17c478bd9Sstevel@tonic-gate/*
27c478bd9Sstevel@tonic-gate * CDDL HEADER START
37c478bd9Sstevel@tonic-gate *
47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
58475e043SDan OpenSolaris Anderson * Common Development and Distribution License (the "License").
68475e043SDan OpenSolaris Anderson * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate *
87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate * and limitations under the License.
127c478bd9Sstevel@tonic-gate *
137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate *
197c478bd9Sstevel@tonic-gate * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate/*
228475e043SDan OpenSolaris Anderson * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate */
257c478bd9Sstevel@tonic-gate
267c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h>
277c478bd9Sstevel@tonic-gate
287c478bd9Sstevel@tonic-gate#if defined(lint) || defined(__lint)
297c478bd9Sstevel@tonic-gate
307c478bd9Sstevel@tonic-gate#include <sys/types.h>
317c478bd9Sstevel@tonic-gate
327c478bd9Sstevel@tonic-gate/* ARGSUSED */
337c478bd9Sstevel@tonic-gateuint64_t
348475e043SDan OpenSolaris Andersonbig_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
357c478bd9Sstevel@tonic-gate{ return (0); }
367c478bd9Sstevel@tonic-gate
377c478bd9Sstevel@tonic-gate/* ARGSUSED */
387c478bd9Sstevel@tonic-gateuint64_t
398475e043SDan OpenSolaris Andersonbig_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
407c478bd9Sstevel@tonic-gate{ return (0); }
417c478bd9Sstevel@tonic-gate
427c478bd9Sstevel@tonic-gate/* ARGSUSED */
437c478bd9Sstevel@tonic-gatevoid
448475e043SDan OpenSolaris Andersonbig_sqr_vec(uint64_t *r, uint64_t *a, int len)
457c478bd9Sstevel@tonic-gate{}
467c478bd9Sstevel@tonic-gate
477c478bd9Sstevel@tonic-gate#else	/* lint */
487c478bd9Sstevel@tonic-gate
497c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------
507c478bd9Sstevel@tonic-gate/
517c478bd9Sstevel@tonic-gate/  Implementation of big_mul_set_vec which exploits
527c478bd9Sstevel@tonic-gate/  the 64X64->128 bit  unsigned multiply instruction.
537c478bd9Sstevel@tonic-gate/
547c478bd9Sstevel@tonic-gate/  As defined in Sun's bignum library for pkcs11, bignums are
558475e043SDan OpenSolaris Anderson/  composed of an array of 64-bit "digits" or "chunks" along with
568475e043SDan OpenSolaris Anderson/  descriptive information.
577c478bd9Sstevel@tonic-gate/
587c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------
597c478bd9Sstevel@tonic-gate
607c478bd9Sstevel@tonic-gate/ r = a * digit, r and a are vectors of length len
617c478bd9Sstevel@tonic-gate/ returns the carry digit
627c478bd9Sstevel@tonic-gate/ r and a are 64 bit aligned.
637c478bd9Sstevel@tonic-gate/
647c478bd9Sstevel@tonic-gate/ uint64_t
658475e043SDan OpenSolaris Anderson/ big_mul_set_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
667c478bd9Sstevel@tonic-gate/
678475e043SDan OpenSolaris Anderson	ENTRY(big_mul_set_vec)
687c478bd9Sstevel@tonic-gate	xorq	%rax, %rax		/ if (len == 0) return (0)
697c478bd9Sstevel@tonic-gate	testq	%rdx, %rdx
707c478bd9Sstevel@tonic-gate	jz	.L17
717c478bd9Sstevel@tonic-gate
727c478bd9Sstevel@tonic-gate	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
737c478bd9Sstevel@tonic-gate	xorq	%r9, %r9		/ cy = 0
747c478bd9Sstevel@tonic-gate
757c478bd9Sstevel@tonic-gate.L15:
767c478bd9Sstevel@tonic-gate	cmpq	$8, %r8			/ 8 - len
777c478bd9Sstevel@tonic-gate	jb	.L16
787c478bd9Sstevel@tonic-gate	movq	0(%rsi), %rax		/ rax = a[0]
797c478bd9Sstevel@tonic-gate	movq	8(%rsi), %r11		/ prefetch a[1]
807c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[0] * digit
817c478bd9Sstevel@tonic-gate	addq	%r9, %rax
827c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
837c478bd9Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
847c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
857c478bd9Sstevel@tonic-gate
867c478bd9Sstevel@tonic-gate	movq	%r11, %rax
877c478bd9Sstevel@tonic-gate	movq	16(%rsi), %r11		/ prefetch a[2]
887c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[1] * digit
897c478bd9Sstevel@tonic-gate	addq	%r9, %rax
907c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
917c478bd9Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
927c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
937c478bd9Sstevel@tonic-gate
947c478bd9Sstevel@tonic-gate	movq	%r11, %rax
957c478bd9Sstevel@tonic-gate	movq	24(%rsi), %r11		/ prefetch a[3]
967c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[2] * digit
977c478bd9Sstevel@tonic-gate	addq	%r9, %rax
987c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
997c478bd9Sstevel@tonic-gate	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
1007c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1017c478bd9Sstevel@tonic-gate
1027c478bd9Sstevel@tonic-gate	movq	%r11, %rax
1037c478bd9Sstevel@tonic-gate	movq	32(%rsi), %r11		/ prefetch a[4]
1047c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[3] * digit
1057c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1067c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1077c478bd9Sstevel@tonic-gate	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
1087c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1097c478bd9Sstevel@tonic-gate
1107c478bd9Sstevel@tonic-gate	movq	%r11, %rax
1117c478bd9Sstevel@tonic-gate	movq	40(%rsi), %r11		/ prefetch a[5]
1127c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[4] * digit
1137c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1147c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1157c478bd9Sstevel@tonic-gate	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
1167c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1177c478bd9Sstevel@tonic-gate
1187c478bd9Sstevel@tonic-gate	movq	%r11, %rax
1197c478bd9Sstevel@tonic-gate	movq	48(%rsi), %r11		/ prefetch a[6]
1207c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[5] * digit
1217c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1227c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1237c478bd9Sstevel@tonic-gate	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
1247c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1257c478bd9Sstevel@tonic-gate
1267c478bd9Sstevel@tonic-gate	movq	%r11, %rax
1277c478bd9Sstevel@tonic-gate	movq	56(%rsi), %r11		/ prefetch a[7]
1287c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[6] * digit
1297c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1307c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1317c478bd9Sstevel@tonic-gate	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
1327c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1337c478bd9Sstevel@tonic-gate
1347c478bd9Sstevel@tonic-gate	movq	%r11, %rax
1357c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[7] * digit
1367c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1377c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1387c478bd9Sstevel@tonic-gate	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
1397c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1407c478bd9Sstevel@tonic-gate
1417c478bd9Sstevel@tonic-gate	addq	$64, %rsi
1427c478bd9Sstevel@tonic-gate	addq	$64, %rdi
1437c478bd9Sstevel@tonic-gate	subq	$8, %r8
1447c478bd9Sstevel@tonic-gate
1457c478bd9Sstevel@tonic-gate	jz	.L17
1467c478bd9Sstevel@tonic-gate	jmp	.L15
1477c478bd9Sstevel@tonic-gate
1487c478bd9Sstevel@tonic-gate.L16:
1497c478bd9Sstevel@tonic-gate	movq	0(%rsi), %rax
1507c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[0] * digit
1517c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1527c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1537c478bd9Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
1547c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1557c478bd9Sstevel@tonic-gate	decq	%r8
1567c478bd9Sstevel@tonic-gate	jz	.L17
1577c478bd9Sstevel@tonic-gate
1587c478bd9Sstevel@tonic-gate	movq	8(%rsi), %rax
1597c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[1] * digit
1607c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1617c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1627c478bd9Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
1637c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1647c478bd9Sstevel@tonic-gate	decq	%r8
1657c478bd9Sstevel@tonic-gate	jz	.L17
1667c478bd9Sstevel@tonic-gate
1677c478bd9Sstevel@tonic-gate	movq	16(%rsi), %rax
1687c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[2] * digit
1697c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1707c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1717c478bd9Sstevel@tonic-gate	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
1727c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1737c478bd9Sstevel@tonic-gate	decq	%r8
1747c478bd9Sstevel@tonic-gate	jz	.L17
1757c478bd9Sstevel@tonic-gate
1767c478bd9Sstevel@tonic-gate	movq	24(%rsi), %rax
1777c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[3] * digit
1787c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1797c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1807c478bd9Sstevel@tonic-gate	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
1817c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1827c478bd9Sstevel@tonic-gate	decq	%r8
1837c478bd9Sstevel@tonic-gate	jz	.L17
1847c478bd9Sstevel@tonic-gate
1857c478bd9Sstevel@tonic-gate	movq	32(%rsi), %rax
1867c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[4] * digit
1877c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1887c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1897c478bd9Sstevel@tonic-gate	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
1907c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
1917c478bd9Sstevel@tonic-gate	decq	%r8
1927c478bd9Sstevel@tonic-gate	jz	.L17
1937c478bd9Sstevel@tonic-gate
1947c478bd9Sstevel@tonic-gate	movq	40(%rsi), %rax
1957c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[5] * digit
1967c478bd9Sstevel@tonic-gate	addq	%r9, %rax
1977c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
1987c478bd9Sstevel@tonic-gate	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
1997c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2007c478bd9Sstevel@tonic-gate	decq	%r8
2017c478bd9Sstevel@tonic-gate	jz	.L17
2027c478bd9Sstevel@tonic-gate
2037c478bd9Sstevel@tonic-gate	movq	48(%rsi), %rax
2047c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[6] * digit
2057c478bd9Sstevel@tonic-gate	addq	%r9, %rax
2067c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2077c478bd9Sstevel@tonic-gate	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
2087c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2097c478bd9Sstevel@tonic-gate	decq	%r8
2107c478bd9Sstevel@tonic-gate	jz	.L17
2117c478bd9Sstevel@tonic-gate
2127c478bd9Sstevel@tonic-gate
2137c478bd9Sstevel@tonic-gate.L17:
2147c478bd9Sstevel@tonic-gate	movq	%r9, %rax
2157c478bd9Sstevel@tonic-gate	ret
2168475e043SDan OpenSolaris Anderson	SET_SIZE(big_mul_set_vec)
2178475e043SDan OpenSolaris Anderson
2187c478bd9Sstevel@tonic-gate
2197c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------
2207c478bd9Sstevel@tonic-gate/
2217c478bd9Sstevel@tonic-gate/  Implementation of big_mul_add_vec which exploits
2227c478bd9Sstevel@tonic-gate/  the 64X64->128 bit  unsigned multiply instruction.
2237c478bd9Sstevel@tonic-gate/
2247c478bd9Sstevel@tonic-gate/  As defined in Sun's bignum library for pkcs11, bignums are
2258475e043SDan OpenSolaris Anderson/  composed of an array of 64-bit "digits" or "chunks" along with
2268475e043SDan OpenSolaris Anderson/  descriptive information.
2277c478bd9Sstevel@tonic-gate/
2287c478bd9Sstevel@tonic-gate/ ------------------------------------------------------------------------
2297c478bd9Sstevel@tonic-gate
2307c478bd9Sstevel@tonic-gate/ r += a * digit, r and a are vectors of length len
2317c478bd9Sstevel@tonic-gate/ returns the carry digit
2327c478bd9Sstevel@tonic-gate/ r and a are 64 bit aligned.
2337c478bd9Sstevel@tonic-gate/
2347c478bd9Sstevel@tonic-gate/ uint64_t
2358475e043SDan OpenSolaris Anderson/ big_mul_add_vec(uint64_t *r, uint64_t *a, int len, uint64_t digit)
2367c478bd9Sstevel@tonic-gate/
2378475e043SDan OpenSolaris Anderson	ENTRY(big_mul_add_vec)
2387c478bd9Sstevel@tonic-gate	xorq	%rax, %rax		/ if (len == 0) return (0)
2397c478bd9Sstevel@tonic-gate	testq	%rdx, %rdx
2407c478bd9Sstevel@tonic-gate	jz	.L27
2417c478bd9Sstevel@tonic-gate
2427c478bd9Sstevel@tonic-gate	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
2437c478bd9Sstevel@tonic-gate	xorq	%r9, %r9		/ cy = 0
2447c478bd9Sstevel@tonic-gate
2457c478bd9Sstevel@tonic-gate.L25:
2467c478bd9Sstevel@tonic-gate	cmpq	$8, %r8			/ 8 - len
2477c478bd9Sstevel@tonic-gate	jb	.L26
2487c478bd9Sstevel@tonic-gate	movq	0(%rsi), %rax		/ rax = a[0]
2497c478bd9Sstevel@tonic-gate	movq	0(%rdi), %r10		/ r10 = r[0]
2507c478bd9Sstevel@tonic-gate	movq	8(%rsi), %r11		/ prefetch a[1]
2517c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[0] * digit
2527c478bd9Sstevel@tonic-gate	addq	%r10, %rax
2537c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[0]
2547c478bd9Sstevel@tonic-gate	movq	8(%rdi), %r10		/ prefetch r[1]
2557c478bd9Sstevel@tonic-gate	addq	%r9, %rax
2567c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2577c478bd9Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
2587c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2597c478bd9Sstevel@tonic-gate
2607c478bd9Sstevel@tonic-gate	movq	%r11, %rax
2617c478bd9Sstevel@tonic-gate	movq	16(%rsi), %r11		/ prefetch a[2]
2627c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[1] * digit
2637c478bd9Sstevel@tonic-gate	addq	%r10, %rax
2647c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[1]
2657c478bd9Sstevel@tonic-gate	movq	16(%rdi), %r10		/ prefetch r[2]
2667c478bd9Sstevel@tonic-gate	addq	%r9, %rax
2677c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2687c478bd9Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
2697c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2707c478bd9Sstevel@tonic-gate
2717c478bd9Sstevel@tonic-gate	movq	%r11, %rax
2727c478bd9Sstevel@tonic-gate	movq	24(%rsi), %r11		/ prefetch a[3]
2737c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[2] * digit
2747c478bd9Sstevel@tonic-gate	addq	%r10, %rax
2757c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[2]
2767c478bd9Sstevel@tonic-gate	movq	24(%rdi), %r10		/ prefetch r[3]
2777c478bd9Sstevel@tonic-gate	addq	%r9, %rax
2787c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2797c478bd9Sstevel@tonic-gate	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
2807c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2817c478bd9Sstevel@tonic-gate
2827c478bd9Sstevel@tonic-gate	movq	%r11, %rax
2837c478bd9Sstevel@tonic-gate	movq	32(%rsi), %r11		/ prefetch a[4]
2847c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[3] * digit
2857c478bd9Sstevel@tonic-gate	addq	%r10, %rax
2867c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[3]
2877c478bd9Sstevel@tonic-gate	movq	32(%rdi), %r10		/ prefetch r[4]
2887c478bd9Sstevel@tonic-gate	addq	%r9, %rax
2897c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
2907c478bd9Sstevel@tonic-gate	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
2917c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
2927c478bd9Sstevel@tonic-gate
2937c478bd9Sstevel@tonic-gate	movq	%r11, %rax
2947c478bd9Sstevel@tonic-gate	movq	40(%rsi), %r11		/ prefetch a[5]
2957c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[4] * digit
2967c478bd9Sstevel@tonic-gate	addq	%r10, %rax
2977c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[4]
2987c478bd9Sstevel@tonic-gate	movq	40(%rdi), %r10		/ prefetch r[5]
2997c478bd9Sstevel@tonic-gate	addq	%r9, %rax
3007c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3017c478bd9Sstevel@tonic-gate	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
3027c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3037c478bd9Sstevel@tonic-gate
3047c478bd9Sstevel@tonic-gate	movq	%r11, %rax
3057c478bd9Sstevel@tonic-gate	movq	48(%rsi), %r11		/ prefetch a[6]
3067c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[5] * digit
3077c478bd9Sstevel@tonic-gate	addq	%r10, %rax
3087c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[5]
3097c478bd9Sstevel@tonic-gate	movq	48(%rdi), %r10		/ prefetch r[6]
3107c478bd9Sstevel@tonic-gate	addq	%r9, %rax
3117c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3127c478bd9Sstevel@tonic-gate	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
3137c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3147c478bd9Sstevel@tonic-gate
3157c478bd9Sstevel@tonic-gate	movq	%r11, %rax
3167c478bd9Sstevel@tonic-gate	movq	56(%rsi), %r11		/ prefetch a[7]
3177c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[6] * digit
3187c478bd9Sstevel@tonic-gate	addq	%r10, %rax
3197c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[6]
3207c478bd9Sstevel@tonic-gate	movq	56(%rdi), %r10		/ prefetch r[7]
3217c478bd9Sstevel@tonic-gate	addq	%r9, %rax
3227c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3237c478bd9Sstevel@tonic-gate	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
3247c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3257c478bd9Sstevel@tonic-gate
3267c478bd9Sstevel@tonic-gate	movq	%r11, %rax
3277c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[7] * digit
3287c478bd9Sstevel@tonic-gate	addq	%r10, %rax
3297c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[7]
3307c478bd9Sstevel@tonic-gate	addq	%r9, %rax
3317c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3327c478bd9Sstevel@tonic-gate	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
3337c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3347c478bd9Sstevel@tonic-gate
3357c478bd9Sstevel@tonic-gate	addq	$64, %rsi
3367c478bd9Sstevel@tonic-gate	addq	$64, %rdi
3377c478bd9Sstevel@tonic-gate	subq	$8, %r8
3387c478bd9Sstevel@tonic-gate
3397c478bd9Sstevel@tonic-gate	jz	.L27
3407c478bd9Sstevel@tonic-gate	jmp	.L25
3417c478bd9Sstevel@tonic-gate
3427c478bd9Sstevel@tonic-gate.L26:
3437c478bd9Sstevel@tonic-gate	movq	0(%rsi), %rax
3447c478bd9Sstevel@tonic-gate	movq	0(%rdi), %r10
3457c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[0] * digit
3467c478bd9Sstevel@tonic-gate	addq	%r10, %rax
3477c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[0]
3487c478bd9Sstevel@tonic-gate	addq	%r9, %rax
3497c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3507c478bd9Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
3517c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3527c478bd9Sstevel@tonic-gate	decq	%r8
3537c478bd9Sstevel@tonic-gate	jz	.L27
3547c478bd9Sstevel@tonic-gate
3557c478bd9Sstevel@tonic-gate	movq	8(%rsi), %rax
3567c478bd9Sstevel@tonic-gate	movq	8(%rdi), %r10
3577c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[1] * digit
3587c478bd9Sstevel@tonic-gate	addq	%r10, %rax
3597c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[1]
3607c478bd9Sstevel@tonic-gate	addq	%r9, %rax
3617c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3627c478bd9Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
3637c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3647c478bd9Sstevel@tonic-gate	decq	%r8
3657c478bd9Sstevel@tonic-gate	jz	.L27
3667c478bd9Sstevel@tonic-gate
3677c478bd9Sstevel@tonic-gate	movq	16(%rsi), %rax
3687c478bd9Sstevel@tonic-gate	movq	16(%rdi), %r10
3697c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[2] * digit
3707c478bd9Sstevel@tonic-gate	addq	%r10, %rax
3717c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[2]
3727c478bd9Sstevel@tonic-gate	addq	%r9, %rax
3737c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3747c478bd9Sstevel@tonic-gate	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
3757c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3767c478bd9Sstevel@tonic-gate	decq	%r8
3777c478bd9Sstevel@tonic-gate	jz	.L27
3787c478bd9Sstevel@tonic-gate
3797c478bd9Sstevel@tonic-gate	movq	24(%rsi), %rax
3807c478bd9Sstevel@tonic-gate	movq	24(%rdi), %r10
3817c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[3] * digit
3827c478bd9Sstevel@tonic-gate	addq	%r10, %rax
3837c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[3]
3847c478bd9Sstevel@tonic-gate	addq	%r9, %rax
3857c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3867c478bd9Sstevel@tonic-gate	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
3877c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
3887c478bd9Sstevel@tonic-gate	decq	%r8
3897c478bd9Sstevel@tonic-gate	jz	.L27
3907c478bd9Sstevel@tonic-gate
3917c478bd9Sstevel@tonic-gate	movq	32(%rsi), %rax
3927c478bd9Sstevel@tonic-gate	movq	32(%rdi), %r10
3937c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[4] * digit
3947c478bd9Sstevel@tonic-gate	addq	%r10, %rax
3957c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[4]
3967c478bd9Sstevel@tonic-gate	addq	%r9, %rax
3977c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
3987c478bd9Sstevel@tonic-gate	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
3997c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
4007c478bd9Sstevel@tonic-gate	decq	%r8
4017c478bd9Sstevel@tonic-gate	jz	.L27
4027c478bd9Sstevel@tonic-gate
4037c478bd9Sstevel@tonic-gate	movq	40(%rsi), %rax
4047c478bd9Sstevel@tonic-gate	movq	40(%rdi), %r10
4057c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[5] * digit
4067c478bd9Sstevel@tonic-gate	addq	%r10, %rax
4077c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[5]
4087c478bd9Sstevel@tonic-gate	addq	%r9, %rax
4097c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
4107c478bd9Sstevel@tonic-gate	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
4117c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
4127c478bd9Sstevel@tonic-gate	decq	%r8
4137c478bd9Sstevel@tonic-gate	jz	.L27
4147c478bd9Sstevel@tonic-gate
4157c478bd9Sstevel@tonic-gate	movq	48(%rsi), %rax
4167c478bd9Sstevel@tonic-gate	movq	48(%rdi), %r10
4177c478bd9Sstevel@tonic-gate	mulq	%rcx			/ p = a[6] * digit
4187c478bd9Sstevel@tonic-gate	addq	%r10, %rax
4197c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += r[6]
4207c478bd9Sstevel@tonic-gate	addq	%r9, %rax
4217c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p += cy
4227c478bd9Sstevel@tonic-gate	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
4237c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi(p)
4247c478bd9Sstevel@tonic-gate	decq	%r8
4257c478bd9Sstevel@tonic-gate	jz	.L27
4267c478bd9Sstevel@tonic-gate
4277c478bd9Sstevel@tonic-gate
4287c478bd9Sstevel@tonic-gate.L27:
4297c478bd9Sstevel@tonic-gate	movq	%r9, %rax
4307c478bd9Sstevel@tonic-gate	ret
4318475e043SDan OpenSolaris Anderson	SET_SIZE(big_mul_add_vec)
4327c478bd9Sstevel@tonic-gate
4337c478bd9Sstevel@tonic-gate
4347c478bd9Sstevel@tonic-gate/ void
4358475e043SDan OpenSolaris Anderson/ big_sqr_vec(uint64_t *r, uint64_t *a, int len)
4367c478bd9Sstevel@tonic-gate
4378475e043SDan OpenSolaris Anderson	ENTRY(big_sqr_vec)
4387c478bd9Sstevel@tonic-gate	pushq	%rbx
4397c478bd9Sstevel@tonic-gate	pushq	%rbp
4407c478bd9Sstevel@tonic-gate	pushq	%r12
4417c478bd9Sstevel@tonic-gate	pushq	%r13
4427c478bd9Sstevel@tonic-gate	pushq	%r14
4437c478bd9Sstevel@tonic-gate	pushq	%r15
4447c478bd9Sstevel@tonic-gate	pushq	%rdx			/ save arg3, len
4457c478bd9Sstevel@tonic-gate	pushq	%rsi			/ save arg2, a
4467c478bd9Sstevel@tonic-gate	pushq	%rdi			/ save arg1, r
4477c478bd9Sstevel@tonic-gate
4487c478bd9Sstevel@tonic-gate	leaq	8(%rdi), %r13		/ tr = r + 1
4497c478bd9Sstevel@tonic-gate	movq	%rsi, %r14		/ ta = a
4507c478bd9Sstevel@tonic-gate	movq	%rdx, %r15		/ tlen = len
4517c478bd9Sstevel@tonic-gate	decq	%r15			/ tlen = len - 1
4527c478bd9Sstevel@tonic-gate	movq	%r13, %rdi		/ arg1 = tr
4537c478bd9Sstevel@tonic-gate	leaq	8(%r14), %rsi		/ arg2 = ta + 1
4547c478bd9Sstevel@tonic-gate	movq	%r15, %rdx		/ arg3 = tlen
4557c478bd9Sstevel@tonic-gate	movq	0(%r14), %rcx		/ arg4 = ta[0]
4568475e043SDan OpenSolaris Anderson	call	big_mul_set_vec
4577c478bd9Sstevel@tonic-gate	movq	%rax, 0(%r13, %r15, 8)	/ tr[tlen] = cy
4587c478bd9Sstevel@tonic-gate.L31:
4597c478bd9Sstevel@tonic-gate	decq	%r15			/ --tlen
4607c478bd9Sstevel@tonic-gate	jz	.L32			/ while (--tlen != 0)
4617c478bd9Sstevel@tonic-gate
4627c478bd9Sstevel@tonic-gate	addq	$16, %r13		/ tr += 2
4637c478bd9Sstevel@tonic-gate	addq	$8, %r14		/ ++ta
4647c478bd9Sstevel@tonic-gate	movq	%r13, %rdi		/ arg1 = tr
4657c478bd9Sstevel@tonic-gate	leaq	8(%r14), %rsi		/ arg2 = ta + 1
4667c478bd9Sstevel@tonic-gate	movq	%r15, %rdx		/ arg3 = tlen
4677c478bd9Sstevel@tonic-gate	movq	0(%r14), %rcx		/ arg4 = ta[0]
4688475e043SDan OpenSolaris Anderson	call	big_mul_add_vec
4697c478bd9Sstevel@tonic-gate	movq	%rax, 0(%r13, %r15, 8)	/ tr[tlen] = cy
4707c478bd9Sstevel@tonic-gate	jmp	.L31
4717c478bd9Sstevel@tonic-gate
4727c478bd9Sstevel@tonic-gate.L32:
4737c478bd9Sstevel@tonic-gate
4747c478bd9Sstevel@tonic-gate/ No more function calls after this.
4757c478bd9Sstevel@tonic-gate/ Restore arguments to registers.
4767c478bd9Sstevel@tonic-gate/ However, don't use %rdx for arg3, len, because it is heavily
4777c478bd9Sstevel@tonic-gate/ used by the hardware MUL instruction.  Use %r8, instead.
4787c478bd9Sstevel@tonic-gate	movq	0(%rsp), %rdi		/ %rdi == arg1 == r
4797c478bd9Sstevel@tonic-gate	movq	8(%rsp), %rsi		/ %rsi == arg2 == a
4807c478bd9Sstevel@tonic-gate	movq	16(%rsp), %r8		/ %r8  == arg3 == len
4817c478bd9Sstevel@tonic-gate
4827c478bd9Sstevel@tonic-gate	movq	0(%rsi), %rax		/ %rax = a[0];
4837c478bd9Sstevel@tonic-gate	mulq	%rax			/ s = %edx:%eax = a[0]**2
4847c478bd9Sstevel@tonic-gate	movq	%rax, 0(%rdi)		/ r[0] = lo64(s)
4857c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi64(s)
4867c478bd9Sstevel@tonic-gate	xorq	%rdx, %rdx
4877c478bd9Sstevel@tonic-gate	movq	8(%rdi), %rax		/ p = %rdx:%rax = r[1]
4887c478bd9Sstevel@tonic-gate	addq	%rax, %rax
4897c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p = p << 1
4907c478bd9Sstevel@tonic-gate	addq	%r9, %rax
4917c478bd9Sstevel@tonic-gate	adcq	$0, %rdx		/ p = (r[1] << 1) + cy
4927c478bd9Sstevel@tonic-gate	movq	%rax, 8(%rdi)		/ r[1] = lo64(p)
4937c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi64(p)
4947c478bd9Sstevel@tonic-gate	movq	$1, %r11		/ row = 1
4957c478bd9Sstevel@tonic-gate	movq	$2, %r12		/ col = 2
4967c478bd9Sstevel@tonic-gate	movq	%r8, %r15
4977c478bd9Sstevel@tonic-gate	decq	%r15			/ tlen = len - 1
4987c478bd9Sstevel@tonic-gate.L33:
4997c478bd9Sstevel@tonic-gate	cmpq	%r8, %r11		/ len - row
5007c478bd9Sstevel@tonic-gate	jae	.L34			/ while (row < len)
5017c478bd9Sstevel@tonic-gate
5027c478bd9Sstevel@tonic-gate	movq	0(%rsi, %r11, 8), %rax	/ s = (uint128_t)a[row]
5037c478bd9Sstevel@tonic-gate	mulq	%rax			/ s = s * s
5047c478bd9Sstevel@tonic-gate	xorq	%rbx, %rbx
5057c478bd9Sstevel@tonic-gate	movq	0(%rdi, %r12, 8), %rcx	/ p = (uint128_t)r[col]
5067c478bd9Sstevel@tonic-gate	addq	%rcx, %rcx
5077c478bd9Sstevel@tonic-gate	adcq	$0, %rbx		/ p = p << 1
5087c478bd9Sstevel@tonic-gate	addq	%rcx, %rax
5097c478bd9Sstevel@tonic-gate	adcq	%rbx, %rdx		/ t = p + s
5107c478bd9Sstevel@tonic-gate	xorq	%r10, %r10
5117c478bd9Sstevel@tonic-gate	movq	%rax, %rbp		/ t2 = 0:lo64(t)
5127c478bd9Sstevel@tonic-gate	addq	%r9, %rbp
5137c478bd9Sstevel@tonic-gate	adcq	$0, %r10		/ t2 = %r10:%rbp = lo64(t) + cy
5147c478bd9Sstevel@tonic-gate	movq	%rbp, 0(%rdi, %r12, 8)	/ r[col] = lo64(t2)
5157c478bd9Sstevel@tonic-gate	xorq	%rcx, %rcx
5167c478bd9Sstevel@tonic-gate	movq	%rdx, %r9
5177c478bd9Sstevel@tonic-gate	addq	%r10, %r9
5187c478bd9Sstevel@tonic-gate	adcq	$0, %rcx		/ cy = hi64(t) + hi64(t2)
5197c478bd9Sstevel@tonic-gate	cmpq	%r11, %r15
5207c478bd9Sstevel@tonic-gate	je	.L34			/ if (row == len - 1) break
5217c478bd9Sstevel@tonic-gate	xorq	%rdx, %rdx
5227c478bd9Sstevel@tonic-gate	movq	8(%rdi, %r12, 8), %rax
5237c478bd9Sstevel@tonic-gate	addq	%rax, %rax
5247c478bd9Sstevel@tonic-gate	adcq	$0, %rdx
5257c478bd9Sstevel@tonic-gate	addq	%r9, %rax
5267c478bd9Sstevel@tonic-gate	adcq	%rcx, %rdx		/ p = (lo64(r[col+1]) << 1) + cy
5277c478bd9Sstevel@tonic-gate	movq	%rax, 8(%rdi, %r12, 8)	/ r[col+1] = lo64(p)
5287c478bd9Sstevel@tonic-gate	movq	%rdx, %r9		/ cy = hi64(p)
5297c478bd9Sstevel@tonic-gate
5307c478bd9Sstevel@tonic-gate	incq	%r11			/ ++row
5317c478bd9Sstevel@tonic-gate	addq	$2, %r12		/ col += 2
5327c478bd9Sstevel@tonic-gate	jmp	.L33
5337c478bd9Sstevel@tonic-gate
5347c478bd9Sstevel@tonic-gate.L34:
5357c478bd9Sstevel@tonic-gate	movq	%r9, 8(%rdi, %r12, 8)	/ r[col+1] = lo64(cy)
5367c478bd9Sstevel@tonic-gate
5377c478bd9Sstevel@tonic-gate	addq	$24, %rsp		/ skip %rdi, %rsi, %rdx
5387c478bd9Sstevel@tonic-gate	popq	%r15
5397c478bd9Sstevel@tonic-gate	popq	%r14
5407c478bd9Sstevel@tonic-gate	popq	%r13
5417c478bd9Sstevel@tonic-gate	popq	%r12
5427c478bd9Sstevel@tonic-gate	popq	%rbp
5437c478bd9Sstevel@tonic-gate	popq	%rbx
5447c478bd9Sstevel@tonic-gate
5457c478bd9Sstevel@tonic-gate	ret
5467c478bd9Sstevel@tonic-gate
5478475e043SDan OpenSolaris Anderson	SET_SIZE(big_sqr_vec)
5487c478bd9Sstevel@tonic-gate
5497c478bd9Sstevel@tonic-gate#endif	/* lint */
550