1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vsin.S"
30
31#include "libm.h"
32
33	RO_DATA
34	.align	64
35constants:
36	.word	0x3ec718e3,0xa6972785
37	.word	0x3ef9fd39,0x94293940
38	.word	0xbf2a019f,0x75ee4be1
39	.word	0xbf56c16b,0xba552569
40	.word	0x3f811111,0x1108c703
41	.word	0x3fa55555,0x554f5b35
42	.word	0xbfc55555,0x555554d0
43	.word	0xbfdfffff,0xffffff85
44	.word	0x3ff00000,0x00000000
45	.word	0xbfc55555,0x5551fc28
46	.word	0x3f811107,0x62eacc9d
47	.word	0xbfdfffff,0xffff6328
48	.word	0x3fa55551,0x5f7acf0c
49	.word	0x3fe45f30,0x6dc9c883
50	.word	0x43380000,0x00000000
51	.word	0x3ff921fb,0x54400000
52	.word	0x3dd0b461,0x1a600000
53	.word	0x3ba3198a,0x2e000000
54	.word	0x397b839a,0x252049c1
55	.word	0x80000000,0x00004000
56	.word	0xffff8000,0x00000000	! N.B.: low-order words used
57	.word	0x3fc90000,0x80000000	! for sign bit hacking; see
58	.word	0x3fc40000,0x00000000	! references to "thresh" below
59
60#define p4		0x0
61#define q4		0x08
62#define p3		0x10
63#define q3		0x18
64#define p2		0x20
65#define q2		0x28
66#define p1		0x30
67#define q1		0x38
68#define one		0x40
69#define pp1		0x48
70#define pp2		0x50
71#define qq1		0x58
72#define qq2		0x60
73#define invpio2		0x68
74#define round		0x70
75#define pio2_1		0x78
76#define pio2_2		0x80
77#define pio2_3		0x88
78#define pio2_3t		0x90
79#define f30val		0x98
80#define mask		0xa0
81#define thresh		0xa8
82
83! local storage indices
84
85#define xsave		STACK_BIAS-0x8
86#define ysave		STACK_BIAS-0x10
87#define nsave		STACK_BIAS-0x14
88#define sxsave		STACK_BIAS-0x18
89#define sysave		STACK_BIAS-0x1c
90#define biguns		STACK_BIAS-0x20
91#define n2		STACK_BIAS-0x24
92#define n1		STACK_BIAS-0x28
93#define n0		STACK_BIAS-0x2c
94#define x2_1		STACK_BIAS-0x40
95#define x1_1		STACK_BIAS-0x50
96#define x0_1		STACK_BIAS-0x60
97#define y2_0		STACK_BIAS-0x70
98#define y1_0		STACK_BIAS-0x80
99#define y0_0		STACK_BIAS-0x90
100! sizeof temp storage - must be a multiple of 16 for V9
101#define tmps		0x90
102
103!--------------------------------------------------------------
104!	Some defines to keep code more readable
105#define LIM_l6		%l6
106!	in primary range, contains |x| upper limit when cos(x)=1.
107!	in transferring to medium range, denotes what loop was active.
108!--------------------------------------------------------------
109
110	ENTRY(__vsin)
111	save	%sp,-SA(MINFRAME)-tmps,%sp
112	PIC_SETUP(g5)
113	PIC_SET(g5,__vlibm_TBL_sincos_hi,l3)
114	PIC_SET(g5,__vlibm_TBL_sincos_lo,l4)
115	PIC_SET(g5,constants,l5)
116	mov	%l5,%g1
117	wr	%g0,0x82,%asi		! set %asi for non-faulting loads
118
119! ========== primary range ==========
120
121! register use
122
123! i0  n
124! i1  x
125! i2  stridex
126! i3  y
127! i4  stridey
128! i5  0x80000000
129
130! l0  hx0
131! l1  hx1
132! l2  hx2
133! l3  __vlibm_TBL_sincos_hi
134! l4  __vlibm_TBL_sincos_lo
135! l5  0x3fc90000
136! l6  0x3e400000
137! l7  0x3fe921fb
138
139! the following are 64-bit registers in both V8+ and V9
140
141! g1  scratch
142! g5
143
144! o0  py0
145! o1  py1
146! o2  py2
147! o3  oy0
148! o4  oy1
149! o5  oy2
150! o7  scratch
151
152! f0  x0
153! f2
154! f4
155! f6
156! f8  scratch for table base
157! f9  signbit0
158! f10 x1
159! f12
160! f14
161! f16
162! f18 scratch for table base
163! f19 signbit1
164! f20 x2
165! f22
166! f24
167! f26
168! f28 scratch for table base
169! f29 signbit2
170! f30 0x80000000
171! f31 0x4000
172! f32
173! f34
174! f36
175! f38
176! f40
177! f42
178! f44 0xffff800000000000
179! f46 p1
180! f48 p2
181! f50 p3
182! f52 p4
183! f54 one
184! f56 pp1
185! f58 pp2
186! f60 qq1
187! f62 qq2
188
189#ifdef __sparcv9
190	stx	%i1,[%fp+xsave]		! save arguments
191	stx	%i3,[%fp+ysave]
192#else
193	st	%i1,[%fp+xsave]		! save arguments
194	st	%i3,[%fp+ysave]
195#endif
196	st	%i0,[%fp+nsave]
197	st	%i2,[%fp+sxsave]
198	st	%i4,[%fp+sysave]
199	sethi	%hi(0x80000000),%i5	! load/set up constants
200	sethi	%hi(0x3fc90000),%l5
201	sethi	%hi(0x3e400000),LIM_l6
202	sethi	%hi(0x3fe921fb),%l7
203	or	%l7,%lo(0x3fe921fb),%l7
204	ldd	[%g1+f30val],%f30
205	ldd	[%g1+mask],%f44
206	ldd	[%g1+p1],%f46
207	ldd	[%g1+p2],%f48
208	ldd	[%g1+p3],%f50
209	ldd	[%g1+p4],%f52
210	ldd	[%g1+one],%f54
211	ldd	[%g1+pp1],%f56
212	ldd	[%g1+pp2],%f58
213	ldd	[%g1+qq1],%f60
214	ldd	[%g1+qq2],%f62
215	sll	%i2,3,%i2		! scale strides
216	sll	%i4,3,%i4
217	add	%fp,x0_1,%o3		! precondition loop
218	add	%fp,x0_1,%o4
219	add	%fp,x0_1,%o5
220	ld	[%i1],%l0		! hx = *x
221	ld	[%i1],%f0
222	ld	[%i1+4],%f1
223	andn	%l0,%i5,%l0		! hx &= ~0x80000000
224	add	%i1,%i2,%i1		! x += stridex
225
226	ba,pt	%icc,.loop0
227! delay slot
228	nop
229
230	.align 32
231.loop0:
232	lda	[%i1]%asi,%l1		! preload next argument
233	sub	%l0,LIM_l6,%g1
234	sub	%l7,%l0,%o7
235	fands	%f0,%f30,%f9		! save signbit
236
237	lda	[%i1]%asi,%f10
238	orcc	%o7,%g1,%g0
239	mov	%i3,%o0			! py0 = y
240	bl,pn	%icc,.range0		! if hx < 0x3e400000 or > 0x3fe921fb
241
242! delay slot
243	lda	[%i1+4]%asi,%f11
244	addcc	%i0,-1,%i0
245	add	%i3,%i4,%i3		! y += stridey
246	ble,pn	%icc,.endloop1
247
248! delay slot
249	andn	%l1,%i5,%l1
250	add	%i1,%i2,%i1		! x += stridex
251	fabsd	%f0,%f0
252	fmuld	%f54,%f54,%f54		! one*one; a nop for alignment only
253
254.loop1:
255	lda	[%i1]%asi,%l2		! preload next argument
256	sub	%l1,LIM_l6,%g1
257	sub	%l7,%l1,%o7
258	fands	%f10,%f30,%f19		! save signbit
259
260	lda	[%i1]%asi,%f20
261	orcc	%o7,%g1,%g0
262	mov	%i3,%o1			! py1 = y
263	bl,pn	%icc,.range1		! if hx < 0x3e400000 or > 0x3fe921fb
264
265! delay slot
266	lda	[%i1+4]%asi,%f21
267	addcc	%i0,-1,%i0
268	add	%i3,%i4,%i3		! y += stridey
269	ble,pn	%icc,.endloop2
270
271! delay slot
272	andn	%l2,%i5,%l2
273	add	%i1,%i2,%i1		! x += stridex
274	fabsd	%f10,%f10
275	fmuld	%f54,%f54,%f54		! one*one; a nop for alignment only
276
277.loop2:
278	st	%f6,[%o3]
279	sub	%l2,LIM_l6,%g1
280	sub	%l7,%l2,%o7
281	fands	%f20,%f30,%f29		! save signbit
282
283	st	%f7,[%o3+4]
284	orcc	%g1,%o7,%g0
285	mov	%i3,%o2			! py2 = y
286	bl,pn	%icc,.range2		! if hx < 0x3e400000 or > 0x3fe921fb
287
288! delay slot
289	add	%i3,%i4,%i3		! y += stridey
290	cmp	%l0,%l5
291	fabsd	%f20,%f20
292	bl,pn	%icc,.case4
293
294! delay slot
295	st	%f16,[%o4]
296	cmp	%l1,%l5
297	fpadd32s %f0,%f31,%f8
298	bl,pn	%icc,.case2
299
300! delay slot
301	st	%f17,[%o4+4]
302	cmp	%l2,%l5
303	fpadd32s %f10,%f31,%f18
304	bl,pn	%icc,.case1
305
306! delay slot
307	st	%f26,[%o5]
308	mov	%o0,%o3
309	sethi	%hi(0x3fc3c000),%o7
310	fpadd32s %f20,%f31,%f28
311
312	st	%f27,[%o5+4]
313	fand	%f8,%f44,%f2
314	mov	%o1,%o4
315
316	fand	%f18,%f44,%f12
317	mov	%o2,%o5
318	sub	%l0,%o7,%l0
319
320	fand	%f28,%f44,%f22
321	sub	%l1,%o7,%l1
322	sub	%l2,%o7,%l2
323
324	fsubd	%f0,%f2,%f0
325	srl	%l0,10,%l0
326	add	%l3,8,%g1
327
328	fsubd	%f10,%f12,%f10
329	srl	%l1,10,%l1
330
331	fsubd	%f20,%f22,%f20
332	srl	%l2,10,%l2
333
334	fmuld	%f0,%f0,%f2
335	andn	%l0,0x1f,%l0
336
337	fmuld	%f10,%f10,%f12
338	andn	%l1,0x1f,%l1
339
340	fmuld	%f20,%f20,%f22
341	andn	%l2,0x1f,%l2
342
343	fmuld	%f2,%f58,%f6
344	ldd	[%l3+%l0],%f32
345
346	fmuld	%f12,%f58,%f16
347	ldd	[%l3+%l1],%f36
348
349	fmuld	%f22,%f58,%f26
350	ldd	[%l3+%l2],%f40
351
352	faddd	%f6,%f56,%f6
353	fmuld	%f2,%f62,%f4
354	ldd	[%g1+%l0],%f34
355
356	faddd	%f16,%f56,%f16
357	fmuld	%f12,%f62,%f14
358	ldd	[%g1+%l1],%f38
359
360	faddd	%f26,%f56,%f26
361	fmuld	%f22,%f62,%f24
362	ldd	[%g1+%l2],%f42
363
364	fmuld	%f2,%f6,%f6
365	faddd	%f4,%f60,%f4
366
367	fmuld	%f12,%f16,%f16
368	faddd	%f14,%f60,%f14
369
370	fmuld	%f22,%f26,%f26
371	faddd	%f24,%f60,%f24
372
373	faddd	%f6,%f54,%f6
374	fmuld	%f2,%f4,%f4
375
376	faddd	%f16,%f54,%f16
377	fmuld	%f12,%f14,%f14
378
379	faddd	%f26,%f54,%f26
380	fmuld	%f22,%f24,%f24
381
382	fmuld	%f0,%f6,%f6
383	ldd	[%l4+%l0],%f2
384
385	fmuld	%f10,%f16,%f16
386	ldd	[%l4+%l1],%f12
387
388	fmuld	%f20,%f26,%f26
389	ldd	[%l4+%l2],%f22
390
391	fmuld	%f4,%f32,%f4
392	lda	[%i1]%asi,%l0		! preload next argument
393
394	fmuld	%f14,%f36,%f14
395	lda	[%i1]%asi,%f0
396
397	fmuld	%f24,%f40,%f24
398	lda	[%i1+4]%asi,%f1
399
400	fmuld	%f6,%f34,%f6
401	add	%i1,%i2,%i1		! x += stridex
402
403	fmuld	%f16,%f38,%f16
404
405	fmuld	%f26,%f42,%f26
406
407	faddd	%f6,%f4,%f6
408
409	faddd	%f16,%f14,%f16
410
411	faddd	%f26,%f24,%f26
412
413	faddd	%f6,%f2,%f6
414
415	faddd	%f16,%f12,%f16
416
417	faddd	%f26,%f22,%f26
418
419	faddd	%f6,%f32,%f6
420
421	faddd	%f16,%f36,%f16
422
423	faddd	%f26,%f40,%f26
424	andn	%l0,%i5,%l0		! hx &= ~0x80000000
425
426	fors	%f6,%f9,%f6
427	addcc	%i0,-1,%i0
428
429	fors	%f16,%f19,%f16
430	bg,pt	%icc,.loop0
431
432! delay slot
433	fors	%f26,%f29,%f26
434
435	ba,pt	%icc,.endloop0
436! delay slot
437	nop
438
439	.align	32
440.case1:
441	st	%f27,[%o5+4]
442	sethi	%hi(0x3fc3c000),%o7
443	add	%l3,8,%g1
444	fand	%f8,%f44,%f2
445
446	sub	%l0,%o7,%l0
447	sub	%l1,%o7,%l1
448	fand	%f18,%f44,%f12
449	fmuld	%f20,%f20,%f22
450
451	fsubd	%f0,%f2,%f0
452	srl	%l0,10,%l0
453	mov	%o0,%o3
454
455	fsubd	%f10,%f12,%f10
456	srl	%l1,10,%l1
457	mov	%o1,%o4
458
459	fmuld	%f22,%f52,%f24
460	mov	%o2,%o5
461
462	fmuld	%f0,%f0,%f2
463	andn	%l0,0x1f,%l0
464
465	fmuld	%f10,%f10,%f12
466	andn	%l1,0x1f,%l1
467
468	faddd	%f24,%f50,%f24
469
470	fmuld	%f2,%f58,%f6
471	ldd	[%l3+%l0],%f32
472
473	fmuld	%f12,%f58,%f16
474	ldd	[%l3+%l1],%f36
475
476	fmuld	%f22,%f24,%f24
477
478	faddd	%f6,%f56,%f6
479	fmuld	%f2,%f62,%f4
480	ldd	[%g1+%l0],%f34
481
482	faddd	%f16,%f56,%f16
483	fmuld	%f12,%f62,%f14
484	ldd	[%g1+%l1],%f38
485
486	faddd	%f24,%f48,%f24
487
488	fmuld	%f2,%f6,%f6
489	faddd	%f4,%f60,%f4
490
491	fmuld	%f12,%f16,%f16
492	faddd	%f14,%f60,%f14
493
494	fmuld	%f22,%f24,%f24
495
496	faddd	%f6,%f54,%f6
497	fmuld	%f2,%f4,%f4
498
499	faddd	%f16,%f54,%f16
500	fmuld	%f12,%f14,%f14
501
502	faddd	%f24,%f46,%f24
503
504	fmuld	%f0,%f6,%f6
505	ldd	[%l4+%l0],%f2
506
507	fmuld	%f10,%f16,%f16
508	ldd	[%l4+%l1],%f12
509
510	fmuld	%f4,%f32,%f4
511	lda	[%i1]%asi,%l0		! preload next argument
512
513	fmuld	%f14,%f36,%f14
514	lda	[%i1]%asi,%f0
515
516	fmuld	%f6,%f34,%f6
517	lda	[%i1+4]%asi,%f1
518
519	fmuld	%f16,%f38,%f16
520	add	%i1,%i2,%i1		! x += stridex
521
522	fmuld	%f22,%f24,%f24
523
524	faddd	%f6,%f4,%f6
525
526	faddd	%f16,%f14,%f16
527
528	fmuld	%f20,%f24,%f24
529
530	faddd	%f6,%f2,%f6
531
532	faddd	%f16,%f12,%f16
533
534	faddd	%f20,%f24,%f26
535
536	faddd	%f6,%f32,%f6
537
538	faddd	%f16,%f36,%f16
539	andn	%l0,%i5,%l0		! hx &= ~0x80000000
540
541	fors	%f26,%f29,%f26
542	addcc	%i0,-1,%i0
543
544	fors	%f6,%f9,%f6
545	bg,pt	%icc,.loop0
546
547! delay slot
548	fors	%f16,%f19,%f16
549
550	ba,pt	%icc,.endloop0
551! delay slot
552	nop
553
554	.align	32
555.case2:
556	st	%f26,[%o5]
557	cmp	%l2,%l5
558	fpadd32s %f20,%f31,%f28
559	bl,pn	%icc,.case3
560
561! delay slot
562	st	%f27,[%o5+4]
563	sethi	%hi(0x3fc3c000),%o7
564	add	%l3,8,%g1
565	fand	%f8,%f44,%f2
566
567	sub	%l0,%o7,%l0
568	sub	%l2,%o7,%l2
569	fand	%f28,%f44,%f22
570	fmuld	%f10,%f10,%f12
571
572	fsubd	%f0,%f2,%f0
573	srl	%l0,10,%l0
574	mov	%o0,%o3
575
576	fsubd	%f20,%f22,%f20
577	srl	%l2,10,%l2
578	mov	%o2,%o5
579
580	fmuld	%f12,%f52,%f14
581	mov	%o1,%o4
582
583	fmuld	%f0,%f0,%f2
584	andn	%l0,0x1f,%l0
585
586	fmuld	%f20,%f20,%f22
587	andn	%l2,0x1f,%l2
588
589	faddd	%f14,%f50,%f14
590
591	fmuld	%f2,%f58,%f6
592	ldd	[%l3+%l0],%f32
593
594	fmuld	%f22,%f58,%f26
595	ldd	[%l3+%l2],%f40
596
597	fmuld	%f12,%f14,%f14
598
599	faddd	%f6,%f56,%f6
600	fmuld	%f2,%f62,%f4
601	ldd	[%g1+%l0],%f34
602
603	faddd	%f26,%f56,%f26
604	fmuld	%f22,%f62,%f24
605	ldd	[%g1+%l2],%f42
606
607	faddd	%f14,%f48,%f14
608
609	fmuld	%f2,%f6,%f6
610	faddd	%f4,%f60,%f4
611
612	fmuld	%f22,%f26,%f26
613	faddd	%f24,%f60,%f24
614
615	fmuld	%f12,%f14,%f14
616
617	faddd	%f6,%f54,%f6
618	fmuld	%f2,%f4,%f4
619
620	faddd	%f26,%f54,%f26
621	fmuld	%f22,%f24,%f24
622
623	faddd	%f14,%f46,%f14
624
625	fmuld	%f0,%f6,%f6
626	ldd	[%l4+%l0],%f2
627
628	fmuld	%f20,%f26,%f26
629	ldd	[%l4+%l2],%f22
630
631	fmuld	%f4,%f32,%f4
632	lda	[%i1]%asi,%l0		! preload next argument
633
634	fmuld	%f24,%f40,%f24
635	lda	[%i1]%asi,%f0
636
637	fmuld	%f6,%f34,%f6
638	lda	[%i1+4]%asi,%f1
639
640	fmuld	%f26,%f42,%f26
641	add	%i1,%i2,%i1		! x += stridex
642
643	fmuld	%f12,%f14,%f14
644
645	faddd	%f6,%f4,%f6
646
647	faddd	%f26,%f24,%f26
648
649	fmuld	%f10,%f14,%f14
650
651	faddd	%f6,%f2,%f6
652
653	faddd	%f26,%f22,%f26
654
655	faddd	%f10,%f14,%f16
656
657	faddd	%f6,%f32,%f6
658
659	faddd	%f26,%f40,%f26
660	andn	%l0,%i5,%l0		! hx &= ~0x80000000
661
662	fors	%f16,%f19,%f16
663	addcc	%i0,-1,%i0
664
665	fors	%f6,%f9,%f6
666	bg,pt	%icc,.loop0
667
668! delay slot
669	fors	%f26,%f29,%f26
670
671	ba,pt	%icc,.endloop0
672! delay slot
673	nop
674
675	.align	32
676.case3:
677	sethi	%hi(0x3fc3c000),%o7
678	add	%l3,8,%g1
679	fand	%f8,%f44,%f2
680	fmuld	%f10,%f10,%f12
681
682	sub	%l0,%o7,%l0
683	fmuld	%f20,%f20,%f22
684
685	fsubd	%f0,%f2,%f0
686	srl	%l0,10,%l0
687	mov	%o0,%o3
688
689	fmuld	%f12,%f52,%f14
690	mov	%o1,%o4
691
692	fmuld	%f22,%f52,%f24
693	mov	%o2,%o5
694
695	fmuld	%f0,%f0,%f2
696	andn	%l0,0x1f,%l0
697
698	faddd	%f14,%f50,%f14
699
700	faddd	%f24,%f50,%f24
701
702	fmuld	%f2,%f58,%f6
703	ldd	[%l3+%l0],%f32
704
705	fmuld	%f12,%f14,%f14
706
707	fmuld	%f22,%f24,%f24
708
709	faddd	%f6,%f56,%f6
710	fmuld	%f2,%f62,%f4
711	ldd	[%g1+%l0],%f34
712
713	faddd	%f14,%f48,%f14
714
715	faddd	%f24,%f48,%f24
716
717	fmuld	%f2,%f6,%f6
718	faddd	%f4,%f60,%f4
719
720	fmuld	%f12,%f14,%f14
721
722	fmuld	%f22,%f24,%f24
723
724	faddd	%f6,%f54,%f6
725	fmuld	%f2,%f4,%f4
726
727	faddd	%f14,%f46,%f14
728
729	faddd	%f24,%f46,%f24
730
731	fmuld	%f0,%f6,%f6
732	ldd	[%l4+%l0],%f2
733
734	fmuld	%f4,%f32,%f4
735	lda	[%i1]%asi,%l0		! preload next argument
736
737	fmuld	%f12,%f14,%f14
738	lda	[%i1]%asi,%f0
739
740	fmuld	%f6,%f34,%f6
741	lda	[%i1+4]%asi,%f1
742
743	fmuld	%f22,%f24,%f24
744	add	%i1,%i2,%i1		! x += stridex
745
746	fmuld	%f10,%f14,%f14
747
748	faddd	%f6,%f4,%f6
749
750	fmuld	%f20,%f24,%f24
751
752	faddd	%f10,%f14,%f16
753
754	faddd	%f6,%f2,%f6
755
756	faddd	%f20,%f24,%f26
757
758	fors	%f16,%f19,%f16
759	andn	%l0,%i5,%l0		! hx &= ~0x80000000
760
761	faddd	%f6,%f32,%f6
762	addcc	%i0,-1,%i0
763
764	fors	%f26,%f29,%f26
765	bg,pt	%icc,.loop0
766
767! delay slot
768	fors	%f6,%f9,%f6
769
770	ba,pt	%icc,.endloop0
771! delay slot
772	nop
773
774	.align	32
775.case4:
776	st	%f17,[%o4+4]
777	cmp	%l1,%l5
778	fpadd32s %f10,%f31,%f18
779	bl,pn	%icc,.case6
780
781! delay slot
782	st	%f26,[%o5]
783	cmp	%l2,%l5
784	fpadd32s %f20,%f31,%f28
785	bl,pn	%icc,.case5
786
787! delay slot
788	st	%f27,[%o5+4]
789	sethi	%hi(0x3fc3c000),%o7
790	add	%l3,8,%g1
791	fand	%f18,%f44,%f12
792
793	sub	%l1,%o7,%l1
794	sub	%l2,%o7,%l2
795	fand	%f28,%f44,%f22
796	fmuld	%f0,%f0,%f2
797
798	fsubd	%f10,%f12,%f10
799	srl	%l1,10,%l1
800	mov	%o1,%o4
801
802	fsubd	%f20,%f22,%f20
803	srl	%l2,10,%l2
804	mov	%o2,%o5
805
806	fmovd	%f0,%f6
807	fmuld	%f2,%f52,%f4
808	mov	%o0,%o3
809
810	fmuld	%f10,%f10,%f12
811	andn	%l1,0x1f,%l1
812
813	fmuld	%f20,%f20,%f22
814	andn	%l2,0x1f,%l2
815
816	faddd	%f4,%f50,%f4
817
818	fmuld	%f12,%f58,%f16
819	ldd	[%l3+%l1],%f36
820
821	fmuld	%f22,%f58,%f26
822	ldd	[%l3+%l2],%f40
823
824	fmuld	%f2,%f4,%f4
825
826	faddd	%f16,%f56,%f16
827	fmuld	%f12,%f62,%f14
828	ldd	[%g1+%l1],%f38
829
830	faddd	%f26,%f56,%f26
831	fmuld	%f22,%f62,%f24
832	ldd	[%g1+%l2],%f42
833
834	faddd	%f4,%f48,%f4
835
836	fmuld	%f12,%f16,%f16
837	faddd	%f14,%f60,%f14
838
839	fmuld	%f22,%f26,%f26
840	faddd	%f24,%f60,%f24
841
842	fmuld	%f2,%f4,%f4
843
844	faddd	%f16,%f54,%f16
845	fmuld	%f12,%f14,%f14
846
847	faddd	%f26,%f54,%f26
848	fmuld	%f22,%f24,%f24
849
850	faddd	%f4,%f46,%f4
851
852	fmuld	%f10,%f16,%f16
853	ldd	[%l4+%l1],%f12
854
855	fmuld	%f20,%f26,%f26
856	ldd	[%l4+%l2],%f22
857
858	fmuld	%f14,%f36,%f14
859	lda	[%i1]%asi,%l0		! preload next argument
860
861	fmuld	%f24,%f40,%f24
862	lda	[%i1]%asi,%f0
863
864	fmuld	%f16,%f38,%f16
865	lda	[%i1+4]%asi,%f1
866
867	fmuld	%f26,%f42,%f26
868	add	%i1,%i2,%i1		! x += stridex
869
870	fmuld	%f2,%f4,%f4
871
872	faddd	%f16,%f14,%f16
873
874	faddd	%f26,%f24,%f26
875
876	fmuld	%f6,%f4,%f4
877
878	faddd	%f16,%f12,%f16
879
880	faddd	%f26,%f22,%f26
881
882	faddd	%f6,%f4,%f6
883
884	faddd	%f16,%f36,%f16
885
886	faddd	%f26,%f40,%f26
887	andn	%l0,%i5,%l0		! hx &= ~0x80000000
888
889	fors	%f6,%f9,%f6
890	addcc	%i0,-1,%i0
891
892	fors	%f16,%f19,%f16
893	bg,pt	%icc,.loop0
894
895! delay slot
896	fors	%f26,%f29,%f26
897
898	ba,pt	%icc,.endloop0
899! delay slot
900	nop
901
902	.align	32
903.case5:
904	sethi	%hi(0x3fc3c000),%o7
905	add	%l3,8,%g1
906	fand	%f18,%f44,%f12
907	fmuld	%f0,%f0,%f2
908
909	sub	%l1,%o7,%l1
910	fmuld	%f20,%f20,%f22
911
912	fsubd	%f10,%f12,%f10
913	srl	%l1,10,%l1
914	mov	%o1,%o4
915
916	fmovd	%f0,%f6
917	fmuld	%f2,%f52,%f4
918	mov	%o0,%o3
919
920	fmuld	%f22,%f52,%f24
921	mov	%o2,%o5
922
923	fmuld	%f10,%f10,%f12
924	andn	%l1,0x1f,%l1
925
926	faddd	%f4,%f50,%f4
927
928	faddd	%f24,%f50,%f24
929
930	fmuld	%f12,%f58,%f16
931	ldd	[%l3+%l1],%f36
932
933	fmuld	%f2,%f4,%f4
934
935	fmuld	%f22,%f24,%f24
936
937	faddd	%f16,%f56,%f16
938	fmuld	%f12,%f62,%f14
939	ldd	[%g1+%l1],%f38
940
941	faddd	%f4,%f48,%f4
942
943	faddd	%f24,%f48,%f24
944
945	fmuld	%f12,%f16,%f16
946	faddd	%f14,%f60,%f14
947
948	fmuld	%f2,%f4,%f4
949
950	fmuld	%f22,%f24,%f24
951
952	faddd	%f16,%f54,%f16
953	fmuld	%f12,%f14,%f14
954
955	faddd	%f4,%f46,%f4
956
957	faddd	%f24,%f46,%f24
958
959	fmuld	%f10,%f16,%f16
960	ldd	[%l4+%l1],%f12
961
962	fmuld	%f14,%f36,%f14
963	lda	[%i1]%asi,%l0		! preload next argument
964
965	fmuld	%f2,%f4,%f4
966	lda	[%i1]%asi,%f0
967
968	fmuld	%f16,%f38,%f16
969	lda	[%i1+4]%asi,%f1
970
971	fmuld	%f22,%f24,%f24
972	add	%i1,%i2,%i1		! x += stridex
973
974	fmuld	%f6,%f4,%f4
975
976	faddd	%f16,%f14,%f16
977
978	fmuld	%f20,%f24,%f24
979
980	faddd	%f6,%f4,%f6
981
982	faddd	%f16,%f12,%f16
983
984	faddd	%f20,%f24,%f26
985
986	fors	%f6,%f9,%f6
987	andn	%l0,%i5,%l0		! hx &= ~0x80000000
988
989	faddd	%f16,%f36,%f16
990	addcc	%i0,-1,%i0
991
992	fors	%f26,%f29,%f26
993	bg,pt	%icc,.loop0
994
995! delay slot
996	fors	%f16,%f19,%f16
997
998	ba,pt	%icc,.endloop0
999! delay slot
1000	nop
1001
1002	.align	32
1003.case6:
1004	st	%f27,[%o5+4]
1005	cmp	%l2,%l5
1006	fpadd32s %f20,%f31,%f28
1007	bl,pn	%icc,.case7
1008
1009! delay slot
1010	sethi	%hi(0x3fc3c000),%o7
1011	add	%l3,8,%g1
1012	fand	%f28,%f44,%f22
1013	fmuld	%f0,%f0,%f2
1014
1015	sub	%l2,%o7,%l2
1016	fmuld	%f10,%f10,%f12
1017
1018	fsubd	%f20,%f22,%f20
1019	srl	%l2,10,%l2
1020	mov	%o2,%o5
1021
1022	fmovd	%f0,%f6
1023	fmuld	%f2,%f52,%f4
1024	mov	%o0,%o3
1025
1026	fmuld	%f12,%f52,%f14
1027	mov	%o1,%o4
1028
1029	fmuld	%f20,%f20,%f22
1030	andn	%l2,0x1f,%l2
1031
1032	faddd	%f4,%f50,%f4
1033
1034	faddd	%f14,%f50,%f14
1035
1036	fmuld	%f22,%f58,%f26
1037	ldd	[%l3+%l2],%f40
1038
1039	fmuld	%f2,%f4,%f4
1040
1041	fmuld	%f12,%f14,%f14
1042
1043	faddd	%f26,%f56,%f26
1044	fmuld	%f22,%f62,%f24
1045	ldd	[%g1+%l2],%f42
1046
1047	faddd	%f4,%f48,%f4
1048
1049	faddd	%f14,%f48,%f14
1050
1051	fmuld	%f22,%f26,%f26
1052	faddd	%f24,%f60,%f24
1053
1054	fmuld	%f2,%f4,%f4
1055
1056	fmuld	%f12,%f14,%f14
1057
1058	faddd	%f26,%f54,%f26
1059	fmuld	%f22,%f24,%f24
1060
1061	faddd	%f4,%f46,%f4
1062
1063	faddd	%f14,%f46,%f14
1064
1065	fmuld	%f20,%f26,%f26
1066	ldd	[%l4+%l2],%f22
1067
1068	fmuld	%f24,%f40,%f24
1069	lda	[%i1]%asi,%l0		! preload next argument
1070
1071	fmuld	%f2,%f4,%f4
1072	lda	[%i1]%asi,%f0
1073
1074	fmuld	%f26,%f42,%f26
1075	lda	[%i1+4]%asi,%f1
1076
1077	fmuld	%f12,%f14,%f14
1078	add	%i1,%i2,%i1		! x += stridex
1079
1080	fmuld	%f6,%f4,%f4
1081
1082	faddd	%f26,%f24,%f26
1083
1084	fmuld	%f10,%f14,%f14
1085
1086	faddd	%f6,%f4,%f6
1087
1088	faddd	%f26,%f22,%f26
1089
1090	faddd	%f10,%f14,%f16
1091
1092	fors	%f6,%f9,%f6
1093	andn	%l0,%i5,%l0		! hx &= ~0x80000000
1094
1095	faddd	%f26,%f40,%f26
1096	addcc	%i0,-1,%i0
1097
1098	fors	%f16,%f19,%f16
1099	bg,pt	%icc,.loop0
1100
1101! delay slot
1102	fors	%f26,%f29,%f26
1103
1104	ba,pt	%icc,.endloop0
1105! delay slot
1106	nop
1107
1108	.align	32
1109.case7:
1110	fmuld	%f0,%f0,%f2
1111	fmovd	%f0,%f6
1112	mov	%o0,%o3
1113
1114	fmuld	%f10,%f10,%f12
1115	mov	%o1,%o4
1116
1117	fmuld	%f20,%f20,%f22
1118	mov	%o2,%o5
1119
1120	fmuld	%f2,%f52,%f4
1121	lda	[%i1]%asi,%l0		! preload next argument
1122
1123	fmuld	%f12,%f52,%f14
1124	lda	[%i1]%asi,%f0
1125
1126	fmuld	%f22,%f52,%f24
1127	lda	[%i1+4]%asi,%f1
1128
1129	faddd	%f4,%f50,%f4
1130	add	%i1,%i2,%i1		! x += stridex
1131
1132	faddd	%f14,%f50,%f14
1133
1134	faddd	%f24,%f50,%f24
1135
1136	fmuld	%f2,%f4,%f4
1137
1138	fmuld	%f12,%f14,%f14
1139
1140	fmuld	%f22,%f24,%f24
1141
1142	faddd	%f4,%f48,%f4
1143
1144	faddd	%f14,%f48,%f14
1145
1146	faddd	%f24,%f48,%f24
1147
1148	fmuld	%f2,%f4,%f4
1149
1150	fmuld	%f12,%f14,%f14
1151
1152	fmuld	%f22,%f24,%f24
1153
1154	faddd	%f4,%f46,%f4
1155
1156	faddd	%f14,%f46,%f14
1157
1158	faddd	%f24,%f46,%f24
1159
1160	fmuld	%f2,%f4,%f4
1161
1162	fmuld	%f12,%f14,%f14
1163
1164	fmuld	%f22,%f24,%f24
1165
1166	fmuld	%f6,%f4,%f4
1167
1168	fmuld	%f10,%f14,%f14
1169
1170	fmuld	%f20,%f24,%f24
1171
1172	faddd	%f6,%f4,%f6
1173
1174	faddd	%f10,%f14,%f16
1175
1176	faddd	%f20,%f24,%f26
1177	andn	%l0,%i5,%l0		! hx &= ~0x80000000
1178
1179	fors	%f6,%f9,%f6
1180	addcc	%i0,-1,%i0
1181
1182	fors	%f16,%f19,%f16
1183	bg,pt	%icc,.loop0
1184
1185! delay slot
1186	fors	%f26,%f29,%f26
1187
1188	ba,pt	%icc,.endloop0
1189! delay slot
1190	nop
1191
1192
1193	.align	32
1194.endloop2:
1195	cmp	%l1,%l5
1196	bl,pn	%icc,1f
1197! delay slot
1198	fabsd	%f10,%f10
1199	sethi	%hi(0x3fc3c000),%o7
1200	fpadd32s %f10,%f31,%f18
1201	add	%l3,8,%g1
1202	fand	%f18,%f44,%f12
1203	sub	%l1,%o7,%l1
1204	fsubd	%f10,%f12,%f10
1205	srl	%l1,10,%l1
1206	fmuld	%f10,%f10,%f12
1207	andn	%l1,0x1f,%l1
1208	fmuld	%f12,%f58,%f20
1209	ldd	[%l3+%l1],%f36
1210	faddd	%f20,%f56,%f20
1211	fmuld	%f12,%f62,%f14
1212	ldd	[%g1+%l1],%f38
1213	fmuld	%f12,%f20,%f20
1214	faddd	%f14,%f60,%f14
1215	faddd	%f20,%f54,%f20
1216	fmuld	%f12,%f14,%f14
1217	fmuld	%f10,%f20,%f20
1218	ldd	[%l4+%l1],%f12
1219	fmuld	%f14,%f36,%f14
1220	fmuld	%f20,%f38,%f20
1221	faddd	%f20,%f14,%f20
1222	faddd	%f20,%f12,%f20
1223	ba,pt	%icc,2f
1224! delay slot
1225	faddd	%f20,%f36,%f20
12261:
1227	fmuld	%f10,%f10,%f12
1228	fmuld	%f12,%f52,%f14
1229	faddd	%f14,%f50,%f14
1230	fmuld	%f12,%f14,%f14
1231	faddd	%f14,%f48,%f14
1232	fmuld	%f12,%f14,%f14
1233	faddd	%f14,%f46,%f14
1234	fmuld	%f12,%f14,%f14
1235	fmuld	%f10,%f14,%f14
1236	faddd	%f10,%f14,%f20
12372:
1238	fors	%f20,%f19,%f20
1239	st	%f20,[%o1]
1240	st	%f21,[%o1+4]
1241
1242.endloop1:
1243	cmp	%l0,%l5
1244	bl,pn	%icc,1f
1245! delay slot
1246	fabsd	%f0,%f0
1247	sethi	%hi(0x3fc3c000),%o7
1248	fpadd32s %f0,%f31,%f8
1249	add	%l3,8,%g1
1250	fand	%f8,%f44,%f2
1251	sub	%l0,%o7,%l0
1252	fsubd	%f0,%f2,%f0
1253	srl	%l0,10,%l0
1254	fmuld	%f0,%f0,%f2
1255	andn	%l0,0x1f,%l0
1256	fmuld	%f2,%f58,%f20
1257	ldd	[%l3+%l0],%f32
1258	faddd	%f20,%f56,%f20
1259	fmuld	%f2,%f62,%f4
1260	ldd	[%g1+%l0],%f34
1261	fmuld	%f2,%f20,%f20
1262	faddd	%f4,%f60,%f4
1263	faddd	%f20,%f54,%f20
1264	fmuld	%f2,%f4,%f4
1265	fmuld	%f0,%f20,%f20
1266	ldd	[%l4+%l0],%f2
1267	fmuld	%f4,%f32,%f4
1268	fmuld	%f20,%f34,%f20
1269	faddd	%f20,%f4,%f20
1270	faddd	%f20,%f2,%f20
1271	ba,pt	%icc,2f
1272! delay slot
1273	faddd	%f20,%f32,%f20
12741:
1275	fmuld	%f0,%f0,%f2
1276	fmuld	%f2,%f52,%f4
1277	faddd	%f4,%f50,%f4
1278	fmuld	%f2,%f4,%f4
1279	faddd	%f4,%f48,%f4
1280	fmuld	%f2,%f4,%f4
1281	faddd	%f4,%f46,%f4
1282	fmuld	%f2,%f4,%f4
1283	fmuld	%f0,%f4,%f4
1284	faddd	%f0,%f4,%f20
12852:
1286	fors	%f20,%f9,%f20
1287	st	%f20,[%o0]
1288	st	%f21,[%o0+4]
1289
1290.endloop0:
1291	st	%f6,[%o3]
1292	st	%f7,[%o3+4]
1293	st	%f16,[%o4]
1294	st	%f17,[%o4+4]
1295	st	%f26,[%o5]
1296	st	%f27,[%o5+4]
1297
1298! return.  finished off with only primary range arguments.
1299
1300	ret
1301	restore
1302
1303
1304	.align	32
1305.range0:
1306	cmp	%l0,LIM_l6
1307	bg,a,pt	%icc,.MEDIUM		! branch if x is not tiny
1308! delay slot, annulled if branch not taken
1309	mov	0x1,LIM_l6		! set "processing loop0"
1310	st	%f0,[%o0]		! *y = *x with inexact if x nonzero
1311	st	%f1,[%o0+4]
1312	fdtoi	%f0,%f2
1313	addcc	%i0,-1,%i0
1314	ble,pn	%icc,.endloop0
1315! delay slot, harmless if branch taken
1316	add	%i3,%i4,%i3		! y += stridey
1317	andn	%l1,%i5,%l0		! hx &= ~0x80000000
1318	fmovd	%f10,%f0
1319	ba,pt	%icc,.loop0
1320! delay slot
1321	add	%i1,%i2,%i1		! x += stridex
1322
1323
1324	.align	32
1325.range1:
1326	cmp	%l1,LIM_l6
1327	bg,a,pt	%icc,.MEDIUM		! branch if x is not tiny
1328! delay slot, annulled if branch not taken
1329	mov	0x2,LIM_l6		! set "processing loop1"
1330	st	%f10,[%o1]		! *y = *x with inexact if x nonzero
1331	st	%f11,[%o1+4]
1332	fdtoi	%f10,%f12
1333	addcc	%i0,-1,%i0
1334	ble,pn	%icc,.endloop1
1335! delay slot, harmless if branch taken
1336	add	%i3,%i4,%i3		! y += stridey
1337	andn	%l2,%i5,%l1		! hx &= ~0x80000000
1338	fmovd	%f20,%f10
1339	ba,pt	%icc,.loop1
1340! delay slot
1341	add	%i1,%i2,%i1		! x += stridex
1342
1343
1344	.align	32
1345.range2:
1346	cmp	%l2,LIM_l6
1347	bg,a,pt	%icc,.MEDIUM		! branch if x is not tiny
1348! delay slot, annulled if branch not taken
1349	mov	0x3,LIM_l6		! set "processing loop2"
1350	st	%f20,[%o2]		! *y = *x with inexact if x nonzero
1351	st	%f21,[%o2+4]
1352	fdtoi	%f20,%f22
13531:
1354	addcc	%i0,-1,%i0
1355	ble,pn	%icc,.endloop2
1356! delay slot
1357	nop
1358	ld	[%i1],%l2
1359	ld	[%i1],%f20
1360	ld	[%i1+4],%f21
1361	andn	%l2,%i5,%l2		! hx &= ~0x80000000
1362	ba,pt	%icc,.loop2
1363! delay slot
1364	add	%i1,%i2,%i1		! x += stridex
1365
1366
1367	.align	32
1368.MEDIUM:
1369
1370! ========== medium range ==========
1371
1372! register use
1373
1374! i0  n
1375! i1  x
1376! i2  stridex
1377! i3  y
1378! i4  stridey
1379! i5  0x80000000
1380
1381! l0  hx0
1382! l1  hx1
1383! l2  hx2
1384! l3  __vlibm_TBL_sincos_hi
1385! l4  __vlibm_TBL_sincos_lo
1386! l5  constants
1387! l6  in transition from pri-range and here, use for biguns
1388! l7  0x413921fb
1389
1390! the following are 64-bit registers in both V8+ and V9
1391
1392! g1  scratch
1393! g5
1394
1395! o0  py0
1396! o1  py1
1397! o2  py2
1398! o3  n0
1399! o4  n1
1400! o5  n2
1401! o7  scratch
1402
1403! f0  x0
1404! f2  n0,y0
1405! f4
1406! f6
1407! f8  scratch for table base
1408! f9  signbit0
1409! f10 x1
1410! f12 n1,y1
1411! f14
1412! f16
1413! f18 scratch for table base
1414! f19 signbit1
1415! f20 x2
1416! f22 n2,y2
1417! f24
1418! f26
1419! f28 scratch for table base
1420! f29 signbit2
1421! f30 0x80000000
1422! f31 0x4000
1423! f32
1424! f34
1425! f36
1426! f38
1427! f40 invpio2
1428! f42 round
1429! f44 0xffff800000000000
1430! f46 pio2_1
1431! f48 pio2_2
1432! f50 pio2_3
1433! f52 pio2_3t
1434! f54 one
1435! f56 pp1
1436! f58 pp2
1437! f60 qq1
1438! f62 qq2
1439
1440	PIC_SET(g5,constants,l5)
1441
1442	! %o3,%o4,%o5 need to be stored
1443	st	%f6,[%o3]
1444	sethi	%hi(0x413921fb),%l7
1445	st	%f7,[%o3+4]
1446	or	%l7,%lo(0x413921fb),%l7
1447	st	%f16,[%o4]
1448	st	%f17,[%o4+4]
1449	st	%f26,[%o5]
1450	st	%f27,[%o5+4]
1451	ldd	[%l5+invpio2],%f40
1452	ldd	[%l5+round],%f42
1453	ldd	[%l5+pio2_1],%f46
1454	ldd	[%l5+pio2_2],%f48
1455	ldd	[%l5+pio2_3],%f50
1456	ldd	[%l5+pio2_3t],%f52
1457	std	%f54,[%fp+x0_1+8]	! set up stack data
1458	std	%f54,[%fp+x1_1+8]
1459	std	%f54,[%fp+x2_1+8]
1460	stx	%g0,[%fp+y0_0+8]
1461	stx	%g0,[%fp+y1_0+8]
1462	stx	%g0,[%fp+y2_0+8]
1463
1464!	branched here in the middle of the array.  Need to adjust
1465!	for the members of the triple that were selected in the primary
1466!	loop.
1467
1468!	no adjustment since all three selected here
1469	subcc	LIM_l6,0x1,%g0		! continue in LOOP0?
1470	bz,a	%icc,.LOOP0
1471	mov	0x0,LIM_l6		! delay slot set biguns=0
1472
1473!	ajust 1st triple since 2d and 3d done here
1474	subcc	LIM_l6,0x2,%g0		! continue in LOOP1?
1475	fors	%f0,%f9,%f0		! restore sign bit
1476	fmuld	%f0,%f40,%f2		! adj LOOP0
1477	bz,a	%icc,.LOOP1
1478	mov	0x0,LIM_l6		! delay slot set biguns=0
1479
1480!	ajust 1st and 2d triple since 3d done here
1481	subcc	LIM_l6,0x3,%g0		! continue in LOOP2?
1482	!done fmuld	%f0,%f40,%f2		! adj LOOP0
1483	sub	%i3,%i4,%i3		! adjust to not double increment
1484	fors	%f10,%f19,%f10		! restore sign bit
1485	fmuld	%f10,%f40,%f12		! adj LOOP1
1486	faddd	%f2,%f42,%f2		! adj LOOP1
1487	bz,a	%icc,.LOOP2
1488	mov	0x0,LIM_l6		! delay slot set biguns=0
1489
1490	.align 32
1491.LOOP0:
1492	lda	[%i1]%asi,%l1		! preload next argument
1493	mov	%i3,%o0			! py0 = y
1494	lda	[%i1]%asi,%f10
1495	cmp	%l0,%l7
1496	add	%i3,%i4,%i3		! y += stridey
1497	bg,pn	%icc,.BIG0		! if hx > 0x413921fb
1498
1499! delay slot
1500	lda	[%i1+4]%asi,%f11
1501	addcc	%i0,-1,%i0
1502	add	%i1,%i2,%i1		! x += stridex
1503	ble,pn	%icc,.ENDLOOP1
1504
1505! delay slot
1506	andn	%l1,%i5,%l1
1507	nop
1508	fmuld	%f0,%f40,%f2
1509	fabsd	%f54,%f54		! a nop for alignment only
1510
1511.LOOP1:
1512	lda	[%i1]%asi,%l2		! preload next argument
1513	mov	%i3,%o1			! py1 = y
1514
1515	lda	[%i1]%asi,%f20
1516	cmp	%l1,%l7
1517	add	%i3,%i4,%i3		! y += stridey
1518	bg,pn	%icc,.BIG1		! if hx > 0x413921fb
1519
1520! delay slot
1521	lda	[%i1+4]%asi,%f21
1522	addcc	%i0,-1,%i0
1523	add	%i1,%i2,%i1		! x += stridex
1524	ble,pn	%icc,.ENDLOOP2
1525
1526! delay slot
1527	andn	%l2,%i5,%l2
1528	nop
1529	fmuld	%f10,%f40,%f12
1530	faddd	%f2,%f42,%f2
1531
1532.LOOP2:
1533	st	%f3,[%fp+n0]
1534	mov	%i3,%o2			! py2 = y
1535
1536	cmp	%l2,%l7
1537	add	%i3,%i4,%i3		! y += stridey
1538	fmuld	%f20,%f40,%f22
1539	bg,pn	%icc,.BIG2		! if hx > 0x413921fb
1540
1541! delay slot
1542	add	%l5,thresh+4,%o7
1543	faddd	%f12,%f42,%f12
1544	st	%f13,[%fp+n1]
1545
1546! -
1547
1548	add	%l5,thresh,%g1
1549	faddd	%f22,%f42,%f22
1550	st	%f23,[%fp+n2]
1551
1552	fsubd	%f2,%f42,%f2		! n
1553
1554	fsubd	%f12,%f42,%f12		! n
1555
1556	fsubd	%f22,%f42,%f22		! n
1557
1558	fmuld	%f2,%f46,%f4
1559
1560	fmuld	%f12,%f46,%f14
1561
1562	fmuld	%f22,%f46,%f24
1563
1564	fsubd	%f0,%f4,%f4
1565	fmuld	%f2,%f48,%f6
1566
1567	fsubd	%f10,%f14,%f14
1568	fmuld	%f12,%f48,%f16
1569
1570	fsubd	%f20,%f24,%f24
1571	fmuld	%f22,%f48,%f26
1572
1573	fsubd	%f4,%f6,%f0
1574	ld	[%fp+n0],%o3
1575
1576	fsubd	%f14,%f16,%f10
1577	ld	[%fp+n1],%o4
1578
1579	fsubd	%f24,%f26,%f20
1580	ld	[%fp+n2],%o5
1581
1582	fsubd	%f4,%f0,%f32
1583	and	%o3,1,%o3
1584
1585	fsubd	%f14,%f10,%f34
1586	and	%o4,1,%o4
1587
1588	fsubd	%f24,%f20,%f36
1589	and	%o5,1,%o5
1590
1591	fsubd	%f32,%f6,%f32
1592	fmuld	%f2,%f50,%f8
1593	sll	%o3,3,%o3
1594
1595	fsubd	%f34,%f16,%f34
1596	fmuld	%f12,%f50,%f18
1597	sll	%o4,3,%o4
1598
1599	fsubd	%f36,%f26,%f36
1600	fmuld	%f22,%f50,%f28
1601	sll	%o5,3,%o5
1602
1603	fsubd	%f8,%f32,%f8
1604	ld	[%g1+%o3],%f6
1605
1606	fsubd	%f18,%f34,%f18
1607	ld	[%g1+%o4],%f16
1608
1609	fsubd	%f28,%f36,%f28
1610	ld	[%g1+%o5],%f26
1611
1612	fsubd	%f0,%f8,%f4
1613
1614	fsubd	%f10,%f18,%f14
1615
1616	fsubd	%f20,%f28,%f24
1617
1618	fsubd	%f0,%f4,%f32
1619
1620	fsubd	%f10,%f14,%f34
1621
1622	fsubd	%f20,%f24,%f36
1623
1624	fsubd	%f32,%f8,%f32
1625	fmuld	%f2,%f52,%f2
1626
1627	fsubd	%f34,%f18,%f34
1628	fmuld	%f12,%f52,%f12
1629
1630	fsubd	%f36,%f28,%f36
1631	fmuld	%f22,%f52,%f22
1632
1633	fsubd	%f2,%f32,%f2
1634	ld	[%o7+%o3],%f8
1635
1636	fsubd	%f12,%f34,%f12
1637	ld	[%o7+%o4],%f18
1638
1639	fsubd	%f22,%f36,%f22
1640	ld	[%o7+%o5],%f28
1641
1642	fsubd	%f4,%f2,%f0		! x
1643
1644	fsubd	%f14,%f12,%f10		! x
1645
1646	fsubd	%f24,%f22,%f20		! x
1647
1648	fsubd	%f4,%f0,%f4
1649
1650	fsubd	%f14,%f10,%f14
1651
1652	fsubd	%f24,%f20,%f24
1653
1654	fands	%f0,%f30,%f9		! save signbit
1655
1656	fands	%f10,%f30,%f19		! save signbit
1657
1658	fands	%f20,%f30,%f29		! save signbit
1659
1660	fabsd	%f0,%f0
1661	std	%f0,[%fp+x0_1]
1662
1663	fabsd	%f10,%f10
1664	std	%f10,[%fp+x1_1]
1665
1666	fabsd	%f20,%f20
1667	std	%f20,[%fp+x2_1]
1668
1669	fsubd	%f4,%f2,%f2		! y
1670
1671	fsubd	%f14,%f12,%f12		! y
1672
1673	fsubd	%f24,%f22,%f22		! y
1674
1675	fcmpgt32 %f6,%f0,%l0
1676
1677	fcmpgt32 %f16,%f10,%l1
1678
1679	fcmpgt32 %f26,%f20,%l2
1680
1681! -- 16 byte aligned
1682	fxors	%f2,%f9,%f2
1683
1684	fxors	%f12,%f19,%f12
1685
1686	fxors	%f22,%f29,%f22
1687
1688	fands	%f9,%f8,%f9		! if (n & 1) clear sign bit
1689	andcc	%l0,2,%g0
1690	bne,pn	%icc,.CASE4
1691
1692! delay slot
1693	fands	%f19,%f18,%f19		! if (n & 1) clear sign bit
1694	andcc	%l1,2,%g0
1695	bne,pn	%icc,.CASE2
1696
1697! delay slot
1698	fands	%f29,%f28,%f29		! if (n & 1) clear sign bit
1699	andcc	%l2,2,%g0
1700	bne,pn	%icc,.CASE1
1701
1702! delay slot
1703	fpadd32s %f0,%f31,%f8
1704	sethi	%hi(0x3fc3c000),%o7
1705	ld	[%fp+x0_1],%l0
1706
1707	fpadd32s %f10,%f31,%f18
1708	add	%l3,8,%g1
1709	ld	[%fp+x1_1],%l1
1710
1711	fpadd32s %f20,%f31,%f28
1712	ld	[%fp+x2_1],%l2
1713
1714	fand	%f8,%f44,%f4
1715	sub	%l0,%o7,%l0
1716
1717	fand	%f18,%f44,%f14
1718	sub	%l1,%o7,%l1
1719
1720	fand	%f28,%f44,%f24
1721	sub	%l2,%o7,%l2
1722
1723	fsubd	%f0,%f4,%f0
1724	srl	%l0,10,%l0
1725
1726	fsubd	%f10,%f14,%f10
1727	srl	%l1,10,%l1
1728
1729	fsubd	%f20,%f24,%f20
1730	srl	%l2,10,%l2
1731
1732	faddd	%f0,%f2,%f0
1733	andn	%l0,0x1f,%l0
1734
1735	faddd	%f10,%f12,%f10
1736	andn	%l1,0x1f,%l1
1737
1738	faddd	%f20,%f22,%f20
1739	andn	%l2,0x1f,%l2
1740
1741	fmuld	%f0,%f0,%f2
1742	add	%l0,%o3,%l0
1743
1744	fmuld	%f10,%f10,%f12
1745	add	%l1,%o4,%l1
1746
1747	fmuld	%f20,%f20,%f22
1748	add	%l2,%o5,%l2
1749
1750	fmuld	%f2,%f58,%f6
1751	ldd	[%l3+%l0],%f32
1752
1753	fmuld	%f12,%f58,%f16
1754	ldd	[%l3+%l1],%f34
1755
1756	fmuld	%f22,%f58,%f26
1757	ldd	[%l3+%l2],%f36
1758
1759	faddd	%f6,%f56,%f6
1760	fmuld	%f2,%f62,%f4
1761
1762	faddd	%f16,%f56,%f16
1763	fmuld	%f12,%f62,%f14
1764
1765	faddd	%f26,%f56,%f26
1766	fmuld	%f22,%f62,%f24
1767
1768	fmuld	%f2,%f6,%f6
1769	faddd	%f4,%f60,%f4
1770
1771	fmuld	%f12,%f16,%f16
1772	faddd	%f14,%f60,%f14
1773
1774	fmuld	%f22,%f26,%f26
1775	faddd	%f24,%f60,%f24
1776
1777	faddd	%f6,%f54,%f6
1778	fmuld	%f2,%f4,%f4
1779
1780	faddd	%f16,%f54,%f16
1781	fmuld	%f12,%f14,%f14
1782
1783	faddd	%f26,%f54,%f26
1784	fmuld	%f22,%f24,%f24
1785
1786	fmuld	%f0,%f6,%f6
1787	ldd	[%g1+%l0],%f2
1788
1789	fmuld	%f10,%f16,%f16
1790	ldd	[%g1+%l1],%f12
1791
1792	fmuld	%f20,%f26,%f26
1793	ldd	[%g1+%l2],%f22
1794
1795	fmuld	%f4,%f32,%f4
1796	ldd	[%l4+%l0],%f0
1797
1798	fmuld	%f14,%f34,%f14
1799	ldd	[%l4+%l1],%f10
1800
1801	fmuld	%f24,%f36,%f24
1802	ldd	[%l4+%l2],%f20
1803
1804	fmuld	%f6,%f2,%f6
1805
1806	fmuld	%f16,%f12,%f16
1807
1808	fmuld	%f26,%f22,%f26
1809
1810	faddd	%f6,%f4,%f6
1811
1812	faddd	%f16,%f14,%f16
1813
1814	faddd	%f26,%f24,%f26
1815
1816	faddd	%f6,%f0,%f6
1817
1818	faddd	%f16,%f10,%f16
1819
1820	faddd	%f26,%f20,%f26
1821
1822	faddd	%f6,%f32,%f6
1823
1824	faddd	%f16,%f34,%f16
1825
1826	faddd	%f26,%f36,%f26
1827
1828.FIXSIGN:
1829	ld	[%fp+n0],%o3
1830	add	%l5,thresh-4,%g1
1831
1832	ld	[%fp+n1],%o4
1833
1834	ld	[%fp+n2],%o5
1835	and	%o3,2,%o3
1836
1837	sll	%o3,2,%o3
1838	and	%o4,2,%o4
1839	lda	[%i1]%asi,%l0		! preload next argument
1840
1841	sll	%o4,2,%o4
1842	and	%o5,2,%o5
1843	ld	[%g1+%o3],%f8
1844
1845	sll	%o5,2,%o5
1846	ld	[%g1+%o4],%f18
1847
1848	ld	[%g1+%o5],%f28
1849	fxors	%f9,%f8,%f9
1850
1851	lda	[%i1]%asi,%f0
1852	fxors	%f29,%f28,%f29
1853
1854	lda	[%i1+4]%asi,%f1
1855	fxors	%f19,%f18,%f19
1856
1857	fors	%f6,%f9,%f6		! tack on sign
1858	add	%i1,%i2,%i1		! x += stridex
1859	st	%f6,[%o0]
1860
1861	fors	%f26,%f29,%f26		! tack on sign
1862	st	%f7,[%o0+4]
1863
1864	fors	%f16,%f19,%f16		! tack on sign
1865	st	%f26,[%o2]
1866
1867	st	%f27,[%o2+4]
1868	addcc	%i0,-1,%i0
1869
1870	st	%f16,[%o1]
1871	andn	%l0,%i5,%l0		! hx &= ~0x80000000
1872	bg,pt	%icc,.LOOP0
1873
1874! delay slot
1875	st	%f17,[%o1+4]
1876
1877	ba,pt	%icc,.ENDLOOP0
1878! delay slot
1879	nop
1880
1881	.align	32
1882.CASE1:
1883	fpadd32s %f10,%f31,%f18
1884	sethi	%hi(0x3fc3c000),%o7
1885	ld	[%fp+x0_1],%l0
1886
1887	fand	%f8,%f44,%f4
1888	add	%l3,8,%g1
1889	ld	[%fp+x1_1],%l1
1890
1891	fand	%f18,%f44,%f14
1892	sub	%l0,%o7,%l0
1893
1894	fsubd	%f0,%f4,%f0
1895	srl	%l0,10,%l0
1896	sub	%l1,%o7,%l1
1897
1898	fsubd	%f10,%f14,%f10
1899	srl	%l1,10,%l1
1900
1901	fmuld	%f20,%f20,%f20
1902	ldd	[%l5+%o5],%f36
1903	add	%l5,%o5,%l2
1904
1905	faddd	%f0,%f2,%f0
1906	andn	%l0,0x1f,%l0
1907
1908	faddd	%f10,%f12,%f10
1909	andn	%l1,0x1f,%l1
1910
1911	fmuld	%f20,%f36,%f24
1912	ldd	[%l2+0x10],%f26
1913	add	%fp,%o5,%o5
1914
1915	fmuld	%f0,%f0,%f2
1916	add	%l0,%o3,%l0
1917
1918	fmuld	%f10,%f10,%f12
1919	add	%l1,%o4,%l1
1920
1921	faddd	%f24,%f26,%f24
1922	ldd	[%l2+0x20],%f36
1923
1924	fmuld	%f2,%f58,%f6
1925	ldd	[%l3+%l0],%f32
1926
1927	fmuld	%f12,%f58,%f16
1928	ldd	[%l3+%l1],%f34
1929
1930	fmuld	%f20,%f24,%f24
1931	ldd	[%l2+0x30],%f26
1932
1933	faddd	%f6,%f56,%f6
1934	fmuld	%f2,%f62,%f4
1935
1936	faddd	%f16,%f56,%f16
1937	fmuld	%f12,%f62,%f14
1938
1939	faddd	%f24,%f36,%f24
1940	ldd	[%o5+x2_1],%f36
1941
1942	fmuld	%f2,%f6,%f6
1943	faddd	%f4,%f60,%f4
1944
1945	fmuld	%f12,%f16,%f16
1946	faddd	%f14,%f60,%f14
1947
1948	fmuld	%f20,%f24,%f24
1949
1950	faddd	%f6,%f54,%f6
1951	fmuld	%f2,%f4,%f4
1952	ldd	[%g1+%l0],%f2
1953
1954	faddd	%f16,%f54,%f16
1955	fmuld	%f12,%f14,%f14
1956	ldd	[%g1+%l1],%f12
1957
1958	faddd	%f24,%f26,%f24
1959
1960	fmuld	%f0,%f6,%f6
1961	ldd	[%l4+%l0],%f0
1962
1963	fmuld	%f10,%f16,%f16
1964	ldd	[%l4+%l1],%f10
1965
1966	fmuld	%f4,%f32,%f4
1967	std	%f22,[%fp+y2_0]
1968
1969	fmuld	%f14,%f34,%f14
1970
1971	fmuld	%f6,%f2,%f6
1972
1973	fmuld	%f16,%f12,%f16
1974
1975	fmuld	%f20,%f24,%f24
1976
1977	faddd	%f6,%f4,%f6
1978
1979	faddd	%f16,%f14,%f16
1980
1981	fmuld	%f36,%f24,%f24
1982	ldd	[%o5+y2_0],%f22
1983
1984	faddd	%f6,%f0,%f6
1985
1986	faddd	%f16,%f10,%f16
1987
1988	faddd	%f24,%f22,%f24
1989
1990	faddd	%f6,%f32,%f6
1991
1992	faddd	%f16,%f34,%f16
1993	ba,pt	%icc,.FIXSIGN
1994
1995! delay slot
1996	faddd	%f36,%f24,%f26
1997
1998	.align	32
1999.CASE2:
2000	fpadd32s %f0,%f31,%f8
2001	ld	[%fp+x0_1],%l0
2002	andcc	%l2,2,%g0
2003	bne,pn	%icc,.CASE3
2004
2005! delay slot
2006	sethi	%hi(0x3fc3c000),%o7
2007	fpadd32s %f20,%f31,%f28
2008	ld	[%fp+x2_1],%l2
2009
2010	fand	%f8,%f44,%f4
2011	sub	%l0,%o7,%l0
2012	add	%l3,8,%g1
2013
2014	fand	%f28,%f44,%f24
2015	sub	%l2,%o7,%l2
2016
2017	fsubd	%f0,%f4,%f0
2018	srl	%l0,10,%l0
2019
2020	fsubd	%f20,%f24,%f20
2021	srl	%l2,10,%l2
2022
2023	fmuld	%f10,%f10,%f10
2024	ldd	[%l5+%o4],%f34
2025	add	%l5,%o4,%l1
2026
2027	faddd	%f0,%f2,%f0
2028	andn	%l0,0x1f,%l0
2029
2030	faddd	%f20,%f22,%f20
2031	andn	%l2,0x1f,%l2
2032
2033	fmuld	%f10,%f34,%f14
2034	ldd	[%l1+0x10],%f16
2035	add	%fp,%o4,%o4
2036
2037	fmuld	%f0,%f0,%f2
2038	add	%l0,%o3,%l0
2039
2040	fmuld	%f20,%f20,%f22
2041	add	%l2,%o5,%l2
2042
2043	faddd	%f14,%f16,%f14
2044	ldd	[%l1+0x20],%f34
2045
2046	fmuld	%f2,%f58,%f6
2047	ldd	[%l3+%l0],%f32
2048
2049	fmuld	%f22,%f58,%f26
2050	ldd	[%l3+%l2],%f36
2051
2052	fmuld	%f10,%f14,%f14
2053	ldd	[%l1+0x30],%f16
2054
2055	faddd	%f6,%f56,%f6
2056	fmuld	%f2,%f62,%f4
2057
2058	faddd	%f26,%f56,%f26
2059	fmuld	%f22,%f62,%f24
2060
2061	faddd	%f14,%f34,%f14
2062	ldd	[%o4+x1_1],%f34
2063
2064	fmuld	%f2,%f6,%f6
2065	faddd	%f4,%f60,%f4
2066
2067	fmuld	%f22,%f26,%f26
2068	faddd	%f24,%f60,%f24
2069
2070	fmuld	%f10,%f14,%f14
2071
2072	faddd	%f6,%f54,%f6
2073	fmuld	%f2,%f4,%f4
2074	ldd	[%g1+%l0],%f2
2075
2076	faddd	%f26,%f54,%f26
2077	fmuld	%f22,%f24,%f24
2078	ldd	[%g1+%l2],%f22
2079
2080	faddd	%f14,%f16,%f14
2081
2082	fmuld	%f0,%f6,%f6
2083	ldd	[%l4+%l0],%f0
2084
2085	fmuld	%f20,%f26,%f26
2086	ldd	[%l4+%l2],%f20
2087
2088	fmuld	%f4,%f32,%f4
2089	std	%f12,[%fp+y1_0]
2090
2091	fmuld	%f24,%f36,%f24
2092
2093	fmuld	%f6,%f2,%f6
2094
2095	fmuld	%f26,%f22,%f26
2096
2097	fmuld	%f10,%f14,%f14
2098
2099	faddd	%f6,%f4,%f6
2100
2101	faddd	%f26,%f24,%f26
2102
2103	fmuld	%f34,%f14,%f14
2104	ldd	[%o4+y1_0],%f12
2105
2106	faddd	%f6,%f0,%f6
2107
2108	faddd	%f26,%f20,%f26
2109
2110	faddd	%f14,%f12,%f14
2111
2112	faddd	%f6,%f32,%f6
2113
2114	faddd	%f26,%f36,%f26
2115	ba,pt	%icc,.FIXSIGN
2116
2117! delay slot
2118	faddd	%f34,%f14,%f16
2119
2120	.align	32
2121.CASE3:
2122	fand	%f8,%f44,%f4
2123	add	%l3,8,%g1
2124	sub	%l0,%o7,%l0
2125
2126	fmuld	%f10,%f10,%f10
2127	ldd	[%l5+%o4],%f34
2128	add	%l5,%o4,%l1
2129
2130	fsubd	%f0,%f4,%f0
2131	srl	%l0,10,%l0
2132
2133	fmuld	%f20,%f20,%f20
2134	ldd	[%l5+%o5],%f36
2135	add	%l5,%o5,%l2
2136
2137	fmuld	%f10,%f34,%f14
2138	ldd	[%l1+0x10],%f16
2139	add	%fp,%o4,%o4
2140
2141	faddd	%f0,%f2,%f0
2142	andn	%l0,0x1f,%l0
2143
2144	fmuld	%f20,%f36,%f24
2145	ldd	[%l2+0x10],%f26
2146	add	%fp,%o5,%o5
2147
2148	faddd	%f14,%f16,%f14
2149	ldd	[%l1+0x20],%f34
2150
2151	fmuld	%f0,%f0,%f2
2152	add	%l0,%o3,%l0
2153
2154	faddd	%f24,%f26,%f24
2155	ldd	[%l2+0x20],%f36
2156
2157	fmuld	%f10,%f14,%f14
2158	ldd	[%l1+0x30],%f16
2159
2160	fmuld	%f2,%f58,%f6
2161	ldd	[%l3+%l0],%f32
2162
2163	fmuld	%f20,%f24,%f24
2164	ldd	[%l2+0x30],%f26
2165
2166	faddd	%f14,%f34,%f14
2167	ldd	[%o4+x1_1],%f34
2168
2169	faddd	%f6,%f56,%f6
2170	fmuld	%f2,%f62,%f4
2171
2172	faddd	%f24,%f36,%f24
2173	ldd	[%o5+x2_1],%f36
2174
2175	fmuld	%f10,%f14,%f14
2176	std	%f12,[%fp+y1_0]
2177
2178	fmuld	%f2,%f6,%f6
2179	faddd	%f4,%f60,%f4
2180
2181	fmuld	%f20,%f24,%f24
2182	std	%f22,[%fp+y2_0]
2183
2184	faddd	%f14,%f16,%f14
2185
2186	faddd	%f6,%f54,%f6
2187	fmuld	%f2,%f4,%f4
2188	ldd	[%g1+%l0],%f2
2189
2190	faddd	%f24,%f26,%f24
2191
2192	fmuld	%f10,%f14,%f14
2193
2194	fmuld	%f0,%f6,%f6
2195	ldd	[%l4+%l0],%f0
2196
2197	fmuld	%f4,%f32,%f4
2198
2199	fmuld	%f20,%f24,%f24
2200
2201	fmuld	%f6,%f2,%f6
2202
2203	fmuld	%f34,%f14,%f14
2204	ldd	[%o4+y1_0],%f12
2205
2206	fmuld	%f36,%f24,%f24
2207	ldd	[%o5+y2_0],%f22
2208
2209	faddd	%f6,%f4,%f6
2210
2211	faddd	%f14,%f12,%f14
2212
2213	faddd	%f24,%f22,%f24
2214
2215	faddd	%f6,%f0,%f6
2216
2217	faddd	%f34,%f14,%f16
2218
2219	faddd	%f36,%f24,%f26
2220	ba,pt	%icc,.FIXSIGN
2221
2222! delay slot
2223	faddd	%f6,%f32,%f6
2224
2225	.align	32
2226.CASE4:
2227	fands	%f29,%f28,%f29		! if (n & 1) clear sign bit
2228	sethi	%hi(0x3fc3c000),%o7
2229	andcc	%l1,2,%g0
2230	bne,pn	%icc,.CASE6
2231
2232! delay slot
2233	andcc	%l2,2,%g0
2234	fpadd32s %f10,%f31,%f18
2235	ld	[%fp+x1_1],%l1
2236	bne,pn	%icc,.CASE5
2237
2238! delay slot
2239	add	%l3,8,%g1
2240	ld	[%fp+x2_1],%l2
2241	fpadd32s %f20,%f31,%f28
2242
2243	fand	%f18,%f44,%f14
2244	sub	%l1,%o7,%l1
2245
2246	fand	%f28,%f44,%f24
2247	sub	%l2,%o7,%l2
2248
2249	fsubd	%f10,%f14,%f10
2250	srl	%l1,10,%l1
2251
2252	fsubd	%f20,%f24,%f20
2253	srl	%l2,10,%l2
2254
2255	fmuld	%f0,%f0,%f0
2256	ldd	[%l5+%o3],%f32
2257	add	%l5,%o3,%l0
2258
2259	faddd	%f10,%f12,%f10
2260	andn	%l1,0x1f,%l1
2261
2262	faddd	%f20,%f22,%f20
2263	andn	%l2,0x1f,%l2
2264
2265	fmuld	%f0,%f32,%f4
2266	ldd	[%l0+0x10],%f6
2267	add	%fp,%o3,%o3
2268
2269	fmuld	%f10,%f10,%f12
2270	add	%l1,%o4,%l1
2271
2272	fmuld	%f20,%f20,%f22
2273	add	%l2,%o5,%l2
2274
2275	faddd	%f4,%f6,%f4
2276	ldd	[%l0+0x20],%f32
2277
2278	fmuld	%f12,%f58,%f16
2279	ldd	[%l3+%l1],%f34
2280
2281	fmuld	%f22,%f58,%f26
2282	ldd	[%l3+%l2],%f36
2283
2284	fmuld	%f0,%f4,%f4
2285	ldd	[%l0+0x30],%f6
2286
2287	faddd	%f16,%f56,%f16
2288	fmuld	%f12,%f62,%f14
2289
2290	faddd	%f26,%f56,%f26
2291	fmuld	%f22,%f62,%f24
2292
2293	faddd	%f4,%f32,%f4
2294	ldd	[%o3+x0_1],%f32
2295
2296	fmuld	%f12,%f16,%f16
2297	faddd	%f14,%f60,%f14
2298
2299	fmuld	%f22,%f26,%f26
2300	faddd	%f24,%f60,%f24
2301
2302	fmuld	%f0,%f4,%f4
2303
2304	faddd	%f16,%f54,%f16
2305	fmuld	%f12,%f14,%f14
2306	ldd	[%g1+%l1],%f12
2307
2308	faddd	%f26,%f54,%f26
2309	fmuld	%f22,%f24,%f24
2310	ldd	[%g1+%l2],%f22
2311
2312	faddd	%f4,%f6,%f4
2313
2314	fmuld	%f10,%f16,%f16
2315	ldd	[%l4+%l1],%f10
2316
2317	fmuld	%f20,%f26,%f26
2318	ldd	[%l4+%l2],%f20
2319
2320	fmuld	%f14,%f34,%f14
2321	std	%f2,[%fp+y0_0]
2322
2323	fmuld	%f24,%f36,%f24
2324
2325	fmuld	%f0,%f4,%f4
2326
2327	fmuld	%f16,%f12,%f16
2328
2329	fmuld	%f26,%f22,%f26
2330
2331	fmuld	%f32,%f4,%f4
2332	ldd	[%o3+y0_0],%f2
2333
2334	faddd	%f16,%f14,%f16
2335
2336	faddd	%f26,%f24,%f26
2337
2338	faddd	%f4,%f2,%f4
2339
2340	faddd	%f16,%f10,%f16
2341
2342	faddd	%f26,%f20,%f26
2343
2344	faddd	%f32,%f4,%f6
2345
2346	faddd	%f16,%f34,%f16
2347	ba,pt	%icc,.FIXSIGN
2348
2349! delay slot
2350	faddd	%f26,%f36,%f26
2351
2352	.align	32
2353.CASE5:
2354	fand	%f18,%f44,%f14
2355	sub	%l1,%o7,%l1
2356
2357	fmuld	%f0,%f0,%f0
2358	ldd	[%l5+%o3],%f32
2359	add	%l5,%o3,%l0
2360
2361	fsubd	%f10,%f14,%f10
2362	srl	%l1,10,%l1
2363
2364	fmuld	%f20,%f20,%f20
2365	ldd	[%l5+%o5],%f36
2366	add	%l5,%o5,%l2
2367
2368	fmuld	%f0,%f32,%f4
2369	ldd	[%l0+0x10],%f6
2370	add	%fp,%o3,%o3
2371
2372	faddd	%f10,%f12,%f10
2373	andn	%l1,0x1f,%l1
2374
2375	fmuld	%f20,%f36,%f24
2376	ldd	[%l2+0x10],%f26
2377	add	%fp,%o5,%o5
2378
2379	faddd	%f4,%f6,%f4
2380	ldd	[%l0+0x20],%f32
2381
2382	fmuld	%f10,%f10,%f12
2383	add	%l1,%o4,%l1
2384
2385	faddd	%f24,%f26,%f24
2386	ldd	[%l2+0x20],%f36
2387
2388	fmuld	%f0,%f4,%f4
2389	ldd	[%l0+0x30],%f6
2390
2391	fmuld	%f12,%f58,%f16
2392	ldd	[%l3+%l1],%f34
2393
2394	fmuld	%f20,%f24,%f24
2395	ldd	[%l2+0x30],%f26
2396
2397	faddd	%f4,%f32,%f4
2398	ldd	[%o3+x0_1],%f32
2399
2400	faddd	%f16,%f56,%f16
2401	fmuld	%f12,%f62,%f14
2402
2403	faddd	%f24,%f36,%f24
2404	ldd	[%o5+x2_1],%f36
2405
2406	fmuld	%f0,%f4,%f4
2407	std	%f2,[%fp+y0_0]
2408
2409	fmuld	%f12,%f16,%f16
2410	faddd	%f14,%f60,%f14
2411
2412	fmuld	%f20,%f24,%f24
2413	std	%f22,[%fp+y2_0]
2414
2415	faddd	%f4,%f6,%f4
2416
2417	faddd	%f16,%f54,%f16
2418	fmuld	%f12,%f14,%f14
2419	ldd	[%g1+%l1],%f12
2420
2421	faddd	%f24,%f26,%f24
2422
2423	fmuld	%f0,%f4,%f4
2424
2425	fmuld	%f10,%f16,%f16
2426	ldd	[%l4+%l1],%f10
2427
2428	fmuld	%f14,%f34,%f14
2429
2430	fmuld	%f20,%f24,%f24
2431
2432	fmuld	%f16,%f12,%f16
2433
2434	fmuld	%f32,%f4,%f4
2435	ldd	[%o3+y0_0],%f2
2436
2437	fmuld	%f36,%f24,%f24
2438	ldd	[%o5+y2_0],%f22
2439
2440	faddd	%f16,%f14,%f16
2441
2442	faddd	%f4,%f2,%f4
2443
2444	faddd	%f24,%f22,%f24
2445
2446	faddd	%f16,%f10,%f16
2447
2448	faddd	%f32,%f4,%f6
2449
2450	faddd	%f36,%f24,%f26
2451	ba,pt	%icc,.FIXSIGN
2452
2453! delay slot
2454	faddd	%f16,%f34,%f16
2455
2456	.align	32
2457.CASE6:
2458	ld	[%fp+x2_1],%l2
2459	add	%l3,8,%g1
2460	bne,pn	%icc,.CASE7
2461! delay slot
2462	fpadd32s %f20,%f31,%f28
2463
2464	fand	%f28,%f44,%f24
2465	ldd	[%l5+%o3],%f32
2466	add	%l5,%o3,%l0
2467
2468	fmuld	%f0,%f0,%f0
2469	sub	%l2,%o7,%l2
2470
2471	fsubd	%f20,%f24,%f20
2472	srl	%l2,10,%l2
2473
2474	fmuld	%f10,%f10,%f10
2475	ldd	[%l5+%o4],%f34
2476	add	%l5,%o4,%l1
2477
2478	fmuld	%f0,%f32,%f4
2479	ldd	[%l0+0x10],%f6
2480	add	%fp,%o3,%o3
2481
2482	faddd	%f20,%f22,%f20
2483	andn	%l2,0x1f,%l2
2484
2485	fmuld	%f10,%f34,%f14
2486	ldd	[%l1+0x10],%f16
2487	add	%fp,%o4,%o4
2488
2489	faddd	%f4,%f6,%f4
2490	ldd	[%l0+0x20],%f32
2491
2492	fmuld	%f20,%f20,%f22
2493	add	%l2,%o5,%l2
2494
2495	faddd	%f14,%f16,%f14
2496	ldd	[%l1+0x20],%f34
2497
2498	fmuld	%f0,%f4,%f4
2499	ldd	[%l0+0x30],%f6
2500
2501	fmuld	%f22,%f58,%f26
2502	ldd	[%l3+%l2],%f36
2503
2504	fmuld	%f10,%f14,%f14
2505	ldd	[%l1+0x30],%f16
2506
2507	faddd	%f4,%f32,%f4
2508	ldd	[%o3+x0_1],%f32
2509
2510	faddd	%f26,%f56,%f26
2511	fmuld	%f22,%f62,%f24
2512
2513	faddd	%f14,%f34,%f14
2514	ldd	[%o4+x1_1],%f34
2515
2516	fmuld	%f0,%f4,%f4
2517	std	%f2,[%fp+y0_0]
2518
2519	fmuld	%f22,%f26,%f26
2520	faddd	%f24,%f60,%f24
2521
2522	fmuld	%f10,%f14,%f14
2523	std	%f12,[%fp+y1_0]
2524
2525	faddd	%f4,%f6,%f4
2526
2527	faddd	%f26,%f54,%f26
2528	fmuld	%f22,%f24,%f24
2529	ldd	[%g1+%l2],%f22
2530
2531	faddd	%f14,%f16,%f14
2532
2533	fmuld	%f0,%f4,%f4
2534
2535	fmuld	%f20,%f26,%f26
2536	ldd	[%l4+%l2],%f20
2537
2538	fmuld	%f24,%f36,%f24
2539
2540	fmuld	%f10,%f14,%f14
2541
2542	fmuld	%f26,%f22,%f26
2543
2544	fmuld	%f32,%f4,%f4
2545	ldd	[%o3+y0_0],%f2
2546
2547	fmuld	%f34,%f14,%f14
2548	ldd	[%o4+y1_0],%f12
2549
2550	faddd	%f26,%f24,%f26
2551
2552	faddd	%f4,%f2,%f4
2553
2554	faddd	%f14,%f12,%f14
2555
2556	faddd	%f26,%f20,%f26
2557
2558	faddd	%f32,%f4,%f6
2559
2560	faddd	%f34,%f14,%f16
2561	ba,pt	%icc,.FIXSIGN
2562
2563! delay slot
2564	faddd	%f26,%f36,%f26
2565
2566	.align	32
2567.CASE7:
2568	fmuld	%f0,%f0,%f0
2569	ldd	[%l5+%o3],%f32
2570	add	%l5,%o3,%l0
2571
2572	fmuld	%f10,%f10,%f10
2573	ldd	[%l5+%o4],%f34
2574	add	%l5,%o4,%l1
2575
2576	fmuld	%f20,%f20,%f20
2577	ldd	[%l5+%o5],%f36
2578	add	%l5,%o5,%l2
2579
2580	fmuld	%f0,%f32,%f4
2581	ldd	[%l0+0x10],%f6
2582	add	%fp,%o3,%o3
2583
2584	fmuld	%f10,%f34,%f14
2585	ldd	[%l1+0x10],%f16
2586	add	%fp,%o4,%o4
2587
2588	fmuld	%f20,%f36,%f24
2589	ldd	[%l2+0x10],%f26
2590	add	%fp,%o5,%o5
2591
2592	faddd	%f4,%f6,%f4
2593	ldd	[%l0+0x20],%f32
2594
2595	faddd	%f14,%f16,%f14
2596	ldd	[%l1+0x20],%f34
2597
2598	faddd	%f24,%f26,%f24
2599	ldd	[%l2+0x20],%f36
2600
2601	fmuld	%f0,%f4,%f4
2602	ldd	[%l0+0x30],%f6
2603
2604	fmuld	%f10,%f14,%f14
2605	ldd	[%l1+0x30],%f16
2606
2607	fmuld	%f20,%f24,%f24
2608	ldd	[%l2+0x30],%f26
2609
2610	faddd	%f4,%f32,%f4
2611	ldd	[%o3+x0_1],%f32
2612
2613	faddd	%f14,%f34,%f14
2614	ldd	[%o4+x1_1],%f34
2615
2616	faddd	%f24,%f36,%f24
2617	ldd	[%o5+x2_1],%f36
2618
2619	fmuld	%f0,%f4,%f4
2620	std	%f2,[%fp+y0_0]
2621
2622	fmuld	%f10,%f14,%f14
2623	std	%f12,[%fp+y1_0]
2624
2625	fmuld	%f20,%f24,%f24
2626	std	%f22,[%fp+y2_0]
2627
2628	faddd	%f4,%f6,%f4
2629
2630	faddd	%f14,%f16,%f14
2631
2632	faddd	%f24,%f26,%f24
2633
2634	fmuld	%f0,%f4,%f4
2635
2636	fmuld	%f10,%f14,%f14
2637
2638	fmuld	%f20,%f24,%f24
2639
2640	fmuld	%f32,%f4,%f4
2641	ldd	[%o3+y0_0],%f2
2642
2643	fmuld	%f34,%f14,%f14
2644	ldd	[%o4+y1_0],%f12
2645
2646	fmuld	%f36,%f24,%f24
2647	ldd	[%o5+y2_0],%f22
2648
2649	faddd	%f4,%f2,%f4
2650
2651	faddd	%f14,%f12,%f14
2652
2653	faddd	%f24,%f22,%f24
2654
2655	faddd	%f32,%f4,%f6
2656
2657	faddd	%f34,%f14,%f16
2658	ba,pt	%icc,.FIXSIGN
2659
2660! delay slot
2661	faddd	%f36,%f24,%f26
2662
2663
2664	.align	32
2665.ENDLOOP2:
2666	fmuld	%f10,%f40,%f12
2667	add	%l5,thresh,%g1
2668	faddd	%f12,%f42,%f12
2669	st	%f13,[%fp+n1]
2670	fsubd	%f12,%f42,%f12		! n
2671	fmuld	%f12,%f46,%f14
2672	fsubd	%f10,%f14,%f14
2673	fmuld	%f12,%f48,%