1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24 */
25/*
26 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27 * Use is subject to license terms.
28 */
29
30#include <ucontext.h>
31#include <fenv.h>
32#if defined(__SUNPRO_C)
33#include <sunmath.h>
34#else
35#include <sys/ieeefp.h>
36#endif
37#include "fex_handler.h"
38#include "fenv_inlines.h"
39
40#if !defined(REG_PC)
41#define REG_PC	EIP
42#endif
43
44#if !defined(REG_PS)
45#define REG_PS	EFL
46#endif
47
48#ifdef __amd64
49#define regno(X)	((X < 4)? REG_RAX - X : \
50			((X > 4)? REG_RAX + 1 - X : REG_RSP))
51#else
52#define regno(X)	(EAX - X)
53#endif
54
55/*
56 * Support for SSE instructions
57 */
58
59/*
60 * Decode an SSE instruction.  Fill in *inst and return the length of the
61 * instruction in bytes.  Return 0 if the instruction is not recognized.
62 */
63int
64__fex_parse_sse(ucontext_t *uap, sseinst_t *inst)
65{
66	unsigned char	*ip;
67	char		*addr;
68	int		i, dbl, simd, rex, modrm, sib, r;
69
70	i = 0;
71	ip = (unsigned char *)uap->uc_mcontext.gregs[REG_PC];
72
73	/* look for pseudo-prefixes */
74	dbl = 0;
75	simd = SIMD;
76	if (ip[i] == 0xF3) {
77		simd = 0;
78		i++;
79	} else if (ip[i] == 0x66) {
80		dbl = DOUBLE;
81		i++;
82	} else if (ip[i] == 0xF2) {
83		dbl = DOUBLE;
84		simd = 0;
85		i++;
86	}
87
88	/* look for AMD64 REX prefix */
89	rex = 0;
90	if (ip[i] >= 0x40 && ip[i] <= 0x4F) {
91		rex = ip[i];
92		i++;
93	}
94
95	/* parse opcode */
96	if (ip[i++] != 0x0F)
97		return 0;
98	switch (ip[i++]) {
99	case 0x2A:
100		inst->op = (int)cvtsi2ss + simd + dbl;
101		if (!simd)
102			inst->op = (int)inst->op + (rex & 8);
103		break;
104
105	case 0x2C:
106		inst->op = (int)cvttss2si + simd + dbl;
107		if (!simd)
108			inst->op = (int)inst->op + (rex & 8);
109		break;
110
111	case 0x2D:
112		inst->op = (int)cvtss2si + simd + dbl;
113		if (!simd)
114			inst->op = (int)inst->op + (rex & 8);
115		break;
116
117	case 0x2E:
118		/* oddball: scalar instruction in a SIMD opcode group */
119		if (!simd)
120			return 0;
121		inst->op = (int)ucomiss + dbl;
122		break;
123
124	case 0x2F:
125		/* oddball: scalar instruction in a SIMD opcode group */
126		if (!simd)
127			return 0;
128		inst->op = (int)comiss + dbl;
129		break;
130
131	case 0x51:
132		inst->op = (int)sqrtss + simd + dbl;
133		break;
134
135	case 0x58:
136		inst->op = (int)addss + simd + dbl;
137		break;
138
139	case 0x59:
140		inst->op = (int)mulss + simd + dbl;
141		break;
142
143	case 0x5A:
144		inst->op = (int)cvtss2sd + simd + dbl;
145		break;
146
147	case 0x5B:
148		if (dbl) {
149			if (simd)
150				inst->op = cvtps2dq;
151			else
152				return 0;
153		} else {
154			inst->op = (simd)? cvtdq2ps : cvttps2dq;
155		}
156		break;
157
158	case 0x5C:
159		inst->op = (int)subss + simd + dbl;
160		break;
161
162	case 0x5D:
163		inst->op = (int)minss + simd + dbl;
164		break;
165
166	case 0x5E:
167		inst->op = (int)divss + simd + dbl;
168		break;
169
170	case 0x5F:
171		inst->op = (int)maxss + simd + dbl;
172		break;
173
174	case 0xC2:
175		inst->op = (int)cmpss + simd + dbl;
176		break;
177
178	case 0xE6:
179		if (simd) {
180			if (dbl)
181				inst->op = cvttpd2dq;
182			else
183				return 0;
184		} else {
185			inst->op = (dbl)? cvtpd2dq : cvtdq2pd;
186		}
187		break;
188
189	default:
190		return 0;
191	}
192
193	/* locate operands */
194	modrm = ip[i++];
195
196	if (inst->op == cvtss2si || inst->op == cvttss2si ||
197	    inst->op == cvtsd2si || inst->op == cvttsd2si ||
198	    inst->op == cvtss2siq || inst->op == cvttss2siq ||
199	    inst->op == cvtsd2siq || inst->op == cvttsd2siq) {
200		/* op1 is a gp register */
201		r = ((rex & 4) << 1) | ((modrm >> 3) & 7);
202		inst->op1 = (sseoperand_t *)&uap->uc_mcontext.gregs[regno(r)];
203	} else if (inst->op == cvtps2pi || inst->op == cvttps2pi ||
204	    inst->op == cvtpd2pi || inst->op == cvttpd2pi) {
205		/* op1 is a mmx register */
206#ifdef __amd64
207		inst->op1 = (sseoperand_t *)&uap->uc_mcontext.fpregs.fp_reg_set.
208		    fpchip_state.st[(modrm >> 3) & 7];
209#else
210		inst->op1 = (sseoperand_t *)(10 * ((modrm >> 3) & 7) +
211		    (char *)&uap->uc_mcontext.fpregs.fp_reg_set.
212		    fpchip_state.state[7]);
213#endif
214	} else {
215		/* op1 is a xmm register */
216		r = ((rex & 4) << 1) | ((modrm >> 3) & 7);
217		inst->op1 = (sseoperand_t *)&uap->uc_mcontext.fpregs.
218		    fp_reg_set.fpchip_state.xmm[r];
219	}
220
221	if ((modrm >> 6) == 3) {
222		if (inst->op == cvtsi2ss || inst->op == cvtsi2sd ||
223		    inst->op == cvtsi2ssq || inst->op == cvtsi2sdq) {
224			/* op2 is a gp register */
225			r = ((rex & 1) << 3) | (modrm & 7);
226			inst->op2 = (sseoperand_t *)&uap->uc_mcontext.
227			    gregs[regno(r)];
228		} else if (inst->op == cvtpi2ps || inst->op == cvtpi2pd) {
229			/* op2 is a mmx register */
230#ifdef __amd64
231			inst->op2 = (sseoperand_t *)&uap->uc_mcontext.fpregs.
232			    fp_reg_set.fpchip_state.st[modrm & 7];
233#else
234			inst->op2 = (sseoperand_t *)(10 * (modrm & 7) +
235			    (char *)&uap->uc_mcontext.fpregs.fp_reg_set.
236			    fpchip_state.state[7]);
237#endif
238		} else {
239			/* op2 is a xmm register */
240			r = ((rex & 1) << 3) | (modrm & 7);
241			inst->op2 = (sseoperand_t *)&uap->uc_mcontext.fpregs.
242			    fp_reg_set.fpchip_state.xmm[r];
243		}
244	} else if ((modrm & 0xc7) == 0x05) {
245#ifdef __amd64
246		/* address of next instruction + offset */
247		r = i + 4;
248		if (inst->op == cmpss || inst->op == cmpps ||
249		    inst->op == cmpsd || inst->op == cmppd)
250			r++;
251		inst->op2 = (sseoperand_t *)(ip + r + *(int *)(ip + i));
252#else
253		/* absolute address */
254		inst->op2 = (sseoperand_t *)(*(int *)(ip + i));
255#endif
256		i += 4;
257	} else {
258		/* complex address */
259		if ((modrm & 7) == 4) {
260			/* parse sib byte */
261			sib = ip[i++];
262			if ((sib & 7) == 5 && (modrm >> 6) == 0) {
263				/* start with absolute address */
264				addr = (char *)(uintptr_t)(*(int *)(ip + i));
265				i += 4;
266			} else {
267				/* start with base */
268				r = ((rex & 1) << 3) | (sib & 7);
269				addr = (char *)uap->uc_mcontext.gregs[regno(r)];
270			}
271			r = ((rex & 2) << 2) | ((sib >> 3) & 7);
272			if (r != 4) {
273				/* add scaled index */
274				addr += uap->uc_mcontext.gregs[regno(r)]
275				    << (sib >> 6);
276			}
277		} else {
278			r = ((rex & 1) << 3) | (modrm & 7);
279			addr = (char *)uap->uc_mcontext.gregs[regno(r)];
280		}
281
282		/* add displacement, if any */
283		if ((modrm >> 6) == 1) {
284			addr += (char)ip[i++];
285		} else if ((modrm >> 6) == 2) {
286			addr += *(int *)(ip + i);
287			i += 4;
288		}
289		inst->op2 = (sseoperand_t *)addr;
290	}
291
292	if (inst->op == cmpss || inst->op == cmpps || inst->op == cmpsd ||
293	    inst->op == cmppd) {
294		/* get the immediate operand */
295		inst->imm = ip[i++];
296	}
297
298	return i;
299}
300
301static enum fp_class_type
302my_fp_classf(float *x)
303{
304	int	i = *(int *)x & ~0x80000000;
305
306	if (i < 0x7f800000) {
307		if (i < 0x00800000)
308			return ((i == 0)? fp_zero : fp_subnormal);
309		return fp_normal;
310	}
311	else if (i == 0x7f800000)
312		return fp_infinity;
313	else if (i & 0x400000)
314		return fp_quiet;
315	else
316		return fp_signaling;
317}
318
319static enum fp_class_type
320my_fp_class(double *x)
321{
322	int	i = *(1+(int *)x) & ~0x80000000;
323
324	if (i < 0x7ff00000) {
325		if (i < 0x00100000)
326			return (((i | *(int *)x) == 0)? fp_zero : fp_subnormal);
327		return fp_normal;
328	}
329	else if (i == 0x7ff00000 && *(int *)x == 0)
330		return fp_infinity;
331	else if (i & 0x80000)
332		return fp_quiet;
333	else
334		return fp_signaling;
335}
336
337/*
338 * Inspect a scalar SSE instruction that incurred an invalid operation
339 * exception to determine which type of exception it was.
340 */
341static enum fex_exception
342__fex_get_sse_invalid_type(sseinst_t *inst)
343{
344	enum fp_class_type	t1, t2;
345
346	/* check op2 for signaling nan */
347	t2 = ((int)inst->op & DOUBLE)? my_fp_class(&inst->op2->d[0]) :
348	    my_fp_classf(&inst->op2->f[0]);
349	if (t2 == fp_signaling)
350		return fex_inv_snan;
351
352	/* eliminate all single-operand instructions */
353	switch (inst->op) {
354	case cvtsd2ss:
355	case cvtss2sd:
356		/* hmm, this shouldn't have happened */
357		return (enum fex_exception) -1;
358
359	case sqrtss:
360	case sqrtsd:
361		return fex_inv_sqrt;
362
363	case cvtss2si:
364	case cvtsd2si:
365	case cvttss2si:
366	case cvttsd2si:
367	case cvtss2siq:
368	case cvtsd2siq:
369	case cvttss2siq:
370	case cvttsd2siq:
371		return fex_inv_int;
372	default:
373		break;
374	}
375
376	/* check op1 for signaling nan */
377	t1 = ((int)inst->op & DOUBLE)? my_fp_class(&inst->op1->d[0]) :
378	    my_fp_classf(&inst->op1->f[0]);
379	if (t1 == fp_signaling)
380		return fex_inv_snan;
381
382	/* check two-operand instructions for other cases */
383	switch (inst->op) {
384	case cmpss:
385	case cmpsd:
386	case minss:
387	case minsd:
388	case maxss:
389	case maxsd:
390	case comiss:
391	case comisd:
392		return fex_inv_cmp;
393
394	case addss:
395	case addsd:
396	case subss:
397	case subsd:
398		if (t1 == fp_infinity && t2 == fp_infinity)
399			return fex_inv_isi;
400		break;
401
402	case mulss:
403	case mulsd:
404		if ((t1 == fp_zero && t2 == fp_infinity) ||
405		    (t2 == fp_zero && t1 == fp_infinity))
406			return fex_inv_zmi;
407		break;
408
409	case divss:
410	case divsd:
411		if (t1 == fp_zero && t2 == fp_zero)
412			return fex_inv_zdz;
413		if (t1 == fp_infinity && t2 == fp_infinity)
414			return fex_inv_idi;
415	default:
416		break;
417	}
418
419	return (enum fex_exception)-1;
420}
421
422/* inline templates */
423extern void sse_cmpeqss(float *, float *, int *);
424extern void sse_cmpltss(float *, float *, int *);
425extern void sse_cmpless(float *, float *, int *);
426extern void sse_cmpunordss(float *, float *, int *);
427extern void sse_minss(float *, float *, float *);
428extern void sse_maxss(float *, float *, float *);
429extern void sse_addss(float *, float *, float *);
430extern void sse_subss(float *, float *, float *);
431extern void sse_mulss(float *, float *, float *);
432extern void sse_divss(float *, float *, float *);
433extern void sse_sqrtss(float *, float *);
434extern void sse_ucomiss(float *, float *);
435extern void sse_comiss(float *, float *);
436extern void sse_cvtss2sd(float *, double *);
437extern void sse_cvtsi2ss(int *, float *);
438extern void sse_cvttss2si(float *, int *);
439extern void sse_cvtss2si(float *, int *);
440#ifdef __amd64
441extern void sse_cvtsi2ssq(long long *, float *);
442extern void sse_cvttss2siq(float *, long long *);
443extern void sse_cvtss2siq(float *, long long *);
444#endif
445extern void sse_cmpeqsd(double *, double *, long long *);
446extern void sse_cmpltsd(double *, double *, long long *);
447extern void sse_cmplesd(double *, double *, long long *);
448extern void sse_cmpunordsd(double *, double *, long long *);
449extern void sse_minsd(double *, double *, double *);
450extern void sse_maxsd(double *, double *, double *);
451extern void sse_addsd(double *, double *, double *);
452extern void sse_subsd(double *, double *, double *);
453extern void sse_mulsd(double *, double *, double *);
454extern void sse_divsd(double *, double *, double *);
455extern void sse_sqrtsd(double *, double *);
456extern void sse_ucomisd(double *, double *);
457extern void sse_comisd(double *, double *);
458extern void sse_cvtsd2ss(double *, float *);
459extern void sse_cvtsi2sd(int *, double *);
460extern void sse_cvttsd2si(double *, int *);
461extern void sse_cvtsd2si(double *, int *);
462#ifdef __amd64
463extern void sse_cvtsi2sdq(long long *, double *);
464extern void sse_cvttsd2siq(double *, long long *);
465extern void sse_cvtsd2siq(double *, long long *);
466#endif
467
468/*
469 * Fill in *info with the operands, default untrapped result, and
470 * flags produced by a scalar SSE instruction, and return the type
471 * of trapped exception (if any).  On entry, the mxcsr must have
472 * all exceptions masked and all flags clear.  The same conditions
473 * will hold on exit.
474 *
475 * This routine does not work if the instruction specified by *inst
476 * is not a scalar instruction.
477 */
478enum fex_exception
479__fex_get_sse_op(ucontext_t *uap, sseinst_t *inst, fex_info_t *info)
480{
481	unsigned int	e, te, mxcsr, oldmxcsr, subnorm;
482
483	/*
484	 * Perform the operation with traps disabled and check the
485	 * exception flags.  If the underflow trap was enabled, also
486	 * check for an exact subnormal result.
487	 */
488	__fenv_getmxcsr(&oldmxcsr);
489	subnorm = 0;
490	if ((int)inst->op & DOUBLE) {
491		if (inst->op == cvtsi2sd) {
492			info->op1.type = fex_int;
493			info->op1.val.i = inst->op2->i[0];
494			info->op2.type = fex_nodata;
495		} else if (inst->op == cvtsi2sdq) {
496			info->op1.type = fex_llong;
497			info->op1.val.l = inst->op2->l[0];
498			info->op2.type = fex_nodata;
499		} else if (inst->op == sqrtsd || inst->op == cvtsd2ss ||
500		    inst->op == cvttsd2si || inst->op == cvtsd2si ||
501		    inst->op == cvttsd2siq || inst->op == cvtsd2siq) {
502			info->op1.type = fex_double;
503			info->op1.val.d = inst->op2->d[0];
504			info->op2.type = fex_nodata;
505		} else {
506			info->op1.type = fex_double;
507			info->op1.val.d = inst->op1->d[0];
508			info->op2.type = fex_double;
509			info->op2.val.d = inst->op2->d[0];
510		}
511		info->res.type = fex_double;
512		switch (inst->op) {
513		case cmpsd:
514			info->op = fex_cmp;
515			info->res.type = fex_llong;
516			switch (inst->imm & 3) {
517			case 0:
518				sse_cmpeqsd(&info->op1.val.d, &info->op2.val.d,
519				    &info->res.val.l);
520				break;
521
522			case 1:
523				sse_cmpltsd(&info->op1.val.d, &info->op2.val.d,
524				    &info->res.val.l);
525				break;
526
527			case 2:
528				sse_cmplesd(&info->op1.val.d, &info->op2.val.d,
529				    &info->res.val.l);
530				break;
531
532			case 3:
533				sse_cmpunordsd(&info->op1.val.d,
534				    &info->op2.val.d, &info->res.val.l);
535			}
536			if (inst->imm & 4)
537				info->res.val.l ^= 0xffffffffffffffffull;
538			break;
539
540		case minsd:
541			info->op = fex_other;
542			sse_minsd(&info->op1.val.d, &info->op2.val.d,
543			    &info->res.val.d);
544			break;
545
546		case maxsd:
547			info->op = fex_other;
548			sse_maxsd(&info->op1.val.d, &info->op2.val.d,
549			    &info->res.val.d);
550			break;
551
552		case addsd:
553			info->op = fex_add;
554			sse_addsd(&info->op1.val.d, &info->op2.val.d,
555			    &info->res.val.d);
556			if (my_fp_class(&info->res.val.d) == fp_subnormal)
557				subnorm = 1;
558			break;
559
560		case subsd:
561			info->op = fex_sub;
562			sse_subsd(&info->op1.val.d, &info->op2.val.d,
563			    &info->res.val.d);
564			if (my_fp_class(&info->res.val.d) == fp_subnormal)
565				subnorm = 1;
566			break;
567
568		case mulsd:
569			info->op = fex_mul;
570			sse_mulsd(&info->op1.val.d, &info->op2.val.d,
571			    &info->res.val.d);
572			if (my_fp_class(&info->res.val.d) == fp_subnormal)
573				subnorm = 1;
574			break;
575
576		case divsd:
577			info->op = fex_div;
578			sse_divsd(&info->op1.val.d, &info->op2.val.d,
579			    &info->res.val.d);
580			if (my_fp_class(&info->res.val.d) == fp_subnormal)
581				subnorm = 1;
582			break;
583
584		case sqrtsd:
585			info->op = fex_sqrt;
586			sse_sqrtsd(&info->op1.val.d, &info->res.val.d);
587			break;
588
589		case cvtsd2ss:
590			info->op = fex_cnvt;
591			info->res.type = fex_float;
592			sse_cvtsd2ss(&info->op1.val.d, &info->res.val.f);
593			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
594				subnorm = 1;
595			break;
596
597		case cvtsi2sd:
598			info->op = fex_cnvt;
599			sse_cvtsi2sd(&info->op1.val.i, &info->res.val.d);
600			break;
601
602		case cvttsd2si:
603			info->op = fex_cnvt;
604			info->res.type = fex_int;
605			sse_cvttsd2si(&info->op1.val.d, &info->res.val.i);
606			break;
607
608		case cvtsd2si:
609			info->op = fex_cnvt;
610			info->res.type = fex_int;
611			sse_cvtsd2si(&info->op1.val.d, &info->res.val.i);
612			break;
613
614#ifdef __amd64
615		case cvtsi2sdq:
616			info->op = fex_cnvt;
617			sse_cvtsi2sdq(&info->op1.val.l, &info->res.val.d);
618			break;
619
620		case cvttsd2siq:
621			info->op = fex_cnvt;
622			info->res.type = fex_llong;
623			sse_cvttsd2siq(&info->op1.val.d, &info->res.val.l);
624			break;
625
626		case cvtsd2siq:
627			info->op = fex_cnvt;
628			info->res.type = fex_llong;
629			sse_cvtsd2siq(&info->op1.val.d, &info->res.val.l);
630			break;
631#endif
632
633		case ucomisd:
634			info->op = fex_cmp;
635			info->res.type = fex_nodata;
636			sse_ucomisd(&info->op1.val.d, &info->op2.val.d);
637			break;
638
639		case comisd:
640			info->op = fex_cmp;
641			info->res.type = fex_nodata;
642			sse_comisd(&info->op1.val.d, &info->op2.val.d);
643			break;
644		default:
645			break;
646		}
647	} else {
648		if (inst->op == cvtsi2ss) {
649			info->op1.type = fex_int;
650			info->op1.val.i = inst->op2->i[0];
651			info->op2.type = fex_nodata;
652		} else if (inst->op == cvtsi2ssq) {
653			info->op1.type = fex_llong;
654			info->op1.val.l = inst->op2->l[0];
655			info->op2.type = fex_nodata;
656		} else if (inst->op == sqrtss || inst->op == cvtss2sd ||
657		    inst->op == cvttss2si || inst->op == cvtss2si ||
658		    inst->op == cvttss2siq || inst->op == cvtss2siq) {
659			info->op1.type = fex_float;
660			info->op1.val.f = inst->op2->f[0];
661			info->op2.type = fex_nodata;
662		} else {
663			info->op1.type = fex_float;
664			info->op1.val.f = inst->op1->f[0];
665			info->op2.type = fex_float;
666			info->op2.val.f = inst->op2->f[0];
667		}
668		info->res.type = fex_float;
669		switch (inst->op) {
670		case cmpss:
671			info->op = fex_cmp;
672			info->res.type = fex_int;
673			switch (inst->imm & 3) {
674			case 0:
675				sse_cmpeqss(&info->op1.val.f, &info->op2.val.f,
676				    &info->res.val.i);
677				break;
678
679			case 1:
680				sse_cmpltss(&info->op1.val.f, &info->op2.val.f,
681				    &info->res.val.i);
682				break;
683
684			case 2:
685				sse_cmpless(&info->op1.val.f, &info->op2.val.f,
686				    &info->res.val.i);
687				break;
688
689			case 3:
690				sse_cmpunordss(&info->op1.val.f,
691				    &info->op2.val.f, &info->res.val.i);
692			}
693			if (inst->imm & 4)
694				info->res.val.i ^= 0xffffffffu;
695			break;
696
697		case minss:
698			info->op = fex_other;
699			sse_minss(&info->op1.val.f, &info->op2.val.f,
700			    &info->res.val.f);
701			break;
702
703		case maxss:
704			info->op = fex_other;
705			sse_maxss(&info->op1.val.f, &info->op2.val.f,
706			    &info->res.val.f);
707			break;
708
709		case addss:
710			info->op = fex_add;
711			sse_addss(&info->op1.val.f, &info->op2.val.f,
712			    &info->res.val.f);
713			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
714				subnorm = 1;
715			break;
716
717		case subss:
718			info->op = fex_sub;
719			sse_subss(&info->op1.val.f, &info->op2.val.f,
720			    &info->res.val.f);
721			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
722				subnorm = 1;
723			break;
724
725		case mulss:
726			info->op = fex_mul;
727			sse_mulss(&info->op1.val.f, &info->op2.val.f,
728			    &info->res.val.f);
729			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
730				subnorm = 1;
731			break;
732
733		case divss:
734			info->op = fex_div;
735			sse_divss(&info->op1.val.f, &info->op2.val.f,
736			    &info->res.val.f);
737			if (my_fp_classf(&info->res.val.f) == fp_subnormal)
738				subnorm = 1;
739			break;
740
741		case sqrtss:
742			info->op = fex_sqrt;
743			sse_sqrtss(&info->op1.val.f, &info->res.val.f);
744			break;
745
746		case cvtss2sd:
747			info->op = fex_cnvt;
748			info->res.type = fex_double;
749			sse_cvtss2sd(&info->op1.val.f, &info->res.val.d);
750			break;
751
752		case cvtsi2ss:
753			info->op = fex_cnvt;
754			sse_cvtsi2ss(&info->op1.val.i, &info->res.val.f);
755			break;
756
757		case cvttss2si:
758			info->op = fex_cnvt;
759			info->res.type = fex_int;
760			sse_cvttss2si(&info->op1.val.f, &info->res.val.i);
761			break;
762
763		case cvtss2si:
764			info->op = fex_cnvt;
765			info->res.type = fex_int;
766			sse_cvtss2si(&info->op1.val.f, &info->res.val.i);
767			break;
768
769#ifdef __amd64
770		case cvtsi2ssq:
771			info->op = fex_cnvt;
772			sse_cvtsi2ssq(&info->op1.val.l, &info->res.val.f);
773			break;
774
775		case cvttss2siq:
776			info->op = fex_cnvt;
777			info->res.type = fex_llong;
778			sse_cvttss2siq(&info->op1.val.f, &info->res.val.l);
779			break;
780
781		case cvtss2siq:
782			info->op = fex_cnvt;
783			info->res.type = fex_llong;
784			sse_cvtss2siq(&info->op1.val.f, &info->res.val.l);
785			break;
786#endif
787
788		case ucomiss:
789			info->op = fex_cmp;
790			info->res.type = fex_nodata;
791			sse_ucomiss(&info->op1.val.f, &info->op2.val.f);
792			break;
793
794		case comiss:
795			info->op = fex_cmp;
796			info->res.type = fex_nodata;
797			sse_comiss(&info->op1.val.f, &info->op2.val.f);
798			break;
799		default:
800			break;
801		}
802	}
803	__fenv_getmxcsr(&mxcsr);
804	info->flags = mxcsr & 0x3d;
805	__fenv_setmxcsr(&oldmxcsr);
806
807	/* determine which exception would have been trapped */
808	te = ~(uap->uc_mcontext.fpregs.fp_reg_set.fpchip_state.mxcsr
809	    >> 7) & 0x3d;
810	e = mxcsr & te;
811	if (e & FE_INVALID)
812		return __fex_get_sse_invalid_type(inst);
813	if (e & FE_DIVBYZERO)
814		return fex_division;
815	if (e & FE_OVERFLOW)
816		return fex_overflow;
817	if ((e & FE_UNDERFLOW) || (subnorm && (te & FE_UNDERFLOW)))
818		return fex_underflow;
819	if (e & FE_INEXACT)
820		return fex_inexact;
821	return (enum fex_exception)-1;
822}
823
824/*
825 * Emulate a SIMD SSE instruction to determine which exceptions occur
826 * in each part.  For i = 0, 1, 2, and 3, set e[i] to indicate the
827 * trapped exception that would occur if the i-th part of the SIMD
828 * instruction were executed in isolation; set e[i] to -1 if no
829 * trapped exception would occur in this part.  Also fill in info[i]
830 * with the corresponding operands, default untrapped result, and
831 * flags.
832 *
833 * This routine does not work if the instruction specified by *inst
834 * is not a SIMD instruction.
835 */
836void
837__fex_get_simd_op(ucontext_t *uap, sseinst_t *inst, enum fex_exception *e,
838    fex_info_t *info)
839{
840	sseinst_t	dummy;
841	int		i;
842
843	e[0] = e[1] = e[2] = e[3] = -1;
844
845	/* perform each part of the SIMD operation */
846	switch (inst->op) {
847	case cmpps:
848		dummy.op = cmpss;
849		dummy.imm = inst->imm;
850		for (i = 0; i < 4; i++) {
851			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
852			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
853			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
854		}
855		break;
856
857	case minps:
858		dummy.op = minss;
859		for (i = 0; i < 4; i++) {
860			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
861			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
862			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
863		}
864		break;
865
866	case maxps:
867		dummy.op = maxss;
868		for (i = 0; i < 4; i++) {
869			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
870			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
871			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
872		}
873		break;
874
875	case addps:
876		dummy.op = addss;
877		for (i = 0; i < 4; i++) {
878			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
879			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
880			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
881		}
882		break;
883
884	case subps:
885		dummy.op = subss;
886		for (i = 0; i < 4; i++) {
887			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
888			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
889			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
890		}
891		break;
892
893	case mulps:
894		dummy.op = mulss;
895		for (i = 0; i < 4; i++) {
896			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
897			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
898			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
899		}
900		break;
901
902	case divps:
903		dummy.op = divss;
904		for (i = 0; i < 4; i++) {
905			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
906			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
907			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
908		}
909		break;
910
911	case sqrtps:
912		dummy.op = sqrtss;
913		for (i = 0; i < 4; i++) {
914			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
915			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
916			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
917		}
918		break;
919
920	case cvtdq2ps:
921		dummy.op = cvtsi2ss;
922		for (i = 0; i < 4; i++) {
923			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
924			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
925			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
926		}
927		break;
928
929	case cvttps2dq:
930		dummy.op = cvttss2si;
931		for (i = 0; i < 4; i++) {
932			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
933			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
934			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
935		}
936		break;
937
938	case cvtps2dq:
939		dummy.op = cvtss2si;
940		for (i = 0; i < 4; i++) {
941			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
942			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
943			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
944		}
945		break;
946
947	case cvtpi2ps:
948		dummy.op = cvtsi2ss;
949		for (i = 0; i < 2; i++) {
950			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
951			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
952			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
953		}
954		break;
955
956	case cvttps2pi:
957		dummy.op = cvttss2si;
958		for (i = 0; i < 2; i++) {
959			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
960			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
961			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
962		}
963		break;
964
965	case cvtps2pi:
966		dummy.op = cvtss2si;
967		for (i = 0; i < 2; i++) {
968			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
969			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
970			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
971		}
972		break;
973
974	case cmppd:
975		dummy.op = cmpsd;
976		dummy.imm = inst->imm;
977		for (i = 0; i < 2; i++) {
978			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
979			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
980			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
981		}
982		break;
983
984	case minpd:
985		dummy.op = minsd;
986		for (i = 0; i < 2; i++) {
987			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
988			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
989			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
990		}
991		break;
992
993	case maxpd:
994		dummy.op = maxsd;
995		for (i = 0; i < 2; i++) {
996			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
997			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
998			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
999		}
1000		break;
1001
1002	case addpd:
1003		dummy.op = addsd;
1004		for (i = 0; i < 2; i++) {
1005			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1006			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1007			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1008		}
1009		break;
1010
1011	case subpd:
1012		dummy.op = subsd;
1013		for (i = 0; i < 2; i++) {
1014			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1015			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1016			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1017		}
1018		break;
1019
1020	case mulpd:
1021		dummy.op = mulsd;
1022		for (i = 0; i < 2; i++) {
1023			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1024			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1025			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1026		}
1027		break;
1028
1029	case divpd:
1030		dummy.op = divsd;
1031		for (i = 0; i < 2; i++) {
1032			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1033			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1034			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1035		}
1036		break;
1037
1038	case sqrtpd:
1039		dummy.op = sqrtsd;
1040		for (i = 0; i < 2; i++) {
1041			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1042			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1043			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1044		}
1045		break;
1046
1047	case cvtpi2pd:
1048	case cvtdq2pd:
1049		dummy.op = cvtsi2sd;
1050		for (i = 0; i < 2; i++) {
1051			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1052			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1053			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1054		}
1055		break;
1056
1057	case cvttpd2pi:
1058	case cvttpd2dq:
1059		dummy.op = cvttsd2si;
1060		for (i = 0; i < 2; i++) {
1061			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1062			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1063			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1064		}
1065		break;
1066
1067	case cvtpd2pi:
1068	case cvtpd2dq:
1069		dummy.op = cvtsd2si;
1070		for (i = 0; i < 2; i++) {
1071			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1072			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1073			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1074		}
1075		break;
1076
1077	case cvtps2pd:
1078		dummy.op = cvtss2sd;
1079		for (i = 0; i < 2; i++) {
1080			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1081			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1082			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1083		}
1084		break;
1085
1086	case cvtpd2ps:
1087		dummy.op = cvtsd2ss;
1088		for (i = 0; i < 2; i++) {
1089			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1090			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1091			e[i] = __fex_get_sse_op(uap, &dummy, &info[i]);
1092		}
1093	default:
1094		break;
1095	}
1096}
1097
1098/*
1099 * Store the result value from *info in the destination of the scalar
1100 * SSE instruction specified by *inst.  If no result is given but the
1101 * exception is underflow or overflow, supply the default trapped result.
1102 *
1103 * This routine does not work if the instruction specified by *inst
1104 * is not a scalar instruction.
1105 */
1106void
1107__fex_st_sse_result(ucontext_t *uap, sseinst_t *inst, enum fex_exception e,
1108    fex_info_t *info)
1109{
1110	int		i = 0;
1111	long long	l = 0L;;
1112	float		f = 0.0, fscl;
1113	double		d = 0.0L, dscl;
1114
1115	/* for compares that write eflags, just set the flags
1116	   to indicate "unordered" */
1117	if (inst->op == ucomiss || inst->op == comiss ||
1118	    inst->op == ucomisd || inst->op == comisd) {
1119		uap->uc_mcontext.gregs[REG_PS] |= 0x45;
1120		return;
1121	}
1122
1123	/* if info doesn't specify a result value, try to generate
1124	   the default trapped result */
1125	if (info->res.type == fex_nodata) {
1126		/* set scale factors for exponent wrapping */
1127		switch (e) {
1128		case fex_overflow:
1129			fscl = 1.262177448e-29f; /* 2^-96 */
1130			dscl = 6.441148769597133308e-232; /* 2^-768 */
1131			break;
1132
1133		case fex_underflow:
1134			fscl = 7.922816251e+28f; /* 2^96 */
1135			dscl = 1.552518092300708935e+231; /* 2^768 */
1136			break;
1137
1138		default:
1139			(void) __fex_get_sse_op(uap, inst, info);
1140			if (info->res.type == fex_nodata)
1141				return;
1142			goto stuff;
1143		}
1144
1145		/* generate the wrapped result */
1146		if (inst->op == cvtsd2ss) {
1147			info->op1.type = fex_double;
1148			info->op1.val.d = inst->op2->d[0];
1149			info->op2.type = fex_nodata;
1150			info->res.type = fex_float;
1151			info->res.val.f = (float)(fscl * (fscl *
1152			    info->op1.val.d));
1153		} else if ((int)inst->op & DOUBLE) {
1154			info->op1.type = fex_double;
1155			info->op1.val.d = inst->op1->d[0];
1156			info->op2.type = fex_double;
1157			info->op2.val.d = inst->op2->d[0];
1158			info->res.type = fex_double;
1159			switch (inst->op) {
1160			case addsd:
1161				info->res.val.d = dscl * (dscl *
1162				    info->op1.val.d + dscl * info->op2.val.d);
1163				break;
1164
1165			case subsd:
1166				info->res.val.d = dscl * (dscl *
1167				    info->op1.val.d - dscl * info->op2.val.d);
1168				break;
1169
1170			case mulsd:
1171				info->res.val.d = (dscl * info->op1.val.d) *
1172				    (dscl * info->op2.val.d);
1173				break;
1174
1175			case divsd:
1176				info->res.val.d = (dscl * info->op1.val.d) /
1177				    (info->op2.val.d / dscl);
1178				break;
1179
1180			default:
1181				return;
1182			}
1183		} else {
1184			info->op1.type = fex_float;
1185			info->op1.val.f = inst->op1->f[0];
1186			info->op2.type = fex_float;
1187			info->op2.val.f = inst->op2->f[0];
1188			info->res.type = fex_float;
1189			switch (inst->op) {
1190			case addss:
1191				info->res.val.f = fscl * (fscl *
1192				    info->op1.val.f + fscl * info->op2.val.f);
1193				break;
1194
1195			case subss:
1196				info->res.val.f = fscl * (fscl *
1197				    info->op1.val.f - fscl * info->op2.val.f);
1198				break;
1199
1200			case mulss:
1201				info->res.val.f = (fscl * info->op1.val.f) *
1202				    (fscl * info->op2.val.f);
1203				break;
1204
1205			case divss:
1206				info->res.val.f = (fscl * info->op1.val.f) /
1207				    (info->op2.val.f / fscl);
1208				break;
1209
1210			default:
1211				return;
1212			}
1213		}
1214	}
1215
1216	/* put the result in the destination */
1217stuff:
1218	if (inst->op == cmpss || inst->op == cvttss2si || inst->op == cvtss2si
1219	    || inst->op == cvttsd2si || inst->op == cvtsd2si) {
1220		switch (info->res.type) {
1221		case fex_int:
1222			i = info->res.val.i;
1223			break;
1224
1225		case fex_llong:
1226			i = info->res.val.l;
1227			break;
1228
1229		case fex_float:
1230			i = info->res.val.f;
1231			break;
1232
1233		case fex_double:
1234			i = info->res.val.d;
1235			break;
1236
1237		case fex_ldouble:
1238			i = info->res.val.q;
1239			break;
1240
1241		default:
1242			break;
1243		}
1244		inst->op1->i[0] = i;
1245	} else if (inst->op == cmpsd || inst->op == cvttss2siq ||
1246	    inst->op == cvtss2siq || inst->op == cvttsd2siq ||
1247	    inst->op == cvtsd2siq) {
1248		switch (info->res.type) {
1249		case fex_int:
1250			l = info->res.val.i;
1251			break;
1252
1253		case fex_llong:
1254			l = info->res.val.l;
1255			break;
1256
1257		case fex_float:
1258			l = info->res.val.f;
1259			break;
1260
1261		case fex_double:
1262			l = info->res.val.d;
1263			break;
1264
1265		case fex_ldouble:
1266			l = info->res.val.q;
1267			break;
1268
1269		default:
1270			break;
1271		}
1272		inst->op1->l[0] = l;
1273	} else if ((((int)inst->op & DOUBLE) && inst->op != cvtsd2ss) ||
1274	    inst->op == cvtss2sd) {
1275		switch (info->res.type) {
1276		case fex_int:
1277			d = info->res.val.i;
1278			break;
1279
1280		case fex_llong:
1281			d = info->res.val.l;
1282			break;
1283
1284		case fex_float:
1285			d = info->res.val.f;
1286			break;
1287
1288		case fex_double:
1289			d = info->res.val.d;
1290			break;
1291
1292		case fex_ldouble:
1293			d = info->res.val.q;
1294			break;
1295
1296		default:
1297			break;
1298		}
1299		inst->op1->d[0] = d;
1300	} else {
1301		switch (info->res.type) {
1302		case fex_int:
1303			f = info->res.val.i;
1304			break;
1305
1306		case fex_llong:
1307			f = info->res.val.l;
1308			break;
1309
1310		case fex_float:
1311			f = info->res.val.f;
1312			break;
1313
1314		case fex_double:
1315			f = info->res.val.d;
1316			break;
1317
1318		case fex_ldouble:
1319			f = info->res.val.q;
1320			break;
1321
1322		default:
1323			break;
1324		}
1325		inst->op1->f[0] = f;
1326	}
1327}
1328
1329/*
1330 * Store the results from a SIMD instruction.  For each i, store
1331 * the result value from info[i] in the i-th part of the destination
1332 * of the SIMD SSE instruction specified by *inst.  If no result
1333 * is given but the exception indicated by e[i] is underflow or
1334 * overflow, supply the default trapped result.
1335 *
1336 * This routine does not work if the instruction specified by *inst
1337 * is not a SIMD instruction.
1338 */
1339void
1340__fex_st_simd_result(ucontext_t *uap, sseinst_t *inst, enum fex_exception *e,
1341    fex_info_t *info)
1342{
1343	sseinst_t	dummy;
1344	int		i;
1345
1346	/* store each part */
1347	switch (inst->op) {
1348	case cmpps:
1349		dummy.op = cmpss;
1350		dummy.imm = inst->imm;
1351		for (i = 0; i < 4; i++) {
1352			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1353			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1354			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1355		}
1356		break;
1357
1358	case minps:
1359		dummy.op = minss;
1360		for (i = 0; i < 4; i++) {
1361			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1362			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1363			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1364		}
1365		break;
1366
1367	case maxps:
1368		dummy.op = maxss;
1369		for (i = 0; i < 4; i++) {
1370			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1371			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1372			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1373		}
1374		break;
1375
1376	case addps:
1377		dummy.op = addss;
1378		for (i = 0; i < 4; i++) {
1379			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1380			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1381			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1382		}
1383		break;
1384
1385	case subps:
1386		dummy.op = subss;
1387		for (i = 0; i < 4; i++) {
1388			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1389			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1390			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1391		}
1392		break;
1393
1394	case mulps:
1395		dummy.op = mulss;
1396		for (i = 0; i < 4; i++) {
1397			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1398			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1399			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1400		}
1401		break;
1402
1403	case divps:
1404		dummy.op = divss;
1405		for (i = 0; i < 4; i++) {
1406			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1407			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1408			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1409		}
1410		break;
1411
1412	case sqrtps:
1413		dummy.op = sqrtss;
1414		for (i = 0; i < 4; i++) {
1415			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1416			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1417			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1418		}
1419		break;
1420
1421	case cvtdq2ps:
1422		dummy.op = cvtsi2ss;
1423		for (i = 0; i < 4; i++) {
1424			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1425			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1426			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1427		}
1428		break;
1429
1430	case cvttps2dq:
1431		dummy.op = cvttss2si;
1432		for (i = 0; i < 4; i++) {
1433			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1434			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1435			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1436		}
1437		break;
1438
1439	case cvtps2dq:
1440		dummy.op = cvtss2si;
1441		for (i = 0; i < 4; i++) {
1442			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1443			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1444			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1445		}
1446		break;
1447
1448	case cvtpi2ps:
1449		dummy.op = cvtsi2ss;
1450		for (i = 0; i < 2; i++) {
1451			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1452			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1453			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1454		}
1455		break;
1456
1457	case cvttps2pi:
1458		dummy.op = cvttss2si;
1459		for (i = 0; i < 2; i++) {
1460			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1461			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1462			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1463		}
1464		break;
1465
1466	case cvtps2pi:
1467		dummy.op = cvtss2si;
1468		for (i = 0; i < 2; i++) {
1469			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1470			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1471			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1472		}
1473		break;
1474
1475	case cmppd:
1476		dummy.op = cmpsd;
1477		dummy.imm = inst->imm;
1478		for (i = 0; i < 2; i++) {
1479			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1480			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1481			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1482		}
1483		break;
1484
1485	case minpd:
1486		dummy.op = minsd;
1487		for (i = 0; i < 2; i++) {
1488			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1489			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1490			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1491		}
1492		break;
1493
1494	case maxpd:
1495		dummy.op = maxsd;
1496		for (i = 0; i < 2; i++) {
1497			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1498			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1499			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1500		}
1501		break;
1502
1503	case addpd:
1504		dummy.op = addsd;
1505		for (i = 0; i < 2; i++) {
1506			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1507			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1508			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1509		}
1510		break;
1511
1512	case subpd:
1513		dummy.op = subsd;
1514		for (i = 0; i < 2; i++) {
1515			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1516			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1517			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1518		}
1519		break;
1520
1521	case mulpd:
1522		dummy.op = mulsd;
1523		for (i = 0; i < 2; i++) {
1524			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1525			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1526			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1527		}
1528		break;
1529
1530	case divpd:
1531		dummy.op = divsd;
1532		for (i = 0; i < 2; i++) {
1533			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1534			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1535			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1536		}
1537		break;
1538
1539	case sqrtpd:
1540		dummy.op = sqrtsd;
1541		for (i = 0; i < 2; i++) {
1542			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1543			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1544			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1545		}
1546		break;
1547
1548	case cvtpi2pd:
1549	case cvtdq2pd:
1550		dummy.op = cvtsi2sd;
1551		for (i = 0; i < 2; i++) {
1552			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1553			dummy.op2 = (sseoperand_t *)&inst->op2->i[i];
1554			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1555		}
1556		break;
1557
1558	case cvttpd2pi:
1559	case cvttpd2dq:
1560		dummy.op = cvttsd2si;
1561		for (i = 0; i < 2; i++) {
1562			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1563			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1564			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1565		}
1566		/* for cvttpd2dq, zero the high 64 bits of the destination */
1567		if (inst->op == cvttpd2dq)
1568			inst->op1->l[1] = 0ll;
1569		break;
1570
1571	case cvtpd2pi:
1572	case cvtpd2dq:
1573		dummy.op = cvtsd2si;
1574		for (i = 0; i < 2; i++) {
1575			dummy.op1 = (sseoperand_t *)&inst->op1->i[i];
1576			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1577			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1578		}
1579		/* for cvtpd2dq, zero the high 64 bits of the destination */
1580		if (inst->op == cvtpd2dq)
1581			inst->op1->l[1] = 0ll;
1582		break;
1583
1584	case cvtps2pd:
1585		dummy.op = cvtss2sd;
1586		for (i = 0; i < 2; i++) {
1587			dummy.op1 = (sseoperand_t *)&inst->op1->d[i];
1588			dummy.op2 = (sseoperand_t *)&inst->op2->f[i];
1589			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1590		}
1591		break;
1592
1593	case cvtpd2ps:
1594		dummy.op = cvtsd2ss;
1595		for (i = 0; i < 2; i++) {
1596			dummy.op1 = (sseoperand_t *)&inst->op1->f[i];
1597			dummy.op2 = (sseoperand_t *)&inst->op2->d[i];
1598			__fex_st_sse_result(uap, &dummy, e[i], &info[i]);
1599		}
1600		/* zero the high 64 bits of the destination */
1601		inst->op1->l[1] = 0ll;
1602
1603	default:
1604		break;
1605	}
1606}
1607
1608