1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef	_MD5_BYTESWAP_H
28 #define	_MD5_BYTESWAP_H
29 
30 /*
31  * definitions for inline functions for little-endian loads.
32  *
33  * This file has special definitions for UltraSPARC architectures,
34  * which have a special address space identifier for loading 32 and 16 bit
35  * integers in little-endian byte order.
36  */
37 
38 #include <sys/types.h>
39 #if defined(__sparc)
40 #include <v9/sys/asi.h>
41 #elif defined(_LITTLE_ENDIAN)
42 #include <sys/byteorder.h>
43 #endif
44 
45 #ifdef	__cplusplus
46 extern "C" {
47 #endif
48 
49 #if defined(_LITTLE_ENDIAN)
50 
51 /*
52  * Little-endian optimization:  I don't need to do any weirdness.   On
53  * some little-endian boxen, I'll have to do alignment checks, but I can do
54  * that below.
55  */
56 
57 #if !defined(__i386) && !defined(__amd64)
58 /*
59  * i386 and amd64 don't require aligned 4-byte loads.  The symbol
60  * _MD5_CHECK_ALIGNMENT indicates below whether the MD5Transform function
61  * requires alignment checking.
62  */
63 #define	_MD5_CHECK_ALIGNMENT
64 #endif /* !__i386 && !__amd64 */
65 
66 #define	LOAD_LITTLE_32(addr)	(*(uint32_t *)(void *)(addr))
67 
68 #else	/* !_LITTLE_ENDIAN */
69 
70 /*
71  * sparc v9/v8plus optimization:
72  *
73  * on the sparc v9/v8plus, we can load data little endian.  however, since
74  * the compiler doesn't have direct support for little endian, we
75  * link to an assembly-language routine `load_little_32' to do
76  * the magic.  note that special care must be taken to ensure the
77  * address is 32-bit aligned -- in the interest of speed, we don't
78  * check to make sure, since careful programming can guarantee this
79  * for us.
80  */
81 #if defined(sun4u)
82 
83 /* Define alignment check because we can 4-byte load as little endian. */
84 #define	_MD5_CHECK_ALIGNMENT
85 #define	LOAD_LITTLE_32(addr)    load_little_32((uint32_t *)(void *)(addr))
86 
87 #if !defined(__lint) && defined(__GNUC__)
88 
89 static __inline__ uint32_t
90 load_little_32(uint32_t *addr)
91 {
92 	uint32_t value;
93 
94 	__asm__(
95 	    "lduwa	[%1] %2, %0\n\t"
96 	    : "=r" (value)
97 	    : "r" (addr), "i" (ASI_PL));
98 
99 	return (value);
100 }
101 #endif	/* !__lint && __GNUC__ */
102 
103 #if !defined(__GNUC__)
104 extern	uint32_t load_little_32(uint32_t *);
105 #endif	/* !__GNUC__ */
106 
107 /* Placate lint */
108 #if defined(__lint)
109 uint32_t
110 load_little_32(uint32_t *addr)
111 {
112 	return (*addr);
113 }
114 #endif	/* __lint */
115 
116 #elif defined(_LITTLE_ENDIAN)
117 #define	LOAD_LITTLE_32(addr)	htonl(addr)
118 
119 #else
120 /* big endian -- will work on little endian, but slowly */
121 /* Since we do byte operations, we don't have to check for alignment. */
122 #define	LOAD_LITTLE_32(addr)	\
123 	((addr)[0] | ((addr)[1] << 8) | ((addr)[2] << 16) | ((addr)[3] << 24))
124 #endif	/* sun4u */
125 
126 #if defined(sun4v)
127 
128 /*
129  * For N1 want to minimize number of arithmetic operations. This is best
130  * achieved by using the %asi register to specify ASI for the lduwa operations.
131  * Also, have a separate inline template for each word, so can utilize the
132  * immediate offset in lduwa, without relying on the compiler to do the right
133  * thing.
134  *
135  * Moving to 64-bit loads might also be beneficial.
136  */
137 #define	LOAD_LITTLE_32_0(addr)	load_little_32_0((uint32_t *)(addr))
138 #define	LOAD_LITTLE_32_1(addr)	load_little_32_1((uint32_t *)(addr))
139 #define	LOAD_LITTLE_32_2(addr)	load_little_32_2((uint32_t *)(addr))
140 #define	LOAD_LITTLE_32_3(addr)	load_little_32_3((uint32_t *)(addr))
141 #define	LOAD_LITTLE_32_4(addr)	load_little_32_4((uint32_t *)(addr))
142 #define	LOAD_LITTLE_32_5(addr)	load_little_32_5((uint32_t *)(addr))
143 #define	LOAD_LITTLE_32_6(addr)	load_little_32_6((uint32_t *)(addr))
144 #define	LOAD_LITTLE_32_7(addr)	load_little_32_7((uint32_t *)(addr))
145 #define	LOAD_LITTLE_32_8(addr)	load_little_32_8((uint32_t *)(addr))
146 #define	LOAD_LITTLE_32_9(addr)	load_little_32_9((uint32_t *)(addr))
147 #define	LOAD_LITTLE_32_a(addr)	load_little_32_a((uint32_t *)(addr))
148 #define	LOAD_LITTLE_32_b(addr)	load_little_32_b((uint32_t *)(addr))
149 #define	LOAD_LITTLE_32_c(addr)	load_little_32_c((uint32_t *)(addr))
150 #define	LOAD_LITTLE_32_d(addr)	load_little_32_d((uint32_t *)(addr))
151 #define	LOAD_LITTLE_32_e(addr)	load_little_32_e((uint32_t *)(addr))
152 #define	LOAD_LITTLE_32_f(addr)	load_little_32_f((uint32_t *)(addr))
153 
154 #if !defined(__lint) && defined(__GNUC__)
155 
156 /*
157  * This actually sets the ASI register, not necessarily to ASI_PL.
158  */
159 static __inline__ void
160 set_little(uint8_t asi)
161 {
162 	__asm__ __volatile__(
163 	    "wr	%%g0, %0, %%asi\n\t"
164 	    : /* Nothing */
165 	    : "r" (asi));
166 }
167 
168 static __inline__ uint8_t
169 get_little(void)
170 {
171 	uint8_t asi;
172 
173 	__asm__ __volatile__(
174 	    "rd	%%asi, %0\n\t"
175 	    : "=r" (asi));
176 
177 	return (asi);
178 }
179 
180 /*
181  * We have 16 functions which differ only in the offset from which they
182  * load.  Use this preprocessor template to simplify maintenance.  Its
183  * argument is the offset in hex, without the 0x.
184  */
185 #define	LL_TEMPLATE(__off)			\
186 static __inline__ uint32_t			\
187 load_little_32_##__off(uint32_t *addr)		\
188 {						\
189 	uint32_t value;				\
190 	__asm__(				\
191 		"lduwa	[%1 + %2]%%asi, %0\n\t"	\
192 	: "=r" (value)				\
193 	: "r" (addr), "i" ((0x##__off) << 2));	\
194 	return (value);				\
195 }
196 
197 /* BEGIN CSTYLED */
198 LL_TEMPLATE(0)
199 LL_TEMPLATE(1)
200 LL_TEMPLATE(2)
201 LL_TEMPLATE(3)
202 LL_TEMPLATE(4)
203 LL_TEMPLATE(5)
204 LL_TEMPLATE(6)
205 LL_TEMPLATE(7)
206 LL_TEMPLATE(8)
207 LL_TEMPLATE(9)
208 LL_TEMPLATE(a)
209 LL_TEMPLATE(b)
210 LL_TEMPLATE(c)
211 LL_TEMPLATE(d)
212 LL_TEMPLATE(e)
213 LL_TEMPLATE(f)
214 /* END CSTYLED */
215 #undef	LL_TEMPLATE
216 
217 #endif	/* !__lint && __GNUC__ */
218 
219 #if !defined(__GNUC__)
220 /*
221  * Using the %asi register to achieve little endian loads - register
222  * is set using a inline template.
223  *
224  * Saves a few arithmetic ops as can now use an immediate offset with the
225  * lduwa instructions.
226  */
227 extern void set_little(uint32_t);
228 extern uint32_t get_little(void);
229 
230 extern	uint32_t load_little_32_0(uint32_t *);
231 extern	uint32_t load_little_32_1(uint32_t *);
232 extern	uint32_t load_little_32_2(uint32_t *);
233 extern	uint32_t load_little_32_3(uint32_t *);
234 extern	uint32_t load_little_32_4(uint32_t *);
235 extern	uint32_t load_little_32_5(uint32_t *);
236 extern	uint32_t load_little_32_6(uint32_t *);
237 extern	uint32_t load_little_32_7(uint32_t *);
238 extern	uint32_t load_little_32_8(uint32_t *);
239 extern	uint32_t load_little_32_9(uint32_t *);
240 extern	uint32_t load_little_32_a(uint32_t *);
241 extern	uint32_t load_little_32_b(uint32_t *);
242 extern	uint32_t load_little_32_c(uint32_t *);
243 extern	uint32_t load_little_32_d(uint32_t *);
244 extern	uint32_t load_little_32_e(uint32_t *);
245 extern	uint32_t load_little_32_f(uint32_t *);
246 #endif	/* !__GNUC__ */
247 #endif	/* sun4v */
248 
249 #endif	/* _LITTLE_ENDIAN */
250 
251 #ifdef	__cplusplus
252 }
253 #endif
254 
255 #endif	/* !_MD5_BYTESWAP_H */
256