1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright 2019 Joyent, Inc.
14 */
15
16#include <sys/asm_linkage.h>
17#include <sys/segments.h>
18#include <sys/time_impl.h>
19#include <sys/tsc.h>
20#include <cp_offsets.h>
21
22#define	GETCPU_GDT_OFFSET	SEL_GDT(GDT_CPUID, SEL_UPL)
23
24	.file	"cp_subr.s"
25
26/*
27 * These are cloned from TSC and time related code in the kernel.  They should
28 * be kept in sync in the case that the source values are changed.
29 * See: uts/i86pc/os/timestamp.c
30 */
31#define	NSEC_SHIFT	5
32#define	ADJ_SHIFT	4
33#define	NANOSEC		0x3b9aca00
34
35/*
36 * For __cp_tsc_read calls which incur looping retries due to CPU migration,
37 * this represents the maximum number of tries before bailing out.
38 */
39#define	TSC_READ_MAXLOOP	0x4
40
41/*
42 * hrtime_t
43 * __cp_tsc_read(comm_page_t *cp)
44 *
45 * Stack usage: 0 bytes
46 */
47	ENTRY_NP(__cp_tsc_read)
48	movl	CP_TSC_TYPE(%rdi), %esi
49	movl	CP_TSC_NCPU(%rdi), %r8d
50
51	cmpl	$TSC_TSCP, %esi
52	jne	2f
53	rdtscp
54	/*
55	 * When the TSC is read, the low 32 bits are placed in %eax while the
56	 * high 32 bits are placed in %edx.  They are shifted and ORed together
57	 * to obtain the full 64-bit value.
58	 */
59	shlq	$0x20, %rdx
60	orq	%rdx, %rax
61
62	/*
63	 * A zeroed cp_tsc_ncpu (currently held in r8d) indicates that no
64	 * per-CPU TSC offsets are required.
65	 */
66	testl	%r8d, %r8d
67	jnz	1f
68	ret
69
701:
71	/*
72	 * A non-zero cp_tsc_ncpu indicates the array length of
73	 * cp_tsc_sync_tick_delta containing per-CPU offsets which are applied
74	 * to TSC readings.  The CPU ID furnished by the IA32_TSC_AUX register
75	 * via rdtscp (placed in rcx) is used to look up an offset value in
76	 * that array and apply it to the TSC value.
77	 */
78	leaq	CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
79	movq	(%r9, %rcx, 8), %rdx
80	addq	%rdx, %rax
81	ret
82
832:
84	/*
85	 * TSC reading without RDTSCP
86	 *
87	 * Check if handling for per-CPU TSC offsets is required.  If not,
88	 * immediately skip to the the appropriate steps to perform a rdtsc.
89	 *
90	 * If per-CPU offsets are present, the TSC reading process is more
91	 * complicated.  Without rdtscp, there is no way to simultaneously read
92	 * the TSC and query the current CPU.  In order to "catch" migrations
93	 * during execution, the CPU ID is queried before and after rdtsc.  The
94	 * execution is repeated if results differ, subject to a loop limit.
95	 */
96	xorq	%r9, %r9
97	testl	%r8d, %r8d
98	jz	3f
99
100	/*
101	 * Load the address of the per-CPU offset array, since it is needed.
102	 * The attempted loop count is kept in r8.
103	 */
104	leaq	CP_TSC_SYNC_TICK_DELTA(%rdi), %r9
105	xorl	%r8d, %r8d
106
107	/* Query the CPU ID and stash it in r10 for later comparison */
108	movl	$GETCPU_GDT_OFFSET, %edx
109	lsl	%dx, %edx
110	movl	%edx, %r10d
111
1123:
113	cmpl	$TSC_RDTSC_MFENCE, %esi
114	jne	4f
115	mfence
116	rdtsc
117	jmp	7f
118
1194:
120	cmpl	$TSC_RDTSC_LFENCE, %esi
121	jne	5f
122	lfence
123	rdtsc
124	jmp	7f
125
1265:
127	cmpl	$TSC_RDTSC_CPUID, %esi
128	jne	6f
129	/*
130	 * Since the amd64 ABI dictates that %rbx is callee-saved, it must be
131	 * preserved here.  Its contents will be overwritten when cpuid is used
132	 * as a serializing instruction.
133	 */
134	movq	%rbx, %r11
135	xorl	%eax, %eax
136	cpuid
137	rdtsc
138	movq	%r11, %rbx
139	jmp	7f
140
1416:
142	/*
143	 * Other protections should have prevented this function from being
144	 * called in the first place.  Since callers must handle a failure from
145	 * CPU migration looping, yield the same result as a bail-out: 0
146	 */
147	xorl	%eax, %eax
148	ret
149
1507:
151	shlq	$0x20, %rdx
152	orq	%rdx, %rax
153
154	/*
155	 * With the TSC reading in-hand, check if any per-CPU offset handling
156	 * is required.  The address to the array of deltas (r9) will not have
157	 * been populated if offset handling is unecessary.
158	 */
159	testq	%r9, %r9
160	jnz	8f
161	ret
162
1638:
164	movl	$GETCPU_GDT_OFFSET, %edx
165	lsl	%dx, %edx
166	cmpl	%edx, %r10d
167	jne	9f
168	movq	(%r9, %rdx, 8), %rdx
169	addq	%rdx, %rax
170	ret
171
1729:
173	/*
174	 * It appears that a migration has occurred between the first CPU ID
175	 * query and now.  Check if the loop limit has been broken and retry if
176	 * that's not the case.
177	 */
178	cmpl	$TSC_READ_MAXLOOP, %r8d
179	jge	10f
180	incl	%r8d
181	movl	%edx, %r10d
182	jmp	3b
183
18410:
185	/* Loop limit was reached. Return bail-out value of 0. */
186	xorl	%eax, %eax
187	ret
188
189	SET_SIZE(__cp_tsc_read)
190
191
192/*
193 * uint_t
194 * __cp_getcpu(comm_page_t *)
195 *
196 * Stack usage: 0 bytes
197 */
198	ENTRY_NP(__cp_getcpu)
199	movl	CP_TSC_TYPE(%rdi), %edi
200	/*
201	 * If RDTSCP is available, it is a quick way to grab the cpu_id which
202	 * is stored in the TSC_AUX MSR by the kernel.
203	 */
204	cmpl	$TSC_TSCP, %edi
205	jne	1f
206	rdtscp
207	movl	%ecx, %eax
208	ret
2091:
210	mov	$GETCPU_GDT_OFFSET, %eax
211	lsl	%ax, %eax
212	ret
213	SET_SIZE(__cp_getcpu)
214
215/*
216 * hrtime_t
217 * __cp_gethrtime(comm_page_t *cp)
218 *
219 * Stack usage: 0x20 local + 0x8 call = 0x28 bytes
220 *
221 * %rsp+0x00 - hrtime_t tsc_last
222 * %rsp+0x08 - hrtime_t hrtime_base
223 * %rsp+0x10 - commpage_t *cp
224 * %rsp+0x18 - int hres_lock
225 */
226	ENTRY_NP(__cp_gethrtime)
227	subq	$0x20, %rsp
228	movq	%rdi, 0x10(%rsp)
2291:
230	movl	CP_HRES_LOCK(%rdi), %r9d
231	movl	%r9d, 0x18(%rsp)
232
233	movq	CP_TSC_LAST(%rdi), %rax
234	movq	CP_TSC_HRTIME_BASE(%rdi), %rdx
235	movq	%rax, (%rsp)
236	movq	%rdx, 0x8(%rsp)
237
238	call	__cp_tsc_read
239
240	/*
241	 * Failure is inferred from a TSC reading of 0.  The normal fasttrap
242	 * mechanism can be used as a fallback in such cases.
243	 */
244	testq	%rax, %rax
245	jz	6f
246
247	movq	0x10(%rsp), %rdi
248	movl	0x18(%rsp), %r9d
249	movl	CP_HRES_LOCK(%rdi), %edx
250	andl	$0xfffffffe, %r9d
251	cmpl	%r9d, %edx
252	jne	1b
253
254	/*
255	 * The in-kernel logic for calculating hrtime performs several checks
256	 * to protect against edge cases.  That logic is summarized as:
257	 * if (tsc >= tsc_last) {
258	 *         delta -= tsc_last;
259	 * } else if (tsc >= tsc_last - 2*tsc_max_delta) {
260	 *         delta = 0;
261	 * } else {
262	 *         delta = MIN(tsc, tsc_resume_cap);
263	 * }
264	 *
265	 * The below implementation achieves the same result, although it is
266	 * structured for speed and optimized for the fast path:
267	 *
268	 * delta = tsc - tsc_last;
269	 * if (delta < 0) {
270	 *         delta += (tsc_max_delta << 1);
271	 *         if (delta >= 0) {
272	 *                 delta = 0;
273	 *         } else {
274	 *                 delta = MIN(tsc, tsc_resume_cap);
275	 *         }
276	 * }
277	 */
278	movq	(%rsp), %rdx
279	subq	%rdx, %rax		/* delta = tsc - tsc_last */
280	jbe	3f			/* if (delta < 0) */
281
2822:
283	/*
284	 * Optimized TSC_CONVERT_AND_ADD:
285	 * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT)
286	 *
287	 * Since the multiply and shift are done in 128-bit, there is no need
288	 * to worry about overflow.
289	 */
290	movl	CP_NSEC_SCALE(%rdi), %ecx
291	mulq	%rcx
292	shrdq	$_CONST(32 - NSEC_SHIFT), %rdx, %rax
293	movq	0x8(%rsp), %r8
294	addq	%r8, %rax
295
296	addq	$0x20, %rsp
297	ret
298
2993:
300	movq	%rax, %r9		/* save (tsc - tsc_last) in r9 */
301	movl	CP_TSC_MAX_DELTA(%rdi), %ecx
302	sall	$1, %ecx
303	addq	%rcx, %rax		/* delta += (tsc_max_delta << 1) */
304	jae	4f			/* delta < 0 */
305	xorq	%rax, %rax
306	jmp	2b
307
3084:
309	/*
310	 * Repopulate %rax with the TSC reading by adding tsc_last to %r9
311	 * (which holds tsc - tsc_last)
312	 */
313	movq	(%rsp), %rax
314	addq	%r9, %rax
315
316	/* delta = MIN(tsc, resume_cap) */
317	movq	CP_TSC_RESUME_CAP(%rdi), %rcx
318	cmpq	%rcx, %rax
319	jbe	5f
320	movq	%rcx, %rax
3215:
322	jmp	2b
323
3246:
325	movl	$T_GETHRTIME, %eax
326	int	$T_FASTTRAP
327	addq	$0x20, %rsp
328	ret
329
330	SET_SIZE(__cp_gethrtime)
331
332/*
333 * int
334 * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp)
335 *
336 * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes
337 *
338 * %rsp+0x00 - timespec_t *tsp
339 */
340	ENTRY_NP(__cp_clock_gettime_monotonic)
341	subq	$0x8, %rsp
342	movq	%rsi, (%rsp)
343
344	call	__cp_gethrtime
345
346	/*
347	 * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t.
348	 * This uses the same approach as hrt2ts, although it has been updated
349	 * to utilize 64-bit math.
350	 * 1 / 1,000,000,000 =
351	 * 1000100101110000010111110100000100110110101101001010110110011B-26
352	 * = 0x112e0be826d694b3 * 2^-26
353	 *
354	 * secs = (nsecs * 0x112e0be826d694b3) >> 26
355	 *
356	 * In order to account for the 2s-compliment of negative inputs, a
357	 * final operation completes the process:
358	 *
359	 * secs -= (nsecs >> 63)
360	 */
361	movq	%rax, %r11
362	movq	$0x112e0be826d694b3, %rdx
363	imulq	%rdx
364	sarq	$0x1a, %rdx
365	movq	%r11, %rax
366	sarq	$0x3f, %rax
367	subq	%rax, %rdx
368	movq	(%rsp), %rsi
369	movq	%rdx, (%rsi)
370	/*
371	 * Populating tv_nsec is easier:
372	 * tv_nsec = nsecs - (secs * NANOSEC)
373	 */
374	imulq	$NANOSEC, %rdx, %rdx
375	subq	%rdx, %r11
376	movq	%r11, 0x8(%rsi)
377
378	xorl	%eax, %eax
379	addq	$0x8, %rsp
380	ret
381	SET_SIZE(__cp_clock_gettime_monotonic)
382
383/*
384 * int
385 * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp)
386 *
387 * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes
388 *
389 * %rsp+0x00 - commpage_t *cp
390 * %rsp+0x08 - timespec_t *tsp
391 * %rsp+0x10 - int hres_lock
392 */
393	ENTRY_NP(__cp_clock_gettime_realtime)
394	subq	$0x18, %rsp
395	movq	%rdi, (%rsp)
396	movq	%rsi, 0x8(%rsp)
397
3981:
399	movl	CP_HRES_LOCK(%rdi), %eax
400	movl	%eax, 0x10(%rsp)
401
402	call	__cp_gethrtime
403	movq	(%rsp), %rdi
404	movq	CP_HRES_LAST_TICK(%rdi), %rdx
405	subq	%rdx, %rax			/* nslt = hrtime - last_tick */
406	jb	1b
407	movq	CP_HRESTIME(%rdi), %r9
408	movq	_CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10
409	movl	CP_HRESTIME_ADJ(%rdi), %r11d
410
411	addq	%rax, %r10			/* now.tv_nsec += nslt */
412
413	cmpl	$0, %r11d
414	jb	4f				/* hres_adj > 0 */
415	ja	6f				/* hres_adj < 0 */
416
4172:
418	cmpq	$NANOSEC, %r10
419	jae	8f				/* tv_nsec >= NANOSEC */
420
4213:
422	movl	0x10(%rsp), %eax
423	movl	CP_HRES_LOCK(%rdi), %edx
424	andl	$0xfffffffe, %edx
425	cmpl	%eax, %edx
426	jne	1b
427
428	movq	0x8(%rsp), %rsi
429	movq	%r9, (%rsi)
430	movq	%r10, 0x8(%rsi)
431
432	xorl	%eax, %eax
433	addq	$0x18, %rsp
434	ret
435
436
4374:						/* hres_adj > 0 */
438	sarq	$ADJ_SHIFT, %rax
439	cmpl	%r11d, %eax
440	jbe	5f
441	movl	%r11d, %eax
4425:
443	addq	%rax, %r10
444	jmp	2b
445
4466:						/* hres_adj < 0 */
447	sarq	$ADJ_SHIFT, %rax
448	negl	%r11d
449	cmpl	%r11d, %eax
450	jbe	7f
451	movl	%r11d, %eax
4527:
453	subq	%rax, %r10
454	jmp	2b
455
4568:						/* tv_nsec >= NANOSEC */
457	subq	$NANOSEC, %r10
458	incq	%r9
459	cmpq	$NANOSEC, %r10
460	jae	8b
461	jmp	3b
462
463	SET_SIZE(__cp_clock_gettime_realtime)
464