17c478bdstevel@tonic-gate/*
27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
5ae115bcmrj * Common Development and Distribution License (the "License").
6ae115bcmrj * You may not use this file except in compliance with the License.
77c478bdstevel@tonic-gate *
87c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bdstevel@tonic-gate * See the License for the specific language governing permissions
117c478bdstevel@tonic-gate * and limitations under the License.
127c478bdstevel@tonic-gate *
137c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bdstevel@tonic-gate *
197c478bdstevel@tonic-gate * CDDL HEADER END
207c478bdstevel@tonic-gate */
21843e198johnlev
227c478bdstevel@tonic-gate/*
237997e10Surya Prakki * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
247c478bdstevel@tonic-gate * Use is subject to license terms.
2579ec9daYuri Pankov *
2679ec9daYuri Pankov * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27e014e7fPaul Dagnelie * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
28fc3fd29Patrick Mooney * Copyright 2018 Joyent, Inc.
297c478bdstevel@tonic-gate */
307c478bdstevel@tonic-gate
317c478bdstevel@tonic-gate#include <sys/types.h>
327c478bdstevel@tonic-gate#include <sys/param.h>
337c478bdstevel@tonic-gate#include <sys/systm.h>
347c478bdstevel@tonic-gate#include <sys/disp.h>
357c478bdstevel@tonic-gate#include <sys/var.h>
367c478bdstevel@tonic-gate#include <sys/cmn_err.h>
377c478bdstevel@tonic-gate#include <sys/debug.h>
387c478bdstevel@tonic-gate#include <sys/x86_archext.h>
397c478bdstevel@tonic-gate#include <sys/archsystm.h>
407c478bdstevel@tonic-gate#include <sys/cpuvar.h>
417c478bdstevel@tonic-gate#include <sys/psm_defs.h>
427c478bdstevel@tonic-gate#include <sys/clock.h>
437c478bdstevel@tonic-gate#include <sys/atomic.h>
447c478bdstevel@tonic-gate#include <sys/lockstat.h>
457c478bdstevel@tonic-gate#include <sys/smp_impldefs.h>
467c478bdstevel@tonic-gate#include <sys/dtrace.h>
477c478bdstevel@tonic-gate#include <sys/time.h>
48843e198johnlev#include <sys/panic.h>
49b3c1802Sudheer A#include <sys/cpu.h>
50e014e7fPaul Dagnelie#include <sys/sdt.h>
512428aadPatrick Mooney#include <sys/comm_page.h>
527c478bdstevel@tonic-gate
537c478bdstevel@tonic-gate/*
547c478bdstevel@tonic-gate * Using the Pentium's TSC register for gethrtime()
557c478bdstevel@tonic-gate * ------------------------------------------------
567c478bdstevel@tonic-gate *
577c478bdstevel@tonic-gate * The Pentium family, like many chip architectures, has a high-resolution
587c478bdstevel@tonic-gate * timestamp counter ("TSC") which increments once per CPU cycle.  The contents
597c478bdstevel@tonic-gate * of the timestamp counter are read with the RDTSC instruction.
607c478bdstevel@tonic-gate *
617c478bdstevel@tonic-gate * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
627c478bdstevel@tonic-gate * must be translated into nanoseconds in order to implement gethrtime().
637c478bdstevel@tonic-gate * We avoid inducing floating point operations in this conversion by
647c478bdstevel@tonic-gate * implementing the same nsec_scale algorithm as that found in the sun4u
657c478bdstevel@tonic-gate * platform code.  The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
667c478bdstevel@tonic-gate * a detailed description of the algorithm; the comment is not reproduced
677c478bdstevel@tonic-gate * here.  This implementation differs only in its value for NSEC_SHIFT:
687c478bdstevel@tonic-gate * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
697c478bdstevel@tonic-gate * 60 MHz Pentiums.
707c478bdstevel@tonic-gate *
717c478bdstevel@tonic-gate * While TSC and %tick are both cycle counting registers, TSC's functionality
727c478bdstevel@tonic-gate * falls short in several critical ways:
737c478bdstevel@tonic-gate *
747c478bdstevel@tonic-gate *  (a)	TSCs on different CPUs are not guaranteed to be in sync.  While in
757c478bdstevel@tonic-gate *	practice they often _are_ in sync, this isn't guaranteed by the
767c478bdstevel@tonic-gate *	architecture.
777c478bdstevel@tonic-gate *
787c478bdstevel@tonic-gate *  (b)	The TSC cannot be reliably set to an arbitrary value.  The architecture
797c478bdstevel@tonic-gate *	only supports writing the low 32-bits of TSC, making it impractical
807c478bdstevel@tonic-gate *	to rewrite.
817c478bdstevel@tonic-gate *
827c478bdstevel@tonic-gate *  (c)	The architecture doesn't have the capacity to interrupt based on
837c478bdstevel@tonic-gate *	arbitrary values of TSC; there is no TICK_CMPR equivalent.
847c478bdstevel@tonic-gate *
857c478bdstevel@tonic-gate * Together, (a) and (b) imply that software must track the skew between
867c478bdstevel@tonic-gate * TSCs and account for it (it is assumed that while there may exist skew,
877c478bdstevel@tonic-gate * there does not exist drift).  To determine the skew between CPUs, we
887c478bdstevel@tonic-gate * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
89b3c1802Sudheer A * the online operation calls tsc_sync_master().
907c478bdstevel@tonic-gate *
917c478bdstevel@tonic-gate * In the absence of time-of-day clock adjustments, gethrtime() must stay in
927c478bdstevel@tonic-gate * sync with gettimeofday().  This is problematic; given (c), the software
937c478bdstevel@tonic-gate * cannot drive its time-of-day source from TSC, and yet they must somehow be
947c478bdstevel@tonic-gate * kept in sync.  We implement this by having a routine, tsc_tick(), which
957c478bdstevel@tonic-gate * is called once per second from the interrupt which drives time-of-day.
967c478bdstevel@tonic-gate *
977c478bdstevel@tonic-gate * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
987c478bdstevel@tonic-gate * atomically with nsec_scale under CLOCK_LOCK.  This assures that time
997c478bdstevel@tonic-gate * monotonically increases.
1007c478bdstevel@tonic-gate */
1017c478bdstevel@tonic-gate
1027c478bdstevel@tonic-gate#define	NSEC_SHIFT 5
1037c478bdstevel@tonic-gate
104113b131Eric Saxestatic uint_t nsec_unscale;
1057c478bdstevel@tonic-gate
1067c478bdstevel@tonic-gate/*
1077c478bdstevel@tonic-gate * These two variables used to be grouped together inside of a structure that
1087c478bdstevel@tonic-gate * lived on a single cache line. A regression (bug ID 4623398) caused the
1097c478bdstevel@tonic-gate * compiler to emit code that "optimized" away the while-loops below. The
1107c478bdstevel@tonic-gate * result was that no synchronization between the onlining and onlined CPUs
1117c478bdstevel@tonic-gate * took place.
1127c478bdstevel@tonic-gate */
1137c478bdstevel@tonic-gatestatic volatile int tsc_ready;
1147c478bdstevel@tonic-gatestatic volatile int tsc_sync_go;
1157c478bdstevel@tonic-gate
1167c478bdstevel@tonic-gate/*
1177c478bdstevel@tonic-gate * Used as indices into the tsc_sync_snaps[] array.
1187c478bdstevel@tonic-gate */
1197c478bdstevel@tonic-gate#define	TSC_MASTER		0
1207c478bdstevel@tonic-gate#define	TSC_SLAVE		1
1217c478bdstevel@tonic-gate
1227c478bdstevel@tonic-gate/*
1237c478bdstevel@tonic-gate * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
1247c478bdstevel@tonic-gate */
1257c478bdstevel@tonic-gate#define	TSC_SYNC_STOP		1
1267c478bdstevel@tonic-gate#define	TSC_SYNC_GO		2
127b3c1802Sudheer A#define	TSC_SYNC_DONE		3
128b3c1802Sudheer A#define	SYNC_ITERATIONS		10
1297c478bdstevel@tonic-gate
130843e198johnlev#define	TSC_CONVERT_AND_ADD(tsc, hrt, scale) {	 	\
131ae115bcmrj	unsigned int *_l = (unsigned int *)&(tsc); 	\
132ae115bcmrj	(hrt) += mul32(_l[1], scale) << NSEC_SHIFT; 	\
1337c478bdstevel@tonic-gate	(hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
1347c478bdstevel@tonic-gate}
1357c478bdstevel@tonic-gate
136ae115bcmrj#define	TSC_CONVERT(tsc, hrt, scale) { 			\
137ae115bcmrj	unsigned int *_l = (unsigned int *)&(tsc); 	\
138ae115bcmrj	(hrt) = mul32(_l[1], scale) << NSEC_SHIFT; 	\
1397c478bdstevel@tonic-gate	(hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
1407c478bdstevel@tonic-gate}
1417c478bdstevel@tonic-gate
142ae115bcmrjint tsc_master_slave_sync_needed = 1;
1437c478bdstevel@tonic-gate
144b3c1802Sudheer Atypedef struct tsc_sync {
145b3c1802Sudheer A	volatile hrtime_t master_tsc, slave_tsc;
146b3c1802Sudheer A} tsc_sync_t;
147b3c1802Sudheer Astatic tsc_sync_t *tscp;
148b3c1802Sudheer A
1497c478bdstevel@tonic-gatestatic hrtime_t	tsc_last_jumped = 0;
1507c478bdstevel@tonic-gatestatic int	tsc_jumped = 0;
151e014e7fPaul Dagneliestatic uint32_t	tsc_wayback = 0;
152e014e7fPaul Dagnelie/*
153e014e7fPaul Dagnelie * The cap of 1 second was chosen since it is the frequency at which the
154e014e7fPaul Dagnelie * tsc_tick() function runs which means that when gethrtime() is called it
155e014e7fPaul Dagnelie * should never be more than 1 second since tsc_last was updated.
156e014e7fPaul Dagnelie */
157e014e7fPaul Dagneliestatic hrtime_t tsc_resume_cap_ns = NANOSEC;	 /* 1s */
1587c478bdstevel@tonic-gate
1597c478bdstevel@tonic-gatestatic hrtime_t	shadow_tsc_hrtime_base;
1607c478bdstevel@tonic-gatestatic hrtime_t	shadow_tsc_last;
1617c478bdstevel@tonic-gatestatic uint_t	shadow_nsec_scale;
1627c478bdstevel@tonic-gatestatic uint32_t	shadow_hres_lock;
1632df1fe9randyfint get_tsc_ready();
1647c478bdstevel@tonic-gate
165e014e7fPaul Dagneliestatic inline
166e014e7fPaul Dagneliehrtime_t tsc_protect(hrtime_t a) {
167e014e7fPaul Dagnelie	if (a > tsc_resume_cap) {
168e014e7fPaul Dagnelie		atomic_inc_32(&tsc_wayback);
169e014e7fPaul Dagnelie		DTRACE_PROBE3(tsc__wayback, htrime_t, a, hrtime_t, tsc_last,
170e014e7fPaul Dagnelie		    uint32_t, tsc_wayback);
171e014e7fPaul Dagnelie		return (tsc_resume_cap);
172e014e7fPaul Dagnelie	}
173e014e7fPaul Dagnelie	return (a);
174e014e7fPaul Dagnelie}
175e014e7fPaul Dagnelie
176843e198johnlevhrtime_t
177843e198johnlevtsc_gethrtime(void)
178843e198johnlev{
179843e198johnlev	uint32_t old_hres_lock;
180843e198johnlev	hrtime_t tsc, hrt;
181843e198johnlev
182843e198johnlev	do {
183843e198johnlev		old_hres_lock = hres_lock;
184843e198johnlev
185843e198johnlev		if ((tsc = tsc_read()) >= tsc_last) {
186843e198johnlev			/*
187843e198johnlev			 * It would seem to be obvious that this is true
188843e198johnlev			 * (that is, the past is less than the present),
189843e198johnlev			 * but it isn't true in the presence of suspend/resume
190843e198johnlev			 * cycles.  If we manage to call gethrtime()
191843e198johnlev			 * after a resume, but before the first call to
192843e198johnlev			 * tsc_tick(), we will see the jump.  In this case,
193843e198johnlev			 * we will simply use the value in TSC as the delta.
194843e198johnlev			 */
195843e198johnlev			tsc -= tsc_last;
196843e198johnlev		} else if (tsc >= tsc_last - 2*tsc_max_delta) {
197843e198johnlev			/*
198843e198johnlev			 * There is a chance that tsc_tick() has just run on
199843e198johnlev			 * another CPU, and we have drifted just enough so that
200843e198johnlev			 * we appear behind tsc_last.  In this case, force the
201843e198johnlev			 * delta to be zero.
202843e198johnlev			 */
203843e198johnlev			tsc = 0;
204e014e7fPaul Dagnelie		} else {
205e014e7fPaul Dagnelie			/*
206e014e7fPaul Dagnelie			 * If we reach this else clause we assume that we have
207e014e7fPaul Dagnelie			 * gone through a suspend/resume cycle and use the
208e014e7fPaul Dagnelie			 * current tsc value as the delta.
209e014e7fPaul Dagnelie			 *
210e014e7fPaul Dagnelie			 * In rare cases we can reach this else clause due to
211e014e7fPaul Dagnelie			 * a lack of monotonicity in the TSC value.  In such
212e014e7fPaul Dagnelie			 * cases using the current TSC value as the delta would
213e014e7fPaul Dagnelie			 * cause us to return a value ~2x of what it should
214e014e7fPaul Dagnelie			 * be.  To protect against these cases we cap the
215e014e7fPaul Dagnelie			 * suspend/resume delta at tsc_resume_cap.
216e014e7fPaul Dagnelie			 */
217e014e7fPaul Dagnelie			tsc = tsc_protect(tsc);
218843e198johnlev		}
219843e198johnlev
220843e198johnlev		hrt = tsc_hrtime_base;
221843e198johnlev
222843e198johnlev		TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
223843e198johnlev	} while ((old_hres_lock & ~1) != hres_lock);
224843e198johnlev
225843e198johnlev	return (hrt);
226843e198johnlev}
227843e198johnlev
228843e198johnlevhrtime_t
229843e198johnlevtsc_gethrtime_delta(void)
230843e198johnlev{
231843e198johnlev	uint32_t old_hres_lock;
232843e198johnlev	hrtime_t tsc, hrt;
233a563a03bholler	ulong_t flags;
234843e198johnlev
235843e198johnlev	do {
236843e198johnlev		old_hres_lock = hres_lock;
237843e198johnlev
238843e198johnlev		/*
239843e198johnlev		 * We need to disable interrupts here to assure that we
240843e198johnlev		 * don't migrate between the call to tsc_read() and
241843e198johnlev		 * adding the CPU's TSC tick delta. Note that disabling
242843e198johnlev		 * and reenabling preemption is forbidden here because
243843e198johnlev		 * we may be in the middle of a fast trap. In the amd64
244843e198johnlev		 * kernel we cannot tolerate preemption during a fast
245843e198johnlev		 * trap. See _update_sregs().
246843e198johnlev		 */
247843e198johnlev
248843e198johnlev		flags = clear_int_flag();
249843e198johnlev		tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
250843e198johnlev		restore_int_flag(flags);
251843e198johnlev
252843e198johnlev		/* See comments in tsc_gethrtime() above */
253843e198johnlev
254843e198johnlev		if (tsc >= tsc_last) {
255843e198johnlev			tsc -= tsc_last;
256843e198johnlev		} else if (tsc >= tsc_last - 2 * tsc_max_delta) {
257843e198johnlev			tsc = 0;
258e014e7fPaul Dagnelie		} else {
259e014e7fPaul Dagnelie			tsc = tsc_protect(tsc);
260843e198johnlev		}
261843e198johnlev
262843e198johnlev		hrt = tsc_hrtime_base;
263843e198johnlev
264843e198johnlev		TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
265843e198johnlev	} while ((old_hres_lock & ~1) != hres_lock);
266843e198johnlev
267843e198johnlev	return (hrt);
268843e198johnlev}
269843e198johnlev
2709278ddfRobert Mustacchihrtime_t
2719278ddfRobert Mustacchitsc_gethrtime_tick_delta(void)
2729278ddfRobert Mustacchi{
2739278ddfRobert Mustacchi	hrtime_t hrt;
2749278ddfRobert Mustacchi	ulong_t flags;
2759278ddfRobert Mustacchi
2769278ddfRobert Mustacchi	flags = clear_int_flag();
2779278ddfRobert Mustacchi	hrt = tsc_sync_tick_delta[CPU->cpu_id];
2789278ddfRobert Mustacchi	restore_int_flag(flags);
2799278ddfRobert Mustacchi
2809278ddfRobert Mustacchi	return (hrt);
2819278ddfRobert Mustacchi}
2829278ddfRobert Mustacchi
283fc3fd29Patrick Mooney/* Calculate the hrtime while exposing the parameters of that calculation. */
284fc3fd29Patrick Mooneyhrtime_t
285fc3fd29Patrick Mooneytsc_gethrtime_params(uint64_t *tscp, uint32_t *scalep, uint8_t *shiftp)
286fc3fd29Patrick Mooney{
287fc3fd29Patrick Mooney	uint32_t old_hres_lock, scale;
288fc3fd29Patrick Mooney	hrtime_t tsc, last, base;
289fc3fd29Patrick Mooney
290fc3fd29Patrick Mooney	do {
291fc3fd29Patrick Mooney		old_hres_lock = hres_lock;
292fc3fd29Patrick Mooney
293fc3fd29Patrick Mooney		if (gethrtimef == tsc_gethrtime_delta) {
294fc3fd29Patrick Mooney			ulong_t flags;
295fc3fd29Patrick Mooney
296fc3fd29Patrick Mooney			flags = clear_int_flag();
297fc3fd29Patrick Mooney			tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
298fc3fd29Patrick Mooney			restore_int_flag(flags);
299fc3fd29Patrick Mooney		} else {
300fc3fd29Patrick Mooney			tsc = tsc_read();
301fc3fd29Patrick Mooney		}
302fc3fd29Patrick Mooney
303fc3fd29Patrick Mooney		last = tsc_last;
304fc3fd29Patrick Mooney		base = tsc_hrtime_base;
305fc3fd29Patrick Mooney		scale = nsec_scale;
306fc3fd29Patrick Mooney
307fc3fd29Patrick Mooney	} while ((old_hres_lock & ~1) != hres_lock);
308fc3fd29Patrick Mooney
309fc3fd29Patrick Mooney	/* See comments in tsc_gethrtime() above */
310fc3fd29Patrick Mooney	if (tsc >= last) {
311fc3fd29Patrick Mooney		tsc -= last;
312fc3fd29Patrick Mooney	} else if (tsc >= last - 2 * tsc_max_delta) {
313fc3fd29Patrick Mooney		tsc = 0;
314fc3fd29Patrick Mooney	} else {
315fc3fd29Patrick Mooney		tsc = tsc_protect(tsc);
316fc3fd29Patrick Mooney	}
317fc3fd29Patrick Mooney
318fc3fd29Patrick Mooney	TSC_CONVERT_AND_ADD(tsc, base, nsec_scale);
319fc3fd29Patrick Mooney
320fc3fd29Patrick Mooney	if (tscp != NULL) {
321fc3fd29Patrick Mooney		/*
322fc3fd29Patrick Mooney		 * Do not simply communicate the delta applied to the hrtime
323fc3fd29Patrick Mooney		 * base, but rather the effective TSC measurement.
324fc3fd29Patrick Mooney		 */
325fc3fd29Patrick Mooney		*tscp = tsc + last;
326fc3fd29Patrick Mooney	}
327fc3fd29Patrick Mooney	if (scalep != NULL) {
328fc3fd29Patrick Mooney		*scalep = scale;
329fc3fd29Patrick Mooney	}
330fc3fd29Patrick Mooney	if (shiftp != NULL) {
331fc3fd29Patrick Mooney		*shiftp = NSEC_SHIFT;
332fc3fd29Patrick Mooney	}
333fc3fd29Patrick Mooney
334fc3fd29Patrick Mooney	return (base);
335fc3fd29Patrick Mooney}
336fc3fd29Patrick Mooney
337843e198johnlev/*
338fc3fd29Patrick Mooney * This is similar to tsc_gethrtime_delta, but it cannot actually spin on
339fc3fd29Patrick Mooney * hres_lock.  As a result, it caches all of the variables it needs; if the
340fc3fd29Patrick Mooney * variables don't change, it's done.
341843e198johnlev */
342843e198johnlevhrtime_t
343843e198johnlevdtrace_gethrtime(void)
344843e198johnlev{
345843e198johnlev	uint32_t old_hres_lock;
346843e198johnlev	hrtime_t tsc, hrt;
347a563a03bholler	ulong_t flags;
348843e198johnlev
349843e198johnlev	do {
350843e198johnlev		old_hres_lock = hres_lock;
351843e198johnlev
352843e198johnlev		/*
353843e198johnlev		 * Interrupts are disabled to ensure that the thread isn't
354843e198johnlev		 * migrated between the tsc_read() and adding the CPU's
355843e198johnlev		 * TSC tick delta.
356843e198johnlev		 */
357843e198johnlev		flags = clear_int_flag();
358843e198johnlev
359843e198johnlev		tsc = tsc_read();
360843e198johnlev
361843e198johnlev		if (gethrtimef == tsc_gethrtime_delta)
362843e198johnlev			tsc += tsc_sync_tick_delta[CPU->cpu_id];
363843e198johnlev
364843e198johnlev		restore_int_flag(flags);
365843e198johnlev
366843e198johnlev		/*
367843e198johnlev		 * See the comments in tsc_gethrtime(), above.
368843e198johnlev		 */
369843e198johnlev		if (tsc >= tsc_last)
370843e198johnlev			tsc -= tsc_last;
371843e198johnlev		else if (tsc >= tsc_last - 2*tsc_max_delta)
372843e198johnlev			tsc = 0;
373e014e7fPaul Dagnelie		else
374e014e7fPaul Dagnelie			tsc = tsc_protect(tsc);
375843e198johnlev
376843e198johnlev		hrt = tsc_hrtime_base;
377843e198johnlev
378843e198johnlev		TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
379