xref: /illumos-gate/usr/src/uts/i86pc/os/timestamp.c (revision fc3fd29d04595458a24c7d90eb46ce039660a44f)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5ae115bc7Smrj  * Common Development and Distribution License (the "License").
6ae115bc7Smrj  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
21843e1988Sjohnlev 
227c478bd9Sstevel@tonic-gate /*
237997e108SSurya Prakki  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
247c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
2579ec9da8SYuri Pankov  *
2679ec9da8SYuri Pankov  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27e014e7f8SPaul Dagnelie  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
28*fc3fd29dSPatrick Mooney  * Copyright 2018 Joyent, Inc.
297c478bd9Sstevel@tonic-gate  */
307c478bd9Sstevel@tonic-gate 
317c478bd9Sstevel@tonic-gate #include <sys/types.h>
327c478bd9Sstevel@tonic-gate #include <sys/param.h>
337c478bd9Sstevel@tonic-gate #include <sys/systm.h>
347c478bd9Sstevel@tonic-gate #include <sys/disp.h>
357c478bd9Sstevel@tonic-gate #include <sys/var.h>
367c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
377c478bd9Sstevel@tonic-gate #include <sys/debug.h>
387c478bd9Sstevel@tonic-gate #include <sys/x86_archext.h>
397c478bd9Sstevel@tonic-gate #include <sys/archsystm.h>
407c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
417c478bd9Sstevel@tonic-gate #include <sys/psm_defs.h>
427c478bd9Sstevel@tonic-gate #include <sys/clock.h>
437c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
447c478bd9Sstevel@tonic-gate #include <sys/lockstat.h>
457c478bd9Sstevel@tonic-gate #include <sys/smp_impldefs.h>
467c478bd9Sstevel@tonic-gate #include <sys/dtrace.h>
477c478bd9Sstevel@tonic-gate #include <sys/time.h>
48843e1988Sjohnlev #include <sys/panic.h>
49b3c18020SSudheer A #include <sys/cpu.h>
50e014e7f8SPaul Dagnelie #include <sys/sdt.h>
512428aad8SPatrick Mooney #include <sys/comm_page.h>
527c478bd9Sstevel@tonic-gate 
537c478bd9Sstevel@tonic-gate /*
547c478bd9Sstevel@tonic-gate  * Using the Pentium's TSC register for gethrtime()
557c478bd9Sstevel@tonic-gate  * ------------------------------------------------
567c478bd9Sstevel@tonic-gate  *
577c478bd9Sstevel@tonic-gate  * The Pentium family, like many chip architectures, has a high-resolution
587c478bd9Sstevel@tonic-gate  * timestamp counter ("TSC") which increments once per CPU cycle.  The contents
597c478bd9Sstevel@tonic-gate  * of the timestamp counter are read with the RDTSC instruction.
607c478bd9Sstevel@tonic-gate  *
617c478bd9Sstevel@tonic-gate  * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
627c478bd9Sstevel@tonic-gate  * must be translated into nanoseconds in order to implement gethrtime().
637c478bd9Sstevel@tonic-gate  * We avoid inducing floating point operations in this conversion by
647c478bd9Sstevel@tonic-gate  * implementing the same nsec_scale algorithm as that found in the sun4u
657c478bd9Sstevel@tonic-gate  * platform code.  The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
667c478bd9Sstevel@tonic-gate  * a detailed description of the algorithm; the comment is not reproduced
677c478bd9Sstevel@tonic-gate  * here.  This implementation differs only in its value for NSEC_SHIFT:
687c478bd9Sstevel@tonic-gate  * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
697c478bd9Sstevel@tonic-gate  * 60 MHz Pentiums.
707c478bd9Sstevel@tonic-gate  *
717c478bd9Sstevel@tonic-gate  * While TSC and %tick are both cycle counting registers, TSC's functionality
727c478bd9Sstevel@tonic-gate  * falls short in several critical ways:
737c478bd9Sstevel@tonic-gate  *
747c478bd9Sstevel@tonic-gate  *  (a)	TSCs on different CPUs are not guaranteed to be in sync.  While in
757c478bd9Sstevel@tonic-gate  *	practice they often _are_ in sync, this isn't guaranteed by the
767c478bd9Sstevel@tonic-gate  *	architecture.
777c478bd9Sstevel@tonic-gate  *
787c478bd9Sstevel@tonic-gate  *  (b)	The TSC cannot be reliably set to an arbitrary value.  The architecture
797c478bd9Sstevel@tonic-gate  *	only supports writing the low 32-bits of TSC, making it impractical
807c478bd9Sstevel@tonic-gate  *	to rewrite.
817c478bd9Sstevel@tonic-gate  *
827c478bd9Sstevel@tonic-gate  *  (c)	The architecture doesn't have the capacity to interrupt based on
837c478bd9Sstevel@tonic-gate  *	arbitrary values of TSC; there is no TICK_CMPR equivalent.
847c478bd9Sstevel@tonic-gate  *
857c478bd9Sstevel@tonic-gate  * Together, (a) and (b) imply that software must track the skew between
867c478bd9Sstevel@tonic-gate  * TSCs and account for it (it is assumed that while there may exist skew,
877c478bd9Sstevel@tonic-gate  * there does not exist drift).  To determine the skew between CPUs, we
887c478bd9Sstevel@tonic-gate  * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
89b3c18020SSudheer A  * the online operation calls tsc_sync_master().
907c478bd9Sstevel@tonic-gate  *
917c478bd9Sstevel@tonic-gate  * In the absence of time-of-day clock adjustments, gethrtime() must stay in
927c478bd9Sstevel@tonic-gate  * sync with gettimeofday().  This is problematic; given (c), the software
937c478bd9Sstevel@tonic-gate  * cannot drive its time-of-day source from TSC, and yet they must somehow be
947c478bd9Sstevel@tonic-gate  * kept in sync.  We implement this by having a routine, tsc_tick(), which
957c478bd9Sstevel@tonic-gate  * is called once per second from the interrupt which drives time-of-day.
967c478bd9Sstevel@tonic-gate  *
977c478bd9Sstevel@tonic-gate  * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
987c478bd9Sstevel@tonic-gate  * atomically with nsec_scale under CLOCK_LOCK.  This assures that time
997c478bd9Sstevel@tonic-gate  * monotonically increases.
1007c478bd9Sstevel@tonic-gate  */
1017c478bd9Sstevel@tonic-gate 
1027c478bd9Sstevel@tonic-gate #define	NSEC_SHIFT 5
1037c478bd9Sstevel@tonic-gate 
104113b131bSEric Saxe static uint_t nsec_unscale;
1057c478bd9Sstevel@tonic-gate 
1067c478bd9Sstevel@tonic-gate /*
1077c478bd9Sstevel@tonic-gate  * These two variables used to be grouped together inside of a structure that
1087c478bd9Sstevel@tonic-gate  * lived on a single cache line. A regression (bug ID 4623398) caused the
1097c478bd9Sstevel@tonic-gate  * compiler to emit code that "optimized" away the while-loops below. The
1107c478bd9Sstevel@tonic-gate  * result was that no synchronization between the onlining and onlined CPUs
1117c478bd9Sstevel@tonic-gate  * took place.
1127c478bd9Sstevel@tonic-gate  */
1137c478bd9Sstevel@tonic-gate static volatile int tsc_ready;
1147c478bd9Sstevel@tonic-gate static volatile int tsc_sync_go;
1157c478bd9Sstevel@tonic-gate 
1167c478bd9Sstevel@tonic-gate /*
1177c478bd9Sstevel@tonic-gate  * Used as indices into the tsc_sync_snaps[] array.
1187c478bd9Sstevel@tonic-gate  */
1197c478bd9Sstevel@tonic-gate #define	TSC_MASTER		0
1207c478bd9Sstevel@tonic-gate #define	TSC_SLAVE		1
1217c478bd9Sstevel@tonic-gate 
1227c478bd9Sstevel@tonic-gate /*
1237c478bd9Sstevel@tonic-gate  * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
1247c478bd9Sstevel@tonic-gate  */
1257c478bd9Sstevel@tonic-gate #define	TSC_SYNC_STOP		1
1267c478bd9Sstevel@tonic-gate #define	TSC_SYNC_GO		2
127b3c18020SSudheer A #define	TSC_SYNC_DONE		3
128b3c18020SSudheer A #define	SYNC_ITERATIONS		10
1297c478bd9Sstevel@tonic-gate 
130843e1988Sjohnlev #define	TSC_CONVERT_AND_ADD(tsc, hrt, scale) {	 	\
131ae115bc7Smrj 	unsigned int *_l = (unsigned int *)&(tsc); 	\
132ae115bc7Smrj 	(hrt) += mul32(_l[1], scale) << NSEC_SHIFT; 	\
1337c478bd9Sstevel@tonic-gate 	(hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
1347c478bd9Sstevel@tonic-gate }
1357c478bd9Sstevel@tonic-gate 
136ae115bc7Smrj #define	TSC_CONVERT(tsc, hrt, scale) { 			\
137ae115bc7Smrj 	unsigned int *_l = (unsigned int *)&(tsc); 	\
138ae115bc7Smrj 	(hrt) = mul32(_l[1], scale) << NSEC_SHIFT; 	\
1397c478bd9Sstevel@tonic-gate 	(hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
1407c478bd9Sstevel@tonic-gate }
1417c478bd9Sstevel@tonic-gate 
142ae115bc7Smrj int tsc_master_slave_sync_needed = 1;
1437c478bd9Sstevel@tonic-gate 
144b3c18020SSudheer A typedef struct tsc_sync {
145b3c18020SSudheer A 	volatile hrtime_t master_tsc, slave_tsc;
146b3c18020SSudheer A } tsc_sync_t;
147b3c18020SSudheer A static tsc_sync_t *tscp;
148b3c18020SSudheer A 
1497c478bd9Sstevel@tonic-gate static hrtime_t	tsc_last_jumped = 0;
1507c478bd9Sstevel@tonic-gate static int	tsc_jumped = 0;
151e014e7f8SPaul Dagnelie static uint32_t	tsc_wayback = 0;
152e014e7f8SPaul Dagnelie /*
153e014e7f8SPaul Dagnelie  * The cap of 1 second was chosen since it is the frequency at which the
154e014e7f8SPaul Dagnelie  * tsc_tick() function runs which means that when gethrtime() is called it
155e014e7f8SPaul Dagnelie  * should never be more than 1 second since tsc_last was updated.
156e014e7f8SPaul Dagnelie  */
157e014e7f8SPaul Dagnelie static hrtime_t tsc_resume_cap_ns = NANOSEC;	 /* 1s */
1587c478bd9Sstevel@tonic-gate 
1597c478bd9Sstevel@tonic-gate static hrtime_t	shadow_tsc_hrtime_base;
1607c478bd9Sstevel@tonic-gate static hrtime_t	shadow_tsc_last;
1617c478bd9Sstevel@tonic-gate static uint_t	shadow_nsec_scale;
1627c478bd9Sstevel@tonic-gate static uint32_t	shadow_hres_lock;
1632df1fe9cSrandyf int get_tsc_ready();
1647c478bd9Sstevel@tonic-gate 
165e014e7f8SPaul Dagnelie static inline
166e014e7f8SPaul Dagnelie hrtime_t tsc_protect(hrtime_t a) {
167e014e7f8SPaul Dagnelie 	if (a > tsc_resume_cap) {
168e014e7f8SPaul Dagnelie 		atomic_inc_32(&tsc_wayback);
169e014e7f8SPaul Dagnelie 		DTRACE_PROBE3(tsc__wayback, htrime_t, a, hrtime_t, tsc_last,
170e014e7f8SPaul Dagnelie 		    uint32_t, tsc_wayback);
171e014e7f8SPaul Dagnelie 		return (tsc_resume_cap);
172e014e7f8SPaul Dagnelie 	}
173e014e7f8SPaul Dagnelie 	return (a);
174e014e7f8SPaul Dagnelie }
175e014e7f8SPaul Dagnelie 
176843e1988Sjohnlev hrtime_t
177843e1988Sjohnlev tsc_gethrtime(void)
178843e1988Sjohnlev {
179843e1988Sjohnlev 	uint32_t old_hres_lock;
180843e1988Sjohnlev 	hrtime_t tsc, hrt;
181843e1988Sjohnlev 
182843e1988Sjohnlev 	do {
183843e1988Sjohnlev 		old_hres_lock = hres_lock;
184843e1988Sjohnlev 
185843e1988Sjohnlev 		if ((tsc = tsc_read()) >= tsc_last) {
186843e1988Sjohnlev 			/*
187843e1988Sjohnlev 			 * It would seem to be obvious that this is true
188843e1988Sjohnlev 			 * (that is, the past is less than the present),
189843e1988Sjohnlev 			 * but it isn't true in the presence of suspend/resume
190843e1988Sjohnlev 			 * cycles.  If we manage to call gethrtime()
191843e1988Sjohnlev 			 * after a resume, but before the first call to
192843e1988Sjohnlev 			 * tsc_tick(), we will see the jump.  In this case,
193843e1988Sjohnlev 			 * we will simply use the value in TSC as the delta.
194843e1988Sjohnlev 			 */
195843e1988Sjohnlev 			tsc -= tsc_last;
196843e1988Sjohnlev 		} else if (tsc >= tsc_last - 2*tsc_max_delta) {
197843e1988Sjohnlev 			/*
198843e1988Sjohnlev 			 * There is a chance that tsc_tick() has just run on
199843e1988Sjohnlev 			 * another CPU, and we have drifted just enough so that
200843e1988Sjohnlev 			 * we appear behind tsc_last.  In this case, force the
201843e1988Sjohnlev 			 * delta to be zero.
202843e1988Sjohnlev 			 */
203843e1988Sjohnlev 			tsc = 0;
204e014e7f8SPaul Dagnelie 		} else {
205e014e7f8SPaul Dagnelie 			/*
206e014e7f8SPaul Dagnelie 			 * If we reach this else clause we assume that we have
207e014e7f8SPaul Dagnelie 			 * gone through a suspend/resume cycle and use the
208e014e7f8SPaul Dagnelie 			 * current tsc value as the delta.
209e014e7f8SPaul Dagnelie 			 *
210e014e7f8SPaul Dagnelie 			 * In rare cases we can reach this else clause due to
211e014e7f8SPaul Dagnelie 			 * a lack of monotonicity in the TSC value.  In such
212e014e7f8SPaul Dagnelie 			 * cases using the current TSC value as the delta would
213e014e7f8SPaul Dagnelie 			 * cause us to return a value ~2x of what it should
214e014e7f8SPaul Dagnelie 			 * be.  To protect against these cases we cap the
215e014e7f8SPaul Dagnelie 			 * suspend/resume delta at tsc_resume_cap.
216e014e7f8SPaul Dagnelie 			 */
217e014e7f8SPaul Dagnelie 			tsc = tsc_protect(tsc);
218843e1988Sjohnlev 		}
219843e1988Sjohnlev 
220843e1988Sjohnlev 		hrt = tsc_hrtime_base;
221843e1988Sjohnlev 
222843e1988Sjohnlev 		TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
223843e1988Sjohnlev 	} while ((old_hres_lock & ~1) != hres_lock);
224843e1988Sjohnlev 
225843e1988Sjohnlev 	return (hrt);
226843e1988Sjohnlev }
227843e1988Sjohnlev 
228843e1988Sjohnlev hrtime_t
229843e1988Sjohnlev tsc_gethrtime_delta(void)
230843e1988Sjohnlev {
231843e1988Sjohnlev 	uint32_t old_hres_lock;
232843e1988Sjohnlev 	hrtime_t tsc, hrt;
233a563a037Sbholler 	ulong_t flags;
234843e1988Sjohnlev 
235843e1988Sjohnlev 	do {
236843e1988Sjohnlev 		old_hres_lock = hres_lock;
237843e1988Sjohnlev 
238843e1988Sjohnlev 		/*
239843e1988Sjohnlev 		 * We need to disable interrupts here to assure that we
240843e1988Sjohnlev 		 * don't migrate between the call to tsc_read() and
241843e1988Sjohnlev 		 * adding the CPU's TSC tick delta. Note that disabling
242843e1988Sjohnlev 		 * and reenabling preemption is forbidden here because
243843e1988Sjohnlev 		 * we may be in the middle of a fast trap. In the amd64
244843e1988Sjohnlev 		 * kernel we cannot tolerate preemption during a fast
245843e1988Sjohnlev 		 * trap. See _update_sregs().
246843e1988Sjohnlev 		 */
247843e1988Sjohnlev 
248843e1988Sjohnlev 		flags = clear_int_flag();
249843e1988Sjohnlev 		tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
250843e1988Sjohnlev 		restore_int_flag(flags);
251843e1988Sjohnlev 
252843e1988Sjohnlev 		/* See comments in tsc_gethrtime() above */
253843e1988Sjohnlev 
254843e1988Sjohnlev 		if (tsc >= tsc_last) {
255843e1988Sjohnlev 			tsc -= tsc_last;
256843e1988Sjohnlev 		} else if (tsc >= tsc_last - 2 * tsc_max_delta) {
257843e1988Sjohnlev 			tsc = 0;
258e014e7f8SPaul Dagnelie 		} else {
259e014e7f8SPaul Dagnelie 			tsc = tsc_protect(tsc);
260843e1988Sjohnlev 		}
261843e1988Sjohnlev 
262843e1988Sjohnlev 		hrt = tsc_hrtime_base;
263843e1988Sjohnlev 
264843e1988Sjohnlev 		TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
265843e1988Sjohnlev 	} while ((old_hres_lock & ~1) != hres_lock);
266843e1988Sjohnlev 
267843e1988Sjohnlev 	return (hrt);
268843e1988Sjohnlev }
269843e1988Sjohnlev 
2709278ddffSRobert Mustacchi hrtime_t
2719278ddffSRobert Mustacchi tsc_gethrtime_tick_delta(void)
2729278ddffSRobert Mustacchi {
2739278ddffSRobert Mustacchi 	hrtime_t hrt;
2749278ddffSRobert Mustacchi 	ulong_t flags;
2759278ddffSRobert Mustacchi 
2769278ddffSRobert Mustacchi 	flags = clear_int_flag();
2779278ddffSRobert Mustacchi 	hrt = tsc_sync_tick_delta[CPU->cpu_id];
2789278ddffSRobert Mustacchi 	restore_int_flag(flags);
2799278ddffSRobert Mustacchi 
2809278ddffSRobert Mustacchi 	return (hrt);
2819278ddffSRobert Mustacchi }
2829278ddffSRobert Mustacchi 
283*fc3fd29dSPatrick Mooney /* Calculate the hrtime while exposing the parameters of that calculation. */
284*fc3fd29dSPatrick Mooney hrtime_t
285*fc3fd29dSPatrick Mooney tsc_gethrtime_params(uint64_t *tscp, uint32_t *scalep, uint8_t *shiftp)
286*fc3fd29dSPatrick Mooney {
287*fc3fd29dSPatrick Mooney 	uint32_t old_hres_lock, scale;
288*fc3fd29dSPatrick Mooney 	hrtime_t tsc, last, base;
289*fc3fd29dSPatrick Mooney 
290*fc3fd29dSPatrick Mooney 	do {
291*fc3fd29dSPatrick Mooney 		old_hres_lock = hres_lock;
292*fc3fd29dSPatrick Mooney 
293*fc3fd29dSPatrick Mooney 		if (gethrtimef == tsc_gethrtime_delta) {
294*fc3fd29dSPatrick Mooney 			ulong_t flags;
295*fc3fd29dSPatrick Mooney 
296*fc3fd29dSPatrick Mooney 			flags = clear_int_flag();
297*fc3fd29dSPatrick Mooney 			tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
298*fc3fd29dSPatrick Mooney 			restore_int_flag(flags);
299*fc3fd29dSPatrick Mooney 		} else {
300*fc3fd29dSPatrick Mooney 			tsc = tsc_read();
301*fc3fd29dSPatrick Mooney 		}
302*fc3fd29dSPatrick Mooney 
303*fc3fd29dSPatrick Mooney 		last = tsc_last;
304*fc3fd29dSPatrick Mooney 		base = tsc_hrtime_base;
305*fc3fd29dSPatrick Mooney 		scale = nsec_scale;
306*fc3fd29dSPatrick Mooney 
307*fc3fd29dSPatrick Mooney 	} while ((old_hres_lock & ~1) != hres_lock);
308*fc3fd29dSPatrick Mooney 
309*fc3fd29dSPatrick Mooney 	/* See comments in tsc_gethrtime() above */
310*fc3fd29dSPatrick Mooney 	if (tsc >= last) {
311*fc3fd29dSPatrick Mooney 		tsc -= last;
312*fc3fd29dSPatrick Mooney 	} else if (tsc >= last - 2 * tsc_max_delta) {
313*fc3fd29dSPatrick Mooney 		tsc = 0;
314*fc3fd29dSPatrick Mooney 	} else {
315*fc3fd29dSPatrick Mooney 		tsc = tsc_protect(tsc);
316*fc3fd29dSPatrick Mooney 	}
317*fc3fd29dSPatrick Mooney 
318*fc3fd29dSPatrick Mooney 	TSC_CONVERT_AND_ADD(tsc, base, nsec_scale);
319*fc3fd29dSPatrick Mooney 
320*fc3fd29dSPatrick Mooney 	if (tscp != NULL) {
321*fc3fd29dSPatrick Mooney 		/*
322*fc3fd29dSPatrick Mooney 		 * Do not simply communicate the delta applied to the hrtime
323*fc3fd29dSPatrick Mooney 		 * base, but rather the effective TSC measurement.
324*fc3fd29dSPatrick Mooney 		 */
325*fc3fd29dSPatrick Mooney 		*tscp = tsc + last;
326*fc3fd29dSPatrick Mooney 	}
327*fc3fd29dSPatrick Mooney 	if (scalep != NULL) {
328*fc3fd29dSPatrick Mooney 		*scalep = scale;
329*fc3fd29dSPatrick Mooney 	}
330*fc3fd29dSPatrick Mooney 	if (shiftp != NULL) {
331*fc3fd29dSPatrick Mooney 		*shiftp = NSEC_SHIFT;
332*fc3fd29dSPatrick Mooney 	}
333*fc3fd29dSPatrick Mooney 
334*fc3fd29dSPatrick Mooney 	return (base);
335*fc3fd29dSPatrick Mooney }
336*fc3fd29dSPatrick Mooney 
337843e1988Sjohnlev /*
338*fc3fd29dSPatrick Mooney  * This is similar to tsc_gethrtime_delta, but it cannot actually spin on
339*fc3fd29dSPatrick Mooney  * hres_lock.  As a result, it caches all of the variables it needs; if the
340*fc3fd29dSPatrick Mooney  * variables don't change, it's done.
341843e1988Sjohnlev  */
342843e1988Sjohnlev hrtime_t
343843e1988Sjohnlev dtrace_gethrtime(void)
344843e1988Sjohnlev {
345843e1988Sjohnlev 	uint32_t old_hres_lock;
346843e1988Sjohnlev 	hrtime_t tsc, hrt;
347a563a037Sbholler 	ulong_t flags;
348843e1988Sjohnlev 
349843e1988Sjohnlev 	do {
350843e1988Sjohnlev 		old_hres_lock = hres_lock;
351843e1988Sjohnlev 
352843e1988Sjohnlev 		/*
353843e1988Sjohnlev 		 * Interrupts are disabled to ensure that the thread isn't
354843e1988Sjohnlev 		 * migrated between the tsc_read() and adding the CPU's
355843e1988Sjohnlev 		 * TSC tick delta.
356843e1988Sjohnlev 		 */
357843e1988Sjohnlev 		flags = clear_int_flag();
358843e1988Sjohnlev 
359843e1988Sjohnlev 		tsc = tsc_read();
360843e1988Sjohnlev 
361843e1988Sjohnlev 		if (gethrtimef == tsc_gethrtime_delta)
362843e1988Sjohnlev 			tsc += tsc_sync_tick_delta[CPU->cpu_id];
363843e1988Sjohnlev 
364843e1988Sjohnlev 		restore_int_flag(flags);
365843e1988Sjohnlev 
366843e1988Sjohnlev 		/*
367843e1988Sjohnlev 		 * See the comments in tsc_gethrtime(), above.
368843e1988Sjohnlev 		 */
369843e1988Sjohnlev 		if (tsc >= tsc_last)
370843e1988Sjohnlev 			tsc -= tsc_last;
371843e1988Sjohnlev 		else if (tsc >= tsc_last - 2*tsc_max_delta)
372843e1988Sjohnlev 			tsc = 0;
373e014e7f8SPaul Dagnelie 		else
374e014e7f8SPaul Dagnelie 			tsc = tsc_protect(tsc);
375843e1988Sjohnlev 
376843e1988Sjohnlev 		hrt = tsc_hrtime_base;
377843e1988Sjohnlev 
378843e1988Sjohnlev 		TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
379843e1988Sjohnlev 
380843e1988Sjohnlev 		if ((old_hres_lock & ~1) == hres_lock)
381843e1988Sjohnlev 			break;
382843e1988Sjohnlev 
383843e1988Sjohnlev 		/*
384843e1988Sjohnlev 		 * If we're here, the clock lock is locked -- or it has been
385843e1988Sjohnlev 		 * unlocked and locked since we looked.  This may be due to
386843e1988Sjohnlev 		 * tsc_tick() running on another CPU -- or it may be because
387843e1988Sjohnlev 		 * some code path has ended up in dtrace_probe() with
388843e1988Sjohnlev 		 * CLOCK_LOCK held.  We'll try to determine that we're in
389843e1988Sjohnlev 		 * the former case by taking another lap if the lock has
390843e1988Sjohnlev 		 * changed since when we first looked at it.
391843e1988Sjohnlev 		 */
392843e1988Sjohnlev 		if (old_hres_lock != hres_lock)
393843e1988Sjohnlev 			continue;
394843e1988Sjohnlev 
395843e1988Sjohnlev 		/*
396843e1988Sjohnlev 		 * So the lock was and is locked.  We'll use the old data
397843e1988Sjohnlev 		 * instead.
398843e1988Sjohnlev 		 */
399843e1988Sjohnlev 		old_hres_lock = shadow_hres_lock;
400843e1988Sjohnlev 
401843e1988Sjohnlev 		/*
402843e1988Sjohnlev 		 * Again, disable interrupts to ensure that the thread
403843e1988Sjohnlev 		 * isn't migrated between the tsc_read() and adding
404843e1988Sjohnlev 		 * the CPU's TSC tick delta.
405843e1988Sjohnlev 		 */
406843e1988Sjohnlev 		flags = clear_int_flag();
407843e1988Sjohnlev 
408843e1988Sjohnlev 		tsc = tsc_read();
409843e1988Sjohnlev 
410843e1988Sjohnlev 		if (gethrtimef == tsc_gethrtime_delta)
411843e1988Sjohnlev 			tsc += tsc_sync_tick_delta[CPU->cpu_id];
412843e1988Sjohnlev 
413843e1988Sjohnlev 		restore_int_flag(flags);
414843e1988Sjohnlev 
415843e1988Sjohnlev 		/*
416843e1988Sjohnlev 		 * See the comments in tsc_gethrtime(), above.
417843e1988Sjohnlev 		 */
418843e1988Sjohnlev 		if (tsc >= shadow_tsc_last)
419843e1988Sjohnlev 			tsc -= shadow_tsc_last;
420843e1988Sjohnlev 		else if (tsc >= shadow_tsc_last - 2 * tsc_max_delta)
421843e1988Sjohnlev 			tsc = 0;
422e014e7f8SPaul Dagnelie 		else
423e014e7f8SPaul Dagnelie 			tsc = tsc_protect(tsc);
424843e1988Sjohnlev 
425843e1988Sjohnlev 		hrt = shadow_tsc_hrtime_base;
426843e1988Sjohnlev 
427843e1988Sjohnlev 		TSC_CONVERT_AND_ADD(tsc, hrt, shadow_nsec_scale);
428843e1988Sjohnlev 	} while ((old_hres_lock & ~1) != shadow_hres_lock);
429843e1988Sjohnlev 
430843e1988Sjohnlev 	return (hrt);
431843e1988Sjohnlev }
432843e1988Sjohnlev 
433843e1988Sjohnlev hrtime_t
434843e1988Sjohnlev tsc_gethrtimeunscaled(void)
435843e1988Sjohnlev {
436843e1988Sjohnlev 	uint32_t old_hres_lock;
437843e1988Sjohnlev 	hrtime_t tsc;
438843e1988Sjohnlev 
439843e1988Sjohnlev 	do {
440843e1988Sjohnlev 		old_hres_lock = hres_lock;
441843e1988Sjohnlev 
442843e1988Sjohnlev 		/* See tsc_tick(). */
443843e1988Sjohnlev 		tsc = tsc_read() + tsc_last_jumped;
444843e1988Sjohnlev 	} while ((old_hres_lock & ~1) != hres_lock);
445843e1988Sjohnlev 
446843e1988Sjohnlev 	return (tsc);
447843e1988Sjohnlev }
448843e1988Sjohnlev 
449113b131bSEric Saxe /*
450113b131bSEric Saxe  * Convert a nanosecond based timestamp to tsc
451113b131bSEric Saxe  */
452113b131bSEric Saxe uint64_t
453113b131bSEric Saxe tsc_unscalehrtime(hrtime_t nsec)
454113b131bSEric Saxe {
455113b131bSEric Saxe 	hrtime_t tsc;
456113b131bSEric Saxe 
457113b131bSEric Saxe 	if (tsc_gethrtime_enable) {
458113b131bSEric Saxe 		TSC_CONVERT(nsec, tsc, nsec_unscale);
459113b131bSEric Saxe 		return (tsc);
460113b131bSEric Saxe 	}
461113b131bSEric Saxe 	return ((uint64_t)nsec);
462113b131bSEric Saxe }
463843e1988Sjohnlev 
464843e1988Sjohnlev /* Convert a tsc timestamp to nanoseconds */
465843e1988Sjohnlev void
466843e1988Sjohnlev tsc_scalehrtime(hrtime_t *tsc)
467843e1988Sjohnlev {
468843e1988Sjohnlev 	hrtime_t hrt;
469843e1988Sjohnlev 	hrtime_t mytsc;
470843e1988Sjohnlev 
471843e1988Sjohnlev 	if (tsc == NULL)
472843e1988Sjohnlev 		return;
473843e1988Sjohnlev 	mytsc = *tsc;
474843e1988Sjohnlev 
475843e1988Sjohnlev 	TSC_CONVERT(mytsc, hrt, nsec_scale);
476843e1988Sjohnlev 	*tsc  = hrt;
477843e1988Sjohnlev }
478843e1988Sjohnlev 
479843e1988Sjohnlev hrtime_t
480843e1988Sjohnlev tsc_gethrtimeunscaled_delta(void)
481843e1988Sjohnlev {
482843e1988Sjohnlev 	hrtime_t hrt;
483a563a037Sbholler 	ulong_t flags;
484843e1988Sjohnlev 
485843e1988Sjohnlev 	/*
486843e1988Sjohnlev 	 * Similarly to tsc_gethrtime_delta, we need to disable preemption
487843e1988Sjohnlev 	 * to prevent migration between the call to tsc_gethrtimeunscaled
488843e1988Sjohnlev 	 * and adding the CPU's hrtime delta. Note that disabling and
489843e1988Sjohnlev 	 * reenabling preemption is forbidden here because we may be in the
490843e1988Sjohnlev 	 * middle of a fast trap. In the amd64 kernel we cannot tolerate
491843e1988Sjohnlev 	 * preemption during a fast trap. See _update_sregs().
492843e1988Sjohnlev 	 */
493843e1988Sjohnlev 
494843e1988Sjohnlev 	flags = clear_int_flag();
495843e1988Sjohnlev 	hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id];
496843e1988Sjohnlev 	restore_int_flag(flags);
497843e1988Sjohnlev 
498843e1988Sjohnlev 	return (hrt);
499843e1988Sjohnlev }
500843e1988Sjohnlev 
5017c478bd9Sstevel@tonic-gate /*
50286cb0be2SPatrick Mooney  * TSC Sync Master
50386cb0be2SPatrick Mooney  *
50486cb0be2SPatrick Mooney  * Typically called on the boot CPU, this attempts to quantify TSC skew between
50586cb0be2SPatrick Mooney  * different CPUs.  If an appreciable difference is found, gethrtimef will be
50686cb0be2SPatrick Mooney  * changed to point to tsc_gethrtime_delta().
50786cb0be2SPatrick Mooney  *
50886cb0be2SPatrick Mooney  * Calculating skews is precise only when the master and slave TSCs are read
50986cb0be2SPatrick Mooney  * simultaneously; however, there is no algorithm that can read both CPUs in
51086cb0be2SPatrick Mooney  * perfect simultaneity.  The proposed algorithm is an approximate method based
51186cb0be2SPatrick Mooney  * on the behaviour of cache management.  The slave CPU continuously polls the
51286cb0be2SPatrick Mooney  * TSC while reading a global variable updated by the master CPU.  The latest
51386cb0be2SPatrick Mooney  * TSC reading is saved when the master's update (forced via mfence) reaches
51486cb0be2SPatrick Mooney  * visibility on the slave.  The master will also take a TSC reading
51586cb0be2SPatrick Mooney  * immediately following the mfence.
51686cb0be2SPatrick Mooney  *
51786cb0be2SPatrick Mooney  * While the delay between cache line invalidation on the slave and mfence
51886cb0be2SPatrick Mooney  * completion on the master is not repeatable, the error is heuristically
51986cb0be2SPatrick Mooney  * assumed to be 1/4th of the write time recorded by the master.  Multiple
52086cb0be2SPatrick Mooney  * samples are taken to control for the variance caused by external factors
52186cb0be2SPatrick Mooney  * such as bus contention.  Each sample set is independent per-CPU to control
52286cb0be2SPatrick Mooney  * for differing memory latency on NUMA systems.
5234af20bbdSSudheer A  *
5244af20bbdSSudheer A  * TSC sync is disabled in the context of virtualization because the CPUs
5254af20bbdSSudheer A  * assigned to the guest are virtual CPUs which means the real CPUs on which
5264af20bbdSSudheer A  * guest runs keep changing during life time of guest OS. So we would end up
5274af20bbdSSudheer A  * calculating TSC skews for a set of CPUs during boot whereas the guest
5284af20bbdSSudheer A  * might migrate to a different set of physical CPUs at a later point of
5294af20bbdSSudheer A  * time.
5307c478bd9Sstevel@tonic-gate  */
5317c478bd9Sstevel@tonic-gate void
5327c478bd9Sstevel@tonic-gate tsc_sync_master(processorid_t slave)
5337c478bd9Sstevel@tonic-gate {
534b3c18020SSudheer A 	ulong_t flags, source, min_write_time = ~0UL;
53586cb0be2SPatrick Mooney 	hrtime_t write_time, mtsc_after, last_delta = 0;
536b3c18020SSudheer A 	tsc_sync_t *tsc = tscp;
537b3c18020SSudheer A 	int cnt;
538b9bfdccdSStuart Maybee 	int hwtype;
5397c478bd9Sstevel@tonic-gate 
540b9bfdccdSStuart Maybee 	hwtype = get_hwenv();
54179ec9da8SYuri Pankov 	if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
542ae115bc7Smrj 		return;
543ae115bc7Smrj 
5447c478bd9Sstevel@tonic-gate 	flags = clear_int_flag();
545b3c18020SSudheer A 	source = CPU->cpu_id;
546b3c18020SSudheer A 
547b3c18020SSudheer A 	for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
548b3c18020SSudheer A 		while (tsc_sync_go != TSC_SYNC_GO)
549b3c18020SSudheer A 			SMT_PAUSE();
550b3c18020SSudheer A 
551b3c18020SSudheer A 		tsc->master_tsc = tsc_read();
552b3c18020SSudheer A 		membar_enter();
553b3c18020SSudheer A 		mtsc_after = tsc_read();
554b3c18020SSudheer A 		while (tsc_sync_go != TSC_SYNC_DONE)
555b3c18020SSudheer A 			SMT_PAUSE();
556b3c18020SSudheer A 		write_time =  mtsc_after - tsc->master_tsc;
557b3c18020SSudheer A 		if (write_time <= min_write_time) {
55886cb0be2SPatrick Mooney 			hrtime_t tdelta;
55986cb0be2SPatrick Mooney 
56086cb0be2SPatrick Mooney 			tdelta = tsc->slave_tsc - mtsc_after;
56186cb0be2SPatrick Mooney 			if (tdelta < 0)
56286cb0be2SPatrick Mooney 				tdelta = -tdelta;
563b3c18020SSudheer A 			/*
56486cb0be2SPatrick Mooney 			 * If the margin exists, subtract 1/4th of the measured
56586cb0be2SPatrick Mooney 			 * write time from the master's TSC value.  This is an
56686cb0be2SPatrick Mooney 			 * estimate of how late the mfence completion came
56786cb0be2SPatrick Mooney 			 * after the slave noticed the cache line change.
568b3c18020SSudheer A 			 */
56986cb0be2SPatrick Mooney 			if (tdelta > (write_time/4)) {
570b3c18020SSudheer A 				tdelta = tsc->slave_tsc -
57186cb0be2SPatrick Mooney 				    (mtsc_after - (write_time/4));
57286cb0be2SPatrick Mooney 			} else {
573b3c18020SSudheer A 				tdelta = tsc->slave_tsc - mtsc_after;
57486cb0be2SPatrick Mooney 			}
57586cb0be2SPatrick Mooney 			last_delta = tsc_sync_tick_delta[source] - tdelta;
57686cb0be2SPatrick Mooney 			tsc_sync_tick_delta[slave] = last_delta;
57786cb0be2SPatrick Mooney 			min_write_time = write_time;
578b3c18020SSudheer A 		}
5797c478bd9Sstevel@tonic-gate 
580b3c18020SSudheer A 		tsc->master_tsc = tsc->slave_tsc = write_time = 0;
581b3c18020SSudheer A 		membar_enter();
582b3c18020SSudheer A 		tsc_sync_go = TSC_SYNC_STOP;
583b3c18020SSudheer A 	}
58486cb0be2SPatrick Mooney 
5857c478bd9Sstevel@tonic-gate 	/*
58686cb0be2SPatrick Mooney 	 * Only enable the delta variants of the TSC functions if the measured
58786cb0be2SPatrick Mooney 	 * skew is greater than the fastest write time.
5887c478bd9Sstevel@tonic-gate 	 */
58986cb0be2SPatrick Mooney 	last_delta = (last_delta < 0) ? -last_delta : last_delta;
59086cb0be2SPatrick Mooney 	if (last_delta > min_write_time) {
591b3c18020SSudheer A 		gethrtimef = tsc_gethrtime_delta;
592b3c18020SSudheer A 		gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
5932428aad8SPatrick Mooney 		tsc_ncpu = NCPU;
594b3c18020SSudheer A 	}
5957c478bd9Sstevel@tonic-gate 	restore_int_flag(flags);
5967c478bd9Sstevel@tonic-gate }
5977c478bd9Sstevel@tonic-gate 
5984af20bbdSSudheer A /*
59986cb0be2SPatrick Mooney  * TSC Sync Slave
60086cb0be2SPatrick Mooney  *
6014af20bbdSSudheer A  * Called by a CPU which has just been onlined.  It is expected that the CPU
6024af20bbdSSudheer A  * performing the online operation will call tsc_sync_master().
6034af20bbdSSudheer A  *
60486cb0be2SPatrick Mooney  * Like tsc_sync_master, this logic is skipped on virtualized platforms.
6054af20bbdSSudheer A  */
6067c478bd9Sstevel@tonic-gate void
6077c478bd9Sstevel@tonic-gate tsc_sync_slave(void)
6087c478bd9Sstevel@tonic-gate {
609ae115bc7Smrj 	ulong_t flags;
610b3c18020SSudheer A 	hrtime_t s1;
611b3c18020SSudheer A 	tsc_sync_t *tsc = tscp;
612b3c18020SSudheer A 	int cnt;
613b9bfdccdSStuart Maybee 	int hwtype;
6147c478bd9Sstevel@tonic-gate 
615b9bfdccdSStuart Maybee 	hwtype = get_hwenv();
61679ec9da8SYuri Pankov 	if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
617ae115bc7Smrj 		return;
618ae115bc7Smrj 
6197c478bd9Sstevel@tonic-gate 	flags = clear_int_flag();
6207c478bd9Sstevel@tonic-gate 
621b3c18020SSudheer A 	for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
622b3c18020SSudheer A 		/* Re-fill the cache line */
623b3c18020SSudheer A 		s1 = tsc->master_tsc;
624b3c18020SSudheer A 		membar_enter();
625b3c18020SSudheer A 		tsc_sync_go = TSC_SYNC_GO;
626b3c18020SSudheer A 		do {
627b3c18020SSudheer A 			/*
62886cb0be2SPatrick Mooney 			 * Do not put an SMT_PAUSE here.  If the master and
62986cb0be2SPatrick Mooney 			 * slave are the same hyper-threaded CPU, we want the
63086cb0be2SPatrick Mooney 			 * master to yield as quickly as possible to the slave.
631b3c18020SSudheer A 			 */
632b3c18020SSudheer A 			s1 = tsc_read();
633b3c18020SSudheer A 		} while (tsc->master_tsc == 0);
634b3c18020SSudheer A 		tsc->slave_tsc = s1;
635b3c18020SSudheer A 		membar_enter();
636b3c18020SSudheer A 		tsc_sync_go = TSC_SYNC_DONE;
637b3c18020SSudheer A 
638b3c18020SSudheer A 		while (tsc_sync_go != TSC_SYNC_STOP)
639b3c18020SSudheer A 			SMT_PAUSE();
640b3c18020SSudheer A 	}
6417c478bd9Sstevel@tonic-gate 
6427c478bd9Sstevel@tonic-gate 	restore_int_flag(flags);
6437c478bd9Sstevel@tonic-gate }
6447c478bd9Sstevel@tonic-gate 
6457c478bd9Sstevel@tonic-gate /*
646ae115bc7Smrj  * Called once per second on a CPU from the cyclic subsystem's
647ae115bc7Smrj  * CY_HIGH_LEVEL interrupt.  (No longer just cpu0-only)
6487c478bd9Sstevel@tonic-gate  */
6497c478bd9Sstevel@tonic-gate void
6507c478bd9Sstevel@tonic-gate tsc_tick(void)
6517c478bd9Sstevel@tonic-gate {
6527c478bd9Sstevel@tonic-gate 	hrtime_t now, delta;
6537c478bd9Sstevel@tonic-gate 	ushort_t spl;
6547c478bd9Sstevel@tonic-gate 
6557c478bd9Sstevel@tonic-gate 	/*
6567c478bd9Sstevel@tonic-gate 	 * Before we set the new variables, we set the shadow values.  This
6577c478bd9Sstevel@tonic-gate 	 * allows for lock free operation in dtrace_gethrtime().
6587c478bd9Sstevel@tonic-gate 	 */
6597c478bd9Sstevel@tonic-gate 	lock_set_spl((lock_t *)&shadow_hres_lock + HRES_LOCK_OFFSET,
6607c478bd9Sstevel@tonic-gate 	    ipltospl(CBE_HIGH_PIL), &spl);
6617c478bd9Sstevel@tonic-gate 
6627c478bd9Sstevel@tonic-gate 	shadow_tsc_hrtime_base = tsc_hrtime_base;
6637c478bd9Sstevel@tonic-gate 	shadow_tsc_last = tsc_last;
6647c478bd9Sstevel@tonic-gate 	shadow_nsec_scale = nsec_scale;
6657c478bd9Sstevel@tonic-gate 
6667c478bd9Sstevel@tonic-gate 	shadow_hres_lock++;
6677c478bd9Sstevel@tonic-gate 	splx(spl);
6687c478bd9Sstevel@tonic-gate 
6697c478bd9Sstevel@tonic-gate 	CLOCK_LOCK(&spl);
6707c478bd9Sstevel@tonic-gate 
6717c478bd9Sstevel@tonic-gate 	now = tsc_read();
6727c478bd9Sstevel@tonic-gate 
673d90554ebSdmick 	if (gethrtimef == tsc_gethrtime_delta)
674d90554ebSdmick 		now += tsc_sync_tick_delta[CPU->cpu_id];
675d90554ebSdmick 
6767c478bd9Sstevel@tonic-gate 	if (now < tsc_last) {
6777c478bd9Sstevel@tonic-gate 		/*
6787c478bd9Sstevel@tonic-gate 		 * The TSC has just jumped into the past.  We assume that
6797c478bd9Sstevel@tonic-gate 		 * this is due to a suspend/resume cycle, and we're going
6807c478bd9Sstevel@tonic-gate 		 * to use the _current_ value of TSC as the delta.  This
6817c478bd9Sstevel@tonic-gate 		 * will keep tsc_hrtime_base correct.  We're also going to
6827c478bd9Sstevel@tonic-gate 		 * assume that rate of tsc does not change after a suspend
6837c478bd9Sstevel@tonic-gate 		 * resume (i.e nsec_scale remains the same).
6847c478bd9Sstevel@tonic-gate 		 */
6857c478bd9Sstevel@tonic-gate 		delta = now;
686e014e7f8SPaul Dagnelie 		delta = tsc_protect(delta);
6877c478bd9Sstevel@tonic-gate 		tsc_last_jumped += tsc_last;
6887c478bd9Sstevel@tonic-gate 		tsc_jumped = 1;
6897c478bd9Sstevel@tonic-gate 	} else {
6907c478bd9Sstevel@tonic-gate 		/*
6917c478bd9Sstevel@tonic-gate 		 * Determine the number of TSC ticks since the last clock
6927c478bd9Sstevel@tonic-gate 		 * tick, and add that to the hrtime base.
6937c478bd9Sstevel@tonic-gate 		 */
6947c478bd9Sstevel@tonic-gate 		delta = now - tsc_last;
6957c478bd9Sstevel@tonic-gate 	}
6967c478bd9Sstevel@tonic-gate 
6977c478bd9Sstevel@tonic-gate 	TSC_CONVERT_AND_ADD(delta, tsc_hrtime_base, nsec_scale);
6987c478bd9Sstevel@tonic-gate 	tsc_last = now;
6997c478bd9Sstevel@tonic-gate 
7007c478bd9Sstevel@tonic-gate 	CLOCK_UNLOCK(spl);
7017c478bd9Sstevel@tonic-gate }
7027c478bd9Sstevel@tonic-gate 
7037c478bd9Sstevel@tonic-gate void
704843e1988Sjohnlev tsc_hrtimeinit(uint64_t cpu_freq_hz)
7057c478bd9Sstevel@tonic-gate {
706843e1988Sjohnlev 	extern int gethrtime_hires;
707843e1988Sjohnlev 	longlong_t tsc;
708843e1988Sjohnlev 	ulong_t flags;
7097c478bd9Sstevel@tonic-gate 
710843e1988Sjohnlev 	/*
711843e1988Sjohnlev 	 * cpu_freq_hz is the measured cpu frequency in hertz
712843e1988Sjohnlev 	 */
7137c478bd9Sstevel@tonic-gate 
7147c478bd9Sstevel@tonic-gate 	/*
715843e1988Sjohnlev 	 * We can't accommodate CPUs slower than 31.25 MHz.
7167c478bd9Sstevel@tonic-gate 	 */
717843e1988Sjohnlev 	ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
718843e1988Sjohnlev 	nsec_scale =
719843e1988Sjohnlev 	    (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
720113b131bSEric Saxe 	nsec_unscale =
721113b131bSEric Saxe 	    (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
7227c478bd9Sstevel@tonic-gate 
7237c478bd9Sstevel@tonic-gate 	flags = clear_int_flag();
724843e1988Sjohnlev 	tsc = tsc_read();
725843e1988Sjohnlev 	(void) tsc_gethrtime();
726843e1988Sjohnlev 	tsc_max_delta = tsc_read() - tsc;
7277c478bd9Sstevel@tonic-gate 	restore_int_flag(flags);
728843e1988Sjohnlev 	gethrtimef = tsc_gethrtime;
729843e1988Sjohnlev 	gethrtimeunscaledf = tsc_gethrtimeunscaled;
730843e1988Sjohnlev 	scalehrtimef = tsc_scalehrtime;
731113b131bSEric Saxe 	unscalehrtimef = tsc_unscalehrtime;
732843e1988Sjohnlev 	hrtime_tick = tsc_tick;
733843e1988Sjohnlev 	gethrtime_hires = 1;
7342428aad8SPatrick Mooney 	/*
7352428aad8SPatrick Mooney 	 * Being part of the comm page, tsc_ncpu communicates the published
7362428aad8SPatrick Mooney 	 * length of the tsc_sync_tick_delta array.  This is kept zeroed to
7372428aad8SPatrick Mooney 	 * ignore the absent delta data while the TSCs are synced.
7382428aad8SPatrick Mooney 	 */
7392428aad8SPatrick Mooney 	tsc_ncpu = 0;
740b3c18020SSudheer A 	/*
741b3c18020SSudheer A 	 * Allocate memory for the structure used in the tsc sync logic.
742b3c18020SSudheer A 	 * This structure should be aligned on a multiple of cache line size.
743b3c18020SSudheer A 	 */
744b3c18020SSudheer A 	tscp = kmem_zalloc(PAGESIZE, KM_SLEEP);
745e014e7f8SPaul Dagnelie 
746e014e7f8SPaul Dagnelie 	/*
747e014e7f8SPaul Dagnelie 	 * Convert the TSC resume cap ns value into its unscaled TSC value.
748e014e7f8SPaul Dagnelie 	 * See tsc_gethrtime().
749e014e7f8SPaul Dagnelie 	 */
750e014e7f8SPaul Dagnelie 	if (tsc_resume_cap == 0)
751e014e7f8SPaul Dagnelie 		TSC_CONVERT(tsc_resume_cap_ns, tsc_resume_cap, nsec_unscale);
7527c478bd9Sstevel@tonic-gate }
7532df1fe9cSrandyf 
7542df1fe9cSrandyf int
7552df1fe9cSrandyf get_tsc_ready()
7562df1fe9cSrandyf {
7572df1fe9cSrandyf 	return (tsc_ready);
7582df1fe9cSrandyf }
7592df1fe9cSrandyf 
7602df1fe9cSrandyf /*
76186cb0be2SPatrick Mooney  * Adjust all the deltas by adding the passed value to the array and activate
76286cb0be2SPatrick Mooney  * the "delta" versions of the gethrtime functions.  It is possible that the
76386cb0be2SPatrick Mooney  * adjustment could be negative.  Such may occur if the SunOS instance was
76486cb0be2SPatrick Mooney  * moved by a virtual manager to a machine with a higher value of TSC.
7652df1fe9cSrandyf  */
7662df1fe9cSrandyf void
7672df1fe9cSrandyf tsc_adjust_delta(hrtime_t tdelta)
7682df1fe9cSrandyf {
7692df1fe9cSrandyf 	int		i;
7702df1fe9cSrandyf 
7712df1fe9cSrandyf 	for (i = 0; i < NCPU; i++) {
7722df1fe9cSrandyf 		tsc_sync_tick_delta[i] += tdelta;
7732df1fe9cSrandyf 	}
7742df1fe9cSrandyf 
7752df1fe9cSrandyf 	gethrtimef = tsc_gethrtime_delta;
7762df1fe9cSrandyf 	gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
7772428aad8SPatrick Mooney 	tsc_ncpu = NCPU;
7782df1fe9cSrandyf }
7792df1fe9cSrandyf 
7802df1fe9cSrandyf /*
7812df1fe9cSrandyf  * Functions to manage TSC and high-res time on suspend and resume.
7822df1fe9cSrandyf  */
7832df1fe9cSrandyf 
78486cb0be2SPatrick Mooney /* tod_ops from "uts/i86pc/io/todpc_subr.c" */
7852df1fe9cSrandyf extern tod_ops_t *tod_ops;
78686cb0be2SPatrick Mooney 
7872df1fe9cSrandyf static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
7882df1fe9cSrandyf static timestruc_t tsc_saved_ts;
7892df1fe9cSrandyf static int	tsc_needs_resume = 0;	/* We only want to do this once. */
7902df1fe9cSrandyf int		tsc_delta_onsuspend = 0;
7912df1fe9cSrandyf int		tsc_adjust_seconds = 1;
7922df1fe9cSrandyf int		tsc_suspend_count = 0;
7932df1fe9cSrandyf int		tsc_resume_in_cyclic = 0;
7942df1fe9cSrandyf 
7952df1fe9cSrandyf /*
79686cb0be2SPatrick Mooney  * Take snapshots of the current time and do any other pre-suspend work.
7972df1fe9cSrandyf  */
7982df1fe9cSrandyf void
7992df1fe9cSrandyf tsc_suspend(void)
8002df1fe9cSrandyf {
80186cb0be2SPatrick Mooney 	/*
80286cb0be2SPatrick Mooney 	 * We need to collect the time at which we suspended here so we know
80386cb0be2SPatrick Mooney 	 * now much should be added during the resume.  This is called by each
80486cb0be2SPatrick Mooney 	 * CPU, so reentry must be properly handled.
80586cb0be2SPatrick Mooney 	 */
8062df1fe9cSrandyf 	if (tsc_gethrtime_enable) {
8072df1fe9cSrandyf 		/*
80886cb0be2SPatrick Mooney 		 * Perform the tsc_read after acquiring the lock to make it as
80986cb0be2SPatrick Mooney 		 * accurate as possible in the face of contention.
8102df1fe9cSrandyf 		 */
8112df1fe9cSrandyf 		mutex_enter(&tod_lock);
8122df1fe9cSrandyf 		tsc_saved_tsc = tsc_read();
8132df1fe9cSrandyf 		tsc_saved_ts = TODOP_GET(tod_ops);
8142df1fe9cSrandyf 		mutex_exit(&tod_lock);
8152df1fe9cSrandyf 		/* We only want to do this once. */
8162df1fe9cSrandyf 		if (tsc_needs_resume == 0) {
8172df1fe9cSrandyf 			if (tsc_delta_onsuspend) {
8182df1fe9cSrandyf 				tsc_adjust_delta(tsc_saved_tsc);
8192df1fe9cSrandyf 			} else {
8202df1fe9cSrandyf 				tsc_adjust_delta(nsec_scale);
8212df1fe9cSrandyf 			}
8222df1fe9cSrandyf 			tsc_suspend_count++;
8232df1fe9cSrandyf 		}
8242df1fe9cSrandyf 	}
8252df1fe9cSrandyf 
8262df1fe9cSrandyf 	invalidate_cache();
8272df1fe9cSrandyf 	tsc_needs_resume = 1;
8282df1fe9cSrandyf }
8292df1fe9cSrandyf 
8302df1fe9cSrandyf /*
83186cb0be2SPatrick Mooney  * Restore all timestamp state based on the snapshots taken at suspend time.
8322df1fe9cSrandyf  */
8332df1fe9cSrandyf void
8342df1fe9cSrandyf tsc_resume(void)
8352df1fe9cSrandyf {
8362df1fe9cSrandyf 	/*
8372df1fe9cSrandyf 	 * We only need to (and want to) do this once.  So let the first
8382df1fe9cSrandyf 	 * caller handle this (we are locked by the cpu lock), as it
8392df1fe9cSrandyf 	 * is preferential that we get the earliest sync.
8402df1fe9cSrandyf 	 */
8412df1fe9cSrandyf 	if (tsc_needs_resume) {
8422df1fe9cSrandyf 		/*
8432df1fe9cSrandyf 		 * If using the TSC, adjust the delta based on how long
8442df1fe9cSrandyf 		 * we were sleeping (or away).  We also adjust for
8452df1fe9cSrandyf 		 * migration and a grown TSC.
8462df1fe9cSrandyf 		 */
8472df1fe9cSrandyf 		if (tsc_saved_tsc != 0) {
8482df1fe9cSrandyf 			timestruc_t	ts;
8492df1fe9cSrandyf 			hrtime_t	now, sleep_tsc = 0;
8502df1fe9cSrandyf 			int		sleep_sec;
8512df1fe9cSrandyf 			extern void	tsc_tick(void);
8522df1fe9cSrandyf 			extern uint64_t cpu_freq_hz;
8532df1fe9cSrandyf 
8542df1fe9cSrandyf 			/* tsc_read() MUST be before TODOP_GET() */
8552df1fe9cSrandyf 			mutex_enter(&tod_lock);
8562df1fe9cSrandyf 			now = tsc_read();
8572df1fe9cSrandyf 			ts = TODOP_GET(tod_ops);
8582df1fe9cSrandyf 			mutex_exit(&tod_lock);
8592df1fe9cSrandyf 
8602df1fe9cSrandyf 			/* Compute seconds of sleep time */
8612df1fe9cSrandyf 			sleep_sec = ts.tv_sec - tsc_saved_ts.tv_sec;
8622df1fe9cSrandyf 
8632df1fe9cSrandyf 			/*
8642df1fe9cSrandyf 			 * If the saved sec is less that or equal to
8652df1fe9cSrandyf 			 * the current ts, then there is likely a
8662df1fe9cSrandyf 			 * problem with the clock.  Assume at least
8672df1fe9cSrandyf 			 * one second has passed, so that time goes forward.
8682df1fe9cSrandyf 			 */
8692df1fe9cSrandyf 			if (sleep_sec <= 0) {
8702df1fe9cSrandyf 				sleep_sec = 1;
8712df1fe9cSrandyf 			}
8722df1fe9cSrandyf 
8732df1fe9cSrandyf 			/* How many TSC's should have occured while sleeping */
8742df1fe9cSrandyf 			if (tsc_adjust_seconds)
8752df1fe9cSrandyf 				sleep_tsc = sleep_sec * cpu_freq_hz;
8762df1fe9cSrandyf 
8772df1fe9cSrandyf 			/*
8782df1fe9cSrandyf 			 * We also want to subtract from the "sleep_tsc"
8792df1fe9cSrandyf 			 * the current value of tsc_read(), so that our
8802df1fe9cSrandyf 			 * adjustment accounts for the amount of time we
8812df1fe9cSrandyf 			 * have been resumed _or_ an adjustment based on
8822df1fe9cSrandyf 			 * the fact that we didn't actually power off the
8832df1fe9cSrandyf 			 * CPU (migration is another issue, but _should_
8842df1fe9cSrandyf 			 * also comply with this calculation).  If the CPU
8852df1fe9cSrandyf 			 * never powered off, then:
8862df1fe9cSrandyf 			 *    'now == sleep_tsc + saved_tsc'
8872df1fe9cSrandyf 			 * and the delta will effectively be "0".
8882df1fe9cSrandyf 			 */
8892df1fe9cSrandyf 			sleep_tsc -= now;
8902df1fe9cSrandyf 			if (tsc_delta_onsuspend) {
8912df1fe9cSrandyf 				tsc_adjust_delta(sleep_tsc);
8922df1fe9cSrandyf 			} else {
8932df1fe9cSrandyf 				tsc_adjust_delta(tsc_saved_tsc + sleep_tsc);
8942df1fe9cSrandyf 			}
8952df1fe9cSrandyf 			tsc_saved_tsc = 0;
8962df1fe9cSrandyf 
8972df1fe9cSrandyf 			tsc_tick();
8982df1fe9cSrandyf 		}
8992df1fe9cSrandyf 		tsc_needs_resume = 0;
9002df1fe9cSrandyf 	}
9012df1fe9cSrandyf 
9022df1fe9cSrandyf }
903