17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate * CDDL HEADER START
37c478bd9Sstevel@tonic-gate *
47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5ae115bc7Smrj * Common Development and Distribution License (the "License").
6ae115bc7Smrj * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate *
87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate * and limitations under the License.
127c478bd9Sstevel@tonic-gate *
137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate *
197c478bd9Sstevel@tonic-gate * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
21843e1988Sjohnlev
227c478bd9Sstevel@tonic-gate /*
237997e108SSurya Prakki * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
247c478bd9Sstevel@tonic-gate * Use is subject to license terms.
2579ec9da8SYuri Pankov *
2679ec9da8SYuri Pankov * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27e014e7f8SPaul Dagnelie * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
28*575694f6SJason King * Copyright 2020 Joyent, Inc.
297c478bd9Sstevel@tonic-gate */
307c478bd9Sstevel@tonic-gate
317c478bd9Sstevel@tonic-gate #include <sys/types.h>
327c478bd9Sstevel@tonic-gate #include <sys/param.h>
337c478bd9Sstevel@tonic-gate #include <sys/systm.h>
347c478bd9Sstevel@tonic-gate #include <sys/disp.h>
357c478bd9Sstevel@tonic-gate #include <sys/var.h>
367c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
377c478bd9Sstevel@tonic-gate #include <sys/debug.h>
387c478bd9Sstevel@tonic-gate #include <sys/x86_archext.h>
397c478bd9Sstevel@tonic-gate #include <sys/archsystm.h>
407c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
417c478bd9Sstevel@tonic-gate #include <sys/psm_defs.h>
427c478bd9Sstevel@tonic-gate #include <sys/clock.h>
437c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
447c478bd9Sstevel@tonic-gate #include <sys/lockstat.h>
457c478bd9Sstevel@tonic-gate #include <sys/smp_impldefs.h>
467c478bd9Sstevel@tonic-gate #include <sys/dtrace.h>
477c478bd9Sstevel@tonic-gate #include <sys/time.h>
48843e1988Sjohnlev #include <sys/panic.h>
49b3c18020SSudheer A #include <sys/cpu.h>
50e014e7f8SPaul Dagnelie #include <sys/sdt.h>
512428aad8SPatrick Mooney #include <sys/comm_page.h>
52*575694f6SJason King #include <sys/bootconf.h>
53*575694f6SJason King #include <sys/kobj.h>
54*575694f6SJason King #include <sys/kobj_lex.h>
55*575694f6SJason King #include <sys/tsc.h>
56*575694f6SJason King #include <sys/prom_debug.h>
57*575694f6SJason King #include <util/qsort.h>
587c478bd9Sstevel@tonic-gate
597c478bd9Sstevel@tonic-gate /*
607c478bd9Sstevel@tonic-gate * Using the Pentium's TSC register for gethrtime()
617c478bd9Sstevel@tonic-gate * ------------------------------------------------
627c478bd9Sstevel@tonic-gate *
637c478bd9Sstevel@tonic-gate * The Pentium family, like many chip architectures, has a high-resolution
647c478bd9Sstevel@tonic-gate * timestamp counter ("TSC") which increments once per CPU cycle. The contents
657c478bd9Sstevel@tonic-gate * of the timestamp counter are read with the RDTSC instruction.
667c478bd9Sstevel@tonic-gate *
677c478bd9Sstevel@tonic-gate * As with its UltraSPARC equivalent (the %tick register), TSC's cycle count
687c478bd9Sstevel@tonic-gate * must be translated into nanoseconds in order to implement gethrtime().
697c478bd9Sstevel@tonic-gate * We avoid inducing floating point operations in this conversion by
707c478bd9Sstevel@tonic-gate * implementing the same nsec_scale algorithm as that found in the sun4u
717c478bd9Sstevel@tonic-gate * platform code. The sun4u NATIVE_TIME_TO_NSEC_SCALE block comment contains
727c478bd9Sstevel@tonic-gate * a detailed description of the algorithm; the comment is not reproduced
737c478bd9Sstevel@tonic-gate * here. This implementation differs only in its value for NSEC_SHIFT:
747c478bd9Sstevel@tonic-gate * we implement an NSEC_SHIFT of 5 (instead of sun4u's 4) to allow for
757c478bd9Sstevel@tonic-gate * 60 MHz Pentiums.
767c478bd9Sstevel@tonic-gate *
777c478bd9Sstevel@tonic-gate * While TSC and %tick are both cycle counting registers, TSC's functionality
787c478bd9Sstevel@tonic-gate * falls short in several critical ways:
797c478bd9Sstevel@tonic-gate *
807c478bd9Sstevel@tonic-gate * (a) TSCs on different CPUs are not guaranteed to be in sync. While in
817c478bd9Sstevel@tonic-gate * practice they often _are_ in sync, this isn't guaranteed by the
827c478bd9Sstevel@tonic-gate * architecture.
837c478bd9Sstevel@tonic-gate *
847c478bd9Sstevel@tonic-gate * (b) The TSC cannot be reliably set to an arbitrary value. The architecture
857c478bd9Sstevel@tonic-gate * only supports writing the low 32-bits of TSC, making it impractical
867c478bd9Sstevel@tonic-gate * to rewrite.
877c478bd9Sstevel@tonic-gate *
887c478bd9Sstevel@tonic-gate * (c) The architecture doesn't have the capacity to interrupt based on
897c478bd9Sstevel@tonic-gate * arbitrary values of TSC; there is no TICK_CMPR equivalent.
907c478bd9Sstevel@tonic-gate *
917c478bd9Sstevel@tonic-gate * Together, (a) and (b) imply that software must track the skew between
927c478bd9Sstevel@tonic-gate * TSCs and account for it (it is assumed that while there may exist skew,
937c478bd9Sstevel@tonic-gate * there does not exist drift). To determine the skew between CPUs, we
947c478bd9Sstevel@tonic-gate * have newly onlined CPUs call tsc_sync_slave(), while the CPU performing
95b3c18020SSudheer A * the online operation calls tsc_sync_master().
967c478bd9Sstevel@tonic-gate *
977c478bd9Sstevel@tonic-gate * In the absence of time-of-day clock adjustments, gethrtime() must stay in
987c478bd9Sstevel@tonic-gate * sync with gettimeofday(). This is problematic; given (c), the software
997c478bd9Sstevel@tonic-gate * cannot drive its time-of-day source from TSC, and yet they must somehow be
1007c478bd9Sstevel@tonic-gate * kept in sync. We implement this by having a routine, tsc_tick(), which
1017c478bd9Sstevel@tonic-gate * is called once per second from the interrupt which drives time-of-day.
1027c478bd9Sstevel@tonic-gate *
1037c478bd9Sstevel@tonic-gate * Note that the hrtime base for gethrtime, tsc_hrtime_base, is modified
1047c478bd9Sstevel@tonic-gate * atomically with nsec_scale under CLOCK_LOCK. This assures that time
1057c478bd9Sstevel@tonic-gate * monotonically increases.
1067c478bd9Sstevel@tonic-gate */
1077c478bd9Sstevel@tonic-gate
1087c478bd9Sstevel@tonic-gate #define NSEC_SHIFT 5
1097c478bd9Sstevel@tonic-gate
110113b131bSEric Saxe static uint_t nsec_unscale;
1117c478bd9Sstevel@tonic-gate
1127c478bd9Sstevel@tonic-gate /*
1137c478bd9Sstevel@tonic-gate * These two variables used to be grouped together inside of a structure that
1147c478bd9Sstevel@tonic-gate * lived on a single cache line. A regression (bug ID 4623398) caused the
1157c478bd9Sstevel@tonic-gate * compiler to emit code that "optimized" away the while-loops below. The
1167c478bd9Sstevel@tonic-gate * result was that no synchronization between the onlining and onlined CPUs
1177c478bd9Sstevel@tonic-gate * took place.
1187c478bd9Sstevel@tonic-gate */
1197c478bd9Sstevel@tonic-gate static volatile int tsc_ready;
1207c478bd9Sstevel@tonic-gate static volatile int tsc_sync_go;
1217c478bd9Sstevel@tonic-gate
1227c478bd9Sstevel@tonic-gate /*
1237c478bd9Sstevel@tonic-gate * Used as indices into the tsc_sync_snaps[] array.
1247c478bd9Sstevel@tonic-gate */
1257c478bd9Sstevel@tonic-gate #define TSC_MASTER 0
1267c478bd9Sstevel@tonic-gate #define TSC_SLAVE 1
1277c478bd9Sstevel@tonic-gate
1287c478bd9Sstevel@tonic-gate /*
1297c478bd9Sstevel@tonic-gate * Used in the tsc_master_sync()/tsc_slave_sync() rendezvous.
1307c478bd9Sstevel@tonic-gate */
1317c478bd9Sstevel@tonic-gate #define TSC_SYNC_STOP 1
1327c478bd9Sstevel@tonic-gate #define TSC_SYNC_GO 2
133b3c18020SSudheer A #define TSC_SYNC_DONE 3
134b3c18020SSudheer A #define SYNC_ITERATIONS 10
1357c478bd9Sstevel@tonic-gate
136*575694f6SJason King #define TSC_CONVERT_AND_ADD(tsc, hrt, scale) { \
137*575694f6SJason King unsigned int *_l = (unsigned int *)&(tsc); \
138*575694f6SJason King (hrt) += mul32(_l[1], scale) << NSEC_SHIFT; \
1397c478bd9Sstevel@tonic-gate (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
1407c478bd9Sstevel@tonic-gate }
1417c478bd9Sstevel@tonic-gate
142*575694f6SJason King #define TSC_CONVERT(tsc, hrt, scale) { \
143*575694f6SJason King unsigned int *_l = (unsigned int *)&(tsc); \
144*575694f6SJason King (hrt) = mul32(_l[1], scale) << NSEC_SHIFT; \
1457c478bd9Sstevel@tonic-gate (hrt) += mul32(_l[0], scale) >> (32 - NSEC_SHIFT); \
1467c478bd9Sstevel@tonic-gate }
1477c478bd9Sstevel@tonic-gate
148ae115bc7Smrj int tsc_master_slave_sync_needed = 1;
1497c478bd9Sstevel@tonic-gate
150b3c18020SSudheer A typedef struct tsc_sync {
151b3c18020SSudheer A volatile hrtime_t master_tsc, slave_tsc;
152b3c18020SSudheer A } tsc_sync_t;
153b3c18020SSudheer A static tsc_sync_t *tscp;
154b3c18020SSudheer A
1557c478bd9Sstevel@tonic-gate static hrtime_t tsc_last_jumped = 0;
1567c478bd9Sstevel@tonic-gate static int tsc_jumped = 0;
157e014e7f8SPaul Dagnelie static uint32_t tsc_wayback = 0;
158e014e7f8SPaul Dagnelie /*
159e014e7f8SPaul Dagnelie * The cap of 1 second was chosen since it is the frequency at which the
160e014e7f8SPaul Dagnelie * tsc_tick() function runs which means that when gethrtime() is called it
161e014e7f8SPaul Dagnelie * should never be more than 1 second since tsc_last was updated.
162e014e7f8SPaul Dagnelie */
163e014e7f8SPaul Dagnelie static hrtime_t tsc_resume_cap_ns = NANOSEC; /* 1s */
1647c478bd9Sstevel@tonic-gate
1657c478bd9Sstevel@tonic-gate static hrtime_t shadow_tsc_hrtime_base;
1667c478bd9Sstevel@tonic-gate static hrtime_t shadow_tsc_last;
1677c478bd9Sstevel@tonic-gate static uint_t shadow_nsec_scale;
1687c478bd9Sstevel@tonic-gate static uint32_t shadow_hres_lock;
1692df1fe9cSrandyf int get_tsc_ready();
1707c478bd9Sstevel@tonic-gate
171*575694f6SJason King /*
172*575694f6SJason King * Allow an operator specify an explicit TSC calibration source
173*575694f6SJason King * via /etc/system e.g. `set tsc_calibration="pit"`
174*575694f6SJason King */
175*575694f6SJason King char *tsc_calibration;
176*575694f6SJason King
177*575694f6SJason King /*
178*575694f6SJason King * The source that was used to calibrate the TSC. This is currently just
179*575694f6SJason King * for diagnostic purposes.
180*575694f6SJason King */
181*575694f6SJason King static tsc_calibrate_t *tsc_calibration_source;
182*575694f6SJason King
183*575694f6SJason King /* The TSC frequency after calibration */
184*575694f6SJason King static uint64_t tsc_freq;
185*575694f6SJason King
186*575694f6SJason King static inline hrtime_t
tsc_protect(hrtime_t a)187*575694f6SJason King tsc_protect(hrtime_t a)
188*575694f6SJason King {
189e014e7f8SPaul Dagnelie if (a > tsc_resume_cap) {
190e014e7f8SPaul Dagnelie atomic_inc_32(&tsc_wayback);
191e014e7f8SPaul Dagnelie DTRACE_PROBE3(tsc__wayback, htrime_t, a, hrtime_t, tsc_last,
192e014e7f8SPaul Dagnelie uint32_t, tsc_wayback);
193e014e7f8SPaul Dagnelie return (tsc_resume_cap);
194e014e7f8SPaul Dagnelie }
195e014e7f8SPaul Dagnelie return (a);
196e014e7f8SPaul Dagnelie }
197e014e7f8SPaul Dagnelie
198843e1988Sjohnlev hrtime_t
tsc_gethrtime(void)199843e1988Sjohnlev tsc_gethrtime(void)
200843e1988Sjohnlev {
201843e1988Sjohnlev uint32_t old_hres_lock;
202843e1988Sjohnlev hrtime_t tsc, hrt;
203843e1988Sjohnlev
204843e1988Sjohnlev do {
205843e1988Sjohnlev old_hres_lock = hres_lock;
206843e1988Sjohnlev
207843e1988Sjohnlev if ((tsc = tsc_read()) >= tsc_last) {
208843e1988Sjohnlev /*
209843e1988Sjohnlev * It would seem to be obvious that this is true
210843e1988Sjohnlev * (that is, the past is less than the present),
211843e1988Sjohnlev * but it isn't true in the presence of suspend/resume
212843e1988Sjohnlev * cycles. If we manage to call gethrtime()
213843e1988Sjohnlev * after a resume, but before the first call to
214843e1988Sjohnlev * tsc_tick(), we will see the jump. In this case,
215843e1988Sjohnlev * we will simply use the value in TSC as the delta.
216843e1988Sjohnlev */
217843e1988Sjohnlev tsc -= tsc_last;
218843e1988Sjohnlev } else if (tsc >= tsc_last - 2*tsc_max_delta) {
219843e1988Sjohnlev /*
220843e1988Sjohnlev * There is a chance that tsc_tick() has just run on
221843e1988Sjohnlev * another CPU, and we have drifted just enough so that
222843e1988Sjohnlev * we appear behind tsc_last. In this case, force the
223843e1988Sjohnlev * delta to be zero.
224843e1988Sjohnlev */
225843e1988Sjohnlev tsc = 0;
226e014e7f8SPaul Dagnelie } else {
227e014e7f8SPaul Dagnelie /*
228e014e7f8SPaul Dagnelie * If we reach this else clause we assume that we have
229e014e7f8SPaul Dagnelie * gone through a suspend/resume cycle and use the
230e014e7f8SPaul Dagnelie * current tsc value as the delta.
231e014e7f8SPaul Dagnelie *
232e014e7f8SPaul Dagnelie * In rare cases we can reach this else clause due to
233e014e7f8SPaul Dagnelie * a lack of monotonicity in the TSC value. In such
234e014e7f8SPaul Dagnelie * cases using the current TSC value as the delta would
235e014e7f8SPaul Dagnelie * cause us to return a value ~2x of what it should
236e014e7f8SPaul Dagnelie * be. To protect against these cases we cap the
237e014e7f8SPaul Dagnelie * suspend/resume delta at tsc_resume_cap.
238e014e7f8SPaul Dagnelie */
239e014e7f8SPaul Dagnelie tsc = tsc_protect(tsc);
240843e1988Sjohnlev }
241843e1988Sjohnlev
242843e1988Sjohnlev hrt = tsc_hrtime_base;
243843e1988Sjohnlev
244843e1988Sjohnlev TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
245843e1988Sjohnlev } while ((old_hres_lock & ~1) != hres_lock);
246843e1988Sjohnlev
247843e1988Sjohnlev return (hrt);
248843e1988Sjohnlev }
249843e1988Sjohnlev
250843e1988Sjohnlev hrtime_t
tsc_gethrtime_delta(void)251843e1988Sjohnlev tsc_gethrtime_delta(void)
252843e1988Sjohnlev {
253843e1988Sjohnlev uint32_t old_hres_lock;
254843e1988Sjohnlev hrtime_t tsc, hrt;
255a563a037Sbholler ulong_t flags;
256843e1988Sjohnlev
257843e1988Sjohnlev do {
258843e1988Sjohnlev old_hres_lock = hres_lock;
259843e1988Sjohnlev
260843e1988Sjohnlev /*
261843e1988Sjohnlev * We need to disable interrupts here to assure that we
262843e1988Sjohnlev * don't migrate between the call to tsc_read() and
263843e1988Sjohnlev * adding the CPU's TSC tick delta. Note that disabling
264843e1988Sjohnlev * and reenabling preemption is forbidden here because
265843e1988Sjohnlev * we may be in the middle of a fast trap. In the amd64
266843e1988Sjohnlev * kernel we cannot tolerate preemption during a fast
267843e1988Sjohnlev * trap. See _update_sregs().
268843e1988Sjohnlev */
269843e1988Sjohnlev
270843e1988Sjohnlev flags = clear_int_flag();
271843e1988Sjohnlev tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
272843e1988Sjohnlev restore_int_flag(flags);
273843e1988Sjohnlev
274843e1988Sjohnlev /* See comments in tsc_gethrtime() above */
275843e1988Sjohnlev
276843e1988Sjohnlev if (tsc >= tsc_last) {
277843e1988Sjohnlev tsc -= tsc_last;
278843e1988Sjohnlev } else if (tsc >= tsc_last - 2 * tsc_max_delta) {
279843e1988Sjohnlev tsc = 0;
280e014e7f8SPaul Dagnelie } else {
281e014e7f8SPaul Dagnelie tsc = tsc_protect(tsc);
282843e1988Sjohnlev }
283843e1988Sjohnlev
284843e1988Sjohnlev hrt = tsc_hrtime_base;
285843e1988Sjohnlev
286843e1988Sjohnlev TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
287843e1988Sjohnlev } while ((old_hres_lock & ~1) != hres_lock);
288843e1988Sjohnlev
289843e1988Sjohnlev return (hrt);
290843e1988Sjohnlev }
291843e1988Sjohnlev
2929278ddffSRobert Mustacchi hrtime_t
tsc_gethrtime_tick_delta(void)2939278ddffSRobert Mustacchi tsc_gethrtime_tick_delta(void)
2949278ddffSRobert Mustacchi {
2959278ddffSRobert Mustacchi hrtime_t hrt;
2969278ddffSRobert Mustacchi ulong_t flags;
2979278ddffSRobert Mustacchi
2989278ddffSRobert Mustacchi flags = clear_int_flag();
2999278ddffSRobert Mustacchi hrt = tsc_sync_tick_delta[CPU->cpu_id];
3009278ddffSRobert Mustacchi restore_int_flag(flags);
3019278ddffSRobert Mustacchi
3029278ddffSRobert Mustacchi return (hrt);
3039278ddffSRobert Mustacchi }
3049278ddffSRobert Mustacchi
305fc3fd29dSPatrick Mooney /* Calculate the hrtime while exposing the parameters of that calculation. */
306fc3fd29dSPatrick Mooney hrtime_t
tsc_gethrtime_params(uint64_t * tscp,uint32_t * scalep,uint8_t * shiftp)307fc3fd29dSPatrick Mooney tsc_gethrtime_params(uint64_t *tscp, uint32_t *scalep, uint8_t *shiftp)
308fc3fd29dSPatrick Mooney {
309fc3fd29dSPatrick Mooney uint32_t old_hres_lock, scale;
310fc3fd29dSPatrick Mooney hrtime_t tsc, last, base;
311fc3fd29dSPatrick Mooney
312fc3fd29dSPatrick Mooney do {
313fc3fd29dSPatrick Mooney old_hres_lock = hres_lock;
314fc3fd29dSPatrick Mooney
315fc3fd29dSPatrick Mooney if (gethrtimef == tsc_gethrtime_delta) {
316fc3fd29dSPatrick Mooney ulong_t flags;
317fc3fd29dSPatrick Mooney
318fc3fd29dSPatrick Mooney flags = clear_int_flag();
319fc3fd29dSPatrick Mooney tsc = tsc_read() + tsc_sync_tick_delta[CPU->cpu_id];
320fc3fd29dSPatrick Mooney restore_int_flag(flags);
321fc3fd29dSPatrick Mooney } else {
322fc3fd29dSPatrick Mooney tsc = tsc_read();
323fc3fd29dSPatrick Mooney }
324fc3fd29dSPatrick Mooney
325fc3fd29dSPatrick Mooney last = tsc_last;
326fc3fd29dSPatrick Mooney base = tsc_hrtime_base;
327fc3fd29dSPatrick Mooney scale = nsec_scale;
328fc3fd29dSPatrick Mooney
329fc3fd29dSPatrick Mooney } while ((old_hres_lock & ~1) != hres_lock);
330fc3fd29dSPatrick Mooney
331fc3fd29dSPatrick Mooney /* See comments in tsc_gethrtime() above */
332fc3fd29dSPatrick Mooney if (tsc >= last) {
333fc3fd29dSPatrick Mooney tsc -= last;
334fc3fd29dSPatrick Mooney } else if (tsc >= last - 2 * tsc_max_delta) {
335fc3fd29dSPatrick Mooney tsc = 0;
336fc3fd29dSPatrick Mooney } else {
337fc3fd29dSPatrick Mooney tsc = tsc_protect(tsc);
338fc3fd29dSPatrick Mooney }
339fc3fd29dSPatrick Mooney
340fc3fd29dSPatrick Mooney TSC_CONVERT_AND_ADD(tsc, base, nsec_scale);
341fc3fd29dSPatrick Mooney
342fc3fd29dSPatrick Mooney if (tscp != NULL) {
343fc3fd29dSPatrick Mooney /*
344fc3fd29dSPatrick Mooney * Do not simply communicate the delta applied to the hrtime
345fc3fd29dSPatrick Mooney * base, but rather the effective TSC measurement.
346fc3fd29dSPatrick Mooney */
347fc3fd29dSPatrick Mooney *tscp = tsc + last;
348fc3fd29dSPatrick Mooney }
349fc3fd29dSPatrick Mooney if (scalep != NULL) {
350fc3fd29dSPatrick Mooney *scalep = scale;
351fc3fd29dSPatrick Mooney }
352fc3fd29dSPatrick Mooney if (shiftp != NULL) {
353fc3fd29dSPatrick Mooney *shiftp = NSEC_SHIFT;
354fc3fd29dSPatrick Mooney }
355fc3fd29dSPatrick Mooney
356fc3fd29dSPatrick Mooney return (base);
357fc3fd29dSPatrick Mooney }
358fc3fd29dSPatrick Mooney
359843e1988Sjohnlev /*
360fc3fd29dSPatrick Mooney * This is similar to tsc_gethrtime_delta, but it cannot actually spin on
361fc3fd29dSPatrick Mooney * hres_lock. As a result, it caches all of the variables it needs; if the
362fc3fd29dSPatrick Mooney * variables don't change, it's done.
363843e1988Sjohnlev */
364843e1988Sjohnlev hrtime_t
dtrace_gethrtime(void)365843e1988Sjohnlev dtrace_gethrtime(void)
366843e1988Sjohnlev {
367843e1988Sjohnlev uint32_t old_hres_lock;
368843e1988Sjohnlev hrtime_t tsc, hrt;
369a563a037Sbholler ulong_t flags;
370843e1988Sjohnlev
371843e1988Sjohnlev do {
372843e1988Sjohnlev old_hres_lock = hres_lock;
373843e1988Sjohnlev
374843e1988Sjohnlev /*
375843e1988Sjohnlev * Interrupts are disabled to ensure that the thread isn't
376843e1988Sjohnlev * migrated between the tsc_read() and adding the CPU's
377843e1988Sjohnlev * TSC tick delta.
378843e1988Sjohnlev */
379843e1988Sjohnlev flags = clear_int_flag();
380843e1988Sjohnlev
381843e1988Sjohnlev tsc = tsc_read();
382843e1988Sjohnlev
383843e1988Sjohnlev if (gethrtimef == tsc_gethrtime_delta)
384843e1988Sjohnlev tsc += tsc_sync_tick_delta[CPU->cpu_id];
385843e1988Sjohnlev
386843e1988Sjohnlev restore_int_flag(flags);
387843e1988Sjohnlev
388843e1988Sjohnlev /*
389843e1988Sjohnlev * See the comments in tsc_gethrtime(), above.
390843e1988Sjohnlev */
391843e1988Sjohnlev if (tsc >= tsc_last)
392843e1988Sjohnlev tsc -= tsc_last;
393843e1988Sjohnlev else if (tsc >= tsc_last - 2*tsc_max_delta)
394843e1988Sjohnlev tsc = 0;
395e014e7f8SPaul Dagnelie else
396e014e7f8SPaul Dagnelie tsc = tsc_protect(tsc);
397843e1988Sjohnlev
398843e1988Sjohnlev hrt = tsc_hrtime_base;
399843e1988Sjohnlev
400843e1988Sjohnlev TSC_CONVERT_AND_ADD(tsc, hrt, nsec_scale);
401843e1988Sjohnlev
402843e1988Sjohnlev if ((old_hres_lock & ~1) == hres_lock)
403843e1988Sjohnlev break;
404843e1988Sjohnlev
405843e1988Sjohnlev /*
406843e1988Sjohnlev * If we're here, the clock lock is locked -- or it has been
407843e1988Sjohnlev * unlocked and locked since we looked. This may be due to
408843e1988Sjohnlev * tsc_tick() running on another CPU -- or it may be because
409843e1988Sjohnlev * some code path has ended up in dtrace_probe() with
410843e1988Sjohnlev * CLOCK_LOCK held. We'll try to determine that we're in
411843e1988Sjohnlev * the former case by taking another lap if the lock has
412843e1988Sjohnlev * changed since when we first looked at it.
413843e1988Sjohnlev */
414843e1988Sjohnlev if (old_hres_lock != hres_lock)
415843e1988Sjohnlev continue;
416843e1988Sjohnlev
417843e1988Sjohnlev /*
418843e1988Sjohnlev * So the lock was and is locked. We'll use the old data
419843e1988Sjohnlev * instead.
420843e1988Sjohnlev */
421843e1988Sjohnlev old_hres_lock = shadow_hres_lock;
422843e1988Sjohnlev
423843e1988Sjohnlev /*
424843e1988Sjohnlev * Again, disable interrupts to ensure that the thread
425843e1988Sjohnlev * isn't migrated between the tsc_read() and adding
426843e1988Sjohnlev * the CPU's TSC tick delta.
427843e1988Sjohnlev */
428843e1988Sjohnlev flags = clear_int_flag();
429843e1988Sjohnlev
430843e1988Sjohnlev tsc = tsc_read();
431843e1988Sjohnlev
432843e1988Sjohnlev if (gethrtimef == tsc_gethrtime_delta)
433843e1988Sjohnlev tsc += tsc_sync_tick_delta[CPU->cpu_id];
434843e1988Sjohnlev
435843e1988Sjohnlev restore_int_flag(flags);
436843e1988Sjohnlev
437843e1988Sjohnlev /*
438843e1988Sjohnlev * See the comments in tsc_gethrtime(), above.
439843e1988Sjohnlev */
440843e1988Sjohnlev if (tsc >= shadow_tsc_last)
441843e1988Sjohnlev tsc -= shadow_tsc_last;
442843e1988Sjohnlev else if (tsc >= shadow_tsc_last - 2 * tsc_max_delta)
443843e1988Sjohnlev tsc = 0;
444e014e7f8SPaul Dagnelie else
445e014e7f8SPaul Dagnelie tsc = tsc_protect(tsc);
446843e1988Sjohnlev
447843e1988Sjohnlev hrt = shadow_tsc_hrtime_base;
448843e1988Sjohnlev
449843e1988Sjohnlev TSC_CONVERT_AND_ADD(tsc, hrt, shadow_nsec_scale);
450843e1988Sjohnlev } while ((old_hres_lock & ~1) != shadow_hres_lock);
451843e1988Sjohnlev
452843e1988Sjohnlev return (hrt);
453843e1988Sjohnlev }
454843e1988Sjohnlev
455843e1988Sjohnlev hrtime_t
tsc_gethrtimeunscaled(void)456843e1988Sjohnlev tsc_gethrtimeunscaled(void)
457843e1988Sjohnlev {
458843e1988Sjohnlev uint32_t old_hres_lock;
459843e1988Sjohnlev hrtime_t tsc;
460843e1988Sjohnlev
461843e1988Sjohnlev do {
462843e1988Sjohnlev old_hres_lock = hres_lock;
463843e1988Sjohnlev
464843e1988Sjohnlev /* See tsc_tick(). */
465843e1988Sjohnlev tsc = tsc_read() + tsc_last_jumped;
466843e1988Sjohnlev } while ((old_hres_lock & ~1) != hres_lock);
467843e1988Sjohnlev
468843e1988Sjohnlev return (tsc);
469843e1988Sjohnlev }
470843e1988Sjohnlev
471113b131bSEric Saxe /*
472113b131bSEric Saxe * Convert a nanosecond based timestamp to tsc
473113b131bSEric Saxe */
474113b131bSEric Saxe uint64_t
tsc_unscalehrtime(hrtime_t nsec)475113b131bSEric Saxe tsc_unscalehrtime(hrtime_t nsec)
476113b131bSEric Saxe {
477113b131bSEric Saxe hrtime_t tsc;
478113b131bSEric Saxe
479113b131bSEric Saxe if (tsc_gethrtime_enable) {
480113b131bSEric Saxe TSC_CONVERT(nsec, tsc, nsec_unscale);
481113b131bSEric Saxe return (tsc);
482113b131bSEric Saxe }
483113b131bSEric Saxe return ((uint64_t)nsec);
484113b131bSEric Saxe }
485843e1988Sjohnlev
486843e1988Sjohnlev /* Convert a tsc timestamp to nanoseconds */
487843e1988Sjohnlev void
tsc_scalehrtime(hrtime_t * tsc)488843e1988Sjohnlev tsc_scalehrtime(hrtime_t *tsc)
489843e1988Sjohnlev {
490843e1988Sjohnlev hrtime_t hrt;
491843e1988Sjohnlev hrtime_t mytsc;
492843e1988Sjohnlev
493843e1988Sjohnlev if (tsc == NULL)
494843e1988Sjohnlev return;
495843e1988Sjohnlev mytsc = *tsc;
496843e1988Sjohnlev
497843e1988Sjohnlev TSC_CONVERT(mytsc, hrt, nsec_scale);
498843e1988Sjohnlev *tsc = hrt;
499843e1988Sjohnlev }
500843e1988Sjohnlev
501843e1988Sjohnlev hrtime_t
tsc_gethrtimeunscaled_delta(void)502843e1988Sjohnlev tsc_gethrtimeunscaled_delta(void)
503843e1988Sjohnlev {
504843e1988Sjohnlev hrtime_t hrt;
505a563a037Sbholler ulong_t flags;
506843e1988Sjohnlev
507843e1988Sjohnlev /*
508843e1988Sjohnlev * Similarly to tsc_gethrtime_delta, we need to disable preemption
509843e1988Sjohnlev * to prevent migration between the call to tsc_gethrtimeunscaled
510843e1988Sjohnlev * and adding the CPU's hrtime delta. Note that disabling and
511843e1988Sjohnlev * reenabling preemption is forbidden here because we may be in the
512843e1988Sjohnlev * middle of a fast trap. In the amd64 kernel we cannot tolerate
513843e1988Sjohnlev * preemption during a fast trap. See _update_sregs().
514843e1988Sjohnlev */
515843e1988Sjohnlev
516843e1988Sjohnlev flags = clear_int_flag();
517843e1988Sjohnlev hrt = tsc_gethrtimeunscaled() + tsc_sync_tick_delta[CPU->cpu_id];
518843e1988Sjohnlev restore_int_flag(flags);
519843e1988Sjohnlev
520843e1988Sjohnlev return (hrt);
521843e1988Sjohnlev }
522843e1988Sjohnlev
5237c478bd9Sstevel@tonic-gate /*
52486cb0be2SPatrick Mooney * TSC Sync Master
52586cb0be2SPatrick Mooney *
52686cb0be2SPatrick Mooney * Typically called on the boot CPU, this attempts to quantify TSC skew between
52786cb0be2SPatrick Mooney * different CPUs. If an appreciable difference is found, gethrtimef will be
52886cb0be2SPatrick Mooney * changed to point to tsc_gethrtime_delta().
52986cb0be2SPatrick Mooney *
53086cb0be2SPatrick Mooney * Calculating skews is precise only when the master and slave TSCs are read
53186cb0be2SPatrick Mooney * simultaneously; however, there is no algorithm that can read both CPUs in
53286cb0be2SPatrick Mooney * perfect simultaneity. The proposed algorithm is an approximate method based
53386cb0be2SPatrick Mooney * on the behaviour of cache management. The slave CPU continuously polls the
53486cb0be2SPatrick Mooney * TSC while reading a global variable updated by the master CPU. The latest
53586cb0be2SPatrick Mooney * TSC reading is saved when the master's update (forced via mfence) reaches
53686cb0be2SPatrick Mooney * visibility on the slave. The master will also take a TSC reading
53786cb0be2SPatrick Mooney * immediately following the mfence.
53886cb0be2SPatrick Mooney *
53986cb0be2SPatrick Mooney * While the delay between cache line invalidation on the slave and mfence
54086cb0be2SPatrick Mooney * completion on the master is not repeatable, the error is heuristically
54186cb0be2SPatrick Mooney * assumed to be 1/4th of the write time recorded by the master. Multiple
54286cb0be2SPatrick Mooney * samples are taken to control for the variance caused by external factors
54386cb0be2SPatrick Mooney * such as bus contention. Each sample set is independent per-CPU to control
54486cb0be2SPatrick Mooney * for differing memory latency on NUMA systems.
5454af20bbdSSudheer A *
5464af20bbdSSudheer A * TSC sync is disabled in the context of virtualization because the CPUs
5474af20bbdSSudheer A * assigned to the guest are virtual CPUs which means the real CPUs on which
5484af20bbdSSudheer A * guest runs keep changing during life time of guest OS. So we would end up
5494af20bbdSSudheer A * calculating TSC skews for a set of CPUs during boot whereas the guest
5504af20bbdSSudheer A * might migrate to a different set of physical CPUs at a later point of
5514af20bbdSSudheer A * time.
5527c478bd9Sstevel@tonic-gate */
5537c478bd9Sstevel@tonic-gate void
tsc_sync_master(processorid_t slave)5547c478bd9Sstevel@tonic-gate tsc_sync_master(processorid_t slave)
5557c478bd9Sstevel@tonic-gate {
556b3c18020SSudheer A ulong_t flags, source, min_write_time = ~0UL;
55786cb0be2SPatrick Mooney hrtime_t write_time, mtsc_after, last_delta = 0;
558b3c18020SSudheer A tsc_sync_t *tsc = tscp;
559b3c18020SSudheer A int cnt;
560b9bfdccdSStuart Maybee int hwtype;
5617c478bd9Sstevel@tonic-gate
562b9bfdccdSStuart Maybee hwtype = get_hwenv();
56379ec9da8SYuri Pankov if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
564ae115bc7Smrj return;
565ae115bc7Smrj
5667c478bd9Sstevel@tonic-gate flags = clear_int_flag();
567b3c18020SSudheer A source = CPU->cpu_id;
568b3c18020SSudheer A
569b3c18020SSudheer A for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
570b3c18020SSudheer A while (tsc_sync_go != TSC_SYNC_GO)
571b3c18020SSudheer A SMT_PAUSE();
572b3c18020SSudheer A
573b3c18020SSudheer A tsc->master_tsc = tsc_read();
574b3c18020SSudheer A membar_enter();
575b3c18020SSudheer A mtsc_after = tsc_read();
576b3c18020SSudheer A while (tsc_sync_go != TSC_SYNC_DONE)
577b3c18020SSudheer A SMT_PAUSE();
578b3c18020SSudheer A write_time = mtsc_after - tsc->master_tsc;
579b3c18020SSudheer A if (write_time <= min_write_time) {
58086cb0be2SPatrick Mooney hrtime_t tdelta;
58186cb0be2SPatrick Mooney
58286cb0be2SPatrick Mooney tdelta = tsc->slave_tsc - mtsc_after;
58386cb0be2SPatrick Mooney if (tdelta < 0)
58486cb0be2SPatrick Mooney tdelta = -tdelta;
585b3c18020SSudheer A /*
58686cb0be2SPatrick Mooney * If the margin exists, subtract 1/4th of the measured
58786cb0be2SPatrick Mooney * write time from the master's TSC value. This is an
58886cb0be2SPatrick Mooney * estimate of how late the mfence completion came
58986cb0be2SPatrick Mooney * after the slave noticed the cache line change.
590b3c18020SSudheer A */
59186cb0be2SPatrick Mooney if (tdelta > (write_time/4)) {
592b3c18020SSudheer A tdelta = tsc->slave_tsc -
59386cb0be2SPatrick Mooney (mtsc_after - (write_time/4));
59486cb0be2SPatrick Mooney } else {
595b3c18020SSudheer A tdelta = tsc->slave_tsc - mtsc_after;
59686cb0be2SPatrick Mooney }
59786cb0be2SPatrick Mooney last_delta = tsc_sync_tick_delta[source] - tdelta;
59886cb0be2SPatrick Mooney tsc_sync_tick_delta[slave] = last_delta;
59986cb0be2SPatrick Mooney min_write_time = write_time;
600b3c18020SSudheer A }
6017c478bd9Sstevel@tonic-gate
602b3c18020SSudheer A tsc->master_tsc = tsc->slave_tsc = write_time = 0;
603b3c18020SSudheer A membar_enter();
604b3c18020SSudheer A tsc_sync_go = TSC_SYNC_STOP;
605b3c18020SSudheer A }
60686cb0be2SPatrick Mooney
6077c478bd9Sstevel@tonic-gate /*
60886cb0be2SPatrick Mooney * Only enable the delta variants of the TSC functions if the measured
60986cb0be2SPatrick Mooney * skew is greater than the fastest write time.
6107c478bd9Sstevel@tonic-gate */
61186cb0be2SPatrick Mooney last_delta = (last_delta < 0) ? -last_delta : last_delta;
61286cb0be2SPatrick Mooney if (last_delta > min_write_time) {
613b3c18020SSudheer A gethrtimef = tsc_gethrtime_delta;
614b3c18020SSudheer A gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
6152428aad8SPatrick Mooney tsc_ncpu = NCPU;
616b3c18020SSudheer A }
6177c478bd9Sstevel@tonic-gate restore_int_flag(flags);
6187c478bd9Sstevel@tonic-gate }
6197c478bd9Sstevel@tonic-gate
6204af20bbdSSudheer A /*
62186cb0be2SPatrick Mooney * TSC Sync Slave
62286cb0be2SPatrick Mooney *
6234af20bbdSSudheer A * Called by a CPU which has just been onlined. It is expected that the CPU
6244af20bbdSSudheer A * performing the online operation will call tsc_sync_master().
6254af20bbdSSudheer A *
62686cb0be2SPatrick Mooney * Like tsc_sync_master, this logic is skipped on virtualized platforms.
6274af20bbdSSudheer A */
6287c478bd9Sstevel@tonic-gate void
tsc_sync_slave(void)6297c478bd9Sstevel@tonic-gate tsc_sync_slave(void)
6307c478bd9Sstevel@tonic-gate {
631ae115bc7Smrj ulong_t flags;
632b3c18020SSudheer A hrtime_t s1;
633b3c18020SSudheer A tsc_sync_t *tsc = tscp;
634b3c18020SSudheer A int cnt;
635b9bfdccdSStuart Maybee int hwtype;
6367c478bd9Sstevel@tonic-gate
637b9bfdccdSStuart Maybee hwtype = get_hwenv();
63879ec9da8SYuri Pankov if (!tsc_master_slave_sync_needed || (hwtype & HW_VIRTUAL) != 0)
639ae115bc7Smrj return;
640ae115bc7Smrj
6417c478bd9Sstevel@tonic-gate flags = clear_int_flag();
6427c478bd9Sstevel@tonic-gate
643b3c18020SSudheer A for (cnt = 0; cnt < SYNC_ITERATIONS; cnt++) {
644b3c18020SSudheer A /* Re-fill the cache line */
645b3c18020SSudheer A s1 = tsc->master_tsc;
646b3c18020SSudheer A membar_enter();
647b3c18020SSudheer A tsc_sync_go = TSC_SYNC_GO;
648b3c18020SSudheer A do {
649b3c18020SSudheer A /*
65086cb0be2SPatrick Mooney * Do not put an SMT_PAUSE here. If the master and
65186cb0be2SPatrick Mooney * slave are the same hyper-threaded CPU, we want the
65286cb0be2SPatrick Mooney * master to yield as quickly as possible to the slave.
653b3c18020SSudheer A */
654b3c18020SSudheer A s1 = tsc_read();
655b3c18020SSudheer A } while (tsc->master_tsc == 0);
656b3c18020SSudheer A tsc->slave_tsc = s1;
657b3c18020SSudheer A membar_enter();
658b3c18020SSudheer A tsc_sync_go = TSC_SYNC_DONE;
659b3c18020SSudheer A
660b3c18020SSudheer A while (tsc_sync_go != TSC_SYNC_STOP)
661b3c18020SSudheer A SMT_PAUSE();
662b3c18020SSudheer A }
6637c478bd9Sstevel@tonic-gate
6647c478bd9Sstevel@tonic-gate restore_int_flag(flags);
6657c478bd9Sstevel@tonic-gate }
6667c478bd9Sstevel@tonic-gate
6677c478bd9Sstevel@tonic-gate /*
668ae115bc7Smrj * Called once per second on a CPU from the cyclic subsystem's
669ae115bc7Smrj * CY_HIGH_LEVEL interrupt. (No longer just cpu0-only)
6707c478bd9Sstevel@tonic-gate */
6717c478bd9Sstevel@tonic-gate void
tsc_tick(void)6727c478bd9Sstevel@tonic-gate tsc_tick(void)
6737c478bd9Sstevel@tonic-gate {
6747c478bd9Sstevel@tonic-gate hrtime_t now, delta;
6757c478bd9Sstevel@tonic-gate ushort_t spl;
6767c478bd9Sstevel@tonic-gate
6777c478bd9Sstevel@tonic-gate /*
6787c478bd9Sstevel@tonic-gate * Before we set the new variables, we set the shadow values. This
6797c478bd9Sstevel@tonic-gate * allows for lock free operation in dtrace_gethrtime().
6807c478bd9Sstevel@tonic-gate */
6817c478bd9Sstevel@tonic-gate lock_set_spl((lock_t *)&shadow_hres_lock + HRES_LOCK_OFFSET,
6827c478bd9Sstevel@tonic-gate ipltospl(CBE_HIGH_PIL), &spl);
6837c478bd9Sstevel@tonic-gate
6847c478bd9Sstevel@tonic-gate shadow_tsc_hrtime_base = tsc_hrtime_base;
6857c478bd9Sstevel@tonic-gate shadow_tsc_last = tsc_last;
6867c478bd9Sstevel@tonic-gate shadow_nsec_scale = nsec_scale;
6877c478bd9Sstevel@tonic-gate
6887c478bd9Sstevel@tonic-gate shadow_hres_lock++;
6897c478bd9Sstevel@tonic-gate splx(spl);
6907c478bd9Sstevel@tonic-gate
6917c478bd9Sstevel@tonic-gate CLOCK_LOCK(&spl);
6927c478bd9Sstevel@tonic-gate
6937c478bd9Sstevel@tonic-gate now = tsc_read();
6947c478bd9Sstevel@tonic-gate
695d90554ebSdmick if (gethrtimef == tsc_gethrtime_delta)
696d90554ebSdmick now += tsc_sync_tick_delta[CPU->cpu_id];
697d90554ebSdmick
6987c478bd9Sstevel@tonic-gate if (now < tsc_last) {
6997c478bd9Sstevel@tonic-gate /*
7007c478bd9Sstevel@tonic-gate * The TSC has just jumped into the past. We assume that
7017c478bd9Sstevel@tonic-gate * this is due to a suspend/resume cycle, and we're going
7027c478bd9Sstevel@tonic-gate * to use the _current_ value of TSC as the delta. This
7037c478bd9Sstevel@tonic-gate * will keep tsc_hrtime_base correct. We're also going to
7047c478bd9Sstevel@tonic-gate * assume that rate of tsc does not change after a suspend
7057c478bd9Sstevel@tonic-gate * resume (i.e nsec_scale remains the same).
7067c478bd9Sstevel@tonic-gate */
7077c478bd9Sstevel@tonic-gate delta = now;
708e014e7f8SPaul Dagnelie delta = tsc_protect(delta);
7097c478bd9Sstevel@tonic-gate tsc_last_jumped += tsc_last;
7107c478bd9Sstevel@tonic-gate tsc_jumped = 1;
7117c478bd9Sstevel@tonic-gate } else {
7127c478bd9Sstevel@tonic-gate /*
7137c478bd9Sstevel@tonic-gate * Determine the number of TSC ticks since the last clock
7147c478bd9Sstevel@tonic-gate * tick, and add that to the hrtime base.
7157c478bd9Sstevel@tonic-gate */
7167c478bd9Sstevel@tonic-gate delta = now - tsc_last;
7177c478bd9Sstevel@tonic-gate }
7187c478bd9Sstevel@tonic-gate
7197c478bd9Sstevel@tonic-gate TSC_CONVERT_AND_ADD(delta, tsc_hrtime_base, nsec_scale);
7207c478bd9Sstevel@tonic-gate tsc_last = now;
7217c478bd9Sstevel@tonic-gate
7227c478bd9Sstevel@tonic-gate CLOCK_UNLOCK(spl);
7237c478bd9Sstevel@tonic-gate }
7247c478bd9Sstevel@tonic-gate
7257c478bd9Sstevel@tonic-gate void
tsc_hrtimeinit(uint64_t cpu_freq_hz)726843e1988Sjohnlev tsc_hrtimeinit(uint64_t cpu_freq_hz)
7277c478bd9Sstevel@tonic-gate {
728843e1988Sjohnlev extern int gethrtime_hires;
729843e1988Sjohnlev longlong_t tsc;
730843e1988Sjohnlev ulong_t flags;
7317c478bd9Sstevel@tonic-gate
732843e1988Sjohnlev /*
733843e1988Sjohnlev * cpu_freq_hz is the measured cpu frequency in hertz
734843e1988Sjohnlev */
7357c478bd9Sstevel@tonic-gate
7367c478bd9Sstevel@tonic-gate /*
737843e1988Sjohnlev * We can't accommodate CPUs slower than 31.25 MHz.
7387c478bd9Sstevel@tonic-gate */
739843e1988Sjohnlev ASSERT(cpu_freq_hz > NANOSEC / (1 << NSEC_SHIFT));
740843e1988Sjohnlev nsec_scale =
741843e1988Sjohnlev (uint_t)(((uint64_t)NANOSEC << (32 - NSEC_SHIFT)) / cpu_freq_hz);
742113b131bSEric Saxe nsec_unscale =
743113b131bSEric Saxe (uint_t)(((uint64_t)cpu_freq_hz << (32 - NSEC_SHIFT)) / NANOSEC);
7447c478bd9Sstevel@tonic-gate
7457c478bd9Sstevel@tonic-gate flags = clear_int_flag();
746843e1988Sjohnlev tsc = tsc_read();
747843e1988Sjohnlev (void) tsc_gethrtime();
748843e1988Sjohnlev tsc_max_delta = tsc_read() - tsc;
7497c478bd9Sstevel@tonic-gate restore_int_flag(flags);
750843e1988Sjohnlev gethrtimef = tsc_gethrtime;
751843e1988Sjohnlev gethrtimeunscaledf = tsc_gethrtimeunscaled;
752843e1988Sjohnlev scalehrtimef = tsc_scalehrtime;
753113b131bSEric Saxe unscalehrtimef = tsc_unscalehrtime;
754843e1988Sjohnlev hrtime_tick = tsc_tick;
755843e1988Sjohnlev gethrtime_hires = 1;
7562428aad8SPatrick Mooney /*
7572428aad8SPatrick Mooney * Being part of the comm page, tsc_ncpu communicates the published
7582428aad8SPatrick Mooney * length of the tsc_sync_tick_delta array. This is kept zeroed to
7592428aad8SPatrick Mooney * ignore the absent delta data while the TSCs are synced.
7602428aad8SPatrick Mooney */
7612428aad8SPatrick Mooney tsc_ncpu = 0;
762b3c18020SSudheer A /*
763b3c18020SSudheer A * Allocate memory for the structure used in the tsc sync logic.
764b3c18020SSudheer A * This structure should be aligned on a multiple of cache line size.
765b3c18020SSudheer A */
766b3c18020SSudheer A tscp = kmem_zalloc(PAGESIZE, KM_SLEEP);
767e014e7f8SPaul Dagnelie
768e014e7f8SPaul Dagnelie /*
769e014e7f8SPaul Dagnelie * Convert the TSC resume cap ns value into its unscaled TSC value.
770e014e7f8SPaul Dagnelie * See tsc_gethrtime().
771e014e7f8SPaul Dagnelie */
772e014e7f8SPaul Dagnelie if (tsc_resume_cap == 0)
773e014e7f8SPaul Dagnelie TSC_CONVERT(tsc_resume_cap_ns, tsc_resume_cap, nsec_unscale);
7747c478bd9Sstevel@tonic-gate }
7752df1fe9cSrandyf
7762df1fe9cSrandyf int
get_tsc_ready()7772df1fe9cSrandyf get_tsc_ready()
7782df1fe9cSrandyf {
7792df1fe9cSrandyf return (tsc_ready);
7802df1fe9cSrandyf }
7812df1fe9cSrandyf
7822df1fe9cSrandyf /*
78386cb0be2SPatrick Mooney * Adjust all the deltas by adding the passed value to the array and activate
78486cb0be2SPatrick Mooney * the "delta" versions of the gethrtime functions. It is possible that the
78586cb0be2SPatrick Mooney * adjustment could be negative. Such may occur if the SunOS instance was
78686cb0be2SPatrick Mooney * moved by a virtual manager to a machine with a higher value of TSC.
7872df1fe9cSrandyf */
7882df1fe9cSrandyf void
tsc_adjust_delta(hrtime_t tdelta)7892df1fe9cSrandyf tsc_adjust_delta(hrtime_t tdelta)
7902df1fe9cSrandyf {
7912df1fe9cSrandyf int i;
7922df1fe9cSrandyf
7932df1fe9cSrandyf for (i = 0; i < NCPU; i++) {
7942df1fe9cSrandyf tsc_sync_tick_delta[i] += tdelta;
7952df1fe9cSrandyf }
7962df1fe9cSrandyf
7972df1fe9cSrandyf gethrtimef = tsc_gethrtime_delta;
7982df1fe9cSrandyf gethrtimeunscaledf = tsc_gethrtimeunscaled_delta;
7992428aad8SPatrick Mooney tsc_ncpu = NCPU;
8002df1fe9cSrandyf }
8012df1fe9cSrandyf
8022df1fe9cSrandyf /*
8032df1fe9cSrandyf * Functions to manage TSC and high-res time on suspend and resume.
8042df1fe9cSrandyf */
8052df1fe9cSrandyf
80686cb0be2SPatrick Mooney /* tod_ops from "uts/i86pc/io/todpc_subr.c" */
8072df1fe9cSrandyf extern tod_ops_t *tod_ops;
80886cb0be2SPatrick Mooney
8092df1fe9cSrandyf static uint64_t tsc_saved_tsc = 0; /* 1 in 2^64 chance this'll screw up! */
8102df1fe9cSrandyf static timestruc_t tsc_saved_ts;
8112df1fe9cSrandyf static int tsc_needs_resume = 0; /* We only want to do this once. */
8122df1fe9cSrandyf int tsc_delta_onsuspend = 0;
8132df1fe9cSrandyf int tsc_adjust_seconds = 1;
8142df1fe9cSrandyf int tsc_suspend_count = 0;
8152df1fe9cSrandyf int tsc_resume_in_cyclic = 0;
8162df1fe9cSrandyf
8172df1fe9cSrandyf /*
81886cb0be2SPatrick Mooney * Take snapshots of the current time and do any other pre-suspend work.
8192df1fe9cSrandyf */
8202df1fe9cSrandyf void
tsc_suspend(void)8212df1fe9cSrandyf tsc_suspend(void)
8222df1fe9cSrandyf {
82386cb0be2SPatrick Mooney /*
82486cb0be2SPatrick Mooney * We need to collect the time at which we suspended here so we know
82586cb0be2SPatrick Mooney * now much should be added during the resume. This is called by each
82686cb0be2SPatrick Mooney * CPU, so reentry must be properly handled.
82786cb0be2SPatrick Mooney */
8282df1fe9cSrandyf if (tsc_gethrtime_enable) {
8292df1fe9cSrandyf /*
83086cb0be2SPatrick Mooney * Perform the tsc_read after acquiring the lock to make it as
83186cb0be2SPatrick Mooney * accurate as possible in the face of contention.
8322df1fe9cSrandyf */
8332df1fe9cSrandyf mutex_enter(&tod_lock);
8342df1fe9cSrandyf tsc_saved_tsc = tsc_read();
8352df1fe9cSrandyf tsc_saved_ts = TODOP_GET(tod_ops);
8362df1fe9cSrandyf mutex_exit(&tod_lock);
8372df1fe9cSrandyf /* We only want to do this once. */
8382df1fe9cSrandyf if (tsc_needs_resume == 0) {
8392df1fe9cSrandyf if (tsc_delta_onsuspend) {
8402df1fe9cSrandyf tsc_adjust_delta(tsc_saved_tsc);
8412df1fe9cSrandyf } else {
8422df1fe9cSrandyf tsc_adjust_delta(nsec_scale);
8432df1fe9cSrandyf }
8442df1fe9cSrandyf tsc_suspend_count++;
8452df1fe9cSrandyf }
8462df1fe9cSrandyf }
8472df1fe9cSrandyf
8482df1fe9cSrandyf invalidate_cache();
8492df1fe9cSrandyf tsc_needs_resume = 1;
8502df1fe9cSrandyf }
8512df1fe9cSrandyf
8522df1fe9cSrandyf /*
85386cb0be2SPatrick Mooney * Restore all timestamp state based on the snapshots taken at suspend time.
8542df1fe9cSrandyf */
8552df1fe9cSrandyf void
tsc_resume(void)8562df1fe9cSrandyf tsc_resume(void)
8572df1fe9cSrandyf {
8582df1fe9cSrandyf /*
8592df1fe9cSrandyf * We only need to (and want to) do this once. So let the first
8602df1fe9cSrandyf * caller handle this (we are locked by the cpu lock), as it
8612df1fe9cSrandyf * is preferential that we get the earliest sync.
8622df1fe9cSrandyf */
8632df1fe9cSrandyf if (tsc_needs_resume) {
8642df1fe9cSrandyf /*
8652df1fe9cSrandyf * If using the TSC, adjust the delta based on how long
8662df1fe9cSrandyf * we were sleeping (or away). We also adjust for
8672df1fe9cSrandyf * migration and a grown TSC.
8682df1fe9cSrandyf */
8692df1fe9cSrandyf if (tsc_saved_tsc != 0) {
8702df1fe9cSrandyf timestruc_t ts;
8712df1fe9cSrandyf hrtime_t now, sleep_tsc = 0;
8722df1fe9cSrandyf int sleep_sec;
8732df1fe9cSrandyf extern void tsc_tick(void);
8742df1fe9cSrandyf extern uint64_t cpu_freq_hz;
8752df1fe9cSrandyf
8762df1fe9cSrandyf /* tsc_read() MUST be before TODOP_GET() */
8772df1fe9cSrandyf mutex_enter(&tod_lock);
8782df1fe9cSrandyf now = tsc_read();
8792df1fe9cSrandyf ts = TODOP_GET(tod_ops);
8802df1fe9cSrandyf mutex_exit(&tod_lock);
8812df1fe9cSrandyf
8822df1fe9cSrandyf /* Compute seconds of sleep time */
8832df1fe9cSrandyf sleep_sec = ts.tv_sec - tsc_saved_ts.tv_sec;
8842df1fe9cSrandyf
8852df1fe9cSrandyf /*
8862df1fe9cSrandyf * If the saved sec is less that or equal to
8872df1fe9cSrandyf * the current ts, then there is likely a
8882df1fe9cSrandyf * problem with the clock. Assume at least
8892df1fe9cSrandyf * one second has passed, so that time goes forward.
8902df1fe9cSrandyf */
8912df1fe9cSrandyf if (sleep_sec <= 0) {
8922df1fe9cSrandyf sleep_sec = 1;
8932df1fe9cSrandyf }
8942df1fe9cSrandyf
8952df1fe9cSrandyf /* How many TSC's should have occured while sleeping */
8962df1fe9cSrandyf if (tsc_adjust_seconds)
8972df1fe9cSrandyf sleep_tsc = sleep_sec * cpu_freq_hz;
8982df1fe9cSrandyf
8992df1fe9cSrandyf /*
9002df1fe9cSrandyf * We also want to subtract from the "sleep_tsc"
9012df1fe9cSrandyf * the current value of tsc_read(), so that our
9022df1fe9cSrandyf * adjustment accounts for the amount of time we
9032df1fe9cSrandyf * have been resumed _or_ an adjustment based on
9042df1fe9cSrandyf * the fact that we didn't actually power off the
9052df1fe9cSrandyf * CPU (migration is another issue, but _should_
9062df1fe9cSrandyf * also comply with this calculation). If the CPU
9072df1fe9cSrandyf * never powered off, then:
9082df1fe9cSrandyf * 'now == sleep_tsc + saved_tsc'
9092df1fe9cSrandyf * and the delta will effectively be "0".
9102df1fe9cSrandyf */
9112df1fe9cSrandyf sleep_tsc -= now;
9122df1fe9cSrandyf if (tsc_delta_onsuspend) {
9132df1fe9cSrandyf tsc_adjust_delta(sleep_tsc);
9142df1fe9cSrandyf } else {
9152df1fe9cSrandyf tsc_adjust_delta(tsc_saved_tsc + sleep_tsc);
9162df1fe9cSrandyf }
9172df1fe9cSrandyf tsc_saved_tsc = 0;
9182df1fe9cSrandyf
9192df1fe9cSrandyf tsc_tick();
9202df1fe9cSrandyf }
9212df1fe9cSrandyf tsc_needs_resume = 0;
9222df1fe9cSrandyf }
9232df1fe9cSrandyf
9242df1fe9cSrandyf }
925*575694f6SJason King
926*575694f6SJason King static int
tsc_calibrate_cmp(const void * a,const void * b)927*575694f6SJason King tsc_calibrate_cmp(const void *a, const void *b)
928*575694f6SJason King {
929*575694f6SJason King const tsc_calibrate_t * const *a1 = a;
930*575694f6SJason King const tsc_calibrate_t * const *b1 = b;
931*575694f6SJason King const tsc_calibrate_t *l = *a1;
932*575694f6SJason King const tsc_calibrate_t *r = *b1;
933*575694f6SJason King
934*575694f6SJason King /* Sort from highest preference to lowest preference */
935*575694f6SJason King if (l->tscc_preference > r->tscc_preference)
936*575694f6SJason King return (-1);
937*575694f6SJason King if (l->tscc_preference < r->tscc_preference)
938*575694f6SJason King return (1);
939*575694f6SJason King
940*575694f6SJason King /* For equal preference sources, sort alphabetically */
941*575694f6SJason King int c = strcmp(l->tscc_source, r->tscc_source);
942*575694f6SJason King
943*575694f6SJason King if (c < 0)
944*575694f6SJason King return (-1);
945*575694f6SJason King if (c > 0)
946*575694f6SJason King return (1);
947*575694f6SJason King return (0);
948*575694f6SJason King }
949*575694f6SJason King
950*575694f6SJason King SET_DECLARE(tsc_calibration_set, tsc_calibrate_t);
951*575694f6SJason King
952*575694f6SJason King static tsc_calibrate_t *
tsc_calibrate_get_force(const char * source)953*575694f6SJason King tsc_calibrate_get_force(const char *source)
954*575694f6SJason King {
955*575694f6SJason King tsc_calibrate_t **tsccpp;
956*575694f6SJason King
957*575694f6SJason King VERIFY3P(source, !=, NULL);
958*575694f6SJason King
959*575694f6SJason King SET_FOREACH(tsccpp, tsc_calibration_set) {
960*575694f6SJason King tsc_calibrate_t *tsccp = *tsccpp;
961*575694f6SJason King
962*575694f6SJason King if (strcasecmp(source, tsccp->tscc_source) == 0)
963*575694f6SJason King return (tsccp);
964*575694f6SJason King }
965*575694f6SJason King
966*575694f6SJason King /*
967*575694f6SJason King * If an operator explicitly gave a TSC value and we didn't find it,
968*575694f6SJason King * we should let them know.
969*575694f6SJason King */
970*575694f6SJason King cmn_err(CE_NOTE,
971*575694f6SJason King "Explicit TSC calibration source '%s' not found; using default",
972*575694f6SJason King source);
973*575694f6SJason King
974*575694f6SJason King return (NULL);
975*575694f6SJason King }
976*575694f6SJason King
977*575694f6SJason King /*
978*575694f6SJason King * As described in tscc_pit.c, as an intertim measure as we transition to
979*575694f6SJason King * alternate calibration sources besides the PIT, we still want to gather
980*575694f6SJason King * what the values would have been had we used the PIT. Therefore, if we're
981*575694f6SJason King * using a source other than the PIT, we explicitly run the PIT calibration
982*575694f6SJason King * which will store the TSC frequency as measured by the PIT for the
983*575694f6SJason King * benefit of the APIC code (as well as any potential diagnostics).
984*575694f6SJason King */
985*575694f6SJason King static void
tsc_pit_also(void)986*575694f6SJason King tsc_pit_also(void)
987*575694f6SJason King {
988*575694f6SJason King tsc_calibrate_t *pit = tsc_calibrate_get_force("PIT");
989*575694f6SJason King uint64_t dummy;
990*575694f6SJason King
991*575694f6SJason King /* We should always have the PIT as a possible calibration source */
992*575694f6SJason King VERIFY3P(pit, !=, NULL);
993*575694f6SJason King
994*575694f6SJason King /* If we used the PIT to calibrate, we don't need to run again */
995*575694f6SJason King if (tsc_calibration_source == pit)
996*575694f6SJason King return;
997*575694f6SJason King
998*575694f6SJason King /*
999*575694f6SJason King * Since we're not using the PIT as the actual TSC calibration source,
1000*575694f6SJason King * we don't care about the results or saving the result -- tscc_pit.c
1001*575694f6SJason King * saves the frequency in a global for the benefit of the APIC code.
1002*575694f6SJason King */
1003*575694f6SJason King (void) pit->tscc_calibrate(&dummy);
1004*575694f6SJason King }
1005*575694f6SJason King
1006*575694f6SJason King uint64_t
tsc_calibrate(void)1007*575694f6SJason King tsc_calibrate(void)
1008*575694f6SJason King {
1009*575694f6SJason King tsc_calibrate_t **tsccpp, *force;
1010*575694f6SJason King size_t tsc_set_size;
1011*575694f6SJason King int tsc_name_len;
1012*575694f6SJason King
1013*575694f6SJason King /*
1014*575694f6SJason King * Every x86 system since the Pentium has TSC support. Since we
1015*575694f6SJason King * only support 64-bit x86 systems, there should always be a TSC
1016*575694f6SJason King * present, and something's horribly wrong if it's missing.
1017*575694f6SJason King */
1018*575694f6SJason King if (!is_x86_feature(x86_featureset, X86FSET_TSC))
1019*575694f6SJason King panic("System does not have TSC support");
1020*575694f6SJason King
1021*575694f6SJason King /*
1022*575694f6SJason King * If we already successfully calibrated the TSC, no need to do
1023*575694f6SJason King * it again.
1024*575694f6SJason King */
1025*575694f6SJason King if (tsc_freq > 0)
1026*575694f6SJason King return (tsc_freq);
1027*575694f6SJason King
1028*575694f6SJason King PRM_POINT("Calibrating the TSC...");
1029*575694f6SJason King
1030*575694f6SJason King /*
1031*575694f6SJason King * Allow an operator to explicitly specify a calibration source via
1032*575694f6SJason King * `set tsc_calibration=foo` in the bootloader or
1033*575694f6SJason King * `set tsc_calibration="foo"` in /etc/system (preferring a bootloader
1034*575694f6SJason King * supplied value over /etc/system).
1035*575694f6SJason King *
1036*575694f6SJason King * If no source is given, or the specified source is not found, we
1037*575694f6SJason King * fallback to trying all of the known sources in order by preference
1038*575694f6SJason King * (high preference value to low preference value) until one succeeds.
1039*575694f6SJason King */
1040*575694f6SJason King tsc_name_len = BOP_GETPROPLEN(bootops, "tsc_calibration");
1041*575694f6SJason King if (tsc_name_len > 0) {
1042*575694f6SJason King /* Overwrite any /etc/system supplied value */
1043*575694f6SJason King if (tsc_calibration != NULL) {
1044*575694f6SJason King size_t len = strlen(tsc_calibration) + 1;
1045*575694f6SJason King
1046*575694f6SJason King kobj_free_string(tsc_calibration, len);
1047*575694f6SJason King }
1048*575694f6SJason King
1049*575694f6SJason King tsc_calibration = kmem_zalloc(tsc_name_len + 1, KM_SLEEP);
1050*575694f6SJason King BOP_GETPROP(bootops, "tsc_calibration", tsc_calibration);
1051*575694f6SJason King }
1052*575694f6SJason King
1053*575694f6SJason King if (tsc_calibration != NULL &&
1054*575694f6SJason King (force = tsc_calibrate_get_force(tsc_calibration)) != NULL) {
1055*575694f6SJason King if (tsc_name_len > 0) {
1056*575694f6SJason King PRM_POINT("Forcing bootloader specified TSC calibration"
1057*575694f6SJason King " source");
1058*575694f6SJason King } else {
1059*575694f6SJason King PRM_POINT("Forcing /etc/system specified TSC "
1060*575694f6SJason King "calibration source");
1061*575694f6SJason King }
1062*575694f6SJason King PRM_DEBUGS(force->tscc_source);
1063*575694f6SJason King
1064*575694f6SJason King if (!force->tscc_calibrate(&tsc_freq))
1065*575694f6SJason King panic("Failed to calibrate the TSC");
1066*575694f6SJason King
1067*575694f6SJason King tsc_calibration_source = force;
1068*575694f6SJason King
1069*575694f6SJason King /*
1070*575694f6SJason King * We've saved the tsc_calibration_t that matched the value
1071*575694f6SJason King * of tsc_calibration at this point, so we can release the
1072*575694f6SJason King * memory for the value now.
1073*575694f6SJason King */
1074*575694f6SJason King if (tsc_name_len > 0) {
1075*575694f6SJason King kmem_free(tsc_calibration, tsc_name_len + 1);
1076*575694f6SJason King } else if (tsc_calibration != NULL) {
1077*575694f6SJason King size_t len = strlen(tsc_calibration) + 1;
1078*575694f6SJason King
1079*575694f6SJason King kobj_free_string(tsc_calibration, len);
1080*575694f6SJason King }
1081*575694f6SJason King tsc_calibration = NULL;
1082*575694f6SJason King
1083*575694f6SJason King tsc_pit_also();
1084*575694f6SJason King return (tsc_freq);
1085*575694f6SJason King }
1086*575694f6SJason King
1087*575694f6SJason King /*
1088*575694f6SJason King * While we could sort the set contents in place, we'll make a copy
1089*575694f6SJason King * of the set and avoid modifying the original set.
1090*575694f6SJason King */
1091*575694f6SJason King tsc_set_size = SET_COUNT(tsc_calibration_set) *
1092*575694f6SJason King sizeof (tsc_calibrate_t **);
1093*575694f6SJason King tsccpp = kmem_zalloc(tsc_set_size, KM_SLEEP);
1094*575694f6SJason King bcopy(SET_BEGIN(tsc_calibration_set), tsccpp, tsc_set_size);
1095*575694f6SJason King
1096*575694f6SJason King /*
1097*575694f6SJason King * Sort by preference, highest to lowest
1098*575694f6SJason King */
1099*575694f6SJason King qsort(tsccpp, SET_COUNT(tsc_calibration_set),
1100*575694f6SJason King sizeof (tsc_calibrate_t **), tsc_calibrate_cmp);
1101*575694f6SJason King
1102*575694f6SJason King for (uint_t i = 0; i < SET_COUNT(tsc_calibration_set); i++) {
1103*575694f6SJason King PRM_DEBUGS(tsccpp[i]->tscc_source);
1104*575694f6SJason King if (tsccpp[i]->tscc_calibrate(&tsc_freq)) {
1105*575694f6SJason King VERIFY3U(tsc_freq, >, 0);
1106*575694f6SJason King
1107*575694f6SJason King cmn_err(CE_CONT,
1108*575694f6SJason King "?TSC calibrated using %s; freq is %lu MHz\n",
1109*575694f6SJason King tsccpp[i]->tscc_source, tsc_freq / 1000000);
1110*575694f6SJason King
1111*575694f6SJason King /*
1112*575694f6SJason King * Note that tsccpp is just a (sorted) array of
1113*575694f6SJason King * pointers to the tsc_calibration_t's (from the
1114*575694f6SJason King * linker set). The actual tsc_calibration_t's aren't
1115*575694f6SJason King * kmem_alloc()ed (being part of the linker set), so
1116*575694f6SJason King * it's safe to keep a pointer to the one that was
1117*575694f6SJason King * used for calibration (intended for diagnostic
1118*575694f6SJason King * purposes).
1119*575694f6SJason King */
1120*575694f6SJason King tsc_calibration_source = tsccpp[i];
1121*575694f6SJason King
1122*575694f6SJason King kmem_free(tsccpp, tsc_set_size);
1123*575694f6SJason King tsc_pit_also();
1124*575694f6SJason King return (tsc_freq);
1125*575694f6SJason King }
1126*575694f6SJason King }
1127*575694f6SJason King
1128*575694f6SJason King /*
1129*575694f6SJason King * In case it's useful, we don't free tsccpp -- we're about to panic
1130*575694f6SJason King * anyway.
1131*575694f6SJason King */
1132*575694f6SJason King panic("Failed to calibrate TSC");
1133*575694f6SJason King }
1134*575694f6SJason King
1135*575694f6SJason King uint64_t
tsc_get_freq(void)1136*575694f6SJason King tsc_get_freq(void)
1137*575694f6SJason King {
1138*575694f6SJason King VERIFY(tsc_freq > 0);
1139*575694f6SJason King return (tsc_freq);
1140*575694f6SJason King }
1141