1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
22/*	  All Rights Reserved	*/
23
24/*
25 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
26 * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
27 * Copyright (c) 2016 by Delphix. All rights reserved.
28 */
29
30#include <sys/param.h>
31#include <sys/t_lock.h>
32#include <sys/types.h>
33#include <sys/tuneable.h>
34#include <sys/sysmacros.h>
35#include <sys/systm.h>
36#include <sys/cpuvar.h>
37#include <sys/lgrp.h>
38#include <sys/user.h>
39#include <sys/proc.h>
40#include <sys/callo.h>
41#include <sys/kmem.h>
42#include <sys/var.h>
43#include <sys/cmn_err.h>
44#include <sys/swap.h>
45#include <sys/vmsystm.h>
46#include <sys/class.h>
47#include <sys/time.h>
48#include <sys/debug.h>
49#include <sys/vtrace.h>
50#include <sys/spl.h>
51#include <sys/atomic.h>
52#include <sys/dumphdr.h>
53#include <sys/archsystm.h>
54#include <sys/fs/swapnode.h>
55#include <sys/panic.h>
56#include <sys/disp.h>
57#include <sys/msacct.h>
58#include <sys/mem_cage.h>
59
60#include <vm/page.h>
61#include <vm/anon.h>
62#include <vm/rm.h>
63#include <sys/cyclic.h>
64#include <sys/cpupart.h>
65#include <sys/rctl.h>
66#include <sys/task.h>
67#include <sys/sdt.h>
68#include <sys/ddi_periodic.h>
69#include <sys/random.h>
70#include <sys/modctl.h>
71#include <sys/zone.h>
72
73/*
74 * for NTP support
75 */
76#include <sys/timex.h>
77#include <sys/inttypes.h>
78
79#include <sys/sunddi.h>
80#include <sys/clock_impl.h>
81
82/*
83 * clock() is called straight from the clock cyclic; see clock_init().
84 *
85 * Functions:
86 *	reprime clock
87 *	maintain date
88 *	jab the scheduler
89 */
90
91extern kcondvar_t	fsflush_cv;
92extern sysinfo_t	sysinfo;
93extern vminfo_t	vminfo;
94extern int	idleswtch;	/* flag set while idle in pswtch() */
95extern hrtime_t volatile devinfo_freeze;
96
97/*
98 * high-precision avenrun values.  These are needed to make the
99 * regular avenrun values accurate.
100 */
101static uint64_t hp_avenrun[3];
102int	avenrun[3];		/* FSCALED average run queue lengths */
103time_t	time;	/* time in seconds since 1970 - for compatibility only */
104
105static struct loadavg_s loadavg;
106/*
107 * Phase/frequency-lock loop (PLL/FLL) definitions
108 *
109 * The following variables are read and set by the ntp_adjtime() system
110 * call.
111 *
112 * time_state shows the state of the system clock, with values defined
113 * in the timex.h header file.
114 *
115 * time_status shows the status of the system clock, with bits defined
116 * in the timex.h header file.
117 *
118 * time_offset is used by the PLL/FLL to adjust the system time in small
119 * increments.
120 *
121 * time_constant determines the bandwidth or "stiffness" of the PLL.
122 *
123 * time_tolerance determines maximum frequency error or tolerance of the
124 * CPU clock oscillator and is a property of the architecture; however,
125 * in principle it could change as result of the presence of external
126 * discipline signals, for instance.
127 *
128 * time_precision is usually equal to the kernel tick variable; however,
129 * in cases where a precision clock counter or external clock is
130 * available, the resolution can be much less than this and depend on
131 * whether the external clock is working or not.
132 *
133 * time_maxerror is initialized by a ntp_adjtime() call and increased by
134 * the kernel once each second to reflect the maximum error bound
135 * growth.
136 *
137 * time_esterror is set and read by the ntp_adjtime() call, but
138 * otherwise not used by the kernel.
139 */
140int32_t time_state = TIME_OK;	/* clock state */
141int32_t time_status = STA_UNSYNC;	/* clock status bits */
142int32_t time_offset = 0;		/* time offset (us) */
143int32_t time_constant = 0;		/* pll time constant */
144int32_t time_tolerance = MAXFREQ;	/* frequency tolerance (scaled ppm) */
145int32_t time_precision = 1;	/* clock precision (us) */
146int32_t time_maxerror = MAXPHASE;	/* maximum error (us) */
147int32_t time_esterror = MAXPHASE;	/* estimated error (us) */
148
149/*
150 * The following variables establish the state of the PLL/FLL and the
151 * residual time and frequency offset of the local clock. The scale
152 * factors are defined in the timex.h header file.
153 *
154 * time_phase and time_freq are the phase increment and the frequency
155 * increment, respectively, of the kernel time variable.
156 *
157 * time_freq is set via ntp_adjtime() from a value stored in a file when
158 * the synchronization daemon is first started. Its value is retrieved
159 * via ntp_adjtime() and written to the file about once per hour by the
160 * daemon.
161 *
162 * time_adj is the adjustment added to the value of tick at each timer
163 * interrupt and is recomputed from time_phase and time_freq at each
164 * seconds rollover.
165 *
166 * time_reftime is the second's portion of the system time at the last
167 * call to ntp_adjtime(). It is used to adjust the time_freq variable
168 * and to increase the time_maxerror as the time since last update
169 * increases.
170 */
171int32_t time_phase = 0;		/* phase offset (scaled us) */
172int32_t time_freq = 0;		/* frequency offset (scaled ppm) */
173int32_t time_adj = 0;		/* tick adjust (scaled 1 / hz) */
174int32_t time_reftime = 0;		/* time at last adjustment (s) */
175
176/*
177 * The scale factors of the following variables are defined in the
178 * timex.h header file.
179 *
180 * pps_time contains the time at each calibration interval, as read by
181 * microtime(). pps_count counts the seconds of the calibration
182 * interval, the duration of which is nominally pps_shift in powers of
183 * two.
184 *
185 * pps_offset is the time offset produced by the time median filter
186 * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
187 * this filter.
188 *
189 * pps_freq is the frequency offset produced by the frequency median
190 * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
191 * by this filter.
192 *
193 * pps_usec is latched from a high resolution counter or external clock
194 * at pps_time. Here we want the hardware counter contents only, not the
195 * contents plus the time_tv.usec as usual.
196 *
197 * pps_valid counts the number of seconds since the last PPS update. It
198 * is used as a watchdog timer to disable the PPS discipline should the
199 * PPS signal be lost.
200 *
201 * pps_glitch counts the number of seconds since the beginning of an
202 * offset burst more than tick/2 from current nominal offset. It is used
203 * mainly to suppress error bursts due to priority conflicts between the
204 * PPS interrupt and timer interrupt.
205 *
206 * pps_intcnt counts the calibration intervals for use in the interval-
207 * adaptation algorithm. It's just too complicated for words.
208 */
209struct timeval pps_time;	/* kernel time at last interval */
210int32_t pps_tf[] = {0, 0, 0};	/* pps time offset median filter (us) */
211int32_t pps_offset = 0;		/* pps time offset (us) */
212int32_t pps_jitter = MAXTIME;	/* time dispersion (jitter) (us) */
213int32_t pps_ff[] = {0, 0, 0};	/* pps frequency offset median filter */
214int32_t pps_freq = 0;		/* frequency offset (scaled ppm) */
215int32_t pps_stabil = MAXFREQ;	/* frequency dispersion (scaled ppm) */
216int32_t pps_usec = 0;		/* microsec counter at last interval */
217int32_t pps_valid = PPS_VALID;	/* pps signal watchdog counter */
218int32_t pps_glitch = 0;		/* pps signal glitch counter */
219int32_t pps_count = 0;		/* calibration interval counter (s) */
220int32_t pps_shift = PPS_SHIFT;	/* interval duration (s) (shift) */
221int32_t pps_intcnt = 0;		/* intervals at current duration */
222
223/*
224 * PPS signal quality monitors
225 *
226 * pps_jitcnt counts the seconds that have been discarded because the
227 * jitter measured by the time median filter exceeds the limit MAXTIME
228 * (100 us).
229 *
230 * pps_calcnt counts the frequency calibration intervals, which are
231 * variable from 4 s to 256 s.
232 *
233 * pps_errcnt counts the calibration intervals which have been discarded
234 * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
235 * calibration interval jitter exceeds two ticks.
236 *
237 * pps_stbcnt counts the calibration intervals that have been discarded
238 * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
239 */
240int32_t pps_jitcnt = 0;		/* jitter limit exceeded */
241int32_t pps_calcnt = 0;		/* calibration intervals */
242int32_t pps_errcnt = 0;		/* calibration errors */
243int32_t pps_stbcnt = 0;		/* stability limit exceeded */
244
245kcondvar_t lbolt_cv;
246
247/*
248 * Hybrid lbolt implementation:
249 *
250 * The service historically provided by the lbolt and lbolt64 variables has
251 * been replaced by the ddi_get_lbolt() and ddi_get_lbolt64() routines, and the
252 * original symbols removed from the system. The once clock driven variables are
253 * now implemented in an event driven fashion, backed by gethrtime() coarsed to
254 * the appropriate clock resolution. The default event driven implementation is
255 * complemented by a cyclic driven one, active only during periods of intense
256 * activity around the DDI lbolt routines, when a lbolt specific cyclic is
257 * reprogramed to fire at a clock tick interval to serve consumers of lbolt who
258 * rely on the original low cost of consulting a memory position.
259 *
260 * The implementation uses the number of calls to these routines and the
261 * frequency of these to determine when to transition from event to cyclic
262 * driven and vice-versa. These values are kept on a per CPU basis for
263 * scalability reasons and to prevent CPUs from constantly invalidating a single
264 * cache line when modifying a global variable. The transition from event to
265 * cyclic mode happens once the thresholds are crossed, and activity on any CPU
266 * can cause such transition.
267 *
268 * The lbolt_hybrid function pointer is called by ddi_get_lbolt() and
269 * ddi_get_lbolt64(), and will point to lbolt_event_driven() or
270 * lbolt_cyclic_driven() according to the current mode. When the thresholds
271 * are exceeded, lbolt_event_driven() will reprogram the lbolt cyclic to
272 * fire at a nsec_per_tick interval and increment an internal variable at
273 * each firing. lbolt_hybrid will then point to lbolt_cyclic_driven(), which
274 * will simply return the value of such variable. lbolt_cyclic() will attempt
275 * to shut itself off at each threshold interval (sampling period for calls
276 * to the DDI lbolt routines), and return to the event driven mode, but will
277 * be prevented from doing so if lbolt_cyclic_driven() is being heavily used.
278 *
279 * lbolt_bootstrap is used during boot to serve lbolt consumers who don't wait
280 * for the cyclic subsystem to be intialized.
281 *
282 */
283int64_t lbolt_bootstrap(void);
284int64_t lbolt_event_driven(void);
285int64_t lbolt_cyclic_driven(void);
286int64_t (*lbolt_hybrid)(void) = lbolt_bootstrap;
287uint_t lbolt_ev_to_cyclic(caddr_t, caddr_t);
288
289/*
290 * lbolt's cyclic, installed by clock_init().
291 */
292static void lbolt_cyclic(void);
293
294/*
295 * Tunable to keep lbolt in cyclic driven mode. This will prevent the system
296 * from switching back to event driven, once it reaches cyclic mode.
297 */
298static boolean_t lbolt_cyc_only = B_FALSE;
299
300/*
301 * Cache aligned, per CPU structure with lbolt usage statistics.
302 */
303static lbolt_cpu_t *lb_cpu;
304
305/*
306 * Single, cache aligned, structure with all the information required by
307 * the lbolt implementation.
308 */
309lbolt_info_t *lb_info;
310
311
312int one_sec = 1; /* turned on once every second */
313static int fsflushcnt;	/* counter for t_fsflushr */
314int	dosynctodr = 1;	/* patchable; enable/disable sync to TOD chip */
315int	tod_needsync = 0;	/* need to sync tod chip with software time */
316static int tod_broken = 0;	/* clock chip doesn't work */
317time_t	boot_time = 0;		/* Boot time in seconds since 1970 */
318cyclic_id_t clock_cyclic;	/* clock()'s cyclic_id */
319cyclic_id_t deadman_cyclic;	/* deadman()'s cyclic_id */
320
321extern void clock_tick_schedule(int);
322extern void set_freemem(void);
323extern void pageout_deadman(void);
324
325static int lgrp_ticks;		/* counter to schedule lgrp load calcs */
326
327/*
328 * for tod fault detection
329 */
330#define	TOD_REF_FREQ		((longlong_t)(NANOSEC))
331#define	TOD_STALL_THRESHOLD	(TOD_REF_FREQ * 3 / 2)
332#define	TOD_JUMP_THRESHOLD	(TOD_REF_FREQ / 2)
333#define	TOD_FILTER_N		4
334#define	TOD_FILTER_SETTLE	(4 * TOD_FILTER_N)
335static enum tod_fault_type tod_faulted = TOD_NOFAULT;
336
337static int tod_status_flag = 0;		/* used by tod_validate() */
338
339static hrtime_t prev_set_tick = 0;	/* gethrtime() prior to tod_set() */
340static time_t prev_set_tod = 0;		/* tv_sec value passed to tod_set() */
341
342/* patchable via /etc/system */
343int tod_validate_enable = 1;
344
345/* Diagnose/Limit messages about delay(9F) called from interrupt context */
346int			delay_from_interrupt_diagnose = 0;
347volatile uint32_t	delay_from_interrupt_msg = 20;
348
349/*
350 * On non-SPARC systems, TOD validation must be deferred until gethrtime
351 * returns non-zero values (after mach_clkinit's execution).
352 * On SPARC systems, it must be deferred until after hrtime_base
353 * and hres_last_tick are set (in the first invocation of hres_tick).
354 * Since in both cases the prerequisites occur before the invocation of
355 * tod_get() in clock(), the deferment is lifted there.
356 */
357static boolean_t tod_validate_deferred = B_TRUE;
358
359/*
360 * tod_fault_table[] must be aligned with
361 * enum tod_fault_type in systm.h
362 */
363static char *tod_fault_table[] = {
364	"Reversed",			/* TOD_REVERSED */
365	"Stalled",			/* TOD_STALLED */
366	"Jumped",			/* TOD_JUMPED */
367	"Changed in Clock Rate",	/* TOD_RATECHANGED */
368	"Is Read-Only"			/* TOD_RDONLY */
369	/*
370	 * no strings needed for TOD_NOFAULT
371	 */
372};
373
374/*
375 * test hook for tod broken detection in tod_validate
376 */
377int tod_unit_test = 0;
378time_t tod_test_injector;
379
380#define	CLOCK_ADJ_HIST_SIZE	4
381
382static int	adj_hist_entry;
383
384int64_t clock_adj_hist[CLOCK_ADJ_HIST_SIZE];
385
386static void calcloadavg(int, uint64_t *);
387static int genloadavg(struct loadavg_s *);
388static void loadavg_update();
389
390void (*cmm_clock_callout)() = NULL;
391void (*cpucaps_clock_callout)() = NULL;
392
393extern clock_t clock_tick_proc_max;
394
395static int64_t deadman_counter = 0;
396
397static void
398clock(void)
399{
400	kthread_t	*t;
401	uint_t	nrunnable;
402	uint_t	w_io;
403	cpu_t	*cp;
404	cpupart_t *cpupart;
405	void	(*funcp)();
406	int32_t ltemp;
407	int64_t lltemp;
408	int s;
409	int do_lgrp_load;
410	int i;
411	clock_t now = LBOLT_NO_ACCOUNT;	/* current tick */
412
413	if (panicstr)
414		return;
415
416	/*
417	 * Make sure that 'freemem' do not drift too far from the truth
418	 */
419	set_freemem();
420
421
422	/*
423	 * Before the section which is repeated is executed, we do
424	 * the time delta processing which occurs every clock tick
425	 *
426	 * There is additional processing which happens every time
427	 * the nanosecond counter rolls over which is described
428	 * below - see the section which begins with : if (one_sec)
429	 *
430	 * This section marks the beginning of the precision-kernel
431	 * code fragment.
432	 *
433	 * First, compute the phase adjustment. If the low-order bits
434	 * (time_phase) of the update overflow, bump the higher order
435	 * bits (time_update).
436	 */
437	time_phase += time_adj;
438	if (time_phase <= -FINEUSEC) {
439		ltemp = -time_phase / SCALE_PHASE;
440		time_phase += ltemp * SCALE_PHASE;
441		s = hr_clock_lock();
442		timedelta -= ltemp * (NANOSEC/MICROSEC);
443		hr_clock_unlock(s);
444	} else if (time_phase >= FINEUSEC) {
445		ltemp = time_phase / SCALE_PHASE;
446		time_phase -= ltemp * SCALE_PHASE;
447		s = hr_clock_lock();
448		timedelta += ltemp * (NANOSEC/MICROSEC);
449		hr_clock_unlock(s);
450	}
451
452	/*
453	 * End of precision-kernel code fragment which is processed
454	 * every timer interrupt.
455	 *
456	 * Continue with the interrupt processing as scheduled.
457	 */
458	/*
459	 * Count the number of runnable threads and the number waiting
460	 * for some form of I/O to complete -- gets added to
461	 * sysinfo.waiting.  To know the state of the system, must add
462	 * wait counts from all CPUs.  Also add up the per-partition
463	 * statistics.
464	 */
465	w_io = 0;
466	nrunnable = 0;
467
468	/*
469	 * keep track of when to update lgrp/part loads
470	 */
471
472	do_lgrp_load = 0;
473	if (lgrp_ticks++ >= hz / 10) {
474		lgrp_ticks = 0;
475		do_lgrp_load = 1;
476	}
477
478	if (one_sec) {
479		loadavg_update();
480		deadman_counter++;
481		pageout_deadman();
482	}
483
484	/*
485	 * First count the threads waiting on kpreempt queues in each
486	 * CPU partition.
487	 */
488
489	cpupart = cp_list_head;
490	do {
491		uint_t cpupart_nrunnable = cpupart->cp_kp_queue.disp_nrunnable;
492
493		cpupart->cp_updates++;
494		nrunnable += cpupart_nrunnable;
495		cpupart->cp_nrunnable_cum += cpupart_nrunnable;
496		if (one_sec) {
497			cpupart->cp_nrunning = 0;
498			cpupart->cp_nrunnable = cpupart_nrunnable;
499		}
500	} while ((cpupart = cpupart->cp_next) != cp_list_head);
501
502
503	/* Now count the per-CPU statistics. */
504	cp = cpu_list;
505	do {
506		uint_t cpu_nrunnable = cp->cpu_disp->disp_nrunnable;
507
508		nrunnable += cpu_nrunnable;
509		cpupart = cp->cpu_part;
510		cpupart->cp_nrunnable_cum += cpu_nrunnable;
511		if (one_sec) {
512			cpupart->cp_nrunnable += cpu_nrunnable;
513			/*
514			 * Update user, system, and idle cpu times.
515			 */
516			cpupart->cp_nrunning++;
517			/*
518			 * w_io is used to update sysinfo.waiting during
519			 * one_second processing below.  Only gather w_io
520			 * information when we walk the list of cpus if we're
521			 * going to perform one_second processing.
522			 */
523			w_io += CPU_STATS(cp, sys.iowait);
524		}
525
526		if (one_sec && (cp->cpu_flags & CPU_EXISTS)) {
527			int i, load, change;
528			hrtime_t intracct, intrused;
529			const hrtime_t maxnsec = 1000000000;
530			const int precision = 100;
531
532			/*
533			 * Estimate interrupt load on this cpu each second.
534			 * Computes cpu_intrload as %utilization (0-99).
535			 */
536
537			/* add up interrupt time from all micro states */
538			for (intracct = 0, i = 0; i < NCMSTATES; i++)
539				intracct += cp->cpu_intracct[i];
540			scalehrtime(&intracct);
541
542			/* compute nsec used in the past second */
543			intrused = intracct - cp->cpu_intrlast;
544			cp->cpu_intrlast = intracct;
545
546			/* limit the value for safety (and the first pass) */
547			if (intrused >= maxnsec)
548				intrused = maxnsec - 1;
549
550			/* calculate %time in interrupt */
551			load = (precision * intrused) / maxnsec;
552			ASSERT(load >= 0 && load < precision);
553			change = cp->cpu_intrload - load;
554
555			/* jump to new max, or decay the old max */
556			if (change < 0)
557				cp->cpu_intrload = load;
558			else if (change > 0)
559				cp->cpu_intrload -= (change + 3) / 4;
560
561			DTRACE_PROBE3(cpu_intrload,
562			    cpu_t *, cp,
563			    hrtime_t, intracct,
564			    hrtime_t, intrused);
565		}
566
567		if (do_lgrp_load &&
568		    (cp->cpu_flags & CPU_EXISTS)) {
569			/*
570			 * When updating the lgroup's load average,
571			 * account for the thread running on the CPU.
572			 * If the CPU is the current one, then we need
573			 * to account for the underlying thread which
574			 * got the clock interrupt not the thread that is
575			 * handling the interrupt and caculating the load
576			 * average
577			 */
578			t = cp->cpu_thread;
579			if (CPU == cp)
580				t = t->t_intr;
581
582			/*
583			 * Account for the load average for this thread if
584			 * it isn't the idle thread or it is on the interrupt
585			 * stack and not the current CPU handling the clock
586			 * interrupt
587			 */
588			if ((t && t != cp->cpu_idle_thread) || (CPU != cp &&
589			    CPU_ON_INTR(cp))) {
590				if (t->t_lpl == cp->cpu_lpl) {
591					/* local thread */
592					cpu_nrunnable++;
593				} else {
594					/*
595					 * This is a remote thread, charge it
596					 * against its home lgroup.  Note that
597					 * we notice that a thread is remote
598					 * only if it's currently executing.
599					 * This is a reasonable approximation,
600					 * since queued remote threads are rare.
601					 * Note also that if we didn't charge
602					 * it to its home lgroup, remote
603					 * execution would often make a system
604					 * appear balanced even though it was
605					 * not, and thread placement/migration
606					 * would often not be done correctly.
607					 */
608					lgrp_loadavg(t->t_lpl,
609					    LGRP_LOADAVG_IN_THREAD_MAX, 0);
610				}
611			}
612			lgrp_loadavg(cp->cpu_lpl,
613			    cpu_nrunnable * LGRP_LOADAVG_IN_THREAD_MAX, 1);
614		}
615	} while ((cp = cp->cpu_next) != cpu_list);
616
617	clock_tick_schedule(one_sec);
618
619	/*
620	 * Check for a callout that needs be called from the clock
621	 * thread to support the membership protocol in a clustered
622	 * system.  Copy the function pointer so that we can reset
623	 * this to NULL if needed.
624	 */
625	if ((funcp = cmm_clock_callout) != NULL)
626		(*funcp)();
627
628	if ((funcp = cpucaps_clock_callout) != NULL)
629		(*funcp)();
630
631	/*
632	 * Wakeup the cageout thread waiters once per second.
633	 */
634	if (one_sec)
635		kcage_tick();
636
637	if (one_sec) {
638
639		int drift, absdrift;
640		timestruc_t tod;
641		int s;
642
643		/*
644		 * Beginning of precision-kernel code fragment executed
645		 * every second.
646		 *
647		 * On rollover of the second the phase adjustment to be
648		 * used for the next second is calculated.  Also, the
649		 * maximum error is increased by the tolerance.  If the
650		 * PPS frequency discipline code is present, the phase is
651		 * increased to compensate for the CPU clock oscillator
652		 * frequency error.
653		 *
654		 * On a 32-bit machine and given parameters in the timex.h
655		 * header file, the maximum phase adjustment is +-512 ms
656		 * and maximum frequency offset is (a tad less than)
657		 * +-512 ppm. On a 64-bit machine, you shouldn't need to ask.
658		 */
659		time_maxerror += time_tolerance / SCALE_USEC;
660
661		/*
662		 * Leap second processing. If in leap-insert state at
663		 * the end of the day, the system clock is set back one
664		 * second; if in leap-delete state, the system clock is
665		 * set ahead one second. The microtime() routine or
666		 * external clock driver will insure that reported time
667		 * is always monotonic. The ugly divides should be
668		 * replaced.
669		 */
670		switch (time_state) {
671
672		case TIME_OK:
673			if (time_status & STA_INS)
674				time_state = TIME_INS;
675			else if (time_status & STA_DEL)
676				time_state = TIME_DEL;
677			break;
678
679		case TIME_INS:
680			if (hrestime.tv_sec % 86400 == 0) {
681				s = hr_clock_lock();
682				hrestime.tv_sec--;
683				hr_clock_unlock(s);
684				time_state = TIME_OOP;
685			}
686			break;
687
688		case TIME_DEL:
689			if ((hrestime.tv_sec + 1) % 86400 == 0) {
690				s = hr_clock_lock();
691				hrestime.tv_sec++;
692				hr_clock_unlock(s);
693				time_state = TIME_WAIT;
694			}
695			break;
696
697		case TIME_OOP:
698			time_state = TIME_WAIT;
699			break;
700
701		case TIME_WAIT:
702			if (!(time_status & (STA_INS | STA_DEL)))
703				time_state = TIME_OK;
704		default:
705			break;
706		}
707
708		/*
709		 * Compute the phase adjustment for the next second. In
710		 * PLL mode, the offset is reduced by a fixed factor
711		 * times the time constant. In FLL mode the offset is
712		 * used directly. In either mode, the maximum phase
713		 * adjustment for each second is clamped so as to spread
714		 * the adjustment over not more than the number of
715		 * seconds between updates.
716		 */
717		if (time_offset == 0)
718			time_adj = 0;
719		else if (time_offset < 0) {
720			lltemp = -time_offset;
721			if (!(time_status & STA_FLL)) {
722				if ((1 << time_constant) >= SCALE_KG)
723					lltemp *= (1 << time_constant) /
724					    SCALE_KG;
725				else
726					lltemp = (lltemp / SCALE_KG) >>
727					    time_constant;
728			}
729			if (lltemp > (MAXPHASE / MINSEC) * SCALE_UPDATE)
730				lltemp = (MAXPHASE / MINSEC) * SCALE_UPDATE;
731			time_offset += lltemp;
732			time_adj = -(lltemp * SCALE_PHASE) / hz / SCALE_UPDATE;
733		} else {
734			lltemp = time_offset;
735			if (!(time_status & STA_FLL)) {
736				if ((1 << time_constant) >= SCALE_KG)
737					lltemp *= (1 << time_constant) /
738					    SCALE_KG;
739				else
740					lltemp = (lltemp / SCALE_KG) >>
741					    time_constant;
742			}
743			if (lltemp > (MAXPHASE / MINSEC) * SCALE_UPDATE)
744				lltemp = (MAXPHASE / MINSEC) * SCALE_UPDATE;
745			time_offset -= lltemp;
746			time_adj = (lltemp * SCALE_PHASE) / hz / SCALE_UPDATE;
747		}
748
749		/*
750		 * Compute the frequency estimate and additional phase
751		 * adjustment due to frequency error for the next
752		 * second. When the PPS signal is engaged, gnaw on the
753		 * watchdog counter and update the frequency computed by
754		 * the pll and the PPS signal.
755		 */
756		pps_valid++;
757		if (pps_valid == PPS_VALID) {
758			pps_jitter = MAXTIME;
759			pps_stabil = MAXFREQ;
760			time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
761			    STA_PPSWANDER | STA_PPSERROR);
762		}
763		lltemp = time_freq + pps_freq;
764
765		if (lltemp)
766			time_adj += (lltemp * SCALE_PHASE) / (SCALE_USEC * hz);
767
768		/*
769		 * End of precision kernel-code fragment
770		 *
771		 * The section below should be modified if we are planning
772		 * to use NTP for synchronization.
773		 *
774		 * Note: the clock synchronization code now assumes
775		 * the following:
776		 *   - if dosynctodr is 1, then compute the drift between
777		 *	the tod chip and software time and adjust one or
778		 *	the other depending on the circumstances
779		 *
780		 *   - if dosynctodr is 0, then the tod chip is independent
781		 *	of the software clock and should not be adjusted,
782		 *	but allowed to free run.  this allows NTP to sync.
783		 *	hrestime without any interference from the tod chip.
784		 */
785
786		tod_validate_deferred = B_FALSE;
787		mutex_enter(&tod_lock);
788		tod = tod_get();
789		drift = tod.tv_sec - hrestime.tv_sec;
790		absdrift = (drift >= 0) ? drift : -drift;
791		if (tod_needsync || absdrift > 1) {
792			int s;
793			if (absdrift > 2) {
794				if (!tod_broken && tod_faulted == TOD_NOFAULT) {
795					s = hr_clock_lock();
796					hrestime = tod;
797					membar_enter();	/* hrestime visible */
798					timedelta = 0;
799					timechanged++;
800					tod_needsync = 0;
801					hr_clock_unlock(s);
802					callout_hrestime();
803
804				}
805			} else {
806				if (tod_needsync || !dosynctodr) {
807					gethrestime(&tod);
808					tod_set(tod);
809					s = hr_clock_lock();
810					if (timedelta == 0)
811						tod_needsync = 0;
812					hr_clock_unlock(s);
813				} else {
814					/*
815					 * If the drift is 2 seconds on the
816					 * money, then the TOD is adjusting
817					 * the clock;  record that.
818					 */
819					clock_adj_hist[adj_hist_entry++ %
820					    CLOCK_ADJ_HIST_SIZE] = now;
821					s = hr_clock_lock();
822					timedelta = (int64_t)drift*NANOSEC;
823					hr_clock_unlock(s);
824				}
825			}
826		}
827		one_sec = 0;
828		time = gethrestime_sec();  /* for crusty old kmem readers */
829		mutex_exit(&tod_lock);
830
831		/*
832		 * Some drivers still depend on this... XXX
833		 */
834		cv_broadcast(&lbolt_cv);
835
836		vminfo.freemem += freemem;
837		{
838			pgcnt_t maxswap, resv, free;
839			pgcnt_t avail =
840			    MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
841
842			maxswap = k_anoninfo.ani_mem_resv +
843			    k_anoninfo.ani_max +avail;
844			/* Update ani_free */
845			set_anoninfo();
846			free = k_anoninfo.ani_free + avail;
847			resv = k_anoninfo.ani_phys_resv +
848			    k_anoninfo.ani_mem_resv;
849
850			vminfo.swap_resv += resv;
851			/* number of reserved and allocated pages */
852#ifdef	DEBUG
853			if (maxswap < free)
854				cmn_err(CE_WARN, "clock: maxswap < free");
855			if (maxswap < resv)
856				cmn_err(CE_WARN, "clock: maxswap < resv");
857#endif
858			vminfo.swap_alloc += maxswap - free;
859			vminfo.swap_avail += maxswap - resv;
860			vminfo.swap_free += free;
861		}
862		vminfo.updates++;
863		if (nrunnable) {
864			sysinfo.runque += nrunnable;
865			sysinfo.runocc++;
866		}
867		if (nswapped) {
868			sysinfo.swpque += nswapped;
869			sysinfo.swpocc++;
870		}
871		sysinfo.waiting += w_io;
872		sysinfo.updates++;
873
874		/*
875		 * Wake up fsflush to write out DELWRI
876		 * buffers, dirty pages and other cached
877		 * administrative data, e.g. inodes.
878		 */
879		if (--fsflushcnt <= 0) {
880			fsflushcnt = tune.t_fsflushr;
881			cv_signal(&fsflush_cv);
882		}
883
884		vmmeter();
885		calcloadavg(genloadavg(&loadavg), hp_avenrun);
886		for (i = 0; i < 3; i++)
887			/*
888			 * At the moment avenrun[] can only hold 31
889			 * bits of load average as it is a signed
890			 * int in the API. We need to ensure that
891			 * hp_avenrun[i] >> (16 - FSHIFT) will not be
892			 * too large. If it is, we put the largest value
893			 * that we can use into avenrun[i]. This is
894			 * kludgey, but about all we can do until we
895			 * avenrun[] is declared as an array of uint64[]
896			 */
897			if (hp_avenrun[i] < ((uint64_t)1<<(31+16-FSHIFT)))
898				avenrun[i] = (int32_t)(hp_avenrun[i] >>
899				    (16 - FSHIFT));
900			else
901				avenrun[i] = 0x7fffffff;
902
903		cpupart = cp_list_head;
904		do {
905			calcloadavg(genloadavg(&cpupart->cp_loadavg),
906			    cpupart->cp_hp_avenrun);
907		} while ((cpupart = cpupart->cp_next) != cp_list_head);
908
909		/*
910		 * Wake up the swapper thread if necessary.
911		 */
912		if (runin ||
913		    (runout && (avefree < desfree || wake_sched_sec))) {
914			t = &t0;
915			thread_lock(t);
916			if (t->t_state == TS_STOPPED) {
917				runin = runout = 0;
918				wake_sched_sec = 0;
919				t->t_whystop = 0;
920				t->t_whatstop = 0;
921				t->t_schedflag &= ~TS_ALLSTART;
922				THREAD_TRANSITION(t);
923				setfrontdq(t);
924			}
925			thread_unlock(t);
926		}
927	}
928
929	/*
930	 * Wake up the swapper if any high priority swapped-out threads
931	 * became runable during the last tick.
932	 */
933	if (wake_sched) {
934		t = &t0;
935		thread_lock(t);
936		if (t->t_state == TS_STOPPED) {
937			runin = runout = 0;
938			wake_sched = 0;
939			t->t_whystop = 0;
940			t->t_whatstop = 0;
941			t->t_schedflag &= ~TS_ALLSTART;
942			THREAD_TRANSITION(t);
943			setfrontdq(t);
944		}
945		thread_unlock(t);
946	}
947}
948
949void
950clock_init(void)
951{
952	cyc_handler_t clk_hdlr, lbolt_hdlr;
953	cyc_time_t clk_when, lbolt_when;
954	int i, sz;
955	intptr_t buf;
956
957	/*
958	 * Setup handler and timer for the clock cyclic.
959	 */
960	clk_hdlr.cyh_func = (cyc_func_t)clock;
961	clk_hdlr.cyh_level = CY_LOCK_LEVEL;
962	clk_hdlr.cyh_arg = NULL;
963
964	clk_when.cyt_when = 0;
965	clk_when.cyt_interval = nsec_per_tick;
966
967	/*
968	 * The lbolt cyclic will be reprogramed to fire at a nsec_per_tick
969	 * interval to satisfy performance needs of the DDI lbolt consumers.
970	 * It is off by default.
971	 */
972	lbolt_hdlr.cyh_func = (cyc_func_t)lbolt_cyclic;
973	lbolt_hdlr.cyh_level = CY_LOCK_LEVEL;
974	lbolt_hdlr.cyh_arg = NULL;
975
976	lbolt_when.cyt_interval = nsec_per_tick;
977
978	/*
979	 * Allocate cache line aligned space for the per CPU lbolt data and
980	 * lbolt info structures, and initialize them with their default
981	 * values. Note that these structures are also cache line sized.
982	 */
983	sz = sizeof (lbolt_info_t) + CPU_CACHE_COHERENCE_SIZE;
984	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
985	lb_info = (lbolt_info_t *)P2ROUNDUP(buf, CPU_CACHE_COHERENCE_SIZE);
986
987	if (hz != HZ_DEFAULT)
988		lb_info->lbi_thresh_interval = LBOLT_THRESH_INTERVAL *
989		    hz/HZ_DEFAULT;
990	else
991		lb_info->lbi_thresh_interval = LBOLT_THRESH_INTERVAL;
992
993	lb_info->lbi_thresh_calls = LBOLT_THRESH_CALLS;
994
995	sz = (sizeof (lbolt_cpu_t) * max_ncpus) + CPU_CACHE_COHERENCE_SIZE;
996	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
997	lb_cpu = (lbolt_cpu_t *)P2ROUNDUP(buf, CPU_CACHE_COHERENCE_SIZE);
998
999	for (i = 0; i < max_ncpus; i++)
1000		lb_cpu[i].lbc_counter = lb_info->lbi_thresh_calls;
1001
1002	/*
1003	 * Install the softint used to switch between event and cyclic driven
1004	 * lbolt. We use a soft interrupt to make sure the context of the
1005	 * cyclic reprogram call is safe.
1006	 */
1007	lbolt_softint_add();
1008
1009	/*
1010	 * Since the hybrid lbolt implementation is based on a hardware counter
1011	 * that is reset at every hardware reboot and that we'd like to have
1012	 * the lbolt value starting at zero after both a hardware and a fast
1013	 * reboot, we calculate the number of clock ticks the system's been up
1014	 * and store it in the lbi_debug_time field of the lbolt info structure.
1015	 * The value of this field will be subtracted from lbolt before
1016	 * returning it.
1017	 */
1018	lb_info->lbi_internal = lb_info->lbi_debug_time =
1019	    (gethrtime()/nsec_per_tick);
1020
1021	/*
1022	 * lbolt_hybrid points at lbolt_bootstrap until now. The LBOLT_* macros
1023	 * and lbolt_debug_{enter,return} use this value as an indication that
1024	 * the initializaion above hasn't been completed. Setting lbolt_hybrid
1025	 * to either lbolt_{cyclic,event}_driven here signals those code paths
1026	 * that the lbolt related structures can be used.
1027	 */
1028	if (lbolt_cyc_only) {
1029		lbolt_when.cyt_when = 0;
1030		lbolt_hybrid = lbolt_cyclic_driven;
1031	} else {
1032		lbolt_when.cyt_when = CY_INFINITY;
1033		lbolt_hybrid = lbolt_event_driven;
1034	}
1035
1036	/*
1037	 * Grab cpu_lock and install all three cyclics.
1038	 */
1039	mutex_enter(&cpu_lock);
1040
1041	clock_cyclic = cyclic_add(&clk_hdlr, &clk_when);
1042	lb_info->id.lbi_cyclic_id = cyclic_add(&lbolt_hdlr, &lbolt_when);
1043
1044	mutex_exit(&cpu_lock);
1045}
1046
1047/*
1048 * Called before calcloadavg to get 10-sec moving loadavg together
1049 */
1050
1051static int
1052genloadavg(struct loadavg_s *avgs)
1053{
1054	int avg;
1055	int spos; /* starting position */
1056	int cpos; /* moving current position */
1057	int i;
1058	int slen;
1059	hrtime_t hr_avg;
1060
1061	/* 10-second snapshot, calculate first positon */
1062	if (avgs->lg_len == 0) {
1063		return (0);
1064	}
1065	slen = avgs->lg_len < S_MOVAVG_SZ ? avgs->lg_len : S_MOVAVG_SZ;
1066
1067	spos = (avgs->lg_cur - 1) >= 0 ? avgs->lg_cur - 1 :
1068	    S_LOADAVG_SZ + (avgs->lg_cur - 1);
1069	for (i = hr_avg = 0; i < slen; i++) {
1070		cpos = (spos - i) >= 0 ? spos - i : S_LOADAVG_SZ + (spos - i);
1071		hr_avg += avgs->lg_loads[cpos];
1072	}
1073
1074	hr_avg = hr_avg / slen;
1075	avg = hr_avg / (NANOSEC / LGRP_LOADAVG_IN_THREAD_MAX);
1076
1077	return (avg);
1078}
1079
1080/*
1081 * Run every second from clock () to update the loadavg count available to the
1082 * system and cpu-partitions.
1083 *
1084 * This works by sampling the previous usr, sys, wait time elapsed,
1085 * computing a delta, and adding that delta to the elapsed usr, sys,
1086 * wait increase.
1087 */
1088
1089static void
1090loadavg_update()
1091{
1092	cpu_t *cp;
1093	cpupart_t *cpupart;
1094	hrtime_t cpu_total;
1095	int prev;
1096
1097	cp = cpu_list;
1098	loadavg.lg_total = 0;
1099
1100	/*
1101	 * first pass totals up per-cpu statistics for system and cpu
1102	 * partitions
1103	 */
1104
1105	do {
1106		struct loadavg_s *lavg;
1107
1108		lavg = &cp->cpu_loadavg;
1109
1110		cpu_total = cp->cpu_acct[CMS_USER] +
1111		    cp->cpu_acct[CMS_SYSTEM] + cp->cpu_waitrq;
1112		/* compute delta against last total */
1113		scalehrtime(&cpu_total);
1114		prev = (lavg->lg_cur - 1) >= 0 ? lavg->lg_cur - 1 :
1115		    S_LOADAVG_SZ + (lavg->lg_cur - 1);
1116		if (lavg->lg_loads[prev] <= 0) {
1117			lavg->lg_loads[lavg->lg_cur] = cpu_total;
1118			cpu_total = 0;
1119		} else {
1120			lavg->lg_loads[lavg->lg_cur] = cpu_total;
1121			cpu_total = cpu_total - lavg->lg_loads[prev];
1122			if (cpu_total < 0)
1123				cpu_total = 0;
1124		}
1125
1126		lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
1127		lavg->lg_len = (lavg->lg_len + 1) < S_LOADAVG_SZ ?
1128		    lavg->lg_len + 1 : S_LOADAVG_SZ;
1129
1130		loadavg.lg_total += cpu_total;
1131		cp->cpu_part->cp_loadavg.lg_total += cpu_total;
1132
1133	} while ((cp = cp->cpu_next) != cpu_list);
1134
1135	loadavg.lg_loads[loadavg.lg_cur] = loadavg.lg_total;
1136	loadavg.lg_cur = (loadavg.lg_cur + 1) % S_LOADAVG_SZ;
1137	loadavg.lg_len = (loadavg.lg_len + 1) < S_LOADAVG_SZ ?
1138	    loadavg.lg_len + 1 : S_LOADAVG_SZ;
1139	/*
1140	 * Second pass updates counts
1141	 */
1142	cpupart = cp_list_head;
1143
1144	do {
1145		struct loadavg_s *lavg;
1146
1147		lavg = &cpupart->cp_loadavg;
1148		lavg->lg_loads[lavg->lg_cur] = lavg->lg_total;
1149		lavg->lg_total = 0;
1150		lavg->lg_cur = (lavg->lg_cur + 1) % S_LOADAVG_SZ;
1151		lavg->lg_len = (lavg->lg_len + 1) < S_LOADAVG_SZ ?
1152		    lavg->lg_len + 1 : S_LOADAVG_SZ;
1153
1154	} while ((cpupart = cpupart->cp_next) != cp_list_head);
1155
1156	/*
1157	 * Third pass totals up per-zone statistics.
1158	 */
1159	zone_loadavg_update();
1160}
1161
1162/*
1163 * clock_update() - local clock update
1164 *
1165 * This routine is called by ntp_adjtime() to update the local clock
1166 * phase and frequency. The implementation is of an
1167 * adaptive-parameter, hybrid phase/frequency-lock loop (PLL/FLL). The
1168 * routine computes new time and frequency offset estimates for each
1169 * call.  The PPS signal itself determines the new time offset,
1170 * instead of the calling argument.  Presumably, calls to
1171 * ntp_adjtime() occur only when the caller believes the local clock
1172 * is valid within some bound (+-128 ms with NTP). If the caller's
1173 * time is far different than the PPS time, an argument will ensue,
1174 * and it's not clear who will lose.
1175 *
1176 * For uncompensated quartz crystal oscillatores and nominal update
1177 * intervals less than 1024 s, operation should be in phase-lock mode
1178 * (STA_FLL = 0), where the loop is disciplined to phase. For update
1179 * intervals greater than this, operation should be in frequency-lock
1180 * mode (STA_FLL = 1), where the loop is disciplined to frequency.
1181 *
1182 * Note: mutex(&tod_lock) is in effect.
1183 */
1184void
1185clock_update(int offset)
1186{
1187	int ltemp, mtemp, s;
1188
1189	ASSERT(MUTEX_HELD(&tod_lock));
1190
1191	if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
1192		return;
1193	ltemp = offset;
1194	if ((time_status & STA_PPSTIME) && (time_status & STA_PPSSIGNAL))
1195		ltemp = pps_offset;
1196
1197	/*
1198	 * Scale the phase adjustment and clamp to the operating range.
1199	 */
1200	if (ltemp > MAXPHASE)
1201		time_offset = MAXPHASE * SCALE_UPDATE;
1202	else if (ltemp < -MAXPHASE)
1203		time_offset = -(MAXPHASE * SCALE_UPDATE);
1204	else
1205		time_offset = ltemp * SCALE_UPDATE;
1206
1207	/*
1208	 * Select whether the frequency is to be controlled and in which
1209	 * mode (PLL or FLL). Clamp to the operating range. Ugly
1210	 * multiply/divide should be replaced someday.
1211	 */
1212	if (time_status & STA_FREQHOLD || time_reftime == 0)
1213		time_reftime = hrestime.tv_sec;
1214
1215	mtemp = hrestime.tv_sec - time_reftime;
1216	time_reftime = hrestime.tv_sec;
1217
1218	if (time_status & STA_FLL) {
1219		if (mtemp >= MINSEC) {
1220			ltemp = ((time_offset / mtemp) * (SCALE_USEC /
1221			    SCALE_UPDATE));
1222			if (ltemp)
1223				time_freq += ltemp / SCALE_KH;
1224		}
1225	} else {
1226		if (mtemp < MAXSEC) {
1227			ltemp *= mtemp;
1228			if (ltemp)
1229				time_freq += (int)(((int64_t)ltemp *
1230				    SCALE_USEC) / SCALE_KF)
1231				    / (1 << (time_constant * 2));
1232		}
1233	}
1234	if (time_freq > time_tolerance)
1235		time_freq = time_tolerance;
1236	else if (time_freq < -time_tolerance)
1237		time_freq = -time_tolerance;
1238
1239	s = hr_clock_lock();
1240	tod_needsync = 1;
1241	hr_clock_unlock(s);
1242}
1243
1244/*
1245 * ddi_hardpps() - discipline CPU clock oscillator to external PPS signal
1246 *
1247 * This routine is called at each PPS interrupt in order to discipline
1248 * the CPU clock oscillator to the PPS signal. It measures the PPS phase
1249 * and leaves it in a handy spot for the clock() routine. It
1250 * integrates successive PPS phase differences and calculates the
1251 * frequency offset. This is used in clock() to discipline the CPU
1252 * clock oscillator so that intrinsic frequency error is cancelled out.
1253 * The code requires the caller to capture the time and hardware counter
1254 * value at the on-time PPS signal transition.
1255 *
1256 * Note that, on some Unix systems, this routine runs at an interrupt
1257 * priority level higher than the timer interrupt routine clock().
1258 * Therefore, the variables used are distinct from the clock()
1259 * variables, except for certain exceptions: The PPS frequency pps_freq
1260 * and phase pps_offset variables are determined by this routine and
1261 * updated atomically. The time_tolerance variable can be considered a
1262 * constant, since it is infrequently changed, and then only when the
1263 * PPS signal is disabled. The watchdog counter pps_valid is updated
1264 * once per second by clock() and is atomically cleared in this
1265 * routine.
1266 *
1267 * tvp is the time of the last tick; usec is a microsecond count since the
1268 * last tick.
1269 *
1270 * Note: In Solaris systems, the tick value is actually given by
1271 *       usec_per_tick.  This is called from the serial driver cdintr(),
1272 *	 or equivalent, at a high PIL.  Because the kernel keeps a
1273 *	 highresolution time, the following code can accept either
1274 *	 the traditional argument pair, or the current highres timestamp
1275 *       in tvp and zero in usec.
1276 */
1277void
1278ddi_hardpps(struct timeval *tvp, int usec)
1279{
1280	int u_usec, v_usec, bigtick;
1281	time_t cal_sec;
1282	int cal_usec;
1283
1284	/*
1285	 * An occasional glitch can be produced when the PPS interrupt
1286	 * occurs in the clock() routine before the time variable is
1287	 * updated. Here the offset is discarded when the difference
1288	 * between it and the last one is greater than tick/2, but not
1289	 * if the interval since the first discard exceeds 30 s.
1290	 */
1291	time_status |= STA_PPSSIGNAL;
1292	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
1293	pps_valid = 0;
1294	u_usec = -tvp->tv_usec;
1295	if (u_usec < -(MICROSEC/2))
1296		u_usec += MICROSEC;
1297	v_usec = pps_offset - u_usec;
1298	if (v_usec < 0)
1299		v_usec = -v_usec;
1300	if (v_usec > (usec_per_tick >> 1)) {
1301		if (pps_glitch > MAXGLITCH) {
1302			pps_glitch = 0;
1303			pps_tf[2] = u_usec;
1304			pps_tf[1] = u_usec;
1305		} else {
1306			pps_glitch++;
1307			u_usec = pps_offset;
1308		}
1309	} else
1310		pps_glitch = 0;
1311
1312	/*
1313	 * A three-stage median filter is used to help deglitch the pps
1314	 * time. The median sample becomes the time offset estimate; the
1315	 * difference between the other two samples becomes the time
1316	 * dispersion (jitter) estimate.
1317	 */
1318	pps_tf[2] = pps_tf[1];
1319	pps_tf[1] = pps_tf[0];
1320	pps_tf[0] = u_usec;
1321	if (pps_tf[0] > pps_tf[1]) {
1322		if (pps_tf[1] > pps_tf[2]) {
1323			pps_offset = pps_tf[1];		/* 0 1 2 */
1324			v_usec = pps_tf[0] - pps_tf[2];
1325		} else if (pps_tf[2] > pps_tf[0]) {
1326			pps_offset = pps_tf[0];		/* 2 0 1 */
1327			v_usec = pps_tf[2] - pps_tf[1];
1328		} else {
1329			pps_offset = pps_tf[2];		/* 0 2 1 */
1330			v_usec = pps_tf[0] - pps_tf[1];
1331		}
1332	} else {
1333		if (pps_tf[1] < pps_tf[2]) {
1334			pps_offset = pps_tf[1];		/* 2 1 0 */
1335			v_usec = pps_tf[2] - pps_tf[0];
1336		} else  if (pps_tf[2] < pps_tf[0]) {
1337			pps_offset = pps_tf[0];		/* 1 0 2 */
1338			v_usec = pps_tf[1] - pps_tf[2];
1339		} else {
1340			pps_offset = pps_tf[2];		/* 1 2 0 */
1341			v_usec = pps_tf[1] - pps_tf[0];
1342		}
1343	}
1344	if (v_usec > MAXTIME)
1345		pps_jitcnt++;
1346	v_usec = (v_usec << PPS_AVG) - pps_jitter;
1347	pps_jitter += v_usec / (1 << PPS_AVG);
1348	if (pps_jitter > (MAXTIME >> 1))
1349		time_status |= STA_PPSJITTER;
1350
1351	/*
1352	 * During the calibration interval adjust the starting time when
1353	 * the tick overflows. At the end of the interval compute the
1354	 * duration of the interval and the difference of the hardware
1355	 * counters at the beginning and end of the interval. This code
1356	 * is deliciously complicated by the fact valid differences may
1357	 * exceed the value of tick when using long calibration
1358	 * intervals and small ticks. Note that the counter can be
1359	 * greater than tick if caught at just the wrong instant, but
1360	 * the values returned and used here are correct.
1361	 */
1362	bigtick = (int)usec_per_tick * SCALE_USEC;
1363	pps_usec -= pps_freq;
1364	if (pps_usec >= bigtick)
1365		pps_usec -= bigtick;
1366	if (pps_usec < 0)
1367		pps_usec += bigtick;
1368	pps_time.tv_sec++;
1369	pps_count++;
1370	if (pps_count < (1 << pps_shift))
1371		return;
1372	pps_count = 0;
1373	pps_calcnt++;
1374	u_usec = usec * SCALE_USEC;
1375	v_usec = pps_usec - u_usec;
1376	if (v_usec >= bigtick >> 1)
1377		v_usec -= bigtick;
1378	if (v_usec < -(bigtick >> 1))
1379		v_usec += bigtick;
1380	if (v_usec < 0)
1381		v_usec = -(-v_usec >> pps_shift);
1382	else
1383		v_usec = v_usec >> pps_shift;
1384	pps_usec = u_usec;
1385	cal_sec = tvp->tv_sec;
1386	cal_usec = tvp->tv_usec;
1387	cal_sec -= pps_time.tv_sec;
1388	cal_usec -= pps_time.tv_usec;
1389	if (cal_usec < 0) {
1390		cal_usec += MICROSEC;
1391		cal_sec--;
1392	}
1393	pps_time = *tvp;
1394
1395	/*
1396	 * Check for lost interrupts, noise, excessive jitter and
1397	 * excessive frequency error. The number of timer ticks during
1398	 * the interval may vary +-1 tick. Add to this a margin of one
1399	 * tick for the PPS signal jitter and maximum frequency
1400	 * deviation. If the limits are exceeded, the calibration
1401	 * interval is reset to the minimum and we start over.
1402	 */
1403	u_usec = (int)usec_per_tick << 1;
1404	if (!((cal_sec == -1 && cal_usec > (MICROSEC - u_usec)) ||
1405	    (cal_sec == 0 && cal_usec < u_usec)) ||
1406	    v_usec > time_tolerance || v_usec < -time_tolerance) {
1407		pps_errcnt++;
1408		pps_shift = PPS_SHIFT;
1409		pps_intcnt = 0;
1410		time_status |= STA_PPSERROR;
1411		return;
1412	}
1413
1414	/*
1415	 * A three-stage median filter is used to help deglitch the pps
1416	 * frequency. The median sample becomes the frequency offset
1417	 * estimate; the difference between the other two samples
1418	 * becomes the frequency dispersion (stability) estimate.
1419	 */
1420	pps_ff[2] = pps_ff[1];
1421	pps_ff[1] = pps_ff[0];
1422	pps_ff[0] = v_usec;
1423	if (pps_ff[0] > pps_ff[1]) {
1424		if (pps_ff[1] > pps_ff[2]) {
1425			u_usec = pps_ff[1];		/* 0 1 2 */
1426			v_usec = pps_ff[0] - pps_ff[2];
1427		} else if (pps_ff[2] > pps_ff[0]) {
1428			u_usec = pps_ff[0];		/* 2 0 1 */
1429			v_usec = pps_ff[2] - pps_ff[1];
1430		} else {
1431			u_usec = pps_ff[2];		/* 0 2 1 */
1432			v_usec = pps_ff[0] - pps_ff[1];
1433		}
1434	} else {
1435		if (pps_ff[1] < pps_ff[2]) {
1436			u_usec = pps_ff[1];		/* 2 1 0 */
1437			v_usec = pps_ff[2] - pps_ff[0];
1438		} else  if (pps_ff[2] < pps_ff[0]) {
1439			u_usec = pps_ff[0];		/* 1 0 2 */
1440			v_usec = pps_ff[1] - pps_ff[2];
1441		} else {
1442			u_usec = pps_ff[2];		/* 1 2 0 */
1443			v_usec = pps_ff[1] - pps_ff[0];
1444		}
1445	}
1446
1447	/*
1448	 * Here the frequency dispersion (stability) is updated. If it
1449	 * is less than one-fourth the maximum (MAXFREQ), the frequency
1450	 * offset is updated as well, but clamped to the tolerance. It
1451	 * will be processed later by the clock() routine.
1452	 */
1453	v_usec = (v_usec >> 1) - pps_stabil;
1454	if (v_usec < 0)
1455		pps_stabil -= -v_usec >> PPS_AVG;
1456	else
1457		pps_stabil += v_usec >> PPS_AVG;
1458	if (pps_stabil > MAXFREQ >> 2) {
1459		pps_stbcnt++;
1460		time_status |= STA_PPSWANDER;
1461		return;
1462	}
1463	if (time_status & STA_PPSFREQ) {
1464		if (u_usec < 0) {
1465			pps_freq -= -u_usec >> PPS_AVG;
1466			if (pps_freq < -time_tolerance)
1467				pps_freq = -time_tolerance;
1468			u_usec = -u_usec;
1469		} else {
1470			pps_freq += u_usec >> PPS_AVG;
1471			if (pps_freq > time_tolerance)
1472				pps_freq = time_tolerance;
1473		}
1474	}
1475
1476	/*
1477	 * Here the calibration interval is adjusted. If the maximum
1478	 * time difference is greater than tick / 4, reduce the interval
1479	 * by half. If this is not the case for four consecutive
1480	 * intervals, double the interval.
1481	 */
1482	if (u_usec << pps_shift > bigtick >> 2) {
1483		pps_intcnt = 0;
1484		if (pps_shift > PPS_SHIFT)
1485			pps_shift--;
1486	} else if (pps_intcnt >= 4) {
1487		pps_intcnt = 0;
1488		if (pps_shift < PPS_SHIFTMAX)
1489			pps_shift++;
1490	} else
1491		pps_intcnt++;
1492
1493	/*
1494	 * If recovering from kmdb, then make sure the tod chip gets resynced.
1495	 * If we took an early exit above, then we don't yet have a stable
1496	 * calibration signal to lock onto, so don't mark the tod for sync
1497	 * until we get all the way here.
1498	 */
1499	{
1500		int s = hr_clock_lock();
1501
1502		tod_needsync = 1;
1503		hr_clock_unlock(s);
1504	}
1505}
1506
1507/*
1508 * Handle clock tick processing for a thread.
1509 * Check for timer action, enforce CPU rlimit, do profiling etc.
1510 */
1511void
1512clock_tick(kthread_t *t, int pending)
1513{
1514	struct proc *pp;
1515	klwp_id_t    lwp;
1516	struct as *as;
1517	clock_t	ticks;
1518	int	poke = 0;		/* notify another CPU */
1519	int	user_mode;
1520	size_t	 rss;
1521	int i, total_usec, usec;
1522	rctl_qty_t secs;
1523
1524	ASSERT(pending > 0);
1525
1526	/* Must be operating on a lwp/thread */
1527	if ((lwp = ttolwp(t)) == NULL) {
1528		panic("clock_tick: no lwp");
1529		/*NOTREACHED*/
1530	}
1531
1532	for (i = 0; i < pending; i++) {
1533		CL_TICK(t);	/* Class specific tick processing */
1534		DTRACE_SCHED1(tick, kthread_t *, t);
1535	}
1536
1537	pp = ttoproc(t);
1538
1539	/* pp->p_lock makes sure that the thread does not exit */
1540	ASSERT(MUTEX_HELD(&pp->p_lock));
1541
1542	user_mode = (lwp->lwp_state == LWP_USER);
1543
1544	ticks = (pp->p_utime + pp->p_stime) % hz;
1545	/*
1546	 * Update process times. Should use high res clock and state
1547	 * changes instead of statistical sampling method. XXX
1548	 */
1549	if (user_mode) {
1550		pp->p_utime += pending;
1551	} else {
1552		pp->p_stime += pending;
1553	}
1554
1555	pp->p_ttime += pending;
1556	as = pp->p_as;
1557
1558	/*
1559	 * Update user profiling statistics. Get the pc from the
1560	 * lwp when the AST happens.
1561	 */
1562	if (pp->p_prof.pr_scale) {
1563		atomic_add_32(&lwp->lwp_oweupc, (int32_t)pending);
1564		if (user_mode) {
1565			poke = 1;
1566			aston(t);
1567		}
1568	}
1569
1570	/*
1571	 * If CPU was in user state, process lwp-virtual time
1572	 * interval timer. The value passed to itimerdecr() has to be
1573	 * in microseconds and has to be less than one second. Hence
1574	 * this loop.
1575	 */
1576	total_usec = usec_per_tick * pending;
1577	while (total_usec > 0) {
1578		usec = MIN(total_usec, (MICROSEC - 1));
1579		if (user_mode &&
1580		    timerisset(&lwp->lwp_timer[ITIMER_VIRTUAL].it_value) &&
1581		    itimerdecr(&lwp->lwp_timer[ITIMER_VIRTUAL], usec) == 0) {
1582			poke = 1;
1583			sigtoproc(pp, t, SIGVTALRM);
1584		}
1585		total_usec -= usec;
1586	}
1587
1588	/*
1589	 * If CPU was in user state, process lwp-profile
1590	 * interval timer.
1591	 */
1592	total_usec = usec_per_tick * pending;
1593	while (total_usec > 0) {
1594		usec = MIN(total_usec, (MICROSEC - 1));
1595		if (timerisset(&lwp->lwp_timer[ITIMER_PROF].it_value) &&
1596		    itimerdecr(&lwp->lwp_timer[ITIMER_PROF], usec) == 0) {
1597			poke = 1;
1598			sigtoproc(pp, t, SIGPROF);
1599		}
1600		total_usec -= usec;
1601	}
1602
1603	/*
1604	 * Enforce CPU resource controls:
1605	 *   (a) process.max-cpu-time resource control
1606	 *
1607	 * Perform the check only if we have accumulated more a second.
1608	 */
1609	if ((ticks + pending) >= hz) {
1610		(void) rctl_test(rctlproc_legacy[RLIMIT_CPU], pp->p_rctls, pp,
1611		    (pp->p_utime + pp->p_stime)/hz, RCA_UNSAFE_SIGINFO);
1612	}
1613
1614	/*
1615	 *   (b) task.max-cpu-time resource control
1616	 *
1617	 * If we have accumulated enough ticks, increment the task CPU
1618	 * time usage and test for the resource limit. This minimizes the
1619	 * number of calls to the rct_test(). The task CPU time mutex
1620	 * is highly contentious as many processes can be sharing a task.
1621	 */
1622	if (pp->p_ttime >= clock_tick_proc_max) {
1623		secs = task_cpu_time_incr(pp->p_task, pp->p_ttime);
1624		pp->p_ttime = 0;
1625		if (secs) {
1626			(void) rctl_test(rc_task_cpu_time, pp->p_task->tk_rctls,
1627			    pp, secs, RCA_UNSAFE_SIGINFO);
1628		}
1629	}
1630
1631	/*
1632	 * Update memory usage for the currently running process.
1633	 */
1634	rss = rm_asrss(as);
1635	PTOU(pp)->u_mem += rss;
1636	if (rss > PTOU(pp)->u_mem_max)
1637		PTOU(pp)->u_mem_max = rss;
1638
1639	/*
1640	 * Notify the CPU the thread is running on.
1641	 */
1642	if (poke && t->t_cpu != CPU)
1643		poke_cpu(t->t_cpu->cpu_id);
1644}
1645
1646void
1647profil_tick(uintptr_t upc)
1648{
1649	int ticks;
1650	proc_t *p = ttoproc(curthread);
1651	klwp_t *lwp = ttolwp(curthread);
1652	struct prof *pr = &p->p_prof;
1653
1654	do {
1655		ticks = lwp->lwp_oweupc;
1656	} while (atomic_cas_32(&lwp->lwp_oweupc, ticks, 0) != ticks);
1657
1658	mutex_enter(&p->p_pflock);
1659	if (pr->pr_scale >= 2 && upc >= pr->pr_off) {
1660		/*
1661		 * Old-style profiling
1662		 */
1663		uint16_t *slot = pr->pr_base;
1664		uint16_t old, new;
1665		if (pr->pr_scale != 2) {
1666			uintptr_t delta = upc - pr->pr_off;
1667			uintptr_t byteoff = ((delta >> 16) * pr->pr_scale) +
1668			    (((delta & 0xffff) * pr->pr_scale) >> 16);
1669			if (byteoff >= (uintptr_t)pr->pr_size) {
1670				mutex_exit(&p->p_pflock);
1671				return;
1672			}
1673			slot += byteoff / sizeof (uint16_t);
1674		}
1675		if (fuword16(slot, &old) < 0 ||
1676		    (new = old + ticks) > SHRT_MAX ||
1677		    suword16(slot, new) < 0) {
1678			pr->pr_scale = 0;
1679		}
1680	} else if (pr->pr_scale == 1) {
1681		/*
1682		 * PC Sampling
1683		 */
1684		model_t model = lwp_getdatamodel(lwp);
1685		int result;
1686#ifdef __lint
1687		model = model;
1688#endif
1689		while (ticks-- > 0) {
1690			if (pr->pr_samples == pr->pr_size) {
1691				/* buffer full, turn off sampling */
1692				pr->pr_scale = 0;
1693				break;
1694			}
1695			switch (SIZEOF_PTR(model)) {
1696			case sizeof (uint32_t):
1697				result = suword32(pr->pr_base, (uint32_t)upc);
1698				break;
1699#ifdef _LP64
1700			case sizeof (uint64_t):
1701				result = suword64(pr->pr_base, (uint64_t)upc);
1702				break;
1703#endif
1704			default:
1705				cmn_err(CE_WARN, "profil_tick: unexpected "
1706				    "data model");
1707				result = -1;
1708				break;
1709			}
1710			if (result != 0) {
1711				pr->pr_scale = 0;
1712				break;
1713			}
1714			pr->pr_base = (caddr_t)pr->pr_base + SIZEOF_PTR(model);
1715			pr->pr_samples++;
1716		}
1717	}
1718	mutex_exit(&p->p_pflock);
1719}
1720
1721static void
1722delay_wakeup(void *arg)
1723{
1724	kthread_t	*t = arg;
1725
1726	mutex_enter(&t->t_delay_lock);
1727	cv_signal(&t->t_delay_cv);
1728	mutex_exit(&t->t_delay_lock);
1729}
1730
1731/*
1732 * The delay(9F) man page indicates that it can only be called from user or
1733 * kernel context - detect and diagnose bad calls. The following macro will
1734 * produce a limited number of messages identifying bad callers.  This is done
1735 * in a macro so that caller() is meaningful. When a bad caller is identified,
1736 * switching to 'drv_usecwait(TICK_TO_USEC(ticks));' may be appropriate.
1737 */
1738#define	DELAY_CONTEXT_CHECK()	{					\
1739	uint32_t	m;						\
1740	char		*f;						\
1741	ulong_t		off;						\
1742									\
1743	m = delay_from_interrupt_msg;					\
1744	if (delay_from_interrupt_diagnose && servicing_interrupt() &&	\
1745	    !panicstr && !devinfo_freeze &&				\
1746	    atomic_cas_32(&delay_from_interrupt_msg, m ? m : 1, m-1)) {	\
1747		f = modgetsymname((uintptr_t)caller(), &off);		\
1748		cmn_err(CE_WARN, "delay(9F) called from "		\
1749		    "interrupt context: %s`%s",				\
1750		    mod_containing_pc(caller()), f ? f : "...");	\
1751	}								\
1752}
1753
1754/*
1755 * delay_common: common delay code.
1756 */
1757static void
1758delay_common(clock_t ticks)
1759{
1760	kthread_t	*t = curthread;
1761	clock_t		deadline;
1762	clock_t		timeleft;
1763	callout_id_t	id;
1764
1765	/* If timeouts aren't running all we can do is spin. */
1766	if (panicstr || devinfo_freeze) {
1767		/* Convert delay(9F) call into drv_usecwait(9F) call. */
1768		if (ticks > 0)
1769			drv_usecwait(TICK_TO_USEC(ticks));
1770		return;
1771	}
1772
1773	deadline = ddi_get_lbolt() + ticks;
1774	while ((timeleft = deadline - ddi_get_lbolt()) > 0) {
1775		mutex_enter(&t->t_delay_lock);
1776		id = timeout_default(delay_wakeup, t, timeleft);
1777		cv_wait(&t->t_delay_cv, &t->t_delay_lock);
1778		mutex_exit(&t->t_delay_lock);
1779		(void) untimeout_default(id, 0);
1780	}
1781}
1782
1783/*
1784 * Delay specified number of clock ticks.
1785 */
1786void
1787delay(clock_t ticks)
1788{
1789	DELAY_CONTEXT_CHECK();
1790
1791	delay_common(ticks);
1792}
1793
1794/*
1795 * Delay a random number of clock ticks between 1 and ticks.
1796 */
1797void
1798delay_random(clock_t ticks)
1799{
1800	int	r;
1801
1802	DELAY_CONTEXT_CHECK();
1803
1804	(void) random_get_pseudo_bytes((void *)&r, sizeof (r));
1805	if (ticks == 0)
1806		ticks = 1;
1807	ticks = (r % ticks) + 1;
1808	delay_common(ticks);
1809}
1810
1811/*
1812 * Like delay, but interruptible by a signal.
1813 */
1814int
1815delay_sig(clock_t ticks)
1816{
1817	kthread_t	*t = curthread;
1818	clock_t		deadline;
1819	clock_t		rc;
1820
1821	/* If timeouts aren't running all we can do is spin. */
1822	if (panicstr || devinfo_freeze) {
1823		if (ticks > 0)
1824			drv_usecwait(TICK_TO_USEC(ticks));
1825		return (0);
1826	}
1827
1828	deadline = ddi_get_lbolt() + ticks;
1829	mutex_enter(&t->t_delay_lock);
1830	do {
1831		rc = cv_timedwait_sig(&t->t_delay_cv,
1832		    &t->t_delay_lock, deadline);
1833		/* loop until past deadline or signaled */
1834	} while (rc > 0);
1835	mutex_exit(&t->t_delay_lock);
1836	if (rc == 0)
1837		return (EINTR);
1838	return (0);
1839}
1840
1841
1842#define	SECONDS_PER_DAY 86400
1843
1844/*
1845 * Initialize the system time based on the TOD chip.  approx is used as
1846 * an approximation of time (e.g. from the filesystem) in the event that
1847 * the TOD chip has been cleared or is unresponsive.  An approx of -1
1848 * means the filesystem doesn't keep time.
1849 */
1850void
1851clkset(time_t approx)
1852{
1853	timestruc_t ts;
1854	int spl;
1855	int set_clock = 0;
1856
1857	mutex_enter(&tod_lock);
1858	ts = tod_get();
1859
1860	if (ts.tv_sec > 365 * SECONDS_PER_DAY) {
1861		/*
1862		 * If the TOD chip is reporting some time after 1971,
1863		 * then it probably didn't lose power or become otherwise
1864		 * cleared in the recent past;  check to assure that
1865		 * the time coming from the filesystem isn't in the future
1866		 * according to the TOD chip.
1867		 */
1868		if (approx != -1 && approx > ts.tv_sec) {
1869			cmn_err(CE_WARN, "Last shutdown is later "
1870			    "than time on time-of-day chip; check date.");
1871		}
1872	} else {
1873		/*
1874		 * If the TOD chip isn't giving correct time, set it to the
1875		 * greater of i) approx and ii) 1987. That way if approx
1876		 * is negative or is earlier than 1987, we set the clock
1877		 * back to a time when Oliver North, ALF and Dire Straits
1878		 * were all on the collective brain:  1987.
1879		 */
1880		timestruc_t tmp;
1881		time_t diagnose_date = (1987 - 1970) * 365 * SECONDS_PER_DAY;
1882		ts.tv_sec = (approx > diagnose_date ? approx : diagnose_date);
1883		ts.tv_nsec = 0;
1884
1885		/*
1886		 * Attempt to write the new time to the TOD chip.  Set spl high
1887		 * to avoid getting preempted between the tod_set and tod_get.
1888		 */
1889		spl = splhi();
1890		tod_set(ts);
1891		tmp = tod_get();
1892		splx(spl);
1893
1894		if (tmp.tv_sec != ts.tv_sec && tmp.tv_sec != ts.tv_sec + 1) {
1895			tod_broken = 1;
1896			dosynctodr = 0;
1897			cmn_err(CE_WARN, "Time-of-day chip unresponsive.");
1898		} else {
1899			cmn_err(CE_WARN, "Time-of-day chip had "
1900			    "incorrect date; check and reset.");
1901		}
1902		set_clock = 1;
1903	}
1904
1905	if (!boot_time) {
1906		boot_time = ts.tv_sec;
1907		set_clock = 1;
1908	}
1909
1910	if (set_clock)
1911		set_hrestime(&ts);
1912
1913	mutex_exit(&tod_lock);
1914}
1915
1916int	timechanged;	/* for testing if the system time has been reset */
1917
1918void
1919set_hrestime(timestruc_t *ts)
1920{
1921	int spl = hr_clock_lock();
1922	hrestime = *ts;
1923	membar_enter();	/* hrestime must be visible before timechanged++ */
1924	timedelta = 0;
1925	timechanged++;
1926	hr_clock_unlock(spl);
1927	callout_hrestime();
1928}
1929
1930static uint_t deadman_seconds;
1931static uint32_t deadman_panics;
1932static int deadman_enabled = 0;
1933static int deadman_panic_timers = 1;
1934
1935static void
1936deadman(void)
1937{
1938	if (panicstr) {
1939		/*
1940		 * During panic, other CPUs besides the panic
1941		 * master continue to handle cyclics and some other
1942		 * interrupts.  The code below is intended to be
1943		 * single threaded, so any CPU other than the master
1944		 * must keep out.
1945		 */
1946		if (CPU->cpu_id != panic_cpu.cpu_id)
1947			return;
1948
1949		if (!deadman_panic_timers)
1950			return; /* allow all timers to be manually disabled */
1951
1952		/*
1953		 * If we are generating a crash dump or syncing filesystems and
1954		 * the corresponding timer is set, decrement it and re-enter
1955		 * the panic code to abort it and advance to the next state.
1956		 * The panic states and triggers are explained in panic.c.
1957		 */
1958		if (panic_dump) {
1959			if (dump_timeleft && (--dump_timeleft == 0)) {
1960				panic("panic dump timeout");
1961				/*NOTREACHED*/
1962			}
1963		}
1964		return;
1965	}
1966
1967	if (deadman_counter != CPU->cpu_deadman_counter) {
1968		CPU->cpu_deadman_counter = deadman_counter;
1969		CPU->cpu_deadman_countdown = deadman_seconds;
1970		return;
1971	}
1972
1973	if (--CPU->cpu_deadman_countdown > 0)
1974		return;
1975
1976	/*
1977	 * Regardless of whether or not we actually bring the system down,
1978	 * bump the deadman_panics variable.
1979	 *
1980	 * N.B. deadman_panics is incremented once for each CPU that
1981	 * passes through here.  It's expected that all the CPUs will
1982	 * detect this condition within one second of each other, so
1983	 * when deadman_enabled is off, deadman_panics will
1984	 * typically be a multiple of the total number of CPUs in
1985	 * the system.
1986	 */
1987	atomic_inc_32(&deadman_panics);
1988
1989	if (!deadman_enabled) {
1990		CPU->cpu_deadman_countdown = deadman_seconds;
1991		return;
1992	}
1993
1994	/*
1995	 * If we're here, we want to bring the system down.
1996	 */
1997	panic("deadman: timed out after %d seconds of clock "
1998	    "inactivity", deadman_seconds);
1999	/*NOTREACHED*/
2000}
2001
2002/*ARGSUSED*/
2003static void
2004deadman_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
2005{
2006	cpu->cpu_deadman_counter = 0;
2007	cpu->cpu_deadman_countdown = deadman_seconds;
2008
2009	hdlr->cyh_func = (cyc_func_t)deadman;
2010	hdlr->cyh_level = CY_HIGH_LEVEL;
2011	hdlr->cyh_arg = NULL;
2012
2013	/*
2014	 * Stagger the CPUs so that they don't all run deadman() at
2015	 * the same time.  Simplest reason to do this is to make it
2016	 * more likely that only one CPU will panic in case of a
2017	 * timeout.  This is (strictly speaking) an aesthetic, not a
2018	 * technical consideration.
2019	 */
2020	when->cyt_when = cpu->cpu_id * (NANOSEC / NCPU);
2021	when->cyt_interval = NANOSEC;
2022}
2023
2024
2025void
2026deadman_init(void)
2027{
2028	cyc_omni_handler_t hdlr;
2029
2030	if (deadman_seconds == 0)
2031		deadman_seconds = snoop_interval / MICROSEC;
2032
2033	if (snooping)
2034		deadman_enabled = 1;
2035
2036	hdlr.cyo_online = deadman_online;
2037	hdlr.cyo_offline = NULL;
2038	hdlr.cyo_arg = NULL;
2039
2040	mutex_enter(&cpu_lock);
2041	deadman_cyclic = cyclic_add_omni(&hdlr);
2042	mutex_exit(&cpu_lock);
2043}
2044
2045/*
2046 * tod_fault() is for updating tod validate mechanism state:
2047 * (1) TOD_NOFAULT: for resetting the state to 'normal'.
2048 *     currently used for debugging only
2049 * (2) The following four cases detected by tod validate mechanism:
2050 *       TOD_REVERSED: current tod value is less than previous value.
2051 *       TOD_STALLED: current tod value hasn't advanced.
2052 *       TOD_JUMPED: current tod value advanced too far from previous value.
2053 *       TOD_RATECHANGED: the ratio between average tod delta and
2054 *       average tick delta has changed.
2055 * (3) TOD_RDONLY: when the TOD clock is not writeable e.g. because it is
2056 *     a virtual TOD provided by a hypervisor.
2057 */
2058enum tod_fault_type
2059tod_fault(enum tod_fault_type ftype, int off)
2060{
2061	ASSERT(MUTEX_HELD(&tod_lock));
2062
2063	if (tod_faulted != ftype) {
2064		switch (ftype) {
2065		case TOD_NOFAULT:
2066			plat_tod_fault(TOD_NOFAULT);
2067			cmn_err(CE_NOTE, "Restarted tracking "
2068			    "Time of Day clock.");
2069			tod_faulted = ftype;
2070			break;
2071		case TOD_REVERSED:
2072		case TOD_JUMPED:
2073			if (tod_faulted == TOD_NOFAULT) {
2074				plat_tod_fault(ftype);
2075				cmn_err(CE_WARN, "Time of Day clock error: "
2076				    "reason [%s by 0x%x]. -- "
2077				    " Stopped tracking Time Of Day clock.",
2078				    tod_fault_table[ftype], off);
2079				tod_faulted = ftype;
2080			}
2081			break;
2082		case TOD_STALLED:
2083		case TOD_RATECHANGED:
2084			if (tod_faulted == TOD_NOFAULT) {
2085				plat_tod_fault(ftype);
2086				cmn_err(CE_WARN, "Time of Day clock error: "
2087				    "reason [%s]. -- "
2088				    " Stopped tracking Time Of Day clock.",
2089				    tod_fault_table[ftype]);
2090				tod_faulted = ftype;
2091			}
2092			break;
2093		case TOD_RDONLY:
2094			if (tod_faulted == TOD_NOFAULT) {
2095				plat_tod_fault(ftype);
2096				cmn_err(CE_NOTE, "!Time of Day clock is "
2097				    "Read-Only; set of Date/Time will not "
2098				    "persist across reboot.");
2099				tod_faulted = ftype;
2100			}
2101			break;
2102		default:
2103			break;
2104		}
2105	}
2106	return (tod_faulted);
2107}
2108
2109/*
2110 * Two functions that allow tod_status_flag to be manipulated by functions
2111 * external to this file.
2112 */
2113
2114void
2115tod_status_set(int tod_flag)
2116{
2117	tod_status_flag |= tod_flag;
2118}
2119
2120void
2121tod_status_clear(int tod_flag)
2122{
2123	tod_status_flag &= ~tod_flag;
2124}
2125
2126/*
2127 * Record a timestamp and the value passed to tod_set().  The next call to
2128 * tod_validate() can use these values, prev_set_tick and prev_set_tod,
2129 * when checking the timestruc_t returned by tod_get().  Ordinarily,
2130 * tod_validate() will use prev_tick and prev_tod for this task but these
2131 * become obsolete, and will be re-assigned with the prev_set_* values,
2132 * in the case when the TOD is re-written.
2133 */
2134void
2135tod_set_prev(timestruc_t ts)
2136{
2137	if ((tod_validate_enable == 0) || (tod_faulted != TOD_NOFAULT) ||
2138	    tod_validate_deferred) {
2139		return;
2140	}
2141	prev_set_tick = gethrtime();
2142	/*
2143	 * A negative value will be set to zero in utc_to_tod() so we fake
2144	 * a zero here in such a case.  This would need to change if the
2145	 * behavior of utc_to_tod() changes.
2146	 */
2147	prev_set_tod = ts.tv_sec < 0 ? 0 : ts.tv_sec;
2148}
2149
2150/*
2151 * tod_validate() is used for checking values returned by tod_get().
2152 * Four error cases can be detected by this routine:
2153 *   TOD_REVERSED: current tod value is less than previous.
2154 *   TOD_STALLED: current tod value hasn't advanced.
2155 *   TOD_JUMPED: current tod value advanced too far from previous value.
2156 *   TOD_RATECHANGED: the ratio between average tod delta and
2157 *   average tick delta has changed.
2158 */
2159time_t
2160tod_validate(time_t tod)
2161{
2162	time_t diff_tod;
2163	hrtime_t diff_tick;
2164
2165	long dtick;
2166	int dtick_delta;
2167
2168	int off = 0;
2169	enum tod_fault_type tod_bad = TOD_NOFAULT;
2170
2171	static int firsttime = 1;
2172
2173	static time_t prev_tod = 0;
2174	static hrtime_t prev_tick = 0;
2175	static long dtick_avg = TOD_REF_FREQ;
2176
2177	int cpr_resume_done = 0;
2178	int dr_resume_done = 0;
2179
2180	hrtime_t tick = gethrtime();
2181
2182	ASSERT(MUTEX_HELD(&tod_lock));
2183
2184	/*
2185	 * tod_validate_enable is patchable via /etc/system.
2186	 * If TOD is already faulted, or if TOD validation is deferred,
2187	 * there is nothing to do.
2188	 */
2189	if ((tod_validate_enable == 0) || (tod_faulted != TOD_NOFAULT) ||
2190	    tod_validate_deferred) {
2191		return (tod);
2192	}
2193
2194	/*
2195	 * If this is the first time through, we just need to save the tod
2196	 * we were called with and hrtime so we can use them next time to
2197	 * validate tod_get().
2198	 */
2199	if (firsttime) {
2200		firsttime = 0;
2201		prev_tod = tod;
2202		prev_tick = tick;
2203		return (tod);
2204	}
2205
2206	/*
2207	 * Handle any flags that have been turned on by tod_status_set().
2208	 * In the case where a tod_set() is done and then a subsequent
2209	 * tod_get() fails (ie, both TOD_SET_DONE and TOD_GET_FAILED are
2210	 * true), we treat the TOD_GET_FAILED with precedence by switching
2211	 * off the flag, returning tod and leaving TOD_SET_DONE asserted
2212	 * until such time as tod_get() completes successfully.
2213	 */
2214	if (tod_status_flag & TOD_GET_FAILED) {
2215		/*
2216		 * tod_get() has encountered an issue, possibly transitory,
2217		 * when reading TOD.  We'll just return the incoming tod
2218		 * value (which is actually hrestime.tv_sec in this case)
2219		 * and when we get a genuine tod, following a successful
2220		 * tod_get(), we can validate using prev_tod and prev_tick.
2221		 */
2222		tod_status_flag &= ~TOD_GET_FAILED;
2223		return (tod);
2224	} else if (tod_status_flag & TOD_SET_DONE) {
2225		/*
2226		 * TOD has been modified.  Just before the TOD was written,
2227		 * tod_set_prev() saved tod and hrtime; we can now use
2228		 * those values, prev_set_tod and prev_set_tick, to validate
2229		 * the incoming tod that's just been read.
2230		 */
2231		prev_tod = prev_set_tod;
2232		prev_tick = prev_set_tick;
2233		dtick_avg = TOD_REF_FREQ;
2234		tod_status_flag &= ~TOD_SET_DONE;
2235		/*
2236		 * If a tod_set() preceded a cpr_suspend() without an
2237		 * intervening tod_validate(), we need to ensure that a
2238		 * TOD_JUMPED condition is ignored.
2239		 * Note this isn't a concern in the case of DR as we've
2240		 * just reassigned dtick_avg, above.
2241		 */
2242		if (tod_status_flag & TOD_CPR_RESUME_DONE) {
2243			cpr_resume_done = 1;
2244			tod_status_flag &= ~TOD_CPR_RESUME_DONE;
2245		}
2246	} else if (tod_status_flag & TOD_CPR_RESUME_DONE) {
2247		/*
2248		 * The system's coming back from a checkpoint resume.
2249		 */
2250		cpr_resume_done = 1;
2251		tod_status_flag &= ~TOD_CPR_RESUME_DONE;
2252		/*
2253		 * We need to handle the possibility of a CPR suspend
2254		 * operation having been initiated whilst a DR event was
2255		 * in-flight.
2256		 */
2257		if (tod_status_flag & TOD_DR_RESUME_DONE) {
2258			dr_resume_done = 1;
2259			tod_status_flag &= ~TOD_DR_RESUME_DONE;
2260		}
2261	} else if (tod_status_flag & TOD_DR_RESUME_DONE) {
2262		/*
2263		 * A Dynamic Reconfiguration event has taken place.
2264		 */
2265		dr_resume_done = 1;
2266		tod_status_flag &= ~TOD_DR_RESUME_DONE;
2267	}
2268
2269	/* test hook */
2270	switch (tod_unit_test) {
2271	case 1: /* for testing jumping tod */
2272		tod += tod_test_injector;
2273		tod_unit_test = 0;
2274		break;
2275	case 2:	/* for testing stuck tod bit */
2276		tod |= 1 << tod_test_injector;
2277		tod_unit_test = 0;
2278		break;
2279	case 3:	/* for testing stalled tod */
2280		tod = prev_tod;
2281		tod_unit_test = 0;
2282		break;
2283	case 4:	/* reset tod fault status */
2284		(void) tod_fault(TOD_NOFAULT, 0);
2285		tod_unit_test = 0;
2286		break;
2287	default:
2288		break;
2289	}
2290
2291	diff_tod = tod - prev_tod;
2292	diff_tick = tick - prev_tick;
2293
2294	ASSERT(diff_tick >= 0);
2295
2296	if (diff_tod < 0) {
2297		/* ERROR - tod reversed */
2298		tod_bad = TOD_REVERSED;
2299		off = (int)(prev_tod - tod);
2300	} else if (diff_tod == 0) {
2301		/* tod did not advance */
2302		if (diff_tick > TOD_STALL_THRESHOLD) {
2303			/* ERROR - tod stalled */
2304			tod_bad = TOD_STALLED;
2305		} else {
2306			/*
2307			 * Make sure we don't update prev_tick
2308			 * so that diff_tick is calculated since
2309			 * the first diff_tod == 0
2310			 */
2311			return (tod);
2312		}
2313	} else {
2314		/* calculate dtick */
2315		dtick = diff_tick / diff_tod;
2316
2317		/* update dtick averages */
2318		dtick_avg += ((dtick - dtick_avg) / TOD_FILTER_N);
2319
2320		/*
2321		 * Calculate dtick_delta as
2322		 * variation from reference freq in quartiles
2323		 */
2324		dtick_delta = (dtick_avg - TOD_REF_FREQ) /
2325		    (TOD_REF_FREQ >> 2);
2326
2327		/*
2328		 * Even with a perfectly functioning TOD device,
2329		 * when the number of elapsed seconds is low the
2330		 * algorithm can calculate a rate that is beyond
2331		 * tolerance, causing an error.  The algorithm is
2332		 * inaccurate when elapsed time is low (less than
2333		 * 5 seconds).
2334		 */
2335		if (diff_tod > 4) {
2336			if (dtick < TOD_JUMP_THRESHOLD) {
2337				/*
2338				 * If we've just done a CPR resume, we detect
2339				 * a jump in the TOD but, actually, what's
2340				 * happened is that the TOD has been increasing
2341				 * whilst the system was suspended and the tick
2342				 * count hasn't kept up.  We consider the first
2343				 * occurrence of this after a resume as normal
2344				 * and ignore it; otherwise, in a non-resume
2345				 * case, we regard it as a TOD problem.
2346				 */
2347				if (!cpr_resume_done) {
2348					/* ERROR - tod jumped */
2349					tod_bad = TOD_JUMPED;
2350					off = (int)diff_tod;
2351				}
2352			}
2353			if (dtick_delta) {
2354				/*
2355				 * If we've just done a DR resume, dtick_avg
2356				 * can go a bit askew so we reset it and carry
2357				 * on; otherwise, the TOD is in error.
2358				 */
2359				if (dr_resume_done) {
2360					dtick_avg = TOD_REF_FREQ;
2361				} else {
2362					/* ERROR - change in clock rate */
2363					tod_bad = TOD_RATECHANGED;
2364				}
2365			}
2366		}
2367	}
2368
2369	if (tod_bad != TOD_NOFAULT) {
2370		(void) tod_fault(tod_bad, off);
2371
2372		/*
2373		 * Disable dosynctodr since we are going to fault
2374		 * the TOD chip anyway here
2375		 */
2376		dosynctodr = 0;
2377
2378		/*
2379		 * Set tod to the correct value from hrestime
2380		 */
2381		tod = hrestime.tv_sec;
2382	}
2383
2384	prev_tod = tod;
2385	prev_tick = tick;
2386	return (tod);
2387}
2388
2389static void
2390calcloadavg(int nrun, uint64_t *hp_ave)
2391{
2392	static int64_t f[3] = { 135, 27, 9 };
2393	uint_t i;
2394	int64_t q, r;
2395
2396	/*
2397	 * Compute load average over the last 1, 5, and 15 minutes
2398	 * (60, 300, and 900 seconds).  The constants in f[3] are for
2399	 * exponential decay:
2400	 * (1 - exp(-1/60)) << 13 = 135,
2401	 * (1 - exp(-1/300)) << 13 = 27,
2402	 * (1 - exp(-1/900)) << 13 = 9.
2403	 */
2404
2405	/*
2406	 * a little hoop-jumping to avoid integer overflow
2407	 */
2408	for (i = 0; i < 3; i++) {
2409		q = (hp_ave[i]  >> 16) << 7;
2410		r = (hp_ave[i]  & 0xffff) << 7;
2411		hp_ave[i] += ((nrun - q) * f[i] - ((r * f[i]) >> 16)) >> 4;
2412	}
2413}
2414
2415/*
2416 * lbolt_hybrid() is used by ddi_get_lbolt() and ddi_get_lbolt64() to
2417 * calculate the value of lbolt according to the current mode. In the event
2418 * driven mode (the default), lbolt is calculated by dividing the current hires
2419 * time by the number of nanoseconds per clock tick. In the cyclic driven mode
2420 * an internal variable is incremented at each firing of the lbolt cyclic
2421 * and returned by lbolt_cyclic_driven().
2422 *
2423 * The system will transition from event to cyclic driven mode when the number
2424 * of calls to lbolt_event_driven() exceeds the (per CPU) threshold within a
2425 * window of time. It does so by reprograming lbolt_cyclic from CY_INFINITY to
2426 * nsec_per_tick. The lbolt cyclic will remain ON while at least one CPU is
2427 * causing enough activity to cross the thresholds.
2428 */
2429int64_t
2430lbolt_bootstrap(void)
2431{
2432	return (0);
2433}
2434
2435/* ARGSUSED */
2436uint_t
2437lbolt_ev_to_cyclic(caddr_t arg1, caddr_t arg2)
2438{
2439	hrtime_t ts, exp;
2440	int ret;
2441
2442	ASSERT(lbolt_hybrid != lbolt_cyclic_driven);
2443
2444	kpreempt_disable();
2445
2446	ts = gethrtime();
2447	lb_info->lbi_internal = (ts/nsec_per_tick);
2448
2449	/*
2450	 * Align the next expiration to a clock tick boundary.
2451	 */
2452	exp = ts + nsec_per_tick - 1;
2453	exp = (exp/nsec_per_tick) * nsec_per_tick;
2454
2455	ret = cyclic_reprogram(lb_info->id.lbi_cyclic_id, exp);
2456	ASSERT(ret);
2457
2458	lbolt_hybrid = lbolt_cyclic_driven;
2459	lb_info->lbi_cyc_deactivate = B_FALSE;
2460	lb_info->lbi_cyc_deac_start = lb_info->lbi_internal;
2461
2462	kpreempt_enable();
2463
2464	ret = atomic_dec_32_nv(&lb_info->lbi_token);
2465	ASSERT(ret == 0);
2466
2467	return (1);
2468}
2469
2470int64_t
2471lbolt_event_driven(void)
2472{
2473	hrtime_t ts;
2474	int64_t lb;
2475	int ret, cpu = CPU->cpu_seqid;
2476
2477	ts = gethrtime();
2478	ASSERT(ts > 0);
2479
2480	ASSERT(nsec_per_tick > 0);
2481	lb = (ts/nsec_per_tick);
2482
2483	/*
2484	 * Switch to cyclic mode if the number of calls to this routine
2485	 * has reached the threshold within the interval.
2486	 */
2487	if ((lb - lb_cpu[cpu].lbc_cnt_start) < lb_info->lbi_thresh_interval) {
2488
2489		if (--lb_cpu[cpu].lbc_counter == 0) {
2490			/*
2491			 * Reached the threshold within the interval, reset
2492			 * the usage statistics.
2493			 */
2494			lb_cpu[cpu].lbc_counter = lb_info->lbi_thresh_calls;
2495			lb_cpu[cpu].lbc_cnt_start = lb;
2496
2497			/*
2498			 * Make sure only one thread reprograms the
2499			 * lbolt cyclic and changes the mode.
2500			 */
2501			if (panicstr == NULL &&
2502			    atomic_cas_32(&lb_info->lbi_token, 0, 1) == 0) {
2503
2504				if (lbolt_hybrid == lbolt_cyclic_driven) {
2505					ret = atomic_dec_32_nv(
2506					    &lb_info->lbi_token);
2507					ASSERT(ret == 0);
2508				} else {
2509					lbolt_softint_post();
2510				}
2511			}
2512		}
2513	} else {
2514		/*
2515		 * Exceeded the interval, reset the usage statistics.
2516		 */
2517		lb_cpu[cpu].lbc_counter = lb_info->lbi_thresh_calls;
2518		lb_cpu[cpu].lbc_cnt_start = lb;
2519	}
2520
2521	ASSERT(lb >= lb_info->lbi_debug_time);
2522
2523	return (lb - lb_info->lbi_debug_time);
2524}
2525
2526int64_t
2527lbolt_cyclic_driven(void)
2528{
2529	int64_t lb = lb_info->lbi_internal;
2530	int cpu;
2531
2532	/*
2533	 * If a CPU has already prevented the lbolt cyclic from deactivating
2534	 * itself, don't bother tracking the usage. Otherwise check if we're
2535	 * within the interval and how the per CPU counter is doing.
2536	 */
2537	if (lb_info->lbi_cyc_deactivate) {
2538		cpu = CPU->cpu_seqid;
2539		if ((lb - lb_cpu[cpu].lbc_cnt_start) <
2540		    lb_info->lbi_thresh_interval) {
2541
2542			if (lb_cpu[cpu].lbc_counter == 0)
2543				/*
2544				 * Reached the threshold within the interval,
2545				 * prevent the lbolt cyclic from turning itself
2546				 * off.
2547				 */
2548				lb_info->lbi_cyc_deactivate = B_FALSE;
2549			else
2550				lb_cpu[cpu].lbc_counter--;
2551		} else {
2552			/*
2553			 * Only reset the usage statistics when we have
2554			 * exceeded the interval.
2555			 */
2556			lb_cpu[cpu].lbc_counter = lb_info->lbi_thresh_calls;
2557			lb_cpu[cpu].lbc_cnt_start = lb;
2558		}
2559	}
2560
2561	ASSERT(lb >= lb_info->lbi_debug_time);
2562
2563	return (lb - lb_info->lbi_debug_time);
2564}
2565
2566/*
2567 * The lbolt_cyclic() routine will fire at a nsec_per_tick interval to satisfy
2568 * performance needs of ddi_get_lbolt() and ddi_get_lbolt64() consumers.
2569 * It is inactive by default, and will be activated when switching from event
2570 * to cyclic driven lbolt. The cyclic will turn itself off unless signaled
2571 * by lbolt_cyclic_driven().
2572 */
2573static void
2574lbolt_cyclic(void)
2575{
2576	int ret;
2577
2578	lb_info->lbi_internal++;
2579
2580	if (!lbolt_cyc_only) {
2581
2582		if (lb_info->lbi_cyc_deactivate) {
2583			/*
2584			 * Switching from cyclic to event driven mode.
2585			 */
2586			if (panicstr == NULL &&
2587			    atomic_cas_32(&lb_info->lbi_token, 0, 1) == 0) {
2588
2589				if (lbolt_hybrid == lbolt_event_driven) {
2590					ret = atomic_dec_32_nv(
2591					    &lb_info->lbi_token);
2592					ASSERT(ret == 0);
2593					return;
2594				}
2595
2596				kpreempt_disable();
2597
2598				lbolt_hybrid = lbolt_event_driven;
2599				ret = cyclic_reprogram(
2600				    lb_info->id.lbi_cyclic_id,
2601				    CY_INFINITY);
2602				ASSERT(ret);
2603
2604				kpreempt_enable();
2605
2606				ret = atomic_dec_32_nv(&lb_info->lbi_token);
2607				ASSERT(ret == 0);
2608			}
2609		}
2610
2611		/*
2612		 * The lbolt cyclic should not try to deactivate itself before
2613		 * the sampling period has elapsed.
2614		 */
2615		if (lb_info->lbi_internal - lb_info->lbi_cyc_deac_start >=
2616		    lb_info->lbi_thresh_interval) {
2617			lb_info->lbi_cyc_deactivate = B_TRUE;
2618			lb_info->lbi_cyc_deac_start = lb_info->lbi_internal;
2619		}
2620	}
2621}
2622
2623/*
2624 * Since the lbolt service was historically cyclic driven, it must be 'stopped'
2625 * when the system drops into the kernel debugger. lbolt_debug_entry() is
2626 * called by the KDI system claim callbacks to record a hires timestamp at
2627 * debug enter time. lbolt_debug_return() is called by the sistem release
2628 * callbacks to account for the time spent in the debugger. The value is then
2629 * accumulated in the lb_info structure and used by lbolt_event_driven() and
2630 * lbolt_cyclic_driven(), as well as the mdb_get_lbolt() routine.
2631 */
2632void
2633lbolt_debug_entry(void)
2634{
2635	if (lbolt_hybrid != lbolt_bootstrap) {
2636		ASSERT(lb_info != NULL);
2637		lb_info->lbi_debug_ts = gethrtime();
2638	}
2639}
2640
2641/*
2642 * Calculate the time spent in the debugger and add it to the lbolt info
2643 * structure. We also update the internal lbolt value in case we were in
2644 * cyclic driven mode going in.
2645 */
2646void
2647lbolt_debug_return(void)
2648{
2649	hrtime_t ts;
2650
2651	if (lbolt_hybrid != lbolt_bootstrap) {
2652		ASSERT(lb_info != NULL);
2653		ASSERT(nsec_per_tick > 0);
2654
2655		ts = gethrtime();
2656		lb_info->lbi_internal = (ts/nsec_per_tick);
2657		lb_info->lbi_debug_time +=
2658		    ((ts - lb_info->lbi_debug_ts)/nsec_per_tick);
2659
2660		lb_info->lbi_debug_ts = 0;
2661	}
2662}
2663