1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008-2009, Intel Corporation.
23  * All Rights Reserved.
24  */
25 
26 #define	MAX_TAG		8
27 #define	MAX_STACK	64
28 
29 #pragma D option aggsize=8m
30 #pragma D option bufsize=16m
31 #pragma D option dynvarsize=16m
32 #pragma D option aggrate=0
33 #pragma D option stackframes=MAX_STACK
34 /*
35  * Our D script needs to compile even if some of the TRANSLATE probes cannot
36  * be found. Missing probes can be caused by older kernel, different
37  * architecture, unloaded modules etc.
38  */
39 #pragma D option zdefs
40 
41 #if defined(ENABLE_SCHED)
42 #if defined(TRACE_PID)
43 #define TRACE_FILTER    / pid == 0 || pid == TRACE_PID /
44 #define TRACE_FILTER_COND(a)    / (pid == 0 || pid == TRACE_PID) && (a) /
45 #elif defined(TRACE_PGID)
46 #define TRACE_FILTER    / pid == 0 || curpsinfo->pr_pgid == TRACE_PGID /
47 #define TRACE_FILTER_COND(a)
48     / (pid == 0 || curpsinfo->pr_pgid == TRACE_PGID) && (a) /
49 #else
50 #define TRACE_FILTER
51 #define TRACE_FILTER_COND(a)	/ (a) /
52 #endif
53 #else	/* ENABLE_SCHED */
54 #if defined(TRACE_PID)
55 #define TRACE_FILTER    / pid == TRACE_PID /
56 #define TRACE_FILTER_COND(a)    / (pid == TRACE_PID) && (a) /
57 #elif defined(TRACE_PGID)
58 #define TRACE_FILTER    / curpsinfo->pr_pgid == TRACE_PGID /
59 #define TRACE_FILTER_COND(a)    / (curpsinfo->pr_pgid == TRACE_PGID) && (a) /
60 #else
61 #define TRACE_FILTER	/ pid != 0 /
62 #define TRACE_FILTER_COND(a)    / (pid != 0) && (a) /
63 #endif
64 #endif /* ENABLE_SCHED */
65 
66 /* Threshold to filter WAKEABLE latencies. */
67 #define FILTER_THRESHOLD	5000000
68 /* From thread.h */
69 #define T_WAKEABLE		2
70 
71 /*
72  * This array is used to store timestamp of when threads are enqueued
73  * to dispatch queue.
74  * self-> is not accessible when enqueue happens.
75  */
76 unsigned long long lt_timestamps[int, int];
77 
78 self unsigned int lt_is_block_wakeable;
79 self unsigned long long lt_sleep_start;
80 self unsigned long long lt_sleep_duration;
81 self unsigned long long lt_sch_delay;
82 self unsigned int lt_counter;		/* only used in low overhead */
83 self unsigned long long lt_timestamp;	/* only used in low overhead */
84 self unsigned int lt_stackp;
85 self unsigned int lt_prio[int];
86 self string lt_cause[int];
87 
88 this unsigned int priority;
89 this string cause;
90 
91 /*
92  * Clean up everything, otherwise we will run out of memory.
93  */
94 proc:::lwp-exit
95 {
96 	lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0;
97 
98 	self->lt_sleep_start = 0;
99 	self->lt_is_block_wakeable = 0;
100 	self->lt_counter = 0;
101 	self->lt_timestamp = 0;
102 
103 	/*
104 	 * Workaround: no way to clear associative array.
105 	 * We have to manually clear 0 ~ (MAX_TAG-1).
106 	 */
107 
108 	self->lt_prio[0] = 0;
109 	self->lt_prio[1] = 0;
110 	self->lt_prio[2] = 0;
111 	self->lt_prio[3] = 0;
112 	self->lt_prio[4] = 0;
113 	self->lt_prio[5] = 0;
114 	self->lt_prio[6] = 0;
115 	self->lt_prio[7] = 0;
116 
117 	self->lt_cause[0] = 0;
118 	self->lt_cause[1] = 0;
119 	self->lt_cause[2] = 0;
120 	self->lt_cause[3] = 0;
121 	self->lt_cause[4] = 0;
122 	self->lt_cause[5] = 0;
123 	self->lt_cause[6] = 0;
124 	self->lt_cause[7] = 0;
125 }
126 
127 #if !defined(ENABLE_LOW_OVERHEAD)
128 /*
129  * Log timestamp when a thread is taken off the CPU.
130  */
131 sched::resume:off-cpu
132 TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP)
133 {
134 	self->lt_sleep_start = timestamp;
135 	self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE;
136 
137 	lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] =
138 	    self->lt_sleep_start;
139 }
140 
141 /*
142  * Log timestamp when a thread is put on a dispatch queue and becomes runnable.
143  */
144 sched:::enqueue
145 /lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/
146 {
147 	lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] = timestamp;
148 }
149 
150 /*
151  * Calculate latency when the thread is actually on the CPU.
152  * This is necessary in order to get the right stack.
153  */
154 this unsigned long long end;
155 this unsigned long long now;
156 sched::resume:on-cpu
157 /self->lt_sleep_start != 0/
158 {
159 	this->end = lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid];
160 	this->now = timestamp;
161 	lt_timestamps[curpsinfo->pr_pid, curlwpsinfo->pr_lwpid] = 0;
162 	this->end = (this->end != 0 && this->end != self->lt_sleep_start)
163 	    ? this->end : this->now;
164 
165 	self->lt_sch_delay = this->now - this->end;
166 	self->lt_sleep_duration = this->end - self->lt_sleep_start;
167 	self->lt_sleep_start = 0;
168 }
169 
170 /*
171  * Filter: drop all "large" latency when it is interruptible, i.e., sleep()
172  * etc.
173  */
174 #if defined(ENABLE_FILTER)
175 sched::resume:on-cpu
176 /self->lt_sleep_duration > FILTER_THRESHOLD &&
177   self->lt_is_block_wakeable != 0/
178 {
179 	self->lt_sch_delay = 0;
180 	self->lt_sleep_duration = 0;
181 	self->lt_is_block_wakeable = 0;
182 }
183 #endif /* defined(ENABLE_FILTER) */
184 
185 /*
186  * Write sleep time to the aggregation.
187  * lt_sleep_duration is the duration between the time when a thread is taken
188  * off the CPU and the time when it is enqueued again.
189  */
190 sched::resume:on-cpu
191 /self->lt_sleep_duration != 0/
192 {
193 	this->cause = self->lt_stackp > 0 ?
194 	    self->lt_cause[self->lt_stackp - 1] : "";
195 	this->priority = self->lt_stackp > 0 ?
196 	    self->lt_prio[self->lt_stackp - 1] : 0;
197 
198 	@lt_call_count[pid, tid, stack(), this->cause,
199 	    this->priority] = count();
200 	@lt_call_sum[pid, tid, stack(), this->cause,
201 	    this->priority] = sum(self->lt_sleep_duration);
202 	@lt_call_max[pid, tid, stack(),  this->cause,
203 	    this->priority] = max(self->lt_sleep_duration);
204 
205 	self->lt_is_block_wakeable = 0;	/* Clear the flag to avoid leak */
206 	self->lt_sleep_duration = 0;
207 }
208 
209 /*
210  * Write time spent in queue to the aggregation.
211  * lt_sch_delay is the interval between the time when a thread becomes
212  * runnable and the time when it is actually on the CPU.
213  */
214 sched::resume:on-cpu
215 /self->lt_sch_delay != 0/
216 {
217 	@lt_named_count[pid, tid, "Wait for available CPU"] = count();
218 	@lt_named_sum[pid, tid, "Wait for available CPU"] =
219 	    sum(self->lt_sch_delay);
220 	@lt_named_max[pid, tid, "Wait for available CPU"] =
221 	    max(self->lt_sch_delay);
222 
223 	self->lt_sch_delay = 0;
224 }
225 
226 /*
227  * Probes to track latency caused by spinning on a lock.
228  */
229 lockstat:::adaptive-spin
230 TRACE_FILTER
231 {
232 	@lt_named_count[pid, tid, "Adapt. lock spin"] = count();
233 	@lt_named_sum[pid, tid, "Adapt. lock spin"] = sum(arg1);
234 	@lt_named_max[pid, tid, "Adapt. lock spin"] = max(arg1);
235 }
236 
237 lockstat:::spin-spin
238 TRACE_FILTER
239 {
240 	@lt_named_count[pid, tid, "Spinlock spin"] = count();
241 	@lt_named_sum[pid, tid, "Spinlock spin"] = sum(arg1);
242 	@lt_named_max[pid, tid, "Spinlock spin"] = max(arg1);
243 }
244 
245 /*
246  * Probes to track latency caused by blocking on a lock.
247  */
248 lockstat:::adaptive-block
249 TRACE_FILTER
250 {
251 	@lt_named_count[pid, tid, "#Adapt. lock block"] = count();
252 	@lt_named_sum[pid, tid, "#Adapt. lock block"] = sum(arg1);
253 	@lt_named_max[pid, tid, "#Adapt. lock block"] = max(arg1);
254 }
255 
256 lockstat:::rw-block
257 TRACE_FILTER
258 {
259 	@lt_named_count[pid, tid, "#RW. lock block"] = count();
260 	@lt_named_sum[pid, tid, "#RW. lock block"] = sum(arg1);
261 	@lt_named_max[pid, tid, "#RW. lock block"] = max(arg1);
262 }
263 
264 #if defined(ENABLE_SYNCOBJ)
265 /*
266  * Probes to track latency caused by synchronization objects.
267  */
268 this int stype;
269 this unsigned long long wchan;
270 this unsigned long long wtime;
271 
272 sched:::wakeup
273 /*
274  * Currently we are unable to track wakeup from sched, because all its LWP IDs
275  * are zero when we trace it and that makes lt_timestamps unusable.
276  */
277 /args[1]->pr_pid != 0 &&
278     lt_timestamps[args[1]->pr_pid, args[0]->pr_lwpid] != 0/
279 {
280 	this->stype = args[0]->pr_stype;
281 	this->wchan = args[0]->pr_wchan;
282 	/*
283 	 * We can use lt_timestamps[] here, because
284 	 * wakeup is always fired before enqueue.
285 	 * After enqueue, lt_timestamps[] will be overwritten.
286 	 */
287 	this->wtime = timestamp - lt_timestamps[args[1]->pr_pid,
288 	    args[0]->pr_lwpid];
289 
290 	@lt_sync_count[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
291 	    this->wchan] = count();
292 	@lt_sync_sum[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
293 	    this->wchan] = sum(this->wtime);
294 	@lt_sync_max[args[1]->pr_pid, args[0]->pr_lwpid, this->stype,
295 	    this->wchan] = max(this->wtime);
296 }
297 #endif /* defined(ENABLE_SYNCOBJ) */
298 
299 #else /* !defined(ENABLE_LOW_OVERHEAD) */
300 
301 /*
302  * This is the low overhead mode.
303  * In order to reduce the number of instructions executed during each
304  * off-cpu and on-cpu event, we do the following:
305  *
306  *	1. Use sampling and update aggregations only roughly 1/100 times
307  *		(SAMPLE_TIMES).
308  *	2. Do not track anything other than what is needed for "main" window.
309  *	3. Use as few thread local variables as possible.
310  */
311 
312 #define SAMPLE_TIMES		100
313 #define SAMPLE_THRESHOLD	50000000
314 
315 /*
316  * Log timestamp when a thread is off CPU.
317  */
318 sched::resume:off-cpu
319 TRACE_FILTER_COND(curlwpsinfo->pr_state == SSLEEP)
320 {
321 	self->lt_timestamp = timestamp;
322 #if defined(ENABLE_FILTER)
323 	self->lt_is_block_wakeable = curthread->t_flag & T_WAKEABLE;
324 #endif /* defined(ENABLE_FILTER) */
325 }
326 
327 /*
328  * Calculate latency when a thread is actually on the CPU.
329  */
330 this int need_skip;
331 sched::resume:on-cpu
332 /self->lt_timestamp != 0/
333 {
334 	self->lt_timestamp = timestamp - self->lt_timestamp;
335 
336 #if defined(ENABLE_FILTER)
337 	self->lt_timestamp =
338 	    (self->lt_timestamp > FILTER_THRESHOLD &&
339 	    self->lt_is_block_wakeable != 0) ? 0 : self->lt_timestamp;
340 	self->lt_is_block_wakeable = 0;
341 #endif /* defined(ENABLE_FILTER) */
342 
343 	this->need_skip = (self->lt_counter < (SAMPLE_TIMES - 1) &&
344 	    self->lt_timestamp <= SAMPLE_THRESHOLD) ? 1 : 0;
345 	self->lt_timestamp = this->need_skip ? 0 : self->lt_timestamp;
346 	self->lt_counter += this->need_skip;
347 }
348 
349 /*
350  * Track large latency first.
351  */
352 sched::resume:on-cpu
353 /self->lt_timestamp > SAMPLE_THRESHOLD/
354 {
355 	this->cause = self->lt_stackp > 0 ?
356 	    self->lt_cause[self->lt_stackp - 1] : "";
357 	this->priority = self->lt_stackp > 0 ?
358 	    self->lt_prio[self->lt_stackp - 1] : 0;
359 
360 	@lt_call_count[pid, tid, stack(), this->cause,
361 	    this->priority] = sum(1);
362 	@lt_call_sum[pid, tid, stack(), this->cause,
363 	    this->priority] = sum(self->lt_timestamp);
364 	@lt_call_max[pid, tid, stack(), this->cause,
365 	    this->priority] = max(self->lt_timestamp);
366 
367 	self->lt_timestamp = 0;
368 }
369 
370 /*
371  * If we fall back to this probe, that means the latency is small and counter
372  * has reached SAMPLE_TIMES.
373  */
374 sched::resume:on-cpu
375 /self->lt_timestamp != 0/
376 {
377 	this->cause = self->lt_stackp > 0 ?
378 	    self->lt_cause[self->lt_stackp - 1] : "";
379 	this->priority = self->lt_stackp > 0 ?
380 	    self->lt_prio[self->lt_stackp - 1] : 0;
381 
382 	/* Need +1 because lt_counter has not been updated in this cycle. */
383 	@lt_call_count[pid, tid, stack(), this->cause,
384 	    this->priority] = sum(self->lt_counter + 1);
385 	@lt_call_sum[pid, tid, stack(), this->cause,
386 	    this->priority] = sum((self->lt_counter + 1) * self->lt_timestamp);
387 	@lt_call_max[pid, tid, stack(), this->cause,
388 	    this->priority] = max(self->lt_timestamp);
389 
390 	self->lt_timestamp = 0;
391 	self->lt_counter = 0;
392 }
393 
394 #endif /* !defined(ENABLE_LOW_OVERHEAD) */
395 
396 #define	TRANSLATE(entryprobe, returnprobe, cause, priority)		\
397 entryprobe								\
398 TRACE_FILTER_COND(self->lt_stackp == 0 ||				\
399     (self->lt_stackp < MAX_TAG &&					\
400     self->lt_prio[self->lt_stackp - 1] <= priority) )			\
401 {									\
402 	self->lt_prio[self->lt_stackp] = priority;			\
403 	self->lt_cause[self->lt_stackp] = cause;			\
404 	++self->lt_stackp;						\
405 }									\
406 returnprobe								\
407 TRACE_FILTER_COND(self->lt_stackp > 0 &&				\
408     self->lt_cause[self->lt_stackp - 1] == cause)			\
409 {									\
410 	--self->lt_stackp;						\
411 	self->lt_cause[self->lt_stackp] = NULL;				\
412 }
413 
414 /*
415  * Syscalls have a priority of 10. This is to make sure that latency is
416  * traced to one of the syscalls only if nothing else matches.
417  * We put this special probe here because it uses "probefunc" variable instead
418  * of a constant string.
419  */
420 
421 TRANSLATE(syscall:::entry, syscall:::return, probefunc, 10)
422