xref: /illumos-gate/usr/src/cmd/intrd/intrd.pl (revision 9e59f930)
1#!/usr/perl5/bin/perl
2#
3# CDDL HEADER START
4#
5# The contents of this file are subject to the terms of the
6# Common Development and Distribution License (the "License").
7# You may not use this file except in compliance with the License.
8#
9# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10# or http://www.opensolaris.org/os/licensing.
11# See the License for the specific language governing permissions
12# and limitations under the License.
13#
14# When distributing Covered Code, include this CDDL HEADER in each
15# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16# If applicable, add the following below this CDDL HEADER, with the
17# fields enclosed by brackets "[]" replaced with your own identifying
18# information: Portions Copyright [yyyy] [name of copyright owner]
19#
20# CDDL HEADER END
21#
22
23#
24# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25# Use is subject to license terms.
26#
27#ident	"%Z%%M%	%I%	%E% SMI"
28#
29
30require 5.6.1;
31use strict;
32use warnings;
33use POSIX;
34use File::Basename("basename");
35
36my $cmdname = basename($0);
37
38my $using_scengen = 0;	# 1 if using scenario simulator
39my $debug = 0;
40
41my $normal_sleeptime = 10;		# time to sleep between samples
42my $idle_sleeptime = 45;		# time to sleep when idle
43my $onecpu_sleeptime = (60 * 15);	# used if only 1 CPU on system
44my $sleeptime = $normal_sleeptime;	# either normal_ or idle_ or onecpu_
45
46my $idle_intrload = .1;			# idle if interrupt load < 10%
47
48my $timerange_toohi    = .01;
49my $statslen = 60;	# time period (in secs) to keep in @deltas
50
51
52# Parse arguments. intrd does not accept any public arguments; the two
53# arguments below are meant for testing purposes. -D generates a significant
54# amount of syslog output. -S <filename> loads the filename as a perl
55# script. That file is expected to implement a kstat "simulator" which
56# can be used to feed information to intrd and verify intrd's responses.
57
58while ($_ = shift @ARGV) {
59	if ($_ eq "-S" && $#ARGV != -1) {
60		$using_scengen = 1;
61		do $ARGV[0];	# load simulator
62		shift @ARGV;
63	} elsif ($_ eq "-D") {
64		$debug = 1;
65	}
66}
67
68if ($using_scengen == 0) {
69	require Sun::Solaris::Kstat;
70	require Sun::Solaris::Intrs;
71	import Sun::Solaris::Intrs(qw(intrmove));
72	require Sys::Syslog;
73	import Sys::Syslog;
74	openlog($cmdname, 'pid', 'daemon');
75	setlogmask(Sys::Syslog::LOG_UPTO($debug > 0 ? &Sys::Syslog::LOG_DEBUG :
76	    &Sys::Syslog::LOG_INFO));
77}
78
79
80my $asserted = 0;
81my $assert_level = 'debug';	# syslog level for assertion failures
82sub VERIFY($@)
83{
84	my $bad = (shift() == 0);	# $_[0] == 0 means assert failed
85	if ($bad) {
86		my $msg = shift();
87		syslog($assert_level, "VERIFY: $msg", @_);
88		$asserted++;
89	}
90	return ($bad);
91}
92
93
94
95
96sub getstat($);
97sub generate_delta($$);
98sub compress_deltas($);
99sub dumpdelta($);
100
101sub goodness($);
102sub imbalanced($$);
103sub do_reconfig($);
104
105sub goodness_cpu($$);		# private function
106sub move_intr($$$$);		# private function
107sub ivecs_to_string(@);		# private function
108sub do_find_goal($$$$);		# private function
109sub find_goal($$);		# private function
110sub do_reconfig_cpu2cpu($$$$);	# private function
111sub do_reconfig_cpu($$$);	# private function
112
113
114#
115# What follow are the basic data structures routines of intrd.
116#
117# getstat() is responsible for reading the kstats and generating a "stat" hash.
118#
119# generate_delta() is responsible for taking two "stat" hashes and creating
120# a new "delta" hash that represents what has changed over time.
121#
122# compress_deltas() is responsible for taking a list of deltas and generating
123# a single delta hash that encompasses all the time periods described by the
124# deltas.
125
126
127#
128# getstat() is handed a reference to a kstat and generates a hash, returned
129# by reference, containing all the fields from the kstats which we need.
130# If it returns the scalar 0, it failed to gather the kstats, and the caller
131# should react accordingly.
132#
133# getstat() is also responsible for maintaining a reasonable $sleeptime.
134#
135# {"snaptime"}          kstat's snaptime
136# {<cpuid>}             one hash reference per online cpu
137#  ->{"tot"}            == cpu:<cpuid>:sys:cpu_nsec_{user + kernel + idle}
138#  ->{"crtime"}         == cpu:<cpuid>:sys:crtime
139#  ->{"ivecs"}
140#     ->{<cookie#>}     iterates over pci_intrs::<nexus>:cookie
141#        ->{"time"}     == pci_intrs:<ivec#>:<nexus>:time (in nsec)
142#        ->{"pil"}      == pci_intrs:<ivec#>:<nexus>:pil
143#        ->{"crtime"}   == pci_intrs:<ivec#>:<nexus>:crtime
144#        ->{"ino"}      == pci_intrs:<ivec#>:<nexus>:ino
145#        ->{"buspath"}  == pci_intrs:<ivec#>:<nexus>:buspath
146#        ->{"name"}     == pci_intrs:<ivec#>:<nexus>:name
147#        ->{"ihs"}      == pci_intrs:<ivec#>:<nexus>:ihs
148#
149
150sub getstat($)
151{
152	my ($ks) = @_;
153
154	my $cpucnt = 0;
155	my %stat = ();
156	my ($minsnap, $maxsnap);
157
158	# kstats are not generated atomically. Each kstat hierarchy will
159	# have been generated within the kernel at a different time. On a
160	# thrashing system, we may not run quickly enough in order to get
161	# coherent kstat timing information across all the kstats. To
162	# determine if this is occurring, $minsnap/$maxsnap are used to
163	# find the breadth between the first and last snaptime of all the
164	# kstats we access. $maxsnap - $minsnap roughly represents the
165	# total time taken up in getstat(). If this time approaches the
166	# time between snapshots, our results may not be useful.
167
168	$minsnap = -1;		# snaptime is always a positive number
169	$maxsnap = $minsnap;
170
171	# Iterate over the cpus in cpu:<cpuid>::. Check
172	# cpu_info:<cpuid>:cpu_info<cpuid>:state to make sure the
173	# processor is "on-line". If not, it isn't accepting interrupts
174	# and doesn't concern us.
175	#
176	# Record cpu:<cpuid>:sys:snaptime, and check $minsnap/$maxsnap.
177
178	while (my ($cpu, $cpst) = each %{$ks->{cpu}}) {
179		next if !exists($ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state});
180		my $state = $ks->{cpu_info}{$cpu}{"cpu_info$cpu"}{state};
181		next if ($state !~ /^on-line\0/);
182		my $cpu_sys = $cpst->{sys};
183
184		$stat{$cpu}{tot} = ($cpu_sys->{cpu_nsec_idle} +
185				    $cpu_sys->{cpu_nsec_user} +
186				    $cpu_sys->{cpu_nsec_kernel});
187		$stat{$cpu}{crtime} = $cpu_sys->{crtime};
188		$stat{$cpu}{ivecs} = {};
189
190		if ($minsnap == -1 || $cpu_sys->{snaptime} < $minsnap) {
191			$minsnap = $cpu_sys->{snaptime};
192		}
193		if ($cpu_sys->{snaptime} > $maxsnap) {
194			$maxsnap = $cpu_sys->{snaptime};
195		}
196		$cpucnt++;
197	}
198
199	if ($cpucnt <= 1) {
200		$sleeptime = $onecpu_sleeptime;
201		return (0);	# nothing to do with 1 CPU
202	}
203
204	# Iterate over the ivecs. If the cpu is not on-line, ignore the
205	# ivecs mapped to it, if any.
206	#
207	# Record pci_intrs:{inum}:<nexus>:time, snaptime, crtime, pil,
208	# ino, name, and buspath. Check $minsnap/$maxsnap.
209
210	foreach my $inst (values(%{$ks->{pci_intrs}})) {
211		my $intrcfg = (values(%$inst))[0];
212		my $cpu = $intrcfg->{cpu};
213
214		next unless exists $stat{$cpu};
215		next if ($intrcfg->{type} =~ /^disabled\0/);
216
217		if ($intrcfg->{snaptime} < $minsnap) {
218			$minsnap = $intrcfg->{snaptime};
219		} elsif ($intrcfg->{snaptime} > $maxsnap) {
220			$maxsnap = $intrcfg->{snaptime};
221		}
222
223		my $cookie = "$intrcfg->{buspath} $intrcfg->{ino}";
224		if (exists $stat{$cpu}{ivecs}{$cookie}) {
225			my $cookiestats = $stat{$cpu}{ivecs}{$cookie};
226
227			$cookiestats->{time} += $intrcfg->{time};
228			$cookiestats->{name} .= "/$intrcfg->{name}";
229
230			# If this new interrupt sharing $cookie represents a
231			# change from an earlier getstat, make sure that
232			# generate_delta will see the change by setting
233			# crtime to the most recent crtime of its components.
234
235			if ($intrcfg->{crtime} > $cookiestats->{crtime}) {
236				$cookiestats->{crtime} = $intrcfg->{crtime};
237			}
238			$cookiestats->{ihs}++;
239			next;
240		}
241		$stat{$cpu}{ivecs}{$cookie}{time} = $intrcfg->{time};
242		$stat{$cpu}{ivecs}{$cookie}{crtime} = $intrcfg->{crtime};
243		$stat{$cpu}{ivecs}{$cookie}{pil} = $intrcfg->{pil};
244		$stat{$cpu}{ivecs}{$cookie}{ino} = $intrcfg->{ino};
245		$stat{$cpu}{ivecs}{$cookie}{buspath} = $intrcfg->{buspath};
246		$stat{$cpu}{ivecs}{$cookie}{name} = $intrcfg->{name};
247		$stat{$cpu}{ivecs}{$cookie}{ihs} = 1;
248	}
249
250	# We define the timerange as the amount of time spent gathering the
251	# various kstats, divided by our sleeptime. If we take a lot of time
252	# to access the kstats, and then we create a delta comparing these
253	# kstats with a prior set of kstats, that delta will cover
254	# substaintially different amount of time depending upon which
255	# interrupt or CPU is being examined.
256	#
257	# By checking the timerange here, we guarantee that any deltas
258	# created from these kstats will contain self-consistent data,
259	# in that all CPUs and interrupts cover a similar span of time.
260	#
261	# $timerange_toohi is the upper bound. Any timerange above
262	# this is thrown out as garbage. If the stat is safely within this
263	# bound, we treat the stat as representing an instant in time, rather
264	# than the time range it actually spans. We arbitrarily choose minsnap
265	# as the snaptime of the stat.
266
267	$stat{snaptime} = $minsnap;
268	my $timerange = ($maxsnap - $minsnap) / $sleeptime;
269	return (0) if ($timerange > $timerange_toohi);	# i.e. failure
270	return (\%stat);
271}
272
273#
274# dumpdelta takes a reference to our "delta" structure:
275# {"missing"}           "1" if the delta's component stats had inconsistencies
276# {"minsnap"}           time of the first kstat snaptime used in this delta
277# {"maxsnap"}           time of the last kstat snaptime used in this delta
278# {"goodness"}          cost function applied to this delta
279# {"avgintrload"}       avg of interrupt load across cpus, as a percentage
280# {"avgintrnsec"}       avg number of nsec spent in interrupts, per cpu
281# {<cpuid>}             iterates over on-line cpus
282#  ->{"intrs"}          cpu's movable intr time (sum of "time" for each ivec)
283#  ->{"tot"}            CPU load from all sources in nsec
284#  ->{"bigintr"}        largest value of {ivecs}{<ivec#>}{time} from below
285#  ->{"intrload"}       intrs / tot
286#  ->{"ivecs"}
287#     ->{<ivec#>}       iterates over ivecs for this cpu
288#        ->{"time"}     time used by this interrupt (in nsec)
289#        ->{"pil"}      pil level of this interrupt
290#        ->{"ino"}      interrupt number
291#        ->{"buspath"}  filename of the directory of the device's bus
292#        ->{"name"}     device name
293#        ->{"ihs"}      number of different handlers sharing this ino
294#
295# It prints out the delta structure in a nice, human readable display.
296#
297
298sub dumpdelta($)
299{
300	my ($delta) = @_;
301
302	# print global info
303
304	syslog('debug', "dumpdelta:");
305	syslog('debug', " RECONFIGURATION IN DELTA") if $delta->{missing} > 0;
306	syslog('debug', " avgintrload: %5.2f%%  avgintrnsec: %d",
307	       $delta->{avgintrload} * 100, $delta->{avgintrnsec});
308	syslog('debug', "    goodness: %5.2f%%", $delta->{goodness} * 100)
309	    if exists($delta->{goodness});
310
311	# iterate over cpus
312
313	while (my ($cpu, $cpst) = each %$delta) {
314		next if !ref($cpst);		# skip non-cpuid entries
315		my $tot = $cpst->{tot};
316		syslog('debug', "    cpu %3d intr %7.3f%%  (bigintr %7.3f%%)",
317		       $cpu, $cpst->{intrload}*100, $cpst->{bigintr}*100/$tot);
318		syslog('debug', "        intrs %d, bigintr %d",
319		       $cpst->{intrs}, $cpst->{bigintr});
320
321		# iterate over ivecs on this cpu
322
323		while (my ($ivec, $ivst) = each %{$cpst->{ivecs}}) {
324			syslog('debug', "    %15s:\"%s\": %7.3f%%  %d",
325			    ($ivst->{ihs} > 1 ? "$ivst->{name}($ivst->{ihs})" :
326			    $ivst->{name}), $ivec,
327			    $ivst->{time}*100 / $tot, $ivst->{time});
328		}
329	}
330}
331
332#
333# generate_delta($stat, $newstat) takes two stat references, returned from
334# getstat(), and creates a %delta. %delta (not surprisingly) contains the
335# same basic info as stat and newstat, but with the timestamps as deltas
336# instead of absolute times. We return a reference to the delta.
337#
338
339sub generate_delta($$)
340{
341	my ($stat, $newstat) = @_;
342
343	my %delta = ();
344	my $intrload;
345	my $intrnsec;
346	my $cpus;
347
348	# Take the worstcase timerange
349	$delta{minsnap} = $stat->{snaptime};
350	$delta{maxsnap} = $newstat->{snaptime};
351	if (VERIFY($delta{maxsnap} > $delta{minsnap},
352	    "generate_delta: stats aren't ascending")) {
353		$delta{missing} = 1;
354		return (\%delta);
355	}
356
357	# if there are a different number of cpus in the stats, set missing
358
359	$delta{missing} = (keys(%$stat) != keys(%$newstat));
360	if (VERIFY($delta{missing} == 0,
361	    "generate_delta: number of CPUs changed")) {
362		return (\%delta);
363	}
364
365	# scan through every cpu in %newstat and compare against %stat
366
367	while (my ($cpu, $newcpst) = each %$newstat) {
368		next if !ref($newcpst);		# skip non-cpuid fields
369
370		# If %stat is missing a cpu from %newstat, then it was just
371		# onlined. Mark missing.
372
373		if (VERIFY(exists $stat->{$cpu} &&
374		    $stat->{$cpu}{crtime} == $newcpst->{crtime},
375		    "generate_delta: cpu $cpu changed")) {
376			$delta{missing} = 1;
377			return (\%delta);
378		}
379		my $cpst = $stat->{$cpu};
380		$delta{$cpu}{tot} = $newcpst->{tot} - $cpst->{tot};
381		if (VERIFY($delta{$cpu}{tot} >= 0,
382		    "generate_delta: deltas are not ascending?")) {
383			$delta{missing} = 1;
384			delete($delta{$cpu});
385			return (\%delta);
386		}
387		# Avoid remote chance of division by zero
388		$delta{$cpu}{tot} = 1 if $delta{$cpu}{tot} == 0;
389		$delta{$cpu}{intrs} = 0;
390		$delta{$cpu}{bigintr} = 0;
391
392		my %ivecs = ();
393		$delta{$cpu}{ivecs} = \%ivecs;
394
395		# if the number of ivecs differs, set missing
396
397		if (VERIFY(keys(%{$cpst->{ivecs}}) ==
398			   keys(%{$newcpst->{ivecs}}),
399			   "generate_delta: cpu $cpu has more/less".
400			   " interrupts")) {
401			$delta{missing} = 1;
402			return (\%delta);
403		}
404
405		while (my ($inum, $newivec) = each %{$newcpst->{ivecs}}) {
406			# If this ivec doesn't exist in $stat, or if $stat
407			# shows a different crtime, set missing.
408
409			if (VERIFY(exists $cpst->{ivecs}{$inum} &&
410				   $cpst->{ivecs}{$inum}{crtime} ==
411				   $newivec->{crtime},
412				   "generate_delta: cpu $cpu inum $inum".
413				   " has changed")) {
414				$delta{missing} = 1;
415				return (\%delta);
416			}
417			my $ivec = $cpst->{ivecs}{$inum};
418
419			# Create $delta{$cpu}{ivecs}{$inum}.
420
421			my %dltivec = ();
422			$delta{$cpu}{ivecs}{$inum} = \%dltivec;
423
424			# calculate time used by this interrupt
425
426			my $time = $newivec->{time} - $ivec->{time};
427			if (VERIFY($time >= 0,
428				   "generate_delta: ivec went backwards?")) {
429				$delta{missing} = 1;
430				delete($delta{$cpu}{ivecs}{$inum});
431				return (\%delta);
432			}
433			$delta{$cpu}{intrs} += $time;
434			$dltivec{time} = $time;
435			if ($time > $delta{$cpu}{bigintr}) {
436				$delta{$cpu}{bigintr} = $time;
437			}
438
439			# Transfer over basic info about the kstat. We
440			# don't have to worry about discrepancies between
441			# ivec and newivec because we verified that both
442			# have the same crtime.
443
444			$dltivec{pil} = $newivec->{pil};
445			$dltivec{ino} = $newivec->{ino};
446			$dltivec{buspath} = $newivec->{buspath};
447			$dltivec{name} = $newivec->{name};
448			$dltivec{ihs} = $newivec->{ihs};
449		}
450		if ($delta{$cpu}{tot} < $delta{$cpu}{intrs}) {
451			# Ewww! Hopefully just a rounding error.
452			# Make something up.
453			$delta{$cpu}{tot} = $delta{$cpu}{intrs};
454		}
455		$delta{$cpu}{intrload} =
456		       $delta{$cpu}{intrs} / $delta{$cpu}{tot};
457		$intrload += $delta{$cpu}{intrload};
458		$intrnsec += $delta{$cpu}{intrs};
459		$cpus++;
460	}
461	if ($cpus > 0) {
462		$delta{avgintrload} = $intrload / $cpus;
463		$delta{avgintrnsec} = $intrnsec / $cpus;
464	} else {
465		$delta{avgintrload} = 0;
466		$delta{avgintrnsec} = 0;
467	}
468	return (\%delta);
469}
470
471
472# compress_delta takes a list of deltas, and returns a single new delta
473# which represents the combined information from all the deltas. The deltas
474# provided are assumed to be sequential in time. The resulting compressed
475# delta looks just like any other delta. This new delta is also more accurate
476# since its statistics are averaged over a longer period than any of the
477# original deltas.
478
479sub compress_deltas ($)
480{
481	my ($deltas) = @_;
482
483	my %newdelta = ();
484	my ($intrs, $tot);
485	my $cpus = 0;
486	my ($high_intrload) = 0;
487
488	if (VERIFY($#$deltas != -1,
489		   "compress_deltas: list of delta is empty?")) {
490		return (0);
491	}
492	$newdelta{minsnap} = $deltas->[0]{minsnap};
493	$newdelta{maxsnap} = $deltas->[$#$deltas]{maxsnap};
494	$newdelta{missing} = 0;
495
496	foreach my $delta (@$deltas) {
497		if (VERIFY($delta->{missing} == 0,
498		    "compressing bad deltas?")) {
499			return (0);
500		}
501		while (my ($cpuid, $cpu) = each %$delta) {
502			next if !ref($cpu);
503
504			$intrs += $cpu->{intrs};
505			$tot += $cpu->{tot};
506			$newdelta{$cpuid}{intrs} += $cpu->{intrs};
507			$newdelta{$cpuid}{tot} += $cpu->{tot};
508			if (!exists $newdelta{$cpuid}{ivecs}) {
509				my %ivecs = ();
510				$newdelta{$cpuid}{ivecs} = \%ivecs;
511			}
512			while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
513				my $newivecs = $newdelta{$cpuid}{ivecs};
514				$newivecs->{$inum}{time} += $ivec->{time};
515				$newivecs->{$inum}{pil} = $ivec->{pil};
516				$newivecs->{$inum}{ino} = $ivec->{ino};
517				$newivecs->{$inum}{buspath} = $ivec->{buspath};
518				$newivecs->{$inum}{name} = $ivec->{name};
519				$newivecs->{$inum}{ihs} = $ivec->{ihs};
520			}
521		}
522	}
523	foreach my $cpu (values(%newdelta)) {
524		next if !ref($cpu); # ignore non-cpu fields
525		$cpus++;
526
527		my $bigintr = 0;
528		foreach my $ivec (values(%{$cpu->{ivecs}})) {
529			if ($ivec->{time} > $bigintr) {
530				$bigintr = $ivec->{time};
531			}
532		}
533		$cpu->{bigintr} = $bigintr;
534		$cpu->{intrload} = $cpu->{intrs} / $cpu->{tot};
535		if ($high_intrload < $cpu->{intrload}) {
536			$high_intrload = $cpu->{intrload};
537		}
538		$cpu->{tot} = 1 if $cpu->{tot} <= 0;
539	}
540	if ($cpus == 0) {
541		$newdelta{avgintrnsec} = 0;
542		$newdelta{avgintrload} = 0;
543	} else {
544		$newdelta{avgintrnsec} = $intrs / $cpus;
545		$newdelta{avgintrload} = $intrs / $tot;
546	}
547	$sleeptime = ($high_intrload < $idle_intrload) ? $idle_sleeptime :
548	    $normal_sleeptime;
549	return (\%newdelta);
550}
551
552
553
554
555
556# What follow are the core functions responsible for examining the deltas
557# generated above and deciding what to do about them.
558#
559# goodness() and its helper goodness_cpu() return a heuristic which describe
560# how good (or bad) the current interrupt balance is. The value returned will
561# be between 0 and 1, with 0 representing maximum goodness, and 1 representing
562# maximum badness.
563#
564# imbalanced() compares a current and historical value of goodness, and
565# determines if there has been enough change to warrant evaluating a
566# reconfiguration of the interrupts
567#
568# do_reconfig(), and its helpers, do_reconfig_cpu(), do_reconfig_cpu2cpu(),
569# find_goal(), do_find_goal(), and move_intr(), are responsible for examining
570# a delta and determining the best possible assignment of interrupts to CPUs.
571#
572# It is important that do_reconfig() be in alignment with goodness(). If
573# do_reconfig were to generate a new interrupt distribution that worsened
574# goodness, we could get into a pathological loop with intrd fighting itself,
575# constantly deciding that things are imbalanced, and then changing things
576# only to make them worse.
577
578
579
580# any goodness over $goodness_unsafe_load is considered really bad
581# goodness must drop by at least $goodness_mindelta for a reconfig
582
583my $goodness_unsafe_load = .9;
584my $goodness_mindelta = .1;
585
586# goodness(%delta) examines a delta and return its "goodness". goodness will
587# be between 0 (best) and 1 (major bad). goodness is determined by evaluating
588# the goodness of each individual cpu, and returning the worst case. This
589# helps on systems with many CPUs, where otherwise a single pathological CPU
590# might otherwise be ignored because the average was OK.
591#
592# To calculate the goodness of an individual CPU, we start by looking at its
593# load due to interrupts. If the load is above a certain high threshold and
594# there is more than one interrupt assigned to this CPU, we set goodness
595# to worst-case. If the load is below the average interrupt load of all CPUs,
596# then we return best-case, since what's to complain about?
597#
598# Otherwise we look at how much the load is above the average, and return
599# that as the goodness, with one caveat: we never return more than the CPU's
600# interrupt load ignoring its largest single interrupt source. This is
601# because a CPU with one high-load interrupt, and no other interrupts, is
602# perfectly balanced. Nothing can be done to improve the situation, and thus
603# it is perfectly balanced even if the interrupt's load is 100%.
604
605sub goodness($)
606{
607	my ($delta) = @_;
608
609	return (1) if $delta->{missing} > 0;
610
611	my $high_goodness = 0;
612	my $goodness;
613
614	foreach my $cpu (values(%$delta)) {
615		next if !ref($cpu);		# skip non-cpuid fields
616
617		$goodness = goodness_cpu($cpu, $delta->{avgintrload});
618		if (VERIFY($goodness >= 0 && $goodness <= 1,
619			   "goodness: cpu goodness out of range?")) {
620			dumpdelta($delta);
621			return (1);
622		}
623		if ($goodness == 1) {
624			return (1);	# worst case, no need to continue
625		}
626		if ($goodness > $high_goodness) {
627			$high_goodness = $goodness;
628		}
629	}
630	return ($high_goodness);
631}
632
633sub goodness_cpu($$)		# private function
634{
635	my ($cpu, $avgintrload) = @_;
636
637	my $goodness;
638	my $load = $cpu->{intrs} / $cpu->{tot};
639
640	return (0) if ($load < $avgintrload);	# low loads are perfectly good
641
642	# Calculate $load_no_bigintr, which represents the load
643	# due to interrupts, excluding the one biggest interrupt.
644	# This is the most gain we can get on this CPU from
645	# offloading interrupts.
646
647	my $load_no_bigintr = ($cpu->{intrs} - $cpu->{bigintr}) / $cpu->{tot};
648
649	# A major imbalance is indicated if a CPU is saturated
650	# with interrupt handling, and it has more than one
651	# source of interrupts. Those other interrupts could be
652	# starved if of a lower pil. Return a goodness of 1,
653	# which is the worst possible return value,
654	# which will effectively contaminate this entire delta.
655
656	my $cnt = keys(%{$cpu->{ivecs}});
657
658	if ($load > $goodness_unsafe_load && $cnt > 1) {
659		return (1);
660	}
661	$goodness = $load - $avgintrload;
662	if ($goodness > $load_no_bigintr) {
663		$goodness = $load_no_bigintr;
664	}
665	return ($goodness);
666}
667
668
669# imbalanced() is used by the main routine to determine if the goodness
670# has shifted far enough from our last baseline to warrant a reassignment
671# of interrupts. A very high goodness indicates that a CPU is way out of
672# whack. If the goodness has varied too much since the baseline, then
673# perhaps a reconfiguration is worth considering.
674
675sub imbalanced ($$)
676{
677	my ($goodness, $baseline) = @_;
678
679	# Return 1 if we are pathological, or creeping away from the baseline
680
681	return (1) if $goodness > .50;
682	return (1) if abs($goodness - $baseline) > $goodness_mindelta;
683	return (0);
684}
685
686# do_reconfig(), do_reconfig_cpu(), and do_reconfig_cpu2cpu(), are the
687# decision-making functions responsible for generating a new interrupt
688# distribution. They are designed with the definition of goodness() in
689# mind, i.e. they use the same definition of "good distribution" as does
690# goodness().
691#
692# do_reconfig() is responsible for deciding whether a redistribution is
693# actually warranted. If the goodness is already pretty good, it doesn't
694# waste the CPU time to generate a new distribution. If it
695# calculates a new distribution and finds that it is not sufficiently
696# improved from the prior distirbution, it will not do the redistribution,
697# mainly to avoid the disruption to system performance caused by
698# rejuggling interrupts.
699#
700# Its main loop works by going through a list of cpus sorted from
701# highest to lowest interrupt load. It removes the highest-load cpus
702# one at a time and hands them off to do_reconfig_cpu(). This function
703# then re-sorts the remaining CPUs from lowest to highest interrupt load,
704# and one at a time attempts to rejuggle interrupts between the original
705# high-load CPU and the low-load CPU. Rejuggling on a high-load CPU is
706# considered finished as soon as its interrupt load is within
707# $goodness_mindelta of the average interrupt load. Such a CPU will have
708# a goodness of below the $goodness_mindelta threshold.
709
710#
711# move_intr(\%delta, $inum, $oldcpu, $newcpu)
712# used by reconfiguration code to move an interrupt between cpus within
713# a delta. This manipulates data structures, and does not actually move
714# the interrupt on the running system.
715#
716sub move_intr($$$$)		# private function
717{
718	my ($delta, $inum, $oldcpuid, $newcpuid) = @_;
719
720	my $ivec = $delta->{$oldcpuid}{ivecs}{$inum};
721
722	# Remove ivec from old cpu
723
724	my $oldcpu = $delta->{$oldcpuid};
725	$oldcpu->{intrs} -= $ivec->{time};
726	$oldcpu->{intrload} = $oldcpu->{intrs} / $oldcpu->{tot};
727	delete($oldcpu->{ivecs}{$inum});
728
729	VERIFY($oldcpu->{intrs} >= 0, "move_intr: intr's time > total time?");
730	VERIFY($ivec->{time} <= $oldcpu->{bigintr},
731	       "move_intr: intr's time > bigintr?");
732
733	if ($ivec->{time} >= $oldcpu->{bigintr}) {
734		my $bigtime = 0;
735
736		foreach my $ivec (values(%{$oldcpu->{ivecs}})) {
737			$bigtime = $ivec->{time} if $ivec->{time} > $bigtime;
738		}
739		$oldcpu->{bigintr} = $bigtime;
740	}
741
742	# Add ivec onto new cpu
743
744	my $newcpu = $delta->{$newcpuid};
745
746	$ivec->{nowcpu} = $newcpuid;
747	$newcpu->{intrs} += $ivec->{time};
748	$newcpu->{intrload} = $newcpu->{intrs} / $newcpu->{tot};
749	$newcpu->{ivecs}{$inum} = $ivec;
750
751	$newcpu->{bigintr} = $ivec->{time}
752		if $ivec->{time} > $newcpu->{bigintr};
753}
754
755sub move_intr_check($$$)	# private function
756{
757	my ($delta, $oldcpuid, $newcpuid) = @_;
758
759	VERIFY($delta->{$oldcpuid}{tot} >= $delta->{$oldcpuid}{intrs},
760	       "Moved interrupts left 100+%% load on src cpu");
761	VERIFY($delta->{$newcpuid}{tot} >= $delta->{$newcpuid}{intrs},
762	       "Moved interrupts left 100+%% load on tgt cpu");
763}
764
765sub ivecs_to_string(@)		# private function
766{
767	my $str = "";
768	foreach my $ivec (@_) {
769		$str = "$str $ivec->{inum}";
770	}
771	return ($str);
772}
773
774
775sub do_reconfig($)
776{
777	my ($delta) = @_;
778
779	my $goodness = $delta->{goodness};
780
781	# We can't improve goodness to better than 0. We should stop here
782	# if, even if we achieve a goodness of 0, the improvement is still
783	# too small to merit the action.
784
785	if ($goodness - 0 < $goodness_mindelta) {
786		syslog('debug', "goodness good enough, don't reconfig");
787		return (0);
788	}
789
790	syslog('notice', "Optimizing interrupt assignments");
791
792	if (VERIFY ($delta->{missing} == 0, "RECONFIG Aborted: should not ".
793	    "have a delta with missing")) {
794		return (-1);
795	}
796
797	# Make a list of all cpuids, and also add some extra information
798	# to the ivec structures.
799
800	my @cpusortlist = ();
801
802	while (my ($cpuid, $cpu) = each %$delta) {
803		next if !ref($cpu);	# skip non-cpu entries
804
805		push(@cpusortlist, $cpuid);
806		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
807			$ivec->{origcpu} = $cpuid;
808			$ivec->{nowcpu} = $cpuid;
809			$ivec->{inum} = $inum;
810		}
811	}
812
813	# Sort the list of CPUs from highest to lowest interrupt load.
814	# Remove the top CPU from that list and attempt to redistribute
815	# its interrupts. If the CPU has a goodness below a threshold,
816	# just ignore the CPU and move to the next one. If the CPU's
817	# load falls below the average load plus that same threshold,
818	# then there are no CPUs left worth reconfiguring, and we're done.
819
820	while (@cpusortlist) {
821		# Re-sort cpusortlist each time, since do_reconfig_cpu can
822		# move interrupts around.
823
824		@cpusortlist =
825		    sort({$delta->{$b}{intrload} <=> $delta->{$a}{intrload}}
826		    @cpusortlist);
827
828		my $cpu = shift(@cpusortlist);
829		if (($delta->{$cpu}{intrload} <= $goodness_unsafe_load) &&
830		    ($delta->{$cpu}{intrload} <=
831		    $delta->{avgintrload} + $goodness_mindelta)) {
832			syslog('debug', "finished reconfig: cpu $cpu load ".
833			    "$delta->{$cpu}{intrload} avgload ".
834			    "$delta->{avgintrload}");
835			last;
836		}
837		if (goodness_cpu($delta->{$cpu}, $delta->{avgintrload}) <
838		    $goodness_mindelta) {
839			next;
840		}
841		do_reconfig_cpu($delta, \@cpusortlist, $cpu);
842	}
843
844	# How good a job did we do? If the improvement was minimal, and
845	# our goodness wasn't pathological (and thus needing any help it
846	# can get), then don't bother moving the interrupts.
847
848	my $newgoodness = goodness($delta);
849	VERIFY($newgoodness <= $goodness,
850	       "reconfig: result has worse goodness?");
851
852	if (($goodness != 1 || $newgoodness == 1) &&
853	    $goodness - $newgoodness < $goodness_mindelta) {
854		syslog('debug', "goodness already near optimum, ".
855		       "don't reconfig");
856		return (0);
857	}
858	syslog('debug', "goodness %5.2f%% --> %5.2f%%", $goodness*100,
859	       $newgoodness*100);
860
861	# Time to move those interrupts!
862
863	my $ret = 1;
864	my $warned = 0;
865	while (my ($cpuid, $cpu) = each %$delta) {
866		next if $cpuid =~ /\D/;
867		while (my ($inum, $ivec) = each %{$cpu->{ivecs}}) {
868			next if ($ivec->{origcpu} == $cpuid);
869
870			if (!intrmove($ivec->{buspath}, $ivec->{ino},
871			    $cpuid)) {
872				syslog('warning', "Unable to move interrupts")
873				    if $warned++ == 0;
874				syslog('debug', "Unable to move buspath ".
875				    "$ivec->{buspath} ino $ivec->{ino} to ".
876				    "cpu $cpuid");
877				$ret = -1;
878			}
879		}
880	}
881
882	syslog('notice', "Interrupt assignments optimized");
883	return ($ret);
884}
885
886sub do_reconfig_cpu($$$)	# private function
887{
888	my ($delta, $cpusortlist, $oldcpuid) = @_;
889
890	# We have been asked to rejuggle interrupts between $oldcpuid and
891	# other CPUs found on $cpusortlist so as to improve the load on
892	# $oldcpuid. We reverse $cpusortlist to get our own copy of the
893	# list, sorted from lowest to highest interrupt load. One at a
894	# time, shift a CPU off of this list of CPUs, and attempt to
895	# rejuggle interrupts between the two CPUs. Don't do this if the
896	# other CPU has a higher load than oldcpuid. We're done rejuggling
897	# once $oldcpuid's goodness falls below a threshold.
898
899	syslog('debug', "reconfiguring $oldcpuid");
900
901	my $cpu = $delta->{$oldcpuid};
902	my $avgintrload = $delta->{avgintrload};
903
904	my @cputargetlist = reverse(@$cpusortlist); # make a copy of the list
905	while ($#cputargetlist != -1) {
906 		last if goodness_cpu($cpu, $avgintrload) < $goodness_mindelta;
907
908		my $tgtcpuid = shift(@cputargetlist);
909		my $tgt = $delta->{$tgtcpuid};
910		my $load = $cpu->{intrload};
911		my $tgtload = $tgt->{intrload};
912		last if $tgtload > $load;
913		do_reconfig_cpu2cpu($delta, $oldcpuid, $tgtcpuid, $load);
914	}
915}
916
917sub do_reconfig_cpu2cpu($$$$)	# private function
918{
919	my ($delta, $srccpuid, $tgtcpuid, $srcload) = @_;
920
921	# We've been asked to consider interrupt juggling between srccpuid
922	# (with a high interrupt load) and tgtcpuid (with a lower interrupt
923	# load). First, make a single list with all of the ivecs from both
924	# CPUs, and sort the list from highest to lowest load.
925
926	syslog('debug', "exchanging intrs between $srccpuid and $tgtcpuid");
927
928	# Gather together all the ivecs and sort by load
929
930	my @ivecs = (values(%{$delta->{$srccpuid}{ivecs}}),
931	    values(%{$delta->{$tgtcpuid}{ivecs}}));
932	return if $#ivecs == -1;
933
934	@ivecs = sort({$b->{time} <=> $a->{time}} @ivecs);
935
936	# Our "goal" load for srccpuid is the average load across all CPUs.
937	# find_goal() will find determine the optimum selection of the
938	# available interrupts which comes closest to this goal without
939	# falling below the goal.
940
941	my $goal = $delta->{avgintrnsec};
942
943	# We know that the interrupt load on tgtcpuid is less than that on
944	# srccpuid, but its load could still be above avgintrnsec. Don't
945	# choose a goal which would bring srccpuid below the load on tgtcpuid.
946
947	my $avgnsec =
948	    ($delta->{$srccpuid}{intrs} + $delta->{$tgtcpuid}{intrs}) / 2;
949	if ($goal < $avgnsec) {
950		$goal = $avgnsec;
951	}
952
953	# If the largest of the interrupts is on srccpuid, leave it there.
954	# This can help minimize the disruption caused by moving interrupts.
955
956	if ($ivecs[0]->{origcpu} == $srccpuid) {
957		syslog('debug', "Keeping $ivecs[0]->{inum} on $srccpuid");
958		$goal -= $ivecs[0]->{time};
959		shift(@ivecs);
960	}
961
962	syslog('debug', "GOAL: inums should total $goal");
963	find_goal(\@ivecs, $goal);
964
965	# find_goal() returned its results to us by setting $ivec->{goal} if
966	# the ivec should be on srccpuid, or clearing it for tgtcpuid.
967	# Call move_intr() to update our $delta with the new results.
968
969	foreach my $ivec (@ivecs) {
970		syslog('debug', "ivec $ivec->{inum} goal $ivec->{goal}");
971		VERIFY($ivec->{nowcpu} == $srccpuid ||
972		    $ivec->{nowcpu} == $tgtcpuid, "cpu2cpu found an ".
973		    "interrupt not currently on src or tgt cpu");
974
975		if ($ivec->{goal} && $ivec->{nowcpu} != $srccpuid) {
976			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
977			    $srccpuid);
978		} elsif ($ivec->{goal} == 0 && $ivec->{nowcpu} != $tgtcpuid) {
979			move_intr($delta, $ivec->{inum}, $ivec->{nowcpu},
980			    $tgtcpuid);
981		}
982	}
983	move_intr_check($delta, $srccpuid, $tgtcpuid); # asserts
984
985	my $newload = $delta->{$srccpuid}{intrs} / $delta->{$srccpuid}{tot};
986	VERIFY($newload <= $srcload && $newload > $delta->{avgintrload},
987	    "cpu2cpu: new load didn't end up in expected range");
988}
989
990
991# find_goal() and its helper do_find_goal() are used to find the best
992# combination of interrupts in order to generate a load that is as close
993# as possible to a goal load without falling below that goal. Before returning
994# to its caller, find_goal() sets a new value in the hash of each interrupt,
995# {goal}, which if set signifies that this interrupt is one of the interrupts
996# identified as part of the set of interrupts which best meet the goal.
997#
998# The arguments to find_goal are a list of ivecs (hash references), sorted
999# by descending {time}, and the goal load. The goal is relative to {time}.
1000# The best fit is determined by performing a depth-first search. do_find_goal
1001# is the recursive subroutine which carries out the search.
1002#
1003# It is passed an index as an argument, originally 0. On a given invocation,
1004# it is only to consider interrupts in the ivecs array starting at that index.
1005# It then considers two possibilities:
1006#   1) What is the best goal-fit if I include ivecs[index]?
1007#   2) What is the best goal-fit if I exclude ivecs[index]?
1008# To determine case 1, it subtracts the load of ivecs[index] from the goal,
1009# and calls itself recursively with that new goal and index++.
1010# To determine case 2, it calls itself recursively with the same goal and
1011# index++.
1012#
1013# It then compares the two results, decide which one best meets the goals,
1014# and returns the result. The return value is the best-fit's interrupt load,
1015# followed by a list of all the interrupts which make up that best-fit.
1016#
1017# As an optimization, a second array loads[] is created which mirrors ivecs[].
1018# loads[i] will equal the total loads of all ivecs[i..$#ivecs]. This is used
1019# by do_find_goal to avoid recursing all the way to the end of the ivecs
1020# array if including all remaining interrupts will still leave the best-fit
1021# at below goal load. If so, it then includes all remaining interrupts on
1022# the goal list and returns.
1023#
1024sub find_goal($$)		# private function
1025{
1026	my ($ivecs, $goal) = @_;
1027
1028	my @goals;
1029	my $load;
1030	my $ivec;
1031
1032	if ($goal <= 0) {
1033		@goals = ();	# the empty set will best meet the goal
1034	} else {
1035		syslog('debug', "finding goal from intrs %s",
1036		    ivecs_to_string(@$ivecs));
1037
1038		# Generate @loads array
1039
1040		my $tot = 0;
1041		foreach $ivec (@$ivecs) {
1042			$tot += $ivec->{time};
1043		}
1044		my @loads = ();
1045		foreach $ivec (@$ivecs) {
1046			push(@loads, $tot);
1047			$tot -= $ivec->{time};
1048		}
1049		($load, @goals) = do_find_goal($ivecs, \@loads, $goal, 0);
1050		VERIFY($load >= $goal, "find_goal didn't meet goals");
1051	}
1052	syslog('debug', "goals found: %s", ivecs_to_string(@goals));
1053
1054	# Set or clear $ivec->{goal} for each ivec, based on returned @goals
1055
1056	foreach $ivec (@$ivecs) {
1057		if ($#goals > -1 && $ivec == $goals[0]) {
1058			syslog('debug', "inum $ivec->{inum} on source cpu");
1059			$ivec->{goal} = 1;
1060			shift(@goals);
1061		} else {
1062			syslog('debug', "inum $ivec->{inum} on target cpu");
1063			$ivec->{goal} = 0;
1064		}
1065	}
1066}
1067
1068
1069sub do_find_goal($$$$)		# private function
1070{
1071	my ($ivecs, $loads, $goal, $idx) = @_;
1072
1073	if ($idx > $#{$ivecs}) {
1074		return (0);
1075	}
1076	syslog('debug', "$idx: finding goal $goal inum $ivecs->[$idx]{inum}");
1077
1078	my $load = $ivecs->[$idx]{time};
1079	my @goals_with = ();
1080	my @goals_without = ();
1081	my ($with, $without);
1082
1083	# If we include all remaining items and we're still below goal,
1084	# stop here. We can just return a result that includes $idx and all
1085	# subsequent ivecs. Since this will still be below goal, there's
1086	# nothing better to be done.
1087
1088	if ($loads->[$idx] <= $goal) {
1089		syslog('debug',
1090		    "$idx: including all remaining intrs %s with load %d",
1091		    ivecs_to_string(@$ivecs[$idx .. $#{$ivecs}]),
1092		    $loads->[$idx]);
1093		return ($loads->[$idx], @$ivecs[$idx .. $#{$ivecs}]);
1094	}
1095
1096	# Evaluate the "with" option, i.e. the best matching goal which
1097	# includes $ivecs->[$idx]. If idx's load is more than our goal load,
1098	# stop here. Once we're above the goal, there is no need to consider
1099	# further interrupts since they'll only take us further from the goal.
1100
1101	if ($goal <= $load) {
1102		$with = $load;	# stop here
1103	} else {
1104		($with, @goals_with) =
1105		    do_find_goal($ivecs, $loads, $goal - $load, $idx + 1);
1106		$with += $load;
1107	}
1108	syslog('debug', "$idx: with-load $with intrs %s",
1109	       ivecs_to_string($ivecs->[$idx], @goals_with));
1110
1111	# Evaluate the "without" option, i.e. the best matching goal which
1112	# excludes $ivecs->[$idx].
1113
1114	($without, @goals_without) =
1115	    &do_find_goal($ivecs, $loads, $goal, $idx + 1);
1116	syslog('debug', "$idx: without-load $without intrs %s",
1117	       ivecs_to_string(@goals_without));
1118
1119	# We now have our "with" and "without" options, and we choose which
1120	# best fits the goal. If one is greater than goal and the other is
1121	# below goal, we choose the one that is greater. If they are both
1122	# below goal, then we choose the one that is greater. If they are
1123	# both above goal, then we choose the smaller.
1124
1125	my $which;		# 0 == with, 1 == without
1126	if ($with >= $goal && $without < $goal) {
1127		$which = 0;
1128	} elsif ($with < $goal && $without >= $goal) {
1129		$which = 1;
1130	} elsif ($with >= $goal && $without >= $goal) {
1131		$which = ($without < $with);
1132	} else {
1133		$which = ($without > $with);
1134	}
1135
1136	# Return the load of our best case scenario, followed by all the ivecs
1137	# which compose that goal.
1138
1139	if ($which == 1) {	# without
1140		syslog('debug', "$idx: going without");
1141		return ($without, @goals_without);
1142	} else {
1143		syslog('debug', "$idx: going with");
1144		return ($with, $ivecs->[$idx], @goals_with);
1145	}
1146	# Not reached
1147}
1148
1149
1150
1151
1152syslog('debug', "intrd is starting".($debug ? " (debug)" : ""));
1153
1154my @deltas = ();
1155my $deltas_tottime = 0;		# sum of maxsnap-minsnap across @deltas
1156my $avggoodness;
1157my $baseline_goodness = 0;
1158my $compdelta;
1159
1160my $do_reconfig;
1161
1162# temp variables
1163my $goodness;
1164my $deltatime;
1165my $olddelta;
1166my $olddeltatime;
1167my $delta;
1168my $newstat;
1169my $below_statslen;
1170my $newtime;
1171my $ret;
1172
1173
1174my $gotsig = 0;
1175$SIG{INT} = sub { $gotsig = 1; };     # don't die in the middle of retargeting
1176$SIG{HUP} = $SIG{INT};
1177$SIG{TERM} = $SIG{INT};
1178
1179my $ks;
1180if ($using_scengen == 0) {
1181	$ks = Sun::Solaris::Kstat->new();
1182} else {
1183	$ks = myks_update();	# supplied by the simulator
1184}
1185
1186# If no pci_intrs kstats were found, we need to exit, but we can't because
1187# SMF will restart us and/or report an error to the administrator. But
1188# there's nothing an administrator can do. So print out a message for SMF
1189# logs and silently pause forever.
1190
1191if (!exists($ks->{pci_intrs})) {
1192	print STDERR "$cmdname: no interrupts were found; ".
1193	    "your PCI bus may not yet be supported\n";
1194	pause() while $gotsig == 0;
1195	exit 0;
1196}
1197
1198my $stat = getstat($ks);
1199
1200
1201
1202for (;;) {
1203	sub clear_deltas {
1204		@deltas = ();
1205		$deltas_tottime = 0;
1206		$stat = 0;   # prevent next gen_delta() from setting {missing}
1207	}
1208
1209	# 1. Sleep, update the kstats, and save the new stats in $newstat.
1210
1211	exit 0 if $gotsig;		# if we got ^C / SIGTERM, exit
1212	if ($using_scengen == 0) {
1213		sleep($sleeptime);
1214		exit 0 if $gotsig;	# if we got ^C / SIGTERM, exit
1215		$ks->update();
1216	} else {
1217		$ks = myks_update();
1218	}
1219	$newstat = getstat($ks);
1220
1221	# $stat or $newstat could be zero if they're uninitialized, or if
1222	# getstat() failed. If $stat is zero, move $newstat to $stat, sleep
1223	# and try again. If $newstat is zero, then we also sleep and try
1224	# again, hoping the problem will clear up.
1225
1226	next if (!ref $newstat);
1227	if (!ref $stat) {
1228		$stat = $newstat;
1229		next;
1230	}
1231
1232
1233	# 2. Compare $newstat with the prior set of values, result in %$delta.
1234
1235	$delta = generate_delta($stat, $newstat);
1236	dumpdelta($delta) if $debug;	# Dump most recent stats to stdout.
1237	$stat = $newstat;	# The new stats now become the old stats.
1238
1239
1240	# 3. If $delta->{missing}, then there has been a reconfiguration of
1241	# either cpus or interrupts (probably both). We need to toss out our
1242	# old set of statistics and start from scratch.
1243	#
1244	# Also, if the delta covers a very long range of time, then we've
1245	# been experiencing a system overload that has resulted in intrd
1246	# not being allowed to run effectively for a while now. As above,
1247	# toss our old statistics and start from scratch.
1248
1249	$deltatime = $delta->{maxsnap} - $delta->{minsnap};
1250	if ($delta->{missing} > 0 || $deltatime > $statslen) {
1251		clear_deltas();
1252		syslog('debug', "evaluating interrupt assignments");
1253		next;
1254	}
1255
1256
1257	# 4. Incorporate new delta into the list of deltas, and associated
1258	# statistics. If we've just now received $statslen deltas, then it's
1259	# time to evaluate a reconfiguration.
1260
1261	$below_statslen = ($deltas_tottime < $statslen);
1262	$deltas_tottime += $deltatime;
1263	$do_reconfig = ($below_statslen && $deltas_tottime >= $statslen);
1264	push(@deltas, $delta);
1265
1266	# 5. Remove old deltas if total time is more than $statslen. We use
1267	# @deltas as a moving average of the last $statslen seconds. Shift
1268	# off the olders deltas, but only if that doesn't cause us to fall
1269	# below $statslen seconds.
1270
1271	while (@deltas > 1) {
1272		$olddelta = $deltas[0];
1273		$olddeltatime = $olddelta->{maxsnap} - $olddelta->{minsnap};
1274		$newtime = $deltas_tottime - $olddeltatime;
1275		last if ($newtime < $statslen);
1276
1277		shift(@deltas);
1278		$deltas_tottime = $newtime;
1279	}
1280
1281	# 6. The brains of the operation are here. First, check if we're
1282	# imbalanced, and if so set $do_reconfig. If $do_reconfig is set,
1283	# either because of imbalance or above in step 4, we evaluate a
1284	# new configuration.
1285	#
1286	# First, take @deltas and generate a single "compressed" delta
1287	# which summarizes them all. Pass that to do_reconfig and see
1288	# what it does with it:
1289	#
1290	# $ret == -1 : failure
1291	# $ret ==  0 : current config is optimal (or close enough)
1292	# $ret ==  1 : reconfiguration has occurred
1293	#
1294	# If $ret is -1 or 1, dump all our deltas and start from scratch.
1295	# Step 4 above will set do_reconfig soon thereafter.
1296	#
1297	# If $ret is 0, then nothing has happened because we're already
1298	# good enough. Set baseline_goodness to current goodness.
1299
1300	$compdelta = compress_deltas(\@deltas);
1301	if (VERIFY(ref($compdelta) eq "HASH", "couldn't compress deltas")) {
1302		clear_deltas();
1303		next;
1304	}
1305	$compdelta->{goodness} = goodness($compdelta);
1306	dumpdelta($compdelta) if $debug;
1307
1308	$goodness = $compdelta->{goodness};
1309	syslog('debug', "GOODNESS: %5.2f%%", $goodness * 100);
1310
1311	if ($deltas_tottime >= $statslen &&
1312	    imbalanced($goodness, $baseline_goodness)) {
1313		$do_reconfig = 1;
1314	}
1315
1316	if ($do_reconfig) {
1317		$ret = do_reconfig($compdelta);
1318
1319		if ($ret != 0) {
1320			clear_deltas();
1321			syslog('debug', "do_reconfig FAILED!") if $ret == -1;
1322		} else {
1323			syslog('debug', "setting new baseline of $goodness");
1324			$baseline_goodness = $goodness;
1325		}
1326	}
1327	syslog('debug', "---------------------------------------");
1328}
1329