1# Run this TCL script using "testfixture" in order get a report that shows
2# how much disk space is used by a particular data to actually store data
3# versus how much space is unused.
4#
5
6# Get the name of the database to analyze
7#
8if {[llength $argv]!=1} {
9  puts stderr "Usage: $argv0 database-name"
10  exit 1
11}
12set file_to_analyze [lindex $argv 0]
13if {![file exists $file_to_analyze]} {
14  puts stderr "No such file: $file_to_analyze"
15  exit 1
16}
17if {![file readable $file_to_analyze]} {
18  puts stderr "File is not readable: $file_to_analyze"
19  exit 1
20}
21if {[file size $file_to_analyze]<2048} {
22  puts stderr "Empty or malformed database: $file_to_analyze"
23  exit 1
24}
25
26# Open the database
27#
28sqlite db [lindex $argv 0]
29set DB [btree_open [lindex $argv 0]]
30
31# In-memory database for collecting statistics
32#
33sqlite mem :memory:
34set tabledef\
35{CREATE TABLE space_used(
36   name clob,        -- Name of a table or index in the database file
37   tblname clob,     -- Name of associated table
38   is_index boolean, -- TRUE if it is an index, false for a table
39   nentry int,       -- Number of entries in the BTree
40   payload int,      -- Total amount of data stored in this table or index
41   mx_payload int,   -- Maximum payload size
42   n_ovfl int,       -- Number of entries that overflow
43   pri_pages int,    -- Number of primary pages used
44   ovfl_pages int,   -- Number of overflow pages used
45   pri_unused int,   -- Number of unused bytes on primary pages
46   ovfl_unused int   -- Number of unused bytes on overflow pages
47);}
48mem eval $tabledef
49
50# This query will be used to find the root page number for every index and
51# table in the database.
52#
53set sql {
54  SELECT name, tbl_name, type, rootpage
55    FROM sqlite_master WHERE type IN ('table','index')
56  UNION ALL
57  SELECT 'sqlite_master', 'sqlite_master', 'table', 2
58  ORDER BY 1
59}
60
61# Analyze every table in the database, one at a time.
62#
63foreach {name tblname type rootpage} [db eval $sql] {
64  puts stderr "Analyzing $name..."
65  set cursor [btree_cursor $DB $rootpage 0]
66  set go [btree_first $cursor]
67  set size 0
68  catch {unset pg_used}
69  set unused_ovfl 0
70  set n_overflow 0
71  set cnt_ovfl 0
72  set n_entry 0
73  set mx_size 0
74  set pg_used($rootpage) 1016
75  while {$go==0} {
76    incr n_entry
77    set payload [btree_payload_size $cursor]
78    incr size $payload
79    set stat [btree_cursor_dump $cursor]
80    set pgno [lindex $stat 0]
81    set freebytes [lindex $stat 4]
82    set pg_used($pgno) $freebytes
83    if {$payload>236} {
84      # if {[lindex $stat 8]==0} {error "overflow is empty with $payload"}
85      set n [expr {($payload-236+1019)/1020}]
86      incr n_overflow $n
87      incr cnt_ovfl
88      incr unused_ovfl [expr {$n*1020+236-$payload}]
89    } else {
90      # if {[lindex $stat 8]!=0} {error "overflow not empty with $payload"}
91    }
92    if {$payload>$mx_size} {set mx_size $payload}
93    set go [btree_next $cursor]
94  }
95  btree_close_cursor $cursor
96  set n_primary [llength [array names pg_used]]
97  set unused_primary 0
98  foreach x [array names pg_used] {incr unused_primary $pg_used($x)}
99  regsub -all ' $name '' name
100  set sql "INSERT INTO space_used VALUES('$name'"
101  regsub -all ' $tblname '' tblname
102  append sql ",'$tblname',[expr {$type=="index"}],$n_entry"
103  append sql ",$size,$mx_size,$cnt_ovfl,"
104  append sql "$n_primary,$n_overflow,$unused_primary,$unused_ovfl);"
105  mem eval $sql
106}
107
108# Generate a single line of output in the statistics section of the
109# report.
110#
111proc statline {title value {extra {}}} {
112  set len [string length $title]
113  set dots [string range {......................................} $len end]
114  set len [string length $value]
115  set sp2 [string range {          } $len end]
116  if {$extra ne ""} {
117    set extra " $extra"
118  }
119  puts "$title$dots $value$sp2$extra"
120}
121
122# Generate a formatted percentage value for $num/$denom
123#
124proc percent {num denom} {
125  if {$denom==0.0} {return ""}
126  set v [expr {$num*100.0/$denom}]
127  if {$v>1.0 && $v<99.0} {
128    return [format %4.1f%% $v]
129  } elseif {$v<0.1 || $v>99.9} {
130    return [format %6.3f%% $v]
131  } else {
132    return [format %5.2f%% $v]
133  }
134}
135
136# Generate a subreport that covers some subset of the database.
137# the $where clause determines which subset to analyze.
138#
139proc subreport {title where} {
140  set hit 0
141  mem eval "SELECT sum(nentry) AS nentry, \
142                   sum(payload) AS payload, \
143                   sum(CASE is_index WHEN 1 THEN 0 ELSE payload-4*nentry END) \
144                       AS data, \
145                   max(mx_payload) AS mx_payload, \
146                   sum(n_ovfl) as n_ovfl, \
147                   sum(pri_pages) AS pri_pages, \
148                   sum(ovfl_pages) AS ovfl_pages, \
149                   sum(pri_unused) AS pri_unused, \
150                   sum(ovfl_unused) AS ovfl_unused \
151            FROM space_used WHERE $where" {} {set hit 1}
152  if {!$hit} {return 0}
153  puts ""
154  set len [string length $title]
155  incr len 5
156  set stars "***********************************"
157  append stars $stars
158  set stars [string range $stars $len end]
159  puts "*** $title $stars"
160  puts ""
161  statline "Percentage of total database" \
162     [percent [expr {$pri_pages+$ovfl_pages}] $::file_pgcnt]
163  statline "Number of entries" $nentry
164  set storage [expr {($pri_pages+$ovfl_pages)*1024}]
165  statline "Bytes of storage consumed" $storage
166  statline "Bytes of payload" $payload [percent $payload $storage]
167  statline "Bytes of data" $data [percent $data $storage]
168  set key [expr {$payload-$data}]
169  statline "Bytes of key" $key [percent $key $storage]
170  set avgpay [expr {$nentry>0?$payload/$nentry:0}]
171  statline "Average payload per entry" $avgpay
172  set avgunused [expr {$nentry>0?($pri_unused+$ovfl_unused)/$nentry:0}]
173  statline "Average unused bytes per entry" $avgunused
174  statline "Average fanout" \
175     [format %.2f [expr {$pri_pages==0?0:($nentry+0.0)/$pri_pages}]]
176  statline "Maximum payload per entry" $mx_payload
177  statline "Entries that use overflow" $n_ovfl [percent $n_ovfl $nentry]
178  statline "Total pages used" [set allpgs [expr {$pri_pages+$ovfl_pages}]]
179  statline "Primary pages used" $pri_pages ;# [percent $pri_pages $allpgs]
180  statline "Overflow pages used" $ovfl_pages ;# [percent $ovfl_pages $allpgs]
181  statline "Unused bytes on primary pages" $pri_unused \
182               [percent $pri_unused [expr {$pri_pages*1024}]]
183  statline "Unused bytes on overflow pages" $ovfl_unused \
184               [percent $ovfl_unused [expr {$ovfl_pages*1024}]]
185  set allunused [expr {$ovfl_unused+$pri_unused}]
186  statline "Unused bytes on all pages" $allunused \
187               [percent $allunused [expr {$allpgs*1024}]]
188  return 1
189}
190
191# Output summary statistics:
192#
193puts "/** Disk-Space Utilization Report For $file_to_analyze"
194puts "*** As of [clock format [clock seconds] -format {%Y-%b-%d %H:%M:%S}]"
195puts ""
196set fsize [file size [lindex $argv 0]]
197set file_pgcnt [expr {$fsize/1024}]
198set usedcnt [mem eval {SELECT sum(pri_pages+ovfl_pages) FROM space_used}]
199set freecnt [expr {$file_pgcnt-$usedcnt-1}]
200set freecnt2 [lindex [btree_get_meta $DB] 0]
201statline {Pages in the whole file (measured)} $file_pgcnt
202set file_pgcnt2 [expr {$usedcnt+$freecnt2+1}]
203statline {Pages in the whole file (calculated)} $file_pgcnt2
204statline {Pages that store data} $usedcnt [percent $usedcnt $file_pgcnt]
205statline {Pages on the freelist (per header)}\
206   $freecnt2 [percent $freecnt2 $file_pgcnt]
207statline {Pages on the freelist (calculated)}\
208   $freecnt [percent $freecnt $file_pgcnt]
209statline {Header pages} 1 [percent 1 $file_pgcnt]
210
211set ntable [db eval {SELECT count(*)+1 FROM sqlite_master WHERE type='table'}]
212statline {Number of tables in the database} $ntable
213set nindex [db eval {SELECT count(*) FROM sqlite_master WHERE type='index'}]
214set autoindex [db eval {SELECT count(*) FROM sqlite_master
215                        WHERE type='index' AND name LIKE '(% autoindex %)'}]
216set manindex [expr {$nindex-$autoindex}]
217statline {Number of indices} $nindex
218statline {Number of named indices} $manindex [percent $manindex $nindex]
219statline {Automatically generated indices} $autoindex \
220     [percent $autoindex $nindex]
221
222set bytes_data [mem eval "SELECT sum(payload-4*nentry) FROM space_used
223                          WHERE NOT is_index AND name!='sqlite_master'"]
224set total_payload [mem eval "SELECT sum(payload) FROM space_used"]
225statline "Size of the file in bytes" $fsize
226statline "Bytes of payload stored" $total_payload \
227    [percent $total_payload $fsize]
228statline "Bytes of user data stored" $bytes_data \
229    [percent $bytes_data $fsize]
230
231# Output table rankings
232#
233puts ""
234puts "*** Page counts for all tables with their indices ********************"
235puts ""
236mem eval {SELECT tblname, count(*) AS cnt, sum(pri_pages+ovfl_pages) AS size
237          FROM space_used GROUP BY tblname ORDER BY size DESC, tblname} {} {
238  statline [string toupper $tblname] $size [percent $size $file_pgcnt]
239}
240
241# Output subreports
242#
243if {$nindex>0} {
244  subreport {All tables and indices} 1
245}
246subreport {All tables} {NOT is_index}
247if {$nindex>0} {
248  subreport {All indices} {is_index}
249}
250foreach tbl [mem eval {SELECT name FROM space_used WHERE NOT is_index
251                       ORDER BY name}] {
252  regsub ' $tbl '' qn
253  set name [string toupper $tbl]
254  set n [mem eval "SELECT count(*) FROM space_used WHERE tblname='$qn'"]
255  if {$n>1} {
256    subreport "Table $name and all its indices" "tblname='$qn'"
257    subreport "Table $name w/o any indices" "name='$qn'"
258    subreport "Indices of table $name" "tblname='$qn' AND is_index"
259  } else {
260    subreport "Table $name" "name='$qn'"
261  }
262}
263
264# Output instructions on what the numbers above mean.
265#
266puts {
267*** Definitions ******************************************************
268
269Number of pages in the whole file
270
271    The number of 1024-byte pages that go into forming the complete database
272
273Pages that store data
274
275    The number of pages that store data, either as primary B*Tree pages or
276    as overflow pages.  The number at the right is the data pages divided by
277    the total number of pages in the file.
278
279Pages on the freelist
280
281    The number of pages that are not currently in use but are reserved for
282    future use.  The percentage at the right is the number of freelist pages
283    divided by the total number of pages in the file.
284
285Header pages
286
287    The number of pages of header overhead in the database.  This value is
288    always 1.  The percentage at the right is the number of header pages
289    divided by the total number of pages in the file.
290
291Number of tables in the database
292
293    The number of tables in the database, including the SQLITE_MASTER table
294    used to store schema information.
295
296Number of indices
297
298    The total number of indices in the database.
299
300Number of named indices
301
302    The number of indices created using an explicit CREATE INDEX statement.
303
304Automatically generated indices
305
306    The number of indices used to implement PRIMARY KEY or UNIQUE constraints
307    on tables.
308
309Size of the file in bytes
310
311    The total amount of disk space used by the entire database files.
312
313Bytes of payload stored
314
315    The total number of bytes of payload stored in the database.  Payload
316    includes both key and data.  The content of the SQLITE_MASTER table is
317    counted when computing this number.  The percentage at the right shows
318    the payload divided by the total file size.
319
320Bytes of user data stored
321
322    The total number of bytes of data stored in the database, not counting
323    the database schema information stored in the SQLITE_MASTER table.  The
324    percentage at the right is the user data size divided by the total file
325    size.
326
327Percentage of total database
328
329    The amount of the complete database file that is devoted to storing
330    information described by this category.
331
332Number of entries
333
334    The total number of B*Tree key/value pairs stored under this category.
335
336Bytes of storage consumed
337
338    The total amount of disk space required to store all B*Tree entries
339    under this category.  The is the total number of pages used times
340    the pages size (1024).
341
342Bytes of payload
343
344    The amount of payload stored under this category.  Payload is the sum
345    of keys and data.  Each table entry has 4 bytes of key and an arbitrary
346    amount of data.  Each index entry has 4 or more bytes of key and no
347    data.  The percentage at the right is the bytes of payload divided by
348    the bytes of storage consumed.
349
350Bytes of data
351
352    The amount of data stored under this category.  The data space reported
353    includes formatting information such as nul-terminators and field-lengths
354    that are stored with the data.  The percentage at the right is the bytes
355    of data divided by bytes of storage consumed.
356
357Bytes of key
358
359    The sum of the sizes of all keys under this category.  The percentage at
360    the right is the bytes of key divided by the bytes of storage consumed.
361
362Average payload per entry
363
364    The average amount of payload on each entry.  This is just the bytes of
365    payload divided by the number of entries.
366
367Average unused bytes per entry
368
369    The average amount of free space remaining on all pages under this
370    category on a per-entry basis.  This is the number of unused bytes on
371    all pages divided by the number of entries.
372
373Maximum payload per entry
374
375    The largest payload size of any entry.
376
377Entries that use overflow
378
379    Up to 236 bytes of payload for each entry are stored directly in the
380    primary B*Tree page.  Any additional payload is stored on a linked list
381    of overflow pages.  This is the number of entries that exceed 236 bytes
382    in size.  The value to the right is the number of entries that overflow
383    divided by the total number of entries.
384
385Total pages used
386
387    This is the number of 1024 byte pages used to hold all information in
388    the current category.  This is the sum of primary and overflow pages.
389
390Primary pages used
391
392    This is the number of primary B*Tree pages used.
393
394Overflow pages used
395
396    The total number of overflow pages used for this category.
397
398Unused bytes on primary pages
399
400    The total number of bytes of unused space on all primary pages.  The
401    percentage at the right is the number of unused bytes divided by the
402    total number of bytes on primary pages.
403
404Unused bytes on overflow pages
405
406    The total number of bytes of unused space on all overflow pages.  The
407    percentage at the right is the number of unused bytes divided by the
408    total number of bytes on overflow pages.
409
410Unused bytes on all pages
411
412    The total number of bytes of unused space on all primary and overflow
413    pages.  The percentage at the right is the number of unused bytes
414    divided by the total number of bytes.
415}
416
417# Output the database
418#
419puts "**********************************************************************"
420puts "The entire text of this report can be sourced into any SQL database"
421puts "engine for further analysis.  All of the text above is an SQL comment."
422puts "The data used to generate this report follows:"
423puts "*/"
424puts "BEGIN;"
425puts $tabledef
426unset -nocomplain x
427mem eval {SELECT * FROM space_used} x {
428  puts -nonewline "INSERT INTO space_used VALUES("
429  regsub ' $x(name) '' qn
430  regsub ' $x(tblname) '' qtn
431  puts -nonewline "'$qn','$qtn',"
432  puts -nonewline "$x(is_index),$x(nentry),$x(payload),$x(mx_payload),"
433  puts -nonewline "$x(n_ovfl),$x(pri_pages),$x(ovfl_pages),$x(pri_unused),"
434  puts "$x(ovfl_unused));"
435}
436puts "COMMIT;"
437