1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2015 Joyent, Inc.
24 */
25
26/*
27 * Copyright (c) 1987, 2010, Oracle and/or its affiliates. All rights reserved.
28 */
29
30/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
31/*	  All Rights Reserved	*/
32
33/*
34 * University Copyright- Copyright (c) 1982, 1986, 1988
35 * The Regents of the University of California
36 * All Rights Reserved
37 *
38 * University Acknowledgment- Portions of this document are derived from
39 * software developed by the University of California, Berkeley, and its
40 * contributors.
41 */
42
43/*
44 * Each physical swap area has an associated bitmap representing
45 * its physical storage. The bitmap records which swap slots are
46 * currently allocated or freed.  Allocation is done by searching
47 * through the bitmap for the first free slot. Thus, there's
48 * no linear relation between offset within the swap device and the
49 * address (within its segment(s)) of the page that the slot backs;
50 * instead, it's an arbitrary one-to-one mapping.
51 *
52 * Associated with each swap area is a swapinfo structure.  These
53 * structures are linked into a linear list that determines the
54 * ordering of swap areas in the logical swap device.  Each contains a
55 * pointer to the corresponding bitmap, the area's size, and its
56 * associated vnode.
57 */
58
59#include <sys/types.h>
60#include <sys/inttypes.h>
61#include <sys/param.h>
62#include <sys/t_lock.h>
63#include <sys/sysmacros.h>
64#include <sys/systm.h>
65#include <sys/errno.h>
66#include <sys/kmem.h>
67#include <sys/vfs.h>
68#include <sys/vnode.h>
69#include <sys/pathname.h>
70#include <sys/cmn_err.h>
71#include <sys/vtrace.h>
72#include <sys/swap.h>
73#include <sys/dumphdr.h>
74#include <sys/debug.h>
75#include <sys/fs/snode.h>
76#include <sys/fs/swapnode.h>
77#include <sys/policy.h>
78#include <sys/zone.h>
79
80#include <vm/as.h>
81#include <vm/seg.h>
82#include <vm/page.h>
83#include <vm/seg_vn.h>
84#include <vm/hat.h>
85#include <vm/anon.h>
86#include <vm/seg_map.h>
87
88/*
89 * To balance the load among multiple swap areas, we don't allow
90 * more than swap_maxcontig allocations to be satisfied from a
91 * single swap area before moving on to the next swap area.  This
92 * effectively "interleaves" allocations among the many swap areas.
93 */
94int swap_maxcontig;	/* set by anon_init() to 1 Mb */
95
96#define	MINIROOTSIZE	12000	/* ~6 Meg XXX */
97
98/*
99 * XXX - this lock is a kludge. It serializes some aspects of swapadd() and
100 * swapdel() (namely VOP_OPEN, VOP_CLOSE, VN_RELE).  It protects against
101 * somebody swapadd'ing and getting swap slots from a vnode, while someone
102 * else is in the process of closing or rele'ing it.
103 */
104static kmutex_t swap_lock;
105
106kmutex_t swapinfo_lock;
107
108/*
109 * protected by the swapinfo_lock
110 */
111extern struct swapinfo	*swapinfo;
112
113static	struct	swapinfo *silast;
114static	int	nswapfiles;
115
116static u_offset_t	swap_getoff(struct swapinfo *);
117static int	swapadd(struct vnode *, ulong_t, ulong_t, char *);
118static int	swapdel(struct vnode *, ulong_t);
119static int	swapslot_free(struct vnode *, u_offset_t, struct swapinfo *);
120
121/*
122 * swap device bitmap allocation macros
123 */
124#define	MAPSHIFT	5
125#define	NBBW		(NBPW * NBBY)	/* number of bits per word */
126#define	TESTBIT(map, i)		(((map)[(i) >> MAPSHIFT] & (1 << (i) % NBBW)))
127#define	SETBIT(map, i)		(((map)[(i) >> MAPSHIFT] |= (1 << (i) % NBBW)))
128#define	CLEARBIT(map, i)	(((map)[(i) >> MAPSHIFT] &= ~(1 << (i) % NBBW)))
129
130int swap_debug = 0;	/* set for debug printf's */
131int swap_verify = 0;	/* set to verify slots when freeing and allocating */
132
133uint_t swapalloc_maxcontig;
134
135/*
136 * Allocate a range of up to *lenp contiguous slots (page) from a physical
137 * swap device. Flags are one of:
138 *	SA_NOT  Must have a slot from a physical swap device other than the
139 *		the one containing input (*vpp, *offp).
140 * Less slots than requested may be returned. *lenp allocated slots are
141 * returned starting at *offp on *vpp.
142 * Returns 1 for a successful allocation, 0 for couldn't allocate any slots.
143 */
144int
145swap_phys_alloc(
146	struct vnode **vpp,
147	u_offset_t *offp,
148	size_t *lenp,
149	uint_t flags)
150{
151	struct swapinfo *sip;
152	offset_t soff, noff;
153	size_t len;
154
155	mutex_enter(&swapinfo_lock);
156	sip = silast;
157
158	/* Find a desirable physical device and allocate from it. */
159	do {
160		if (sip == NULL)
161			break;
162		if (!(sip->si_flags & ST_INDEL) &&
163		    (spgcnt_t)sip->si_nfpgs > 0) {
164			/* Caller wants other than specified swap device */
165			if (flags & SA_NOT) {
166				if (*vpp != sip->si_vp ||
167				    *offp < sip->si_soff ||
168				    *offp >= sip->si_eoff)
169					goto found;
170			/* Caller is loose, will take anything */
171			} else
172				goto found;
173		} else if (sip->si_nfpgs == 0)
174			sip->si_allocs = 0;
175		if ((sip = sip->si_next) == NULL)
176			sip = swapinfo;
177	} while (sip != silast);
178	mutex_exit(&swapinfo_lock);
179	return (0);
180found:
181	soff = swap_getoff(sip);
182	sip->si_nfpgs--;
183	if (soff == -1)
184		panic("swap_alloc: swap_getoff failed!");
185
186	for (len = PAGESIZE; len < *lenp; len += PAGESIZE) {
187		if (sip->si_nfpgs == 0)
188			break;
189		if (swapalloc_maxcontig && len >= swapalloc_maxcontig)
190			break;
191		noff = swap_getoff(sip);
192		if (noff == -1) {
193			break;
194		} else if (noff != soff + len) {
195			CLEARBIT(sip->si_swapslots, btop(noff - sip->si_soff));
196			break;
197		}
198		sip->si_nfpgs--;
199	}
200	*vpp = sip->si_vp;
201	*offp = soff;
202	*lenp = len;
203	ASSERT((spgcnt_t)sip->si_nfpgs >= 0);
204	sip->si_allocs += btop(len);
205	if (sip->si_allocs >= swap_maxcontig) {
206		sip->si_allocs = 0;
207		if ((silast = sip->si_next) == NULL)
208			silast = swapinfo;
209	}
210	TRACE_2(TR_FAC_VM, TR_SWAP_ALLOC,
211	    "swap_alloc:sip %p offset %lx", sip, soff);
212	mutex_exit(&swapinfo_lock);
213	return (1);
214}
215
216int swap_backsearch = 0;
217
218/*
219 * Get a free offset on swap device sip.
220 * Return >=0 offset if succeeded, -1 for failure.
221 */
222static u_offset_t
223swap_getoff(struct swapinfo *sip)
224{
225	uint_t *sp, *ep;
226	size_t aoff, boff, poff, slotnumber;
227
228	ASSERT(MUTEX_HELD(&swapinfo_lock));
229
230	sip->si_alloccnt++;
231	for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
232	    ep = &sip->si_swapslots[sip->si_mapsize / NBPW]; sp < ep; sp++) {
233		if (*sp != (uint_t)0xffffffff)
234			goto foundentry;
235		else
236			sip->si_checkcnt++;
237	}
238	SWAP_PRINT(SW_ALLOC,
239	    "swap_getoff: couldn't find slot from hint %ld to end\n",
240	    sip->si_hint, 0, 0, 0, 0);
241	/*
242	 * Go backwards? Check for faster method XXX
243	 */
244	if (swap_backsearch) {
245		for (sp = &sip->si_swapslots[sip->si_hint >> MAPSHIFT],
246		    ep = sip->si_swapslots; sp > ep; sp--) {
247			if (*sp != (uint_t)0xffffffff)
248				goto foundentry;
249			else
250				sip->si_checkcnt++;
251		}
252	} else {
253		for (sp = sip->si_swapslots,
254		    ep = &sip->si_swapslots[sip->si_hint >> MAPSHIFT];
255		    sp < ep; sp++) {
256			if (*sp != (uint_t)0xffffffff)
257				goto foundentry;
258			else
259				sip->si_checkcnt++;
260		}
261	}
262	if (*sp == 0xffffffff) {
263		cmn_err(CE_WARN, "No free swap slots!");
264		return ((u_offset_t)-1);
265	}
266
267foundentry:
268	/*
269	 * aoff is the page number offset (in bytes) of the si_swapslots
270	 * array element containing a free page
271	 *
272	 * boff is the page number offset of the free page
273	 * (i.e. cleared bit) in si_swapslots[aoff].
274	 */
275	aoff = ((char *)sp - (char *)sip->si_swapslots) * NBBY;
276
277	for (boff = (sip->si_hint % NBBW); boff < NBBW; boff++) {
278		if (!TESTBIT(sip->si_swapslots, aoff + boff))
279			goto foundslot;
280		else
281			sip->si_checkcnt++;
282	}
283	for (boff = 0; boff < (sip->si_hint % NBBW); boff++) {
284		if (!TESTBIT(sip->si_swapslots, aoff + boff))
285			goto foundslot;
286		else
287			sip->si_checkcnt++;
288	}
289	panic("swap_getoff: didn't find slot in word hint %ld", sip->si_hint);
290
291foundslot:
292	/*
293	 * Return the offset of the free page in swap device.
294	 * Convert page number of byte offset and add starting
295	 * offset of swap device.
296	 */
297	slotnumber = aoff + boff;
298	SWAP_PRINT(SW_ALLOC, "swap_getoff: allocating slot %ld\n",
299	    slotnumber, 0, 0, 0, 0);
300	poff = ptob(slotnumber);
301	if (poff + sip->si_soff >= sip->si_eoff)
302		printf("ptob(aoff(%ld) + boff(%ld))(%ld) >= eoff(%ld)\n",
303		    aoff, boff, ptob(slotnumber), (long)sip->si_eoff);
304	ASSERT(poff < sip->si_eoff);
305	/*
306	 * We could verify here that the slot isn't already allocated
307	 * by looking through all the anon slots.
308	 */
309	SETBIT(sip->si_swapslots, slotnumber);
310	sip->si_hint = slotnumber + 1;	/* hint = next slot */
311	return (poff + sip->si_soff);
312}
313
314/*
315 * Free a swap page.
316 */
317void
318swap_phys_free(struct vnode *vp, u_offset_t off, size_t len)
319{
320	struct swapinfo *sip;
321	ssize_t pagenumber, npage;
322
323	mutex_enter(&swapinfo_lock);
324	sip = swapinfo;
325
326	do {
327		if (sip->si_vp == vp &&
328		    sip->si_soff <= off && off < sip->si_eoff) {
329			for (pagenumber = btop(off - sip->si_soff),
330			    npage = btop(len) + pagenumber;
331			    pagenumber < npage; pagenumber++) {
332				SWAP_PRINT(SW_ALLOC,
333				    "swap_phys_free: freeing slot %ld on "
334				    "sip %p\n",
335				    pagenumber, sip, 0, 0, 0);
336				if (!TESTBIT(sip->si_swapslots, pagenumber)) {
337					panic(
338					    "swap_phys_free: freeing free slot "
339					    "%p,%lx\n", (void *)vp,
340					    ptob(pagenumber) + sip->si_soff);
341				}
342				CLEARBIT(sip->si_swapslots, pagenumber);
343				sip->si_nfpgs++;
344			}
345			ASSERT(sip->si_nfpgs <= sip->si_npgs);
346			mutex_exit(&swapinfo_lock);
347			return;
348		}
349	} while ((sip = sip->si_next) != NULL);
350	panic("swap_phys_free");
351	/*NOTREACHED*/
352}
353
354/*
355 * Return the anon struct corresponding for the given
356 * <vnode, off> if it is part of the virtual swap device.
357 * Return the anon struct if found, otherwise NULL.
358 */
359struct anon *
360swap_anon(struct vnode *vp, u_offset_t off)
361{
362	struct anon *ap;
363
364	ASSERT(MUTEX_HELD(AH_MUTEX(vp, off)));
365
366	for (ap = anon_hash[ANON_HASH(vp, off)]; ap != NULL; ap = ap->an_hash) {
367		if (ap->an_vp == vp && ap->an_off == off)
368			return (ap);
369	}
370	return (NULL);
371}
372
373
374/*
375 * Determine if the vp offset range overlap a swap device.
376 */
377int
378swap_in_range(struct vnode *vp, u_offset_t offset, size_t len)
379{
380	struct swapinfo *sip;
381	u_offset_t eoff;
382
383	eoff = offset + len;
384	ASSERT(eoff > offset);
385
386	mutex_enter(&swapinfo_lock);
387	sip = swapinfo;
388	if (vp && sip) {
389		do {
390			if (vp != sip->si_vp || eoff <= sip->si_soff ||
391			    offset >= sip->si_eoff)
392				continue;
393			mutex_exit(&swapinfo_lock);
394			return (1);
395		} while ((sip = sip->si_next) != NULL);
396	}
397	mutex_exit(&swapinfo_lock);
398	return (0);
399}
400
401/*
402 * See if name is one of our swap files
403 * even though lookupname failed.
404 * This can be used by swapdel to delete
405 * swap resources on remote machines
406 * where the link has gone down.
407 */
408static struct vnode *
409swapdel_byname(
410	char	*name,			/* pathname to delete */
411	ulong_t lowblk)			/* Low block number of area to delete */
412{
413	struct swapinfo **sipp, *osip;
414	u_offset_t soff;
415
416	/*
417	 * Find the swap file entry for the file to
418	 * be deleted. Skip any entries that are in
419	 * transition.
420	 */
421
422	soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
423
424	mutex_enter(&swapinfo_lock);
425	for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
426		if ((strcmp(osip->si_pname, name) == 0) &&
427		    (osip->si_soff == soff) && (osip->si_flags == 0)) {
428			struct vnode *vp = osip->si_vp;
429
430			VN_HOLD(vp);
431			mutex_exit(&swapinfo_lock);
432			return (vp);
433		}
434	}
435	mutex_exit(&swapinfo_lock);
436	return (NULL);
437}
438
439
440/*
441 * New system call to manipulate swap files.
442 */
443int
444swapctl(int sc_cmd, void *sc_arg, int *rv)
445{
446	struct swapinfo *sip, *csip, *tsip;
447	int error = 0;
448	struct swapent st, *ust;
449	struct swapres sr;
450	struct vnode *vp;
451	int cnt = 0;
452	int tmp_nswapfiles;
453	int nswap;
454	int length, nlen;
455	int gplen = 0, plen;
456	char *swapname;
457	char *pname;
458	char *tpname;
459	struct anoninfo ai;
460	spgcnt_t avail;
461	int global = INGLOBALZONE(curproc);
462	struct zone *zp = curproc->p_zone;
463
464	/*
465	 * When running in a zone we want to hide the details of the swap
466	 * devices: we report there only being one swap device named "swap"
467	 * having a size equal to the sum of the sizes of all real swap devices
468	 * on the system.
469	 */
470	switch (sc_cmd) {
471	case SC_GETNSWP:
472		if (global)
473			*rv = nswapfiles;
474		else
475			*rv = 1;
476		return (0);
477
478	case SC_AINFO:
479		/*
480		 * Return anoninfo information with these changes:
481		 * ani_max = maximum amount of swap space
482		 *	(including potentially available physical memory)
483		 * ani_free = amount of unallocated anonymous memory
484		 *	(some of which might be reserved and including
485		 *	 potentially available physical memory)
486		 * ani_resv = amount of claimed (reserved) anonymous memory
487		 */
488		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
489		ai.ani_max = (k_anoninfo.ani_max +
490		    k_anoninfo.ani_mem_resv) + avail;
491
492		/* Update ani_free */
493		set_anoninfo();
494		ai.ani_free = k_anoninfo.ani_free + avail;
495
496		ai.ani_resv = k_anoninfo.ani_phys_resv +
497		    k_anoninfo.ani_mem_resv;
498
499		if (!global && zp->zone_max_swap_ctl != UINT64_MAX) {
500			/*
501			 * We're in a non-global zone with a swap cap.  We
502			 * always report the system-wide values for the global
503			 * zone, even though it too can have a swap cap.
504			 */
505
506			/*
507			 * For a swap-capped zone, the numbers are contrived
508			 * since we don't have a correct value of 'reserved'
509			 * for the zone.
510			 *
511			 * The ani_max value is always the zone's swap cap.
512			 *
513			 * The ani_free value is always the difference between
514			 * the cap and the amount of swap in use by the zone.
515			 *
516			 * The ani_resv value is typically set to be the amount
517			 * of swap in use by the zone, but can be adjusted
518			 * upwards to indicate how much swap is currently
519			 * unavailable to that zone due to usage by entities
520			 * outside the zone.
521			 *
522			 * This works as follows.
523			 *
524			 * In the 'swap -s' output, the data is displayed
525			 * as follows:
526			 *    allocated = ani_max  - ani_free
527			 *    reserved  = ani_resv - allocated
528			 *    available = ani_max  - ani_resv
529			 *
530			 * Taking a contrived example, if the swap cap is 100
531			 * and the amount of swap used by the zone is 75, this
532			 * gives:
533			 *    allocated = ani_max  - ani_free  = 100 - 25 = 75
534			 *    reserved  = ani_resv - allocated =  75 - 75 =  0
535			 *    available = ani_max  - ani_resv  = 100 - 75 = 25
536			 *
537			 * In this typical case, you can see that the 'swap -s'
538			 * 'reserved' will always be 0 inside a swap capped
539			 * zone.
540			 *
541			 * However, if the system as a whole has less free
542			 * swap than the zone limits allow, then we adjust
543			 * the ani_resv value up so that it is the difference
544			 * between the zone cap and the amount of free system
545			 * swap.  Taking the above example, but when the
546			 * system as a whole only has 20 of swap available, we
547			 * get an ani_resv of 100 - 20 = 80.  This gives:
548			 *    allocated = ani_max  - ani_free  = 100 - 25 = 75
549			 *    reserved  = ani_resv - allocated =  80 - 75 =  5
550			 *    available = ani_max  - ani_resv  = 100 - 80 = 20
551			 *
552			 * In this case, you can see how the ani_resv value is
553			 * tweaked up to make the 'swap -s' numbers work inside
554			 * the zone.
555			 */
556			rctl_qty_t cap, used;
557			pgcnt_t pgcap, sys_avail;
558
559			mutex_enter(&zp->zone_mem_lock);
560			cap = zp->zone_max_swap_ctl;
561			used = zp->zone_max_swap;
562			mutex_exit(&zp->zone_mem_lock);
563
564			pgcap = MIN(btop(cap), ai.ani_max);
565			ai.ani_free = pgcap - btop(used);
566
567			/* Get the system-wide swap currently available. */
568			sys_avail = ai.ani_max - ai.ani_resv;
569			if (sys_avail < ai.ani_free)
570				ai.ani_resv = pgcap - sys_avail;
571			else
572				ai.ani_resv = btop(used);
573
574			ai.ani_max = pgcap;
575		}
576
577		if (copyout(&ai, sc_arg, sizeof (struct anoninfo)) != 0)
578			return (EFAULT);
579		return (0);
580
581	case SC_LIST:
582		if (copyin(sc_arg, &length, sizeof (int)) != 0)
583			return (EFAULT);
584		if (!global) {
585			struct swapent st;
586			char *swappath = "swap";
587
588			if (length < 1)
589				return (ENOMEM);
590			ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
591			if (copyin(ust, &st, sizeof (swapent_t)) != 0)
592				return (EFAULT);
593			st.ste_start = PAGESIZE >> SCTRSHFT;
594			st.ste_length = (off_t)0;
595			st.ste_pages = 0;
596			st.ste_free = 0;
597			st.ste_flags = 0;
598
599			mutex_enter(&swapinfo_lock);
600			for (sip = swapinfo, nswap = 0;
601			    sip != NULL && nswap < nswapfiles;
602			    sip = sip->si_next, nswap++) {
603				st.ste_length +=
604				    (sip->si_eoff - sip->si_soff) >> SCTRSHFT;
605				st.ste_pages += sip->si_npgs;
606				st.ste_free += sip->si_nfpgs;
607			}
608			mutex_exit(&swapinfo_lock);
609
610			if (zp->zone_max_swap_ctl != UINT64_MAX) {
611				rctl_qty_t cap, used;
612
613				mutex_enter(&zp->zone_mem_lock);
614				cap = zp->zone_max_swap_ctl;
615				used = zp->zone_max_swap;
616				mutex_exit(&zp->zone_mem_lock);
617
618				st.ste_length = MIN(cap, st.ste_length);
619				st.ste_pages = MIN(btop(cap), st.ste_pages);
620				st.ste_free = MIN(st.ste_pages - btop(used),
621				    st.ste_free);
622			}
623
624			if (copyout(&st, ust, sizeof (swapent_t)) != 0 ||
625			    copyout(swappath, st.ste_path,
626			    strlen(swappath) + 1) != 0) {
627				return (EFAULT);
628			}
629			*rv = 1;
630			return (0);
631		}
632beginning:
633		mutex_enter(&swapinfo_lock);
634		tmp_nswapfiles = nswapfiles;
635		mutex_exit(&swapinfo_lock);
636
637		/*
638		 * Return early if there are no swap entries to report:
639		 */
640		if (tmp_nswapfiles < 1) {
641			*rv = 0;
642			return (0);
643		}
644
645		/* Return an error if not enough space for the whole table. */
646		if (length < tmp_nswapfiles)
647			return (ENOMEM);
648		/*
649		 * Get memory to hold the swap entries and their names. We'll
650		 * copy the real entries into these and then copy these out.
651		 * Allocating the pathname memory is only a guess so we may
652		 * find that we need more and have to do it again.
653		 * All this is because we have to hold the anon lock while
654		 * traversing the swapinfo list, and we can't be doing copyouts
655		 * and/or kmem_alloc()s during this.
656		 */
657		csip = kmem_zalloc(tmp_nswapfiles * sizeof (struct swapinfo),
658		    KM_SLEEP);
659retry:
660		nlen = tmp_nswapfiles * (gplen += 100);
661		pname = kmem_zalloc(nlen, KM_SLEEP);
662
663		mutex_enter(&swapinfo_lock);
664
665		if (tmp_nswapfiles != nswapfiles) {
666			mutex_exit(&swapinfo_lock);
667			kmem_free(pname, nlen);
668			kmem_free(csip,
669			    tmp_nswapfiles * sizeof (struct swapinfo));
670			gplen = 0;
671			goto beginning;
672		}
673		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
674		    sip && nswap < tmp_nswapfiles;
675		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
676			plen = sip->si_pnamelen;
677			if (tpname + plen - pname > nlen) {
678				mutex_exit(&swapinfo_lock);
679				kmem_free(pname, nlen);
680				goto retry;
681			}
682			*tsip = *sip;
683			tsip->si_pname = tpname;
684			(void) strcpy(tsip->si_pname, sip->si_pname);
685		}
686		mutex_exit(&swapinfo_lock);
687
688		if (sip) {
689			error = ENOMEM;
690			goto lout;
691		}
692		ust = (swapent_t *)((swaptbl_t *)sc_arg)->swt_ent;
693		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
694			if (copyin(ust, &st, sizeof (swapent_t)) != 0) {
695				error = EFAULT;
696				goto lout;
697			}
698			st.ste_flags = tsip->si_flags;
699			st.ste_length =
700			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
701			st.ste_start = tsip->si_soff >> SCTRSHFT;
702			st.ste_pages = tsip->si_npgs;
703			st.ste_free = tsip->si_nfpgs;
704			if (copyout(&st, ust, sizeof (swapent_t)) != 0) {
705				error = EFAULT;
706				goto lout;
707			}
708			if (!tsip->si_pnamelen)
709				continue;
710			if (copyout(tsip->si_pname, st.ste_path,
711			    tsip->si_pnamelen) != 0) {
712				error = EFAULT;
713				goto lout;
714			}
715		}
716		*rv = nswap;
717lout:
718		kmem_free(csip, tmp_nswapfiles * sizeof (struct swapinfo));
719		kmem_free(pname, nlen);
720		return (error);
721
722	case SC_ADD:
723	case SC_REMOVE:
724		break;
725	default:
726		return (EINVAL);
727	}
728	if ((error = secpolicy_swapctl(CRED())) != 0)
729		return (error);
730
731	if (copyin(sc_arg, &sr, sizeof (swapres_t)))
732		return (EFAULT);
733
734	/* Allocate the space to read in pathname */
735	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
736		return (ENOMEM);
737
738	error = copyinstr(sr.sr_name, swapname, MAXPATHLEN, 0);
739	if (error)
740		goto out;
741
742	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
743	if (error) {
744		if (sc_cmd == SC_ADD)
745			goto out;
746		/* see if we match by name */
747		vp = swapdel_byname(swapname, (size_t)sr.sr_start);
748		if (vp == NULL)
749			goto out;
750	}
751
752	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
753		VN_RELE(vp);
754		error = ENOSYS;
755		goto out;
756	}
757	switch (vp->v_type) {
758	case VBLK:
759		break;
760
761	case VREG:
762		if (vp->v_vfsp && vn_is_readonly(vp))
763			error = EROFS;
764		else
765			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL);
766		break;
767
768	case VDIR:
769		error = EISDIR;
770		break;
771	default:
772		error = ENOSYS;
773		break;
774	}
775	if (error == 0) {
776		if (sc_cmd == SC_REMOVE)
777			error = swapdel(vp, sr.sr_start);
778		else
779			error = swapadd(vp, sr.sr_start,
780			    sr.sr_length, swapname);
781	}
782	VN_RELE(vp);
783out:
784	kmem_free(swapname, MAXPATHLEN);
785	return (error);
786}
787
788#if defined(_LP64) && defined(_SYSCALL32)
789
790int
791swapctl32(int sc_cmd, void *sc_arg, int *rv)
792{
793	struct swapinfo *sip, *csip, *tsip;
794	int error = 0;
795	struct swapent32 st, *ust;
796	struct swapres32 sr;
797	struct vnode *vp;
798	int cnt = 0;
799	int tmp_nswapfiles;
800	int nswap;
801	int length, nlen;
802	int gplen = 0, plen;
803	char *swapname;
804	char *pname;
805	char *tpname;
806	struct anoninfo32 ai;
807	size_t s;
808	spgcnt_t avail;
809	int global = INGLOBALZONE(curproc);
810	struct zone *zp = curproc->p_zone;
811
812	/*
813	 * When running in a zone we want to hide the details of the swap
814	 * devices: we report there only being one swap device named "swap"
815	 * having a size equal to the sum of the sizes of all real swap devices
816	 * on the system.
817	 */
818	switch (sc_cmd) {
819	case SC_GETNSWP:
820		if (global)
821			*rv = nswapfiles;
822		else
823			*rv = 1;
824		return (0);
825
826	case SC_AINFO:
827		/*
828		 * Return anoninfo information with these changes:
829		 * ani_max = maximum amount of swap space
830		 *	(including potentially available physical memory)
831		 * ani_free = amount of unallocated anonymous memory
832		 *	(some of which might be reserved and including
833		 *	 potentially available physical memory)
834		 * ani_resv = amount of claimed (reserved) anonymous memory
835		 */
836		avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0);
837		s = (k_anoninfo.ani_max + k_anoninfo.ani_mem_resv) + avail;
838		if (s > UINT32_MAX)
839			return (EOVERFLOW);
840		ai.ani_max = s;
841
842		/* Update ani_free */
843		set_anoninfo();
844		s = k_anoninfo.ani_free + avail;
845		if (s > UINT32_MAX)
846			return (EOVERFLOW);
847		ai.ani_free = s;
848
849		s = k_anoninfo.ani_phys_resv + k_anoninfo.ani_mem_resv;
850		if (s > UINT32_MAX)
851			return (EOVERFLOW);
852		ai.ani_resv = s;
853
854		if (!global && zp->zone_max_swap_ctl != UINT64_MAX) {
855			/*
856			 * We're in a non-global zone with a swap cap.  We
857			 * always report the system-wide values for the global
858			 * zone, even though it too can have a swap cap.
859			 * See the comment for the SC_AINFO case in swapctl()
860			 * which explains the following logic.
861			 */
862			rctl_qty_t cap, used;
863			pgcnt_t pgcap, sys_avail;
864
865			mutex_enter(&zp->zone_mem_lock);
866			cap = zp->zone_max_swap_ctl;
867			used = zp->zone_max_swap;
868			mutex_exit(&zp->zone_mem_lock);
869
870			pgcap = MIN(btop(cap), ai.ani_max);
871			ai.ani_free = pgcap - btop(used);
872
873			/* Get the system-wide swap currently available. */
874			sys_avail = ai.ani_max - ai.ani_resv;
875			if (sys_avail < ai.ani_free)
876				ai.ani_resv = pgcap - sys_avail;
877			else
878				ai.ani_resv = btop(used);
879
880			ai.ani_max = pgcap;
881		}
882
883		if (copyout(&ai, sc_arg, sizeof (ai)) != 0)
884			return (EFAULT);
885		return (0);
886
887	case SC_LIST:
888		if (copyin(sc_arg, &length, sizeof (int32_t)) != 0)
889			return (EFAULT);
890		if (!global) {
891			struct swapent32 st;
892			char *swappath = "swap";
893
894			if (length < 1)
895				return (ENOMEM);
896			ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent;
897			if (copyin(ust, &st, sizeof (swapent32_t)) != 0)
898				return (EFAULT);
899			st.ste_start = PAGESIZE >> SCTRSHFT;
900			st.ste_length = (off_t)0;
901			st.ste_pages = 0;
902			st.ste_free = 0;
903			st.ste_flags = 0;
904
905			mutex_enter(&swapinfo_lock);
906			for (sip = swapinfo, nswap = 0;
907			    sip != NULL && nswap < nswapfiles;
908			    sip = sip->si_next, nswap++) {
909				st.ste_length +=
910				    (sip->si_eoff - sip->si_soff) >> SCTRSHFT;
911				st.ste_pages += sip->si_npgs;
912				st.ste_free += sip->si_nfpgs;
913			}
914			mutex_exit(&swapinfo_lock);
915
916			if (zp->zone_max_swap_ctl != UINT64_MAX) {
917				rctl_qty_t cap, used;
918
919				mutex_enter(&zp->zone_mem_lock);
920				cap = zp->zone_max_swap_ctl;
921				used = zp->zone_max_swap;
922				mutex_exit(&zp->zone_mem_lock);
923
924				st.ste_length = MIN(cap, st.ste_length);
925				st.ste_pages = MIN(btop(cap), st.ste_pages);
926				st.ste_free = MIN(st.ste_pages - btop(used),
927				    st.ste_free);
928			}
929
930			if (copyout(&st, ust, sizeof (swapent32_t)) != 0 ||
931			    copyout(swappath, (caddr_t)(uintptr_t)st.ste_path,
932			    strlen(swappath) + 1) != 0) {
933				return (EFAULT);
934			}
935			*rv = 1;
936			return (0);
937		}
938beginning:
939		mutex_enter(&swapinfo_lock);
940		tmp_nswapfiles = nswapfiles;
941		mutex_exit(&swapinfo_lock);
942
943		/*
944		 * Return early if there are no swap entries to report:
945		 */
946		if (tmp_nswapfiles < 1) {
947			*rv = 0;
948			return (0);
949		}
950
951		/* Return an error if not enough space for the whole table. */
952		if (length < tmp_nswapfiles)
953			return (ENOMEM);
954		/*
955		 * Get memory to hold the swap entries and their names. We'll
956		 * copy the real entries into these and then copy these out.
957		 * Allocating the pathname memory is only a guess so we may
958		 * find that we need more and have to do it again.
959		 * All this is because we have to hold the anon lock while
960		 * traversing the swapinfo list, and we can't be doing copyouts
961		 * and/or kmem_alloc()s during this.
962		 */
963		csip = kmem_zalloc(tmp_nswapfiles * sizeof (*csip), KM_SLEEP);
964retry:
965		nlen = tmp_nswapfiles * (gplen += 100);
966		pname = kmem_zalloc(nlen, KM_SLEEP);
967
968		mutex_enter(&swapinfo_lock);
969
970		if (tmp_nswapfiles != nswapfiles) {
971			mutex_exit(&swapinfo_lock);
972			kmem_free(pname, nlen);
973			kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
974			gplen = 0;
975			goto beginning;
976		}
977		for (sip = swapinfo, tsip = csip, tpname = pname, nswap = 0;
978		    (sip != NULL) && (nswap < tmp_nswapfiles);
979		    sip = sip->si_next, tsip++, tpname += plen, nswap++) {
980			plen = sip->si_pnamelen;
981			if (tpname + plen - pname > nlen) {
982				mutex_exit(&swapinfo_lock);
983				kmem_free(pname, nlen);
984				goto retry;
985			}
986			*tsip = *sip;
987			tsip->si_pname = tpname;
988			(void) strcpy(tsip->si_pname, sip->si_pname);
989		}
990		mutex_exit(&swapinfo_lock);
991
992		if (sip != NULL) {
993			error = ENOMEM;
994			goto lout;
995		}
996		ust = (swapent32_t *)((swaptbl32_t *)sc_arg)->swt_ent;
997		for (tsip = csip, cnt = 0; cnt < nswap;  tsip++, ust++, cnt++) {
998			if (copyin(ust, &st, sizeof (*ust)) != 0) {
999				error = EFAULT;
1000				goto lout;
1001			}
1002			st.ste_flags = tsip->si_flags;
1003			st.ste_length =
1004			    (tsip->si_eoff - tsip->si_soff) >> SCTRSHFT;
1005			st.ste_start = tsip->si_soff >> SCTRSHFT;
1006			st.ste_pages = tsip->si_npgs;
1007			st.ste_free = tsip->si_nfpgs;
1008			if (copyout(&st, ust, sizeof (st)) != 0) {
1009				error = EFAULT;
1010				goto lout;
1011			}
1012			if (!tsip->si_pnamelen)
1013				continue;
1014			if (copyout(tsip->si_pname,
1015			    (caddr_t)(uintptr_t)st.ste_path,
1016			    tsip->si_pnamelen) != 0) {
1017				error = EFAULT;
1018				goto lout;
1019			}
1020		}
1021		*rv = nswap;
1022lout:
1023		kmem_free(csip, tmp_nswapfiles * sizeof (*csip));
1024		kmem_free(pname, nlen);
1025		return (error);
1026
1027	case SC_ADD:
1028	case SC_REMOVE:
1029		break;
1030	default:
1031		return (EINVAL);
1032	}
1033	if ((error = secpolicy_swapctl(CRED())) != 0)
1034		return (error);
1035
1036	if (copyin(sc_arg, &sr, sizeof (sr)))
1037		return (EFAULT);
1038
1039	/* Allocate the space to read in pathname */
1040	if ((swapname = kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
1041		return (ENOMEM);
1042
1043	error = copyinstr((caddr_t)(uintptr_t)sr.sr_name,
1044	    swapname, MAXPATHLEN, NULL);
1045	if (error)
1046		goto out;
1047
1048	error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
1049	if (error) {
1050		if (sc_cmd == SC_ADD)
1051			goto out;
1052		/* see if we match by name */
1053		vp = swapdel_byname(swapname, (uint_t)sr.sr_start);
1054		if (vp == NULL)
1055			goto out;
1056	}
1057
1058	if (vp->v_flag & (VNOMAP | VNOSWAP)) {
1059		VN_RELE(vp);
1060		error = ENOSYS;
1061		goto out;
1062	}
1063	switch (vp->v_type) {
1064	case VBLK:
1065		break;
1066
1067	case VREG:
1068		if (vp->v_vfsp && vn_is_readonly(vp))
1069			error = EROFS;
1070		else
1071			error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL);
1072		break;
1073
1074	case VDIR:
1075		error = EISDIR;
1076		break;
1077	default:
1078		error = ENOSYS;
1079		break;
1080	}
1081	if (error == 0) {
1082		if (sc_cmd == SC_REMOVE)
1083			error = swapdel(vp, sr.sr_start);
1084		else
1085			error = swapadd(vp, sr.sr_start, sr.sr_length,
1086			    swapname);
1087	}
1088	VN_RELE(vp);
1089out:
1090	kmem_free(swapname, MAXPATHLEN);
1091	return (error);
1092}
1093
1094#endif /* _LP64 && _SYSCALL32 */
1095
1096/*
1097 * Add a new swap file.
1098 */
1099int
1100swapadd(struct vnode *vp, ulong_t lowblk, ulong_t nblks, char *swapname)
1101{
1102	struct swapinfo **sipp, *nsip = NULL, *esip = NULL;
1103	struct vnode *cvp;
1104	struct vattr vattr;
1105	pgcnt_t pages;
1106	u_offset_t soff, eoff;
1107	int error;
1108	ssize_t i, start, end;
1109	ushort_t wasswap;
1110	ulong_t startblk;
1111	size_t	returned_mem;
1112
1113	SWAP_PRINT(SW_CTL, "swapadd: vp %p lowblk %ld nblks %ld swapname %s\n",
1114	    vp, lowblk, nblks, swapname, 0);
1115	/*
1116	 * Get the real vnode. (If vp is not a specnode it just returns vp, so
1117	 * it does the right thing, but having this code know about specnodes
1118	 * violates the spirit of having it be indepedent of vnode type.)
1119	 */
1120	cvp = common_specvp(vp);
1121
1122	/*
1123	 * Or in VISSWAP so file system has chance to deny swap-ons during open.
1124	 */
1125	mutex_enter(&cvp->v_lock);
1126	wasswap = cvp->v_flag & VISSWAP;
1127	cvp->v_flag |= VISSWAP;
1128	mutex_exit(&cvp->v_lock);
1129
1130	mutex_enter(&swap_lock);
1131	if (error = VOP_OPEN(&cvp, FREAD|FWRITE, CRED(), NULL)) {
1132		mutex_exit(&swap_lock);
1133		/* restore state of v_flag */
1134		if (!wasswap) {
1135			mutex_enter(&cvp->v_lock);
1136			cvp->v_flag &= ~VISSWAP;
1137			mutex_exit(&cvp->v_lock);
1138		}
1139		return (error);
1140	}
1141	mutex_exit(&swap_lock);
1142
1143	/*
1144	 * Get partition size. Return error if empty partition,
1145	 * or if request does not fit within the partition.
1146	 * If this is the first swap device, we can reduce
1147	 * the size of the swap area to match what is
1148	 * available.  This can happen if the system was built
1149	 * on a machine with a different size swap partition.
1150	 */
1151	vattr.va_mask = AT_SIZE;
1152	if (error = VOP_GETATTR(cvp, &vattr, ATTR_COMM, CRED(), NULL))
1153		goto out;
1154
1155	/*
1156	 * Specfs returns a va_size of MAXOFFSET_T (UNKNOWN_SIZE) when the
1157	 * size of the device can't be determined.
1158	 */
1159	if ((vattr.va_size == 0) || (vattr.va_size == MAXOFFSET_T)) {
1160		error = EINVAL;
1161		goto out;
1162	}
1163
1164#ifdef	_ILP32
1165	/*
1166	 * No support for large swap in 32-bit OS, if the size of the swap is
1167	 * bigger than MAXOFF32_T then the size used by swapfs must be limited.
1168	 * This limitation is imposed by the swap subsystem itself, a D_64BIT
1169	 * driver as the target of swap operation should be able to field
1170	 * the IO.
1171	 */
1172	if (vattr.va_size > MAXOFF32_T) {
1173		cmn_err(CE_NOTE,
1174		    "!swap device %s truncated from 0x%llx to 0x%x bytes",
1175		    swapname, vattr.va_size, MAXOFF32_T);
1176		vattr.va_size = MAXOFF32_T;
1177	}
1178#endif	/* _ILP32 */
1179
1180	/* Fail if file not writeable (try to set size to current size) */
1181	vattr.va_mask = AT_SIZE;
1182	if (error = VOP_SETATTR(cvp, &vattr, 0, CRED(), NULL))
1183		goto out;
1184
1185	/* Fail if fs does not support VOP_PAGEIO */
1186	error = VOP_PAGEIO(cvp, (page_t *)NULL, (u_offset_t)0, 0, 0, CRED(),
1187	    NULL);
1188
1189	if (error == ENOSYS)
1190		goto out;
1191	else
1192		error = 0;
1193	/*
1194	 * If swapping on the root filesystem don't put swap blocks that
1195	 * correspond to the miniroot filesystem on the swap free list.
1196	 */
1197	if (cvp == rootdir)
1198		startblk = roundup(MINIROOTSIZE<<SCTRSHFT, klustsize)>>SCTRSHFT;
1199	else				/* Skip 1st page (disk label) */
1200		startblk = (ulong_t)(lowblk ? lowblk : 1);
1201
1202	soff = startblk << SCTRSHFT;
1203	if (soff >= vattr.va_size) {
1204		error = EINVAL;
1205		goto out;
1206	}
1207
1208	/*
1209	 * If user specified 0 blks, use the size of the device
1210	 */
1211	eoff = nblks ?  soff + (nblks - (startblk - lowblk) << SCTRSHFT) :
1212	    vattr.va_size;
1213
1214	SWAP_PRINT(SW_CTL, "swapadd: va_size %ld soff %ld eoff %ld\n",
1215	    vattr.va_size, soff, eoff, 0, 0);
1216
1217	if (eoff > vattr.va_size) {
1218		error = EINVAL;
1219		goto out;
1220	}
1221
1222	/*
1223	 * The starting and ending offsets must be page aligned.
1224	 * Round soff up to next page boundary, round eoff
1225	 * down to previous page boundary.
1226	 */
1227	soff = ptob(btopr(soff));
1228	eoff = ptob(btop(eoff));
1229	if (soff >= eoff) {
1230		SWAP_PRINT(SW_CTL, "swapadd: soff %ld >= eoff %ld\n",
1231		    soff, eoff, 0, 0, 0);
1232		error = EINVAL;
1233		goto out;
1234	}
1235
1236	pages = btop(eoff - soff);
1237
1238	/* Allocate and partially set up the new swapinfo */
1239	nsip = kmem_zalloc(sizeof (struct swapinfo), KM_SLEEP);
1240	nsip->si_vp = cvp;
1241
1242	nsip->si_soff = soff;
1243	nsip->si_eoff = eoff;
1244	nsip->si_hint = 0;
1245	nsip->si_checkcnt = nsip->si_alloccnt = 0;
1246
1247	nsip->si_pnamelen = (int)strlen(swapname) + 1;
1248	nsip->si_pname = (char *)kmem_zalloc(nsip->si_pnamelen, KM_SLEEP);
1249	bcopy(swapname, nsip->si_pname, nsip->si_pnamelen - 1);
1250	SWAP_PRINT(SW_CTL, "swapadd: allocating swapinfo for %s, %ld pages\n",
1251	    swapname, pages, 0, 0, 0);
1252	/*
1253	 * Size of swapslots map in bytes
1254	 */
1255	nsip->si_mapsize = P2ROUNDUP(pages, NBBW) / NBBY;
1256	nsip->si_swapslots = kmem_zalloc(nsip->si_mapsize, KM_SLEEP);
1257
1258	/*
1259	 * Permanently set the bits that can't ever be allocated,
1260	 * i.e. those from the ending offset to the round up slot for the
1261	 * swapslots bit map.
1262	 */
1263	start = pages;
1264	end = P2ROUNDUP(pages, NBBW);
1265	for (i = start; i < end; i++) {
1266		SWAP_PRINT(SW_CTL, "swapadd: set bit for page %ld\n", i,
1267		    0, 0, 0, 0);
1268		SETBIT(nsip->si_swapslots, i);
1269	}
1270	nsip->si_npgs = nsip->si_nfpgs = pages;
1271	/*
1272	 * Now check to see if we can add it. We wait til now to check because
1273	 * we need the swapinfo_lock and we don't want sleep with it (e.g.,
1274	 * during kmem_alloc()) while we're setting up the swapinfo.
1275	 */
1276	mutex_enter(&swapinfo_lock);
1277	for (sipp = &swapinfo; (esip = *sipp) != NULL; sipp = &esip->si_next) {
1278		if (esip->si_vp == cvp) {
1279			if (esip->si_soff == soff && esip->si_npgs == pages &&
1280			    (esip->si_flags & ST_DOINGDEL)) {
1281				/*
1282				 * We are adding a device that we are in the
1283				 * middle of deleting. Just clear the
1284				 * ST_DOINGDEL flag to signal this and
1285				 * the deletion routine will eventually notice
1286				 * it and add it back.
1287				 */
1288				esip->si_flags &= ~ST_DOINGDEL;
1289				mutex_exit(&swapinfo_lock);
1290				goto out;
1291			}
1292			/* disallow overlapping swap files */
1293			if ((soff < esip->si_eoff) && (eoff > esip->si_soff)) {
1294				error = EEXIST;
1295				mutex_exit(&swapinfo_lock);
1296				goto out;
1297			}
1298		}
1299	}
1300
1301	nswapfiles++;
1302
1303	/*
1304	 * add new swap device to list and shift allocations to it
1305	 * before updating the anoninfo counters
1306	 */
1307	*sipp = nsip;
1308	silast = nsip;
1309
1310	/*
1311	 * Update the total amount of reservable swap space
1312	 * accounting properly for swap space from physical memory
1313	 */
1314	/* New swap device soaks up currently reserved memory swap */
1315	mutex_enter(&anoninfo_lock);
1316
1317	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
1318	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
1319
1320	k_anoninfo.ani_max += pages;
1321	ANI_ADD(pages);
1322	if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
1323		returned_mem = MIN(k_anoninfo.ani_mem_resv -
1324		    k_anoninfo.ani_locked_swap,
1325		    k_anoninfo.ani_max - k_anoninfo.ani_phys_resv);
1326
1327		ANI_ADD(-returned_mem);
1328		k_anoninfo.ani_free -= returned_mem;
1329		k_anoninfo.ani_mem_resv -= returned_mem;
1330		k_anoninfo.ani_phys_resv += returned_mem;
1331
1332		mutex_enter(&freemem_lock);
1333		availrmem += returned_mem;
1334		mutex_exit(&freemem_lock);
1335	}
1336	/*
1337	 * At boot time, to permit booting small memory machines using
1338	 * only physical memory as swap space, we allowed a dangerously
1339	 * large amount of memory to be used as swap space; now that
1340	 * more physical backing store is available bump down the amount
1341	 * we can get from memory to a safer size.
1342	 */
1343	if (swapfs_minfree < swapfs_desfree) {
1344		mutex_enter(&freemem_lock);
1345		if (availrmem > swapfs_desfree || !k_anoninfo.ani_mem_resv)
1346			swapfs_minfree = swapfs_desfree;
1347		mutex_exit(&freemem_lock);
1348	}
1349
1350	SWAP_PRINT(SW_CTL, "swapadd: ani_max %ld ani_free %ld\n",
1351	    k_anoninfo.ani_free, k_anoninfo.ani_free, 0, 0, 0);
1352
1353	mutex_exit(&anoninfo_lock);
1354
1355	mutex_exit(&swapinfo_lock);
1356
1357	/* Initialize the dump device */
1358	mutex_enter(&dump_lock);
1359	if (dumpvp == NULL)
1360		(void) dumpinit(vp, swapname, 0);
1361	mutex_exit(&dump_lock);
1362
1363	VN_HOLD(cvp);
1364out:
1365	if (error || esip) {
1366		SWAP_PRINT(SW_CTL, "swapadd: error (%d)\n", error, 0, 0, 0, 0);
1367
1368		if (!wasswap) {
1369			mutex_enter(&cvp->v_lock);
1370			cvp->v_flag &= ~VISSWAP;
1371			mutex_exit(&cvp->v_lock);
1372		}
1373		if (nsip) {
1374			kmem_free(nsip->si_swapslots, (size_t)nsip->si_mapsize);
1375			kmem_free(nsip->si_pname, nsip->si_pnamelen);
1376			kmem_free(nsip, sizeof (*nsip));
1377		}
1378		mutex_enter(&swap_lock);
1379		(void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED(),
1380		    NULL);
1381		mutex_exit(&swap_lock);
1382	}
1383	return (error);
1384}
1385
1386/*
1387 * Delete a swap file.
1388 */
1389static int
1390swapdel(
1391	struct vnode *vp,
1392	ulong_t lowblk) /* Low block number of area to delete. */
1393{
1394	struct swapinfo **sipp, *osip = NULL;
1395	struct vnode *cvp;
1396	u_offset_t soff;
1397	int error = 0;
1398	u_offset_t toff = 0;
1399	struct vnode *tvp = NULL;
1400	spgcnt_t pages;
1401	struct anon **app, *ap;
1402	kmutex_t *ahm;
1403	pgcnt_t adjust_swap = 0;
1404
1405	/* Find the swap file entry for the file to be deleted */
1406	cvp = common_specvp(vp);
1407
1408
1409	lowblk = lowblk ? lowblk : 1;	/* Skip first page (disk label) */
1410	soff = ptob(btopr(lowblk << SCTRSHFT)); /* must be page aligned */
1411
1412	mutex_enter(&swapinfo_lock);
1413	for (sipp = &swapinfo; (osip = *sipp) != NULL; sipp = &osip->si_next) {
1414		if ((osip->si_vp == cvp) &&
1415		    (osip->si_soff == soff) && (osip->si_flags == 0))
1416			break;
1417	}
1418
1419	/* If the file was not found, error.  */
1420	if (osip == NULL) {
1421		error = EINVAL;
1422		mutex_exit(&swapinfo_lock);
1423		goto out;
1424	}
1425
1426	pages = osip->si_npgs;
1427
1428	/*
1429	 * Do not delete if we will be low on swap pages.
1430	 */
1431	mutex_enter(&anoninfo_lock);
1432
1433	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
1434	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
1435
1436	mutex_enter(&freemem_lock);
1437	if (((k_anoninfo.ani_max - k_anoninfo.ani_phys_resv) +
1438	    MAX((spgcnt_t)(availrmem - swapfs_minfree), 0)) < pages) {
1439		mutex_exit(&freemem_lock);
1440		mutex_exit(&anoninfo_lock);
1441		error = ENOMEM;
1442		cmn_err(CE_WARN, "swapdel - too few free pages");
1443		mutex_exit(&swapinfo_lock);
1444		goto out;
1445	}
1446	mutex_exit(&freemem_lock);
1447
1448	k_anoninfo.ani_max -= pages;
1449
1450	/* If needed, reserve memory swap to replace old device */
1451	if (k_anoninfo.ani_phys_resv > k_anoninfo.ani_max) {
1452		adjust_swap = k_anoninfo.ani_phys_resv - k_anoninfo.ani_max;
1453		k_anoninfo.ani_phys_resv -= adjust_swap;
1454		k_anoninfo.ani_mem_resv += adjust_swap;
1455		mutex_enter(&freemem_lock);
1456		availrmem -= adjust_swap;
1457		mutex_exit(&freemem_lock);
1458		ANI_ADD(adjust_swap);
1459	}
1460	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
1461	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
1462	mutex_exit(&anoninfo_lock);
1463
1464	ANI_ADD(-pages);
1465
1466	/*
1467	 * Set the delete flag.  This prevents anyone from allocating more
1468	 * pages from this file. Also set ST_DOINGDEL. Someone who wants to
1469	 * add the file back while we're deleting it will signify by clearing
1470	 * this flag.
1471	 */
1472	osip->si_flags |= ST_INDEL|ST_DOINGDEL;
1473	mutex_exit(&swapinfo_lock);
1474
1475	/*
1476	 * Free all the allocated physical slots for this file. We do this
1477	 * by walking through the entire anon hash array, because we need
1478	 * to update all the anon slots that have physical swap slots on
1479	 * this file, and this is the only way to find them all. We go back
1480	 * to the beginning of a bucket after each slot is freed because the
1481	 * anonhash_lock is not held during the free and thus the hash table
1482	 * may change under us.
1483	 */
1484	for (app = anon_hash; app < &anon_hash[ANON_HASH_SIZE]; app++) {
1485		ahm = &anonhash_lock[(app - anon_hash) &
1486		    (AH_LOCK_SIZE - 1)].pad_mutex;
1487		mutex_enter(ahm);
1488top:
1489		for (ap = *app; ap != NULL; ap = ap->an_hash) {
1490			if (ap->an_pvp == cvp &&
1491			    ap->an_poff >= osip->si_soff &&
1492			    ap->an_poff < osip->si_eoff) {
1493				ASSERT(TESTBIT(osip->si_swapslots,
1494				    btop((size_t)(ap->an_poff -
1495				    osip->si_soff))));
1496				tvp = ap->an_vp;
1497				toff = ap->an_off;
1498				VN_HOLD(tvp);
1499				mutex_exit(ahm);
1500
1501				error = swapslot_free(tvp, toff, osip);
1502
1503				VN_RELE(tvp);
1504				mutex_enter(ahm);
1505				if (!error && (osip->si_flags & ST_DOINGDEL)) {
1506					goto top;
1507				} else {
1508					if (error) {
1509						cmn_err(CE_WARN,
1510						    "swapslot_free failed %d",
1511						    error);
1512					}
1513
1514					/*
1515					 * Add device back before making it
1516					 * visible.
1517					 */
1518					mutex_enter(&swapinfo_lock);
1519					osip->si_flags &=
1520					    ~(ST_INDEL | ST_DOINGDEL);
1521					mutex_exit(&swapinfo_lock);
1522
1523					/*
1524					 * Update the anon space available
1525					 */
1526					mutex_enter(&anoninfo_lock);
1527
1528					k_anoninfo.ani_phys_resv += adjust_swap;
1529					k_anoninfo.ani_mem_resv -= adjust_swap;
1530					k_anoninfo.ani_max += pages;
1531
1532					mutex_enter(&freemem_lock);
1533					availrmem += adjust_swap;
1534					mutex_exit(&freemem_lock);
1535
1536					mutex_exit(&anoninfo_lock);
1537
1538					ANI_ADD(pages);
1539
1540					mutex_exit(ahm);
1541					goto out;
1542				}
1543			}
1544		}
1545		mutex_exit(ahm);
1546	}
1547
1548	/* All done, they'd better all be free! */
1549	mutex_enter(&swapinfo_lock);
1550	ASSERT(osip->si_nfpgs == osip->si_npgs);
1551
1552	/* Now remove it from the swapinfo list */
1553	for (sipp = &swapinfo; *sipp != NULL; sipp = &(*sipp)->si_next) {
1554		if (*sipp == osip)
1555			break;
1556	}
1557	ASSERT(*sipp);
1558	*sipp = osip->si_next;
1559	if (silast == osip)
1560		if ((silast = osip->si_next) == NULL)
1561			silast = swapinfo;
1562	nswapfiles--;
1563	mutex_exit(&swapinfo_lock);
1564
1565	kmem_free(osip->si_swapslots, osip->si_mapsize);
1566	kmem_free(osip->si_pname, osip->si_pnamelen);
1567	kmem_free(osip, sizeof (*osip));
1568
1569	mutex_enter(&dump_lock);
1570	if (cvp == dumpvp)
1571		dumpfini();
1572	mutex_exit(&dump_lock);
1573
1574	/* Release the vnode */
1575
1576	mutex_enter(&swap_lock);
1577	(void) VOP_CLOSE(cvp, FREAD|FWRITE, 1, (offset_t)0, CRED(), NULL);
1578	mutex_enter(&cvp->v_lock);
1579	cvp->v_flag &= ~VISSWAP;
1580	mutex_exit(&cvp->v_lock);
1581	VN_RELE(cvp);
1582	mutex_exit(&swap_lock);
1583out:
1584	return (error);
1585}
1586
1587/*
1588 * Free up a physical swap slot on swapinfo sip, currently in use by the
1589 * anonymous page whose name is (vp, off).
1590 */
1591static int
1592swapslot_free(
1593	struct vnode *vp,
1594	u_offset_t off,
1595	struct swapinfo *sip)
1596{
1597	struct page *pp = NULL;
1598	struct anon *ap = NULL;
1599	int error = 0;
1600	kmutex_t *ahm;
1601	struct vnode *pvp = NULL;
1602	u_offset_t poff;
1603	int	alloc_pg = 0;
1604
1605	ASSERT(sip->si_vp != NULL);
1606	/*
1607	 * Get the page for the old swap slot if exists or create a new one.
1608	 */
1609again:
1610	if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
1611		pp = page_create_va(vp, off, PAGESIZE, PG_WAIT | PG_EXCL,
1612		    segkmap, NULL);
1613		if (pp == NULL)
1614			goto again;
1615		alloc_pg = 1;
1616
1617		error = swap_getphysname(vp, off, &pvp, &poff);
1618		if (error || pvp != sip->si_vp || poff < sip->si_soff ||
1619		    poff >= sip->si_eoff) {
1620			page_io_unlock(pp);
1621			/*LINTED: constant in conditional context*/
1622			VN_DISPOSE(pp, B_INVAL, 0, kcred);
1623			return (0);
1624		}
1625
1626		error = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ,
1627		    CRED(), NULL);
1628		if (error) {
1629			page_io_unlock(pp);
1630			if (error == EFAULT)
1631				error = 0;
1632			/*LINTED: constant in conditional context*/
1633			VN_DISPOSE(pp, B_INVAL, 0, kcred);
1634			return (error);
1635		}
1636	}
1637
1638	/*
1639	 * The anon could have been removed by anon_decref* and/or reallocated
1640	 * by anon layer (an_pvp == NULL) with the same vp, off.
1641	 * In this case the page which has been allocated needs to
1642	 * be freed.
1643	 */
1644	if (!alloc_pg)
1645		page_io_lock(pp);
1646	ahm = AH_MUTEX(vp, off);
1647	mutex_enter(ahm);
1648	ap = swap_anon(vp, off);
1649	if ((ap == NULL || ap->an_pvp == NULL) && alloc_pg) {
1650		mutex_exit(ahm);
1651		page_io_unlock(pp);
1652		/*LINTED: constant in conditional context*/
1653		VN_DISPOSE(pp, B_INVAL, 0, kcred);
1654		return (0);
1655	}
1656
1657	/*
1658	 * Free the physical slot. It may have been freed up and replaced with
1659	 * another one while we were getting the page so we have to re-verify
1660	 * that this is really one we want. If we do free the slot we have
1661	 * to mark the page modified, as its backing store is now gone.
1662	 */
1663	if ((ap != NULL) && (ap->an_pvp == sip->si_vp && ap->an_poff >=
1664	    sip->si_soff && ap->an_poff < sip->si_eoff)) {
1665		swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1666		ap->an_pvp = NULL;
1667		ap->an_poff = 0;
1668		mutex_exit(ahm);
1669		hat_setmod(pp);
1670	} else {
1671		mutex_exit(ahm);
1672	}
1673	page_io_unlock(pp);
1674	page_unlock(pp);
1675	return (0);
1676}
1677
1678
1679/*
1680 * Get contig physical backing store for vp, in the range
1681 * [*offp, *offp + *lenp), May back a subrange of this, but must
1682 * always include the requested offset or fail. Returns the offsets
1683 * backed as [*offp, *offp + *lenp) and the physical offsets used to
1684 * back them from *pvpp in the range [*pstartp, *pstartp + *lenp).
1685 * Returns	0 for success
1686 *		SE_NOANON -- no anon slot for requested paged
1687 *		SE_NOSWAP -- no physical swap space available
1688 */
1689int
1690swap_newphysname(
1691	struct vnode *vp,
1692	u_offset_t offset,
1693	u_offset_t *offp,
1694	size_t *lenp,
1695	struct vnode **pvpp,
1696	u_offset_t *poffp)
1697{
1698	struct anon *ap = NULL;		/* anon slot for vp, off */
1699	int error = 0;
1700	struct vnode *pvp;
1701	u_offset_t poff, pstart, prem;
1702	size_t plen;
1703	u_offset_t off, start;
1704	kmutex_t *ahm;
1705
1706	ASSERT(*offp <= offset && offset < *offp + *lenp);
1707
1708	/* Get new physical swap slots. */
1709	plen = *lenp;
1710	if (!swap_phys_alloc(&pvp, &pstart, &plen, 0)) {
1711		/*
1712		 * No swap available so return error unless requested
1713		 * offset is already backed in which case return that.
1714		 */
1715		ahm = AH_MUTEX(vp, offset);
1716		mutex_enter(ahm);
1717		if ((ap = swap_anon(vp, offset)) == NULL) {
1718			error = SE_NOANON;
1719			mutex_exit(ahm);
1720			return (error);
1721		}
1722		error = (ap->an_pvp ? 0 : SE_NOSWAP);
1723		*offp = offset;
1724		*lenp = PAGESIZE;
1725		*pvpp = ap->an_pvp;
1726		*poffp = ap->an_poff;
1727		mutex_exit(ahm);
1728		return (error);
1729	}
1730
1731	/*
1732	 * We got plen (<= *lenp) contig slots. Use these to back a
1733	 * subrange of [*offp, *offp + *lenp) which includes offset.
1734	 * For now we just put offset at the end of the kluster.
1735	 * Clearly there are other possible choices - which is best?
1736	 */
1737	start = MAX(*offp,
1738	    (offset + PAGESIZE > plen) ? (offset + PAGESIZE - plen) : 0);
1739	ASSERT(start + plen <= *offp + *lenp);
1740
1741	for (off = start, poff = pstart; poff < pstart + plen;
1742	    off += PAGESIZE, poff += PAGESIZE) {
1743		ahm = AH_MUTEX(vp, off);
1744		mutex_enter(ahm);
1745		if ((ap = swap_anon(vp, off)) != NULL) {
1746			/* Free old slot if any, and assign new one */
1747			if (ap->an_pvp)
1748				swap_phys_free(ap->an_pvp, ap->an_poff,
1749				    PAGESIZE);
1750			ap->an_pvp = pvp;
1751			ap->an_poff = poff;
1752		} else {	/* No anon slot for a klustered page, quit. */
1753			prem = (pstart + plen) - poff;
1754			/* Already did requested page, do partial kluster */
1755			if (off > offset) {
1756				plen = poff - pstart;
1757				error = 0;
1758			/* Fail on requested page, error */
1759			} else if (off == offset)  {
1760				error = SE_NOANON;
1761			/* Fail on prior page, fail on requested page, error */
1762			} else if ((ap = swap_anon(vp, offset)) == NULL) {
1763				error = SE_NOANON;
1764			/* Fail on prior page, got requested page, do only it */
1765			} else {
1766				/* Free old slot if any, and assign new one */
1767				if (ap->an_pvp)
1768					swap_phys_free(ap->an_pvp, ap->an_poff,
1769					    PAGESIZE);
1770				ap->an_pvp = pvp;
1771				ap->an_poff = poff;
1772				/* One page kluster */
1773				start = offset;
1774				plen = PAGESIZE;
1775				pstart = poff;
1776				poff += PAGESIZE;
1777				prem -= PAGESIZE;
1778			}
1779			/* Free unassigned slots */
1780			swap_phys_free(pvp, poff, prem);
1781			mutex_exit(ahm);
1782			break;
1783		}
1784		mutex_exit(ahm);
1785	}
1786	ASSERT(*offp <= start && start + plen <= *offp + *lenp);
1787	ASSERT(start <= offset && offset < start + plen);
1788	*offp = start;
1789	*lenp = plen;
1790	*pvpp = pvp;
1791	*poffp = pstart;
1792	return (error);
1793}
1794
1795
1796/*
1797 * Get the physical swap backing store location for a given anonymous page
1798 * named (vp, off). The backing store name is returned in (*pvpp, *poffp).
1799 * Returns	0		success
1800 *		EIDRM --	no anon slot (page is not allocated)
1801 */
1802int
1803swap_getphysname(
1804	struct vnode *vp,
1805	u_offset_t off,
1806	struct vnode **pvpp,
1807	u_offset_t *poffp)
1808{
1809	struct anon *ap;
1810	int error = 0;
1811	kmutex_t *ahm;
1812
1813	ahm = AH_MUTEX(vp, off);
1814	mutex_enter(ahm);
1815
1816	/* Get anon slot for vp, off */
1817	ap = swap_anon(vp, off);
1818	if (ap == NULL) {
1819		error = EIDRM;
1820		goto out;
1821	}
1822	*pvpp = ap->an_pvp;
1823	*poffp = ap->an_poff;
1824out:
1825	mutex_exit(ahm);
1826	return (error);
1827}
1828