1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/errno.h>
30 #include <sys/kmem.h>
31 #include <sys/vnode.h>
32 #include <sys/vfs_opreg.h>
33 #include <sys/swap.h>
34 #include <sys/sysmacros.h>
35 #include <sys/buf.h>
36 #include <sys/callb.h>
37 #include <sys/debug.h>
38 #include <vm/seg.h>
39 #include <sys/fs/swapnode.h>
40 #include <fs/fs_subr.h>
41 #include <sys/cmn_err.h>
42 #include <sys/mem_config.h>
43 #include <sys/atomic.h>
44 
45 extern const fs_operation_def_t swap_vnodeops_template[];
46 
47 /*
48  * swapfs_minfree is the amount of physical memory (actually remaining
49  * availrmem) that we want to keep free for the rest of the system.  This
50  * means that swapfs can only grow to availrmem - swapfs_minfree.  This
51  * can be set as just constant value or a certain percentage of installed
52  * physical memory. It is set in swapinit().
53  *
54  * Users who want to change the amount of memory that can be used as swap
55  * space should do so by setting swapfs_desfree at boot time,
56  * not swapfs_minfree.
57  */
58 
59 pgcnt_t swapfs_desfree = 0;
60 pgcnt_t swapfs_minfree = 0;
61 pgcnt_t swapfs_reserve = 0;
62 
63 #ifdef SWAPFS_DEBUG
64 int swapfs_debug;
65 #endif /* SWAPFS_DEBUG */
66 
67 
68 static int swapfs_vpcount;
69 static kmutex_t swapfs_lock;
70 static struct async_reqs *sw_ar, *sw_pendlist, *sw_freelist;
71 
72 static struct vnode **swap_vnodes;	/* ptr's to swap vnodes */
73 
74 static void swap_init_mem_config(void);
75 
76 static pgcnt_t initial_swapfs_desfree;
77 static pgcnt_t initial_swapfs_minfree;
78 static pgcnt_t initial_swapfs_reserve;
79 
80 static int swap_sync(struct vfs *vfsp, short flag, struct cred *cr);
81 
82 static void
83 swapfs_recalc_save_initial(void)
84 {
85 	initial_swapfs_desfree = swapfs_desfree;
86 	initial_swapfs_minfree = swapfs_minfree;
87 	initial_swapfs_reserve = swapfs_reserve;
88 }
89 
90 static int
91 swapfs_recalc(pgcnt_t pgs)
92 {
93 	pgcnt_t new_swapfs_desfree;
94 	pgcnt_t new_swapfs_minfree;
95 	pgcnt_t new_swapfs_reserve;
96 
97 	new_swapfs_desfree = initial_swapfs_desfree;
98 	new_swapfs_minfree = initial_swapfs_minfree;
99 	new_swapfs_reserve = initial_swapfs_reserve;
100 
101 	if (new_swapfs_desfree == 0)
102 		new_swapfs_desfree = btopr(7 * 512 * 1024); /* 3-1/2Mb */;
103 
104 	if (new_swapfs_minfree == 0) {
105 		/*
106 		 * We set this lower than we'd like here, 2Mb, because we
107 		 * always boot on swapfs. It's up to a safer value,
108 		 * swapfs_desfree, when/if we add physical swap devices
109 		 * in swapadd(). Users who want to change the amount of
110 		 * memory that can be used as swap space should do so by
111 		 * setting swapfs_desfree at boot time, not swapfs_minfree.
112 		 * However, swapfs_minfree is tunable by install as a
113 		 * workaround for bugid 1147463.
114 		 */
115 		new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
116 	}
117 
118 	/*
119 	 * priv processes can reserve memory as swap as long as availrmem
120 	 * remains greater than swapfs_minfree; in the case of non-priv
121 	 * processes, memory can be reserved as swap only if availrmem
122 	 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
123 	 * swapfs_reserve amount of memswap is not available to non-priv
124 	 * processes. This protects daemons such as automounter dying
125 	 * as a result of application processes eating away almost entire
126 	 * membased swap. This safeguard becomes useless if apps are run
127 	 * with root access.
128 	 *
129 	 * set swapfs_reserve to a minimum of 4Mb or 1/128 of physmem whichever
130 	 * is greater up to the limit of 128 MB.
131 	 */
132 	if (new_swapfs_reserve == 0)
133 		new_swapfs_reserve = MIN(btopr(128 * 1024 * 1024),
134 		    MAX(btopr(4 * 1024 * 1024), pgs >> 7));
135 
136 	/* Test basic numeric viability. */
137 	if (new_swapfs_minfree > pgs)
138 		return (0);
139 
140 	/* Equivalent test to anon_resvmem() check. */
141 	if (availrmem < new_swapfs_minfree) {
142 		/*
143 		 * If ism pages are being used, then there must be agreement
144 		 * between these two policies.
145 		 */
146 		if ((availrmem > segspt_minfree) && (segspt_minfree > 0)) {
147 			new_swapfs_minfree = segspt_minfree;
148 		} else {
149 			return (0);
150 		}
151 	}
152 
153 	swapfs_desfree = new_swapfs_desfree;
154 	swapfs_minfree = new_swapfs_minfree;
155 	swapfs_reserve = new_swapfs_reserve;
156 
157 	return (1);
158 }
159 
160 /*ARGSUSED1*/
161 int
162 swapinit(int fstype, char *name)
163 {							/* reserve for mp */
164 	ssize_t sw_freelist_size = klustsize / PAGESIZE * 2;
165 	int i, error;
166 
167 	static const fs_operation_def_t swap_vfsops[] = {
168 		VFSNAME_SYNC, { .vfs_sync = swap_sync },
169 		NULL, NULL
170 	};
171 
172 	SWAPFS_PRINT(SWAP_SUBR, "swapinit\n", 0, 0, 0, 0, 0);
173 	mutex_init(&swapfs_lock, NULL, MUTEX_DEFAULT, NULL);
174 
175 	swap_vnodes = kmem_zalloc(MAX_SWAP_VNODES * sizeof (struct vnode *),
176 	    KM_SLEEP);
177 
178 	swapfs_recalc_save_initial();
179 	if (!swapfs_recalc(physmem))
180 		cmn_err(CE_PANIC, "swapfs_minfree(%lu) > physmem(%lu)",
181 		    swapfs_minfree, physmem);
182 
183 	/*
184 	 * Arrange for a callback on memory size change.
185 	 */
186 	swap_init_mem_config();
187 
188 	sw_ar = (struct async_reqs *)
189 	    kmem_zalloc(sw_freelist_size*sizeof (struct async_reqs), KM_SLEEP);
190 
191 	error = vfs_setfsops(fstype, swap_vfsops, NULL);
192 	if (error != 0) {
193 		cmn_err(CE_WARN, "swapinit: bad vfs ops template");
194 		return (error);
195 	}
196 
197 	error = vn_make_ops(name, swap_vnodeops_template, &swap_vnodeops);
198 	if (error != 0) {
199 		(void) vfs_freevfsops_by_type(fstype);
200 		cmn_err(CE_WARN, "swapinit: bad vnode ops template");
201 		return (error);
202 	}
203 	sw_freelist = sw_ar;
204 	for (i = 0; i < sw_freelist_size - 1; i++)
205 		sw_ar[i].a_next = &sw_ar[i + 1];
206 
207 	return (0);
208 }
209 
210 /*
211  * Get a swapfs vnode corresponding to the specified identifier.
212  */
213 struct vnode *
214 swapfs_getvp(ulong_t vidx)
215 {
216 	struct vnode *vp;
217 
218 	vp = swap_vnodes[vidx];
219 	if (vp) {
220 		return (vp);
221 	}
222 
223 	mutex_enter(&swapfs_lock);
224 	vp = swap_vnodes[vidx];
225 	if (vp == NULL) {
226 		vp = vn_alloc(KM_SLEEP);
227 		vn_setops(vp, swap_vnodeops);
228 		vp->v_type = VREG;
229 		vp->v_flag |= (VISSWAP|VISSWAPFS);
230 		swap_vnodes[vidx] = vp;
231 		swapfs_vpcount++;
232 	}
233 	mutex_exit(&swapfs_lock);
234 	return (vp);
235 }
236 
237 int swap_lo;
238 
239 /*ARGSUSED*/
240 static int
241 swap_sync(struct vfs *vfsp, short flag, struct cred *cr)
242 {
243 	struct vnode *vp;
244 	int i;
245 
246 	if (!(flag & SYNC_ALL))
247 		return (1);
248 
249 	/*
250 	 * assumes that we are the only one left to access this so that
251 	 * no need to use swapfs_lock (since it's staticly defined)
252 	 */
253 	for (i = 0; i < MAX_SWAP_VNODES; i++) {
254 		vp = swap_vnodes[i];
255 		if (vp) {
256 			VN_HOLD(vp);
257 			(void) VOP_PUTPAGE(vp, (offset_t)0, 0,
258 			    (B_ASYNC | B_FREE), kcred, NULL);
259 			VN_RELE(vp);
260 		}
261 	}
262 	return (0);
263 }
264 
265 extern int sw_pending_size;
266 
267 /*
268  * Take an async request off the pending queue
269  */
270 struct async_reqs *
271 sw_getreq()
272 {
273 	struct async_reqs *arg;
274 
275 	mutex_enter(&swapfs_lock);
276 	arg = sw_pendlist;
277 	if (arg) {
278 		sw_pendlist = arg->a_next;
279 		arg->a_next = NULL;
280 		sw_pending_size -= PAGESIZE;
281 	}
282 	ASSERT(sw_pending_size >= 0);
283 	mutex_exit(&swapfs_lock);
284 	return (arg);
285 }
286 
287 /*
288  * Put an async request on the pending queue
289  */
290 void
291 sw_putreq(struct async_reqs *arg)
292 {
293 	/* Hold onto it */
294 	VN_HOLD(arg->a_vp);
295 
296 	mutex_enter(&swapfs_lock);
297 	arg->a_next = sw_pendlist;
298 	sw_pendlist = arg;
299 	sw_pending_size += PAGESIZE;
300 	mutex_exit(&swapfs_lock);
301 }
302 
303 /*
304  * Put an async request back on the pending queue
305  */
306 void
307 sw_putbackreq(struct async_reqs *arg)
308 {
309 	mutex_enter(&swapfs_lock);
310 	arg->a_next = sw_pendlist;
311 	sw_pendlist = arg;
312 	sw_pending_size += PAGESIZE;
313 	mutex_exit(&swapfs_lock);
314 }
315 
316 /*
317  * Take an async request structure off the free list
318  */
319 struct async_reqs *
320 sw_getfree()
321 {
322 	struct async_reqs *arg;
323 
324 	mutex_enter(&swapfs_lock);
325 	arg = sw_freelist;
326 	if (arg) {
327 		sw_freelist = arg->a_next;
328 		arg->a_next = NULL;
329 	}
330 	mutex_exit(&swapfs_lock);
331 	return (arg);
332 }
333 
334 /*
335  * Put an async request structure on the free list
336  */
337 void
338 sw_putfree(struct async_reqs *arg)
339 {
340 	/* Release our hold - should have locked the page by now */
341 	VN_RELE(arg->a_vp);
342 
343 	mutex_enter(&swapfs_lock);
344 	arg->a_next = sw_freelist;
345 	sw_freelist = arg;
346 	mutex_exit(&swapfs_lock);
347 }
348 
349 static pgcnt_t swapfs_pending_delete;
350 
351 /*ARGSUSED*/
352 static void
353 swap_mem_config_post_add(
354 	void *arg,
355 	pgcnt_t delta_swaps)
356 {
357 	(void) swapfs_recalc(physmem - swapfs_pending_delete);
358 }
359 
360 /*ARGSUSED*/
361 static int
362 swap_mem_config_pre_del(
363 	void *arg,
364 	pgcnt_t delta_swaps)
365 {
366 	pgcnt_t nv;
367 
368 	nv = atomic_add_long_nv(&swapfs_pending_delete, (spgcnt_t)delta_swaps);
369 	if (!swapfs_recalc(physmem - nv)) {
370 		/*
371 		 * Tidy-up is done by the call to post_del which
372 		 * is always made.
373 		 */
374 		cmn_err(CE_NOTE, "Memory operation refused to ensure system "
375 		    "doesn't deadlock due to excessive consumption by swapfs.");
376 		return (EBUSY);
377 	}
378 	return (0);
379 }
380 
381 /*ARGSUSED*/
382 static void
383 swap_mem_config_post_del(
384 	void *arg,
385 	pgcnt_t delta_swaps,
386 	int cancelled)
387 {
388 	pgcnt_t nv;
389 
390 	nv = atomic_add_long_nv(&swapfs_pending_delete, -(spgcnt_t)delta_swaps);
391 	(void) swapfs_recalc(physmem - nv);
392 }
393 
394 static kphysm_setup_vector_t swap_mem_config_vec = {
395 	KPHYSM_SETUP_VECTOR_VERSION,
396 	swap_mem_config_post_add,
397 	swap_mem_config_pre_del,
398 	swap_mem_config_post_del,
399 };
400 
401 static void
402 swap_init_mem_config(void)
403 {
404 	int ret;
405 
406 	ret = kphysm_setup_func_register(&swap_mem_config_vec, (void *)NULL);
407 	ASSERT(ret == 0);
408 }
409