1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright 2024 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/errno.h>
34 #include <sys/kmem.h>
35 #include <sys/vnode.h>
36 #include <sys/vfs_opreg.h>
37 #include <sys/swap.h>
38 #include <sys/sysmacros.h>
39 #include <sys/buf.h>
40 #include <sys/callb.h>
41 #include <sys/debug.h>
42 #include <vm/seg.h>
43 #include <sys/fs/swapnode.h>
44 #include <fs/fs_subr.h>
45 #include <sys/cmn_err.h>
46 #include <sys/mem_config.h>
47 #include <sys/atomic.h>
48 
49 extern const fs_operation_def_t swap_vnodeops_template[];
50 
51 /*
52  * swapfs_minfree is the amount of physical memory (actually remaining
53  * availrmem) that we want to keep free for the rest of the system.  This
54  * means that swapfs can only grow to availrmem - swapfs_minfree.  This
55  * can be set as just constant value or a certain percentage of installed
56  * physical memory. It is set in swapinit().
57  *
58  * Users who want to change the amount of memory that can be used as swap
59  * space should do so by setting swapfs_desfree at boot time,
60  * not swapfs_minfree.
61  */
62 
63 pgcnt_t swapfs_desfree = 0;
64 pgcnt_t swapfs_minfree = 0;
65 pgcnt_t swapfs_reserve = 0;
66 
67 #ifdef SWAPFS_DEBUG
68 int swapfs_debug;
69 #endif /* SWAPFS_DEBUG */
70 
71 
72 static int swapfs_vpcount;
73 static kmutex_t swapfs_lock;
74 static struct async_reqs *sw_ar, *sw_pendlist, *sw_freelist;
75 
76 static struct vnode **swap_vnodes;	/* ptr's to swap vnodes */
77 
78 static void swap_init_mem_config(void);
79 
80 static pgcnt_t initial_swapfs_desfree;
81 static pgcnt_t initial_swapfs_minfree;
82 static pgcnt_t initial_swapfs_reserve;
83 
84 static int swap_sync(struct vfs *vfsp, short flag, struct cred *cr);
85 
86 static void
swapfs_recalc_save_initial(void)87 swapfs_recalc_save_initial(void)
88 {
89 	initial_swapfs_desfree = swapfs_desfree;
90 	initial_swapfs_minfree = swapfs_minfree;
91 	initial_swapfs_reserve = swapfs_reserve;
92 }
93 
94 static int
swapfs_recalc(pgcnt_t pgs)95 swapfs_recalc(pgcnt_t pgs)
96 {
97 	pgcnt_t new_swapfs_desfree;
98 	pgcnt_t new_swapfs_minfree;
99 	pgcnt_t new_swapfs_reserve;
100 
101 	new_swapfs_desfree = initial_swapfs_desfree;
102 	new_swapfs_minfree = initial_swapfs_minfree;
103 	new_swapfs_reserve = initial_swapfs_reserve;
104 
105 	if (new_swapfs_desfree == 0)
106 		new_swapfs_desfree = btopr(7 * 512 * 1024); /* 3-1/2Mb */;
107 
108 	if (new_swapfs_minfree == 0) {
109 		/*
110 		 * Set swapfs_minfree to be an eighth of physical, but
111 		 * capped at 512 MiB.
112 		 */
113 		new_swapfs_minfree = MIN(btopr(512 * 1024 * 1024), pgs >> 3);
114 	}
115 
116 	/*
117 	 * priv processes can reserve memory as swap as long as availrmem
118 	 * remains greater than swapfs_minfree; in the case of non-priv
119 	 * processes, memory can be reserved as swap only if availrmem
120 	 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
121 	 * swapfs_reserve amount of memswap is not available to non-priv
122 	 * processes. This protects daemons such as automounter dying
123 	 * as a result of application processes eating away almost entire
124 	 * membased swap. This safeguard becomes useless if apps are run
125 	 * with root access.
126 	 *
127 	 * set swapfs_reserve to a minimum of 4Mb or 1/128 of physmem whichever
128 	 * is greater up to the limit of 128 MB.
129 	 */
130 	if (new_swapfs_reserve == 0)
131 		new_swapfs_reserve = MIN(btopr(128 * 1024 * 1024),
132 		    MAX(btopr(4 * 1024 * 1024), pgs >> 7));
133 
134 	/* Test basic numeric viability. */
135 	if (new_swapfs_minfree > pgs)
136 		return (0);
137 
138 	/* Equivalent test to anon_resvmem() check. */
139 	if (availrmem < new_swapfs_minfree) {
140 		/*
141 		 * If ism pages are being used, then there must be agreement
142 		 * between these two policies.
143 		 */
144 		if ((availrmem > segspt_minfree) && (segspt_minfree > 0)) {
145 			new_swapfs_minfree = segspt_minfree;
146 		} else {
147 			return (0);
148 		}
149 	}
150 
151 	swapfs_desfree = new_swapfs_desfree;
152 	swapfs_minfree = new_swapfs_minfree;
153 	swapfs_reserve = new_swapfs_reserve;
154 
155 	return (1);
156 }
157 
158 /*ARGSUSED1*/
159 int
swapinit(int fstype,char * name)160 swapinit(int fstype, char *name)
161 {
162 	/* reserve for mp */
163 	ssize_t sw_freelist_size = klustsize / PAGESIZE * 2;
164 	int i, error;
165 
166 	static const fs_operation_def_t swap_vfsops[] = {
167 		VFSNAME_SYNC, { .vfs_sync = swap_sync },
168 		NULL, NULL
169 	};
170 
171 	SWAPFS_PRINT(SWAP_SUBR, "swapinit\n", 0, 0, 0, 0, 0);
172 	mutex_init(&swapfs_lock, NULL, MUTEX_DEFAULT, NULL);
173 
174 	swap_vnodes = kmem_zalloc(MAX_SWAP_VNODES * sizeof (struct vnode *),
175 	    KM_SLEEP);
176 
177 	swapfs_recalc_save_initial();
178 	if (!swapfs_recalc(physmem))
179 		cmn_err(CE_PANIC, "swapfs_minfree(%lu) > physmem(%lu)",
180 		    swapfs_minfree, physmem);
181 
182 	/*
183 	 * Arrange for a callback on memory size change.
184 	 */
185 	swap_init_mem_config();
186 
187 	sw_ar = (struct async_reqs *)
188 	    kmem_zalloc(sw_freelist_size*sizeof (struct async_reqs), KM_SLEEP);
189 
190 	error = vfs_setfsops(fstype, swap_vfsops, NULL);
191 	if (error != 0) {
192 		cmn_err(CE_WARN, "swapinit: bad vfs ops template");
193 		return (error);
194 	}
195 
196 	error = vn_make_ops(name, swap_vnodeops_template, &swap_vnodeops);
197 	if (error != 0) {
198 		(void) vfs_freevfsops_by_type(fstype);
199 		cmn_err(CE_WARN, "swapinit: bad vnode ops template");
200 		return (error);
201 	}
202 	sw_freelist = sw_ar;
203 	for (i = 0; i < sw_freelist_size - 1; i++)
204 		sw_ar[i].a_next = &sw_ar[i + 1];
205 
206 	return (0);
207 }
208 
209 /*
210  * Get a swapfs vnode corresponding to the specified identifier.
211  */
212 struct vnode *
swapfs_getvp(ulong_t vidx)213 swapfs_getvp(ulong_t vidx)
214 {
215 	struct vnode *vp;
216 
217 	vp = swap_vnodes[vidx];
218 	if (vp) {
219 		return (vp);
220 	}
221 
222 	mutex_enter(&swapfs_lock);
223 	vp = swap_vnodes[vidx];
224 	if (vp == NULL) {
225 		vp = vn_alloc(KM_SLEEP);
226 		vn_setops(vp, swap_vnodeops);
227 		vp->v_type = VREG;
228 		vp->v_flag |= (VISSWAP|VISSWAPFS);
229 		swap_vnodes[vidx] = vp;
230 		swapfs_vpcount++;
231 	}
232 	mutex_exit(&swapfs_lock);
233 	return (vp);
234 }
235 
236 int swap_lo;
237 
238 /*ARGSUSED*/
239 static int
swap_sync(struct vfs * vfsp,short flag,struct cred * cr)240 swap_sync(struct vfs *vfsp, short flag, struct cred *cr)
241 {
242 	struct vnode *vp;
243 	int i;
244 
245 	if (!(flag & SYNC_ALL))
246 		return (1);
247 
248 	/*
249 	 * assumes that we are the only one left to access this so that
250 	 * no need to use swapfs_lock (since it's staticly defined)
251 	 */
252 	for (i = 0; i < MAX_SWAP_VNODES; i++) {
253 		vp = swap_vnodes[i];
254 		if (vp) {
255 			VN_HOLD(vp);
256 			(void) VOP_PUTPAGE(vp, (offset_t)0, 0,
257 			    (B_ASYNC | B_FREE), kcred, NULL);
258 			VN_RELE(vp);
259 		}
260 	}
261 	return (0);
262 }
263 
264 extern int sw_pending_size;
265 
266 /*
267  * Take an async request off the pending queue
268  */
269 struct async_reqs *
sw_getreq()270 sw_getreq()
271 {
272 	struct async_reqs *arg;
273 
274 	mutex_enter(&swapfs_lock);
275 	arg = sw_pendlist;
276 	if (arg) {
277 		sw_pendlist = arg->a_next;
278 		arg->a_next = NULL;
279 		sw_pending_size -= PAGESIZE;
280 	}
281 	ASSERT(sw_pending_size >= 0);
282 	mutex_exit(&swapfs_lock);
283 	return (arg);
284 }
285 
286 /*
287  * Put an async request on the pending queue
288  */
289 void
sw_putreq(struct async_reqs * arg)290 sw_putreq(struct async_reqs *arg)
291 {
292 	/* Hold onto it */
293 	VN_HOLD(arg->a_vp);
294 
295 	mutex_enter(&swapfs_lock);
296 	arg->a_next = sw_pendlist;
297 	sw_pendlist = arg;
298 	sw_pending_size += PAGESIZE;
299 	mutex_exit(&swapfs_lock);
300 }
301 
302 /*
303  * Put an async request back on the pending queue
304  */
305 void
sw_putbackreq(struct async_reqs * arg)306 sw_putbackreq(struct async_reqs *arg)
307 {
308 	mutex_enter(&swapfs_lock);
309 	arg->a_next = sw_pendlist;
310 	sw_pendlist = arg;
311 	sw_pending_size += PAGESIZE;
312 	mutex_exit(&swapfs_lock);
313 }
314 
315 /*
316  * Take an async request structure off the free list
317  */
318 struct async_reqs *
sw_getfree()319 sw_getfree()
320 {
321 	struct async_reqs *arg;
322 
323 	mutex_enter(&swapfs_lock);
324 	arg = sw_freelist;
325 	if (arg) {
326 		sw_freelist = arg->a_next;
327 		arg->a_next = NULL;
328 	}
329 	mutex_exit(&swapfs_lock);
330 	return (arg);
331 }
332 
333 /*
334  * Put an async request structure on the free list
335  */
336 void
sw_putfree(struct async_reqs * arg)337 sw_putfree(struct async_reqs *arg)
338 {
339 	/* Release our hold - should have locked the page by now */
340 	VN_RELE(arg->a_vp);
341 
342 	mutex_enter(&swapfs_lock);
343 	arg->a_next = sw_freelist;
344 	sw_freelist = arg;
345 	mutex_exit(&swapfs_lock);
346 }
347 
348 static pgcnt_t swapfs_pending_delete;
349 
350 /*ARGSUSED*/
351 static void
swap_mem_config_post_add(void * arg,pgcnt_t delta_swaps)352 swap_mem_config_post_add(
353 	void *arg,
354 	pgcnt_t delta_swaps)
355 {
356 	(void) swapfs_recalc(physmem - swapfs_pending_delete);
357 }
358 
359 /*ARGSUSED*/
360 static int
swap_mem_config_pre_del(void * arg,pgcnt_t delta_swaps)361 swap_mem_config_pre_del(
362 	void *arg,
363 	pgcnt_t delta_swaps)
364 {
365 	pgcnt_t nv;
366 
367 	nv = atomic_add_long_nv(&swapfs_pending_delete, (spgcnt_t)delta_swaps);
368 	if (!swapfs_recalc(physmem - nv)) {
369 		/*
370 		 * Tidy-up is done by the call to post_del which
371 		 * is always made.
372 		 */
373 		cmn_err(CE_NOTE, "Memory operation refused to ensure system "
374 		    "doesn't deadlock due to excessive consumption by swapfs.");
375 		return (EBUSY);
376 	}
377 	return (0);
378 }
379 
380 /*ARGSUSED*/
381 static void
swap_mem_config_post_del(void * arg,pgcnt_t delta_swaps,int cancelled)382 swap_mem_config_post_del(
383 	void *arg,
384 	pgcnt_t delta_swaps,
385 	int cancelled)
386 {
387 	pgcnt_t nv;
388 
389 	nv = atomic_add_long_nv(&swapfs_pending_delete, -(spgcnt_t)delta_swaps);
390 	(void) swapfs_recalc(physmem - nv);
391 }
392 
393 static kphysm_setup_vector_t swap_mem_config_vec = {
394 	KPHYSM_SETUP_VECTOR_VERSION,
395 	swap_mem_config_post_add,
396 	swap_mem_config_pre_del,
397 	swap_mem_config_post_del,
398 };
399 
400 static void
swap_init_mem_config(void)401 swap_init_mem_config(void)
402 {
403 	int ret;
404 
405 	ret = kphysm_setup_func_register(&swap_mem_config_vec, (void *)NULL);
406 	ASSERT(ret == 0);
407 }
408