xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/cong.c (revision 6e18d381c642549b8bb1774a803d3510aec6baaf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2007 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/rds.h>
58 
59 #include <sys/ib/clients/rdsv3/rdsv3.h>
60 #include <sys/ib/clients/rdsv3/rdsv3_impl.h>
61 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
62 
63 /*
64  * This file implements the receive side of the unconventional congestion
65  * management in RDS.
66  *
67  * Messages waiting in the receive queue on the receiving socket are accounted
68  * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
69  * message are accounted for.  If the number of bytes queued equals or exceeds
70  * rcvbuf then the socket is congested.  All sends attempted to this socket's
71  * address should return block or return -EWOULDBLOCK.
72  *
73  * Applications are expected to be reasonably tuned such that this situation
74  * very rarely occurs.  An application encountering this "back-pressure" is
75  * considered a bug.
76  *
77  * This is implemented by having each node maintain bitmaps which indicate
78  * which ports on bound addresses are congested.  As the bitmap changes it is
79  * sent through all the connections which terminate in the local address of the
80  * bitmap which changed.
81  *
82  * The bitmaps are allocated as connections are brought up.  This avoids
83  * allocation in the interrupt handling path which queues messages on sockets.
84  * The dense bitmaps let transports send the entire bitmap on any bitmap change
85  * reasonably efficiently.  This is much easier to implement than some
86  * finer-grained communication of per-port congestion.  The sender does a very
87  * inexpensive bit test to test if the port it's about to send to is congested
88  * or not.
89  */
90 
91 /*
92  * Interaction with poll is a tad tricky. We want all processes stuck in
93  * poll to wake up and check whether a congested destination became uncongested.
94  * The really sad thing is we have no idea which destinations the application
95  * wants to send to - we don't even know which rdsv3_connections are involved.
96  * So until we implement a more flexible rds poll interface, we have to make
97  * do with this:
98  * We maintain a global counter that is incremented each time a congestion map
99  * update is received. Each rds socket tracks this value, and if rdsv3_poll
100  * finds that the saved generation number is smaller than the global generation
101  * number, it wakes up the process.
102  */
103 static atomic_t		rdsv3_cong_generation = ATOMIC_INIT(0);
104 
105 /*
106  * Congestion monitoring
107  */
108 static struct list rdsv3_cong_monitor;
109 static krwlock_t rdsv3_cong_monitor_lock;
110 
111 /*
112  * Yes, a global lock.  It's used so infrequently that it's worth keeping it
113  * global to simplify the locking.  It's only used in the following
114  * circumstances:
115  *
116  *  - on connection buildup to associate a conn with its maps
117  *  - on map changes to inform conns of a new map to send
118  *
119  *  It's sadly ordered under the socket callback lock and the connection lock.
120  *  Receive paths can mark ports congested from interrupt context so the
121  *  lock masks interrupts.
122  */
123 static kmutex_t rdsv3_cong_lock;
124 static struct avl_tree rdsv3_cong_tree;
125 
126 static struct rdsv3_cong_map *
127 rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert)
128 {
129 	struct rdsv3_cong_map *map;
130 	avl_index_t where;
131 
132 	if (insert) {
133 		map = avl_find(&rdsv3_cong_tree, insert, &where);
134 		if (map == NULL) {
135 			avl_insert(&rdsv3_cong_tree, insert, where);
136 			return (NULL);
137 		}
138 	} else {
139 		struct rdsv3_cong_map map1;
140 		map1.m_addr = addr;
141 		map = avl_find(&rdsv3_cong_tree, &map1, &where);
142 	}
143 
144 	return (map);
145 }
146 
147 /*
148  * There is only ever one bitmap for any address.  Connections try and allocate
149  * these bitmaps in the process getting pointers to them.  The bitmaps are only
150  * ever freed as the module is removed after all connections have been freed.
151  */
152 static struct rdsv3_cong_map *
153 rdsv3_cong_from_addr(uint32_be_t addr)
154 {
155 	struct rdsv3_cong_map *map;
156 	struct rdsv3_cong_map *ret = NULL;
157 	unsigned long zp;
158 	unsigned long i;
159 
160 	RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr));
161 
162 	map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP);
163 	if (map == NULL)
164 		return (NULL);
165 
166 	map->m_addr = addr;
167 	rdsv3_init_waitqueue(&map->m_waitq);
168 	list_create(&map->m_conn_list, sizeof (struct rdsv3_connection),
169 	    offsetof(struct rdsv3_connection, c_map_item));
170 
171 	for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) {
172 		zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP);
173 		if (zp == 0)
174 			goto out;
175 		map->m_page_addrs[i] = zp;
176 	}
177 
178 	mutex_enter(&rdsv3_cong_lock);
179 	ret = rdsv3_cong_tree_walk(addr, map);
180 	mutex_exit(&rdsv3_cong_lock);
181 
182 	if (ret == NULL) {
183 		ret = map;
184 		map = NULL;
185 	}
186 
187 out:
188 	if (map) {
189 		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
190 		    i++)
191 			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
192 		kmem_free(map, sizeof (*map));
193 	}
194 
195 	RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x",
196 	    ret, ntohl(addr));
197 
198 	return (ret);
199 }
200 
201 /*
202  * Put the conn on its local map's list.  This is called when the conn is
203  * really added to the hash.  It's nested under the rdsv3_conn_lock, sadly.
204  */
205 void
206 rdsv3_cong_add_conn(struct rdsv3_connection *conn)
207 {
208 	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn);
209 
210 	RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p",
211 	    conn, conn->c_lcong);
212 	mutex_enter(&rdsv3_cong_lock);
213 	list_insert_tail(&conn->c_lcong->m_conn_list, conn);
214 	mutex_exit(&rdsv3_cong_lock);
215 
216 	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn);
217 }
218 
219 void
220 rdsv3_cong_remove_conn(struct rdsv3_connection *conn)
221 {
222 	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn);
223 
224 	RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p",
225 	    conn, conn->c_lcong);
226 	mutex_enter(&rdsv3_cong_lock);
227 	list_remove_node(&conn->c_map_item);
228 	mutex_exit(&rdsv3_cong_lock);
229 
230 	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn);
231 }
232 
233 int
234 rdsv3_cong_get_maps(struct rdsv3_connection *conn)
235 {
236 	conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr);
237 	conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr);
238 
239 	if (conn->c_lcong == NULL || conn->c_fcong == NULL)
240 		return (-ENOMEM);
241 
242 	return (0);
243 }
244 
245 void
246 rdsv3_cong_queue_updates(struct rdsv3_cong_map *map)
247 {
248 	struct rdsv3_connection *conn;
249 
250 	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map);
251 
252 	mutex_enter(&rdsv3_cong_lock);
253 
254 	RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) {
255 		if (!test_and_set_bit(0, &conn->c_map_queued)) {
256 			rdsv3_stats_inc(s_cong_update_queued);
257 			rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0);
258 		}
259 	}
260 
261 	mutex_exit(&rdsv3_cong_lock);
262 
263 	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map);
264 }
265 
266 void
267 rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask)
268 {
269 	RDSV3_DPRINTF4("rdsv3_cong_map_updated",
270 	    "waking map %p for %u.%u.%u.%u",
271 	    map, NIPQUAD(map->m_addr));
272 	rdsv3_stats_inc(s_cong_update_received);
273 	atomic_add_32(&rdsv3_cong_generation, 1);
274 #if 0
275 XXX
276 	if (waitqueue_active(&map->m_waitq))
277 #endif
278 		rdsv3_wake_up(&map->m_waitq);
279 
280 	if (portmask && !list_is_empty(&rdsv3_cong_monitor)) {
281 		struct rdsv3_sock *rs;
282 
283 		rw_enter(&rdsv3_cong_monitor_lock, RW_READER);
284 		RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor,
285 		    rs_cong_list) {
286 			mutex_enter(&rs->rs_lock);
287 			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
288 			rs->rs_cong_mask &= ~portmask;
289 			mutex_exit(&rs->rs_lock);
290 			if (rs->rs_cong_notify)
291 				rdsv3_wake_sk_sleep(rs);
292 		}
293 		rw_exit(&rdsv3_cong_monitor_lock);
294 	}
295 
296 	RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map);
297 }
298 
299 int
300 rdsv3_cong_updated_since(unsigned long *recent)
301 {
302 	unsigned long gen = atomic_get(&rdsv3_cong_generation);
303 
304 	if (*recent == gen)
305 		return (0);
306 	*recent = gen;
307 	return (1);
308 }
309 
310 /*
311  * These should be using generic_{test,__{clear,set}}_le_bit() but some old
312  * kernels don't have them.  Sigh.
313  */
314 #if defined(sparc)
315 #define	LE_BIT_XOR	((BITS_PER_LONG-1) & ~0x7)
316 #else
317 #define	LE_BIT_XOR	0
318 #endif
319 
320 /*
321  * We're called under the locking that protects the sockets receive buffer
322  * consumption.  This makes it a lot easier for the caller to only call us
323  * when it knows that an existing set bit needs to be cleared, and vice versa.
324  * We can't block and we need to deal with concurrent sockets working against
325  * the same per-address map.
326  */
327 void
328 rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port)
329 {
330 	unsigned long i;
331 	unsigned long off;
332 
333 	RDSV3_DPRINTF4("rdsv3_cong_set_bit",
334 	    "setting congestion for %u.%u.%u.%u:%u in map %p",
335 	    NIPQUAD(map->m_addr), ntohs(port), map);
336 
337 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
338 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
339 
340 	set_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]);
341 }
342 
343 void
344 rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port)
345 {
346 	unsigned long i;
347 	unsigned long off;
348 
349 	RDSV3_DPRINTF4("rdsv3_cong_clear_bit",
350 	    "clearing congestion for %u.%u.%u.%u:%u in map %p\n",
351 	    NIPQUAD(map->m_addr), ntohs(port), map);
352 
353 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
354 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
355 
356 	clear_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]);
357 }
358 
359 static int
360 rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port)
361 {
362 	unsigned long i;
363 	unsigned long off;
364 
365 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
366 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
367 
368 	RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx",
369 	    ntohs(port), i, off);
370 
371 	return (test_bit(off ^ LE_BIT_XOR, (void *)map->m_page_addrs[i]));
372 }
373 
374 #undef LE_BIT_XOR
375 
376 void
377 rdsv3_cong_add_socket(struct rdsv3_sock *rs)
378 {
379 	RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs);
380 
381 	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
382 	if (!list_link_active(&rs->rs_cong_list))
383 		list_insert_head(&rdsv3_cong_monitor, rs);
384 	rw_exit(&rdsv3_cong_monitor_lock);
385 }
386 
387 void
388 rdsv3_cong_remove_socket(struct rdsv3_sock *rs)
389 {
390 	struct rdsv3_cong_map *map;
391 
392 	RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs);
393 
394 	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
395 	list_remove_node(&rs->rs_cong_list);
396 	rw_exit(&rdsv3_cong_monitor_lock);
397 
398 	/* update congestion map for now-closed port */
399 	mutex_enter(&rdsv3_cong_lock);
400 	map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL);
401 	mutex_exit(&rdsv3_cong_lock);
402 
403 	if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) {
404 		rdsv3_cong_clear_bit(map, rs->rs_bound_port);
405 		rdsv3_cong_queue_updates(map);
406 	}
407 }
408 
409 int
410 rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock,
411     struct rdsv3_sock *rs)
412 {
413 	int ret = 0;
414 
415 	RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)",
416 	    rs, nonblock);
417 
418 	if (!rdsv3_cong_test_bit(map, port))
419 		return (0);
420 	if (nonblock) {
421 		if (rs && rs->rs_cong_monitor) {
422 			/*
423 			 * It would have been nice to have an atomic set_bit on
424 			 * a uint64_t.
425 			 */
426 			mutex_enter(&rs->rs_lock);
427 			rs->rs_cong_mask |=
428 			    RDSV3_CONG_MONITOR_MASK(ntohs(port));
429 			mutex_exit(&rs->rs_lock);
430 
431 			/*
432 			 * Test again - a congestion update may have arrived in
433 			 * the meantime.
434 			 */
435 			if (!rdsv3_cong_test_bit(map, port))
436 				return (0);
437 		}
438 		rdsv3_stats_inc(s_cong_send_error);
439 		return (-ENOBUFS);
440 	}
441 
442 	rdsv3_stats_inc(s_cong_send_blocked);
443 	RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u",
444 	    map, ntohs(port));
445 
446 #if 0
447 	ret = rdsv3_wait_sig(&map->m_waitq, !rdsv3_cong_test_bit(map, port));
448 	if (ret == 0)
449 		return (-ERESTART);
450 	return (0);
451 #else
452 	mutex_enter(&map->m_waitq.waitq_mutex);
453 	map->m_waitq.waitq_waiters++;
454 	while (rdsv3_cong_test_bit(map, port)) {
455 		ret = cv_wait_sig(&map->m_waitq.waitq_cv,
456 		    &map->m_waitq.waitq_mutex);
457 		if (ret == 0) {
458 			ret = -ERESTART;
459 			break;
460 		}
461 	}
462 	map->m_waitq.waitq_waiters--;
463 	mutex_exit(&map->m_waitq.waitq_mutex);
464 	return (ret);
465 #endif
466 }
467 
468 void
469 rdsv3_cong_exit(void)
470 {
471 	struct rdsv3_cong_map *map;
472 	unsigned long i;
473 
474 	RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter");
475 
476 	while ((map = avl_first(&rdsv3_cong_tree))) {
477 		RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map);
478 		avl_remove(&rdsv3_cong_tree, map);
479 		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
480 		    i++)
481 			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
482 		kmem_free(map, sizeof (*map));
483 	}
484 
485 	RDSV3_DPRINTF4("rdsv3_cong_exit", "Return");
486 }
487 
488 /*
489  * Allocate a RDS message containing a congestion update.
490  */
491 struct rdsv3_message *
492 rdsv3_cong_update_alloc(struct rdsv3_connection *conn)
493 {
494 	struct rdsv3_cong_map *map = conn->c_lcong;
495 	struct rdsv3_message *rm;
496 
497 	rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES);
498 	if (!IS_ERR(rm))
499 		rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP;
500 
501 	return (rm);
502 }
503 
504 static int
505 rdsv3_cong_compare(const void *map1, const void *map2)
506 {
507 #define	addr1	((struct rdsv3_cong_map *)map1)->m_addr
508 #define	addr2	((struct rdsv3_cong_map *)map2)->m_addr
509 
510 	if (addr1 < addr2)
511 		return (-1);
512 	if (addr1 > addr2)
513 		return (1);
514 	return (0);
515 }
516 
517 void
518 rdsv3_cong_init(void)
519 {
520 	list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock),
521 	    offsetof(struct rdsv3_sock, rs_cong_list));
522 	rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL);
523 	mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL);
524 	avl_create(&rdsv3_cong_tree, rdsv3_cong_compare,
525 	    sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map,
526 	    m_rb_node));
527 }
528