1 /*
2  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3  */
4 
5 /*
6  * This file contains code imported from the OFED rds source file cong.c
7  * Oracle elects to have and use the contents of cong.c under and governed
8  * by the OpenIB.org BSD license (see below for full license text). However,
9  * the following notice accompanied the original version of this file:
10  */
11 
12 
13 /*
14  * Copyright (c) 2007 Oracle.  All rights reserved.
15  *
16  * This software is available to you under a choice of one of two
17  * licenses.  You may choose to be licensed under the terms of the GNU
18  * General Public License (GPL) Version 2, available from the file
19  * COPYING in the main directory of this source tree, or the
20  * OpenIB.org BSD license below:
21  *
22  *     Redistribution and use in source and binary forms, with or
23  *     without modification, are permitted provided that the following
24  *     conditions are met:
25  *
26  *      - Redistributions of source code must retain the above
27  *        copyright notice, this list of conditions and the following
28  *        disclaimer.
29  *
30  *      - Redistributions in binary form must reproduce the above
31  *        copyright notice, this list of conditions and the following
32  *        disclaimer in the documentation and/or other materials
33  *        provided with the distribution.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
36  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
38  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
39  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
40  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
41  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42  * SOFTWARE.
43  *
44  */
45 #include <sys/rds.h>
46 
47 #include <sys/ib/clients/rdsv3/rdsv3.h>
48 #include <sys/ib/clients/rdsv3/rdsv3_impl.h>
49 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
50 
51 /*
52  * This file implements the receive side of the unconventional congestion
53  * management in RDS.
54  *
55  * Messages waiting in the receive queue on the receiving socket are accounted
56  * against the sockets SO_RCVBUF option value.  Only the payload bytes in the
57  * message are accounted for.  If the number of bytes queued equals or exceeds
58  * rcvbuf then the socket is congested.  All sends attempted to this socket's
59  * address should return block or return -EWOULDBLOCK.
60  *
61  * Applications are expected to be reasonably tuned such that this situation
62  * very rarely occurs.  An application encountering this "back-pressure" is
63  * considered a bug.
64  *
65  * This is implemented by having each node maintain bitmaps which indicate
66  * which ports on bound addresses are congested.  As the bitmap changes it is
67  * sent through all the connections which terminate in the local address of the
68  * bitmap which changed.
69  *
70  * The bitmaps are allocated as connections are brought up.  This avoids
71  * allocation in the interrupt handling path which queues messages on sockets.
72  * The dense bitmaps let transports send the entire bitmap on any bitmap change
73  * reasonably efficiently.  This is much easier to implement than some
74  * finer-grained communication of per-port congestion.  The sender does a very
75  * inexpensive bit test to test if the port it's about to send to is congested
76  * or not.
77  */
78 
79 /*
80  * Interaction with poll is a tad tricky. We want all processes stuck in
81  * poll to wake up and check whether a congested destination became uncongested.
82  * The really sad thing is we have no idea which destinations the application
83  * wants to send to - we don't even know which rdsv3_connections are involved.
84  * So until we implement a more flexible rds poll interface, we have to make
85  * do with this:
86  * We maintain a global counter that is incremented each time a congestion map
87  * update is received. Each rds socket tracks this value, and if rdsv3_poll
88  * finds that the saved generation number is smaller than the global generation
89  * number, it wakes up the process.
90  */
91 static atomic_t		rdsv3_cong_generation = ATOMIC_INIT(0);
92 
93 /*
94  * Congestion monitoring
95  */
96 static struct list rdsv3_cong_monitor;
97 static krwlock_t rdsv3_cong_monitor_lock;
98 
99 /*
100  * Yes, a global lock.  It's used so infrequently that it's worth keeping it
101  * global to simplify the locking.  It's only used in the following
102  * circumstances:
103  *
104  *  - on connection buildup to associate a conn with its maps
105  *  - on map changes to inform conns of a new map to send
106  *
107  *  It's sadly ordered under the socket callback lock and the connection lock.
108  *  Receive paths can mark ports congested from interrupt context so the
109  *  lock masks interrupts.
110  */
111 static kmutex_t rdsv3_cong_lock;
112 static struct avl_tree rdsv3_cong_tree;
113 
114 static struct rdsv3_cong_map *
rdsv3_cong_tree_walk(uint32_be_t addr,struct rdsv3_cong_map * insert)115 rdsv3_cong_tree_walk(uint32_be_t addr, struct rdsv3_cong_map *insert)
116 {
117 	struct rdsv3_cong_map *map;
118 	avl_index_t where;
119 
120 	if (insert) {
121 		map = avl_find(&rdsv3_cong_tree, insert, &where);
122 		if (map == NULL) {
123 			avl_insert(&rdsv3_cong_tree, insert, where);
124 			return (NULL);
125 		}
126 	} else {
127 		struct rdsv3_cong_map map1;
128 		map1.m_addr = addr;
129 		map = avl_find(&rdsv3_cong_tree, &map1, &where);
130 	}
131 
132 	return (map);
133 }
134 
135 /*
136  * There is only ever one bitmap for any address.  Connections try and allocate
137  * these bitmaps in the process getting pointers to them.  The bitmaps are only
138  * ever freed as the module is removed after all connections have been freed.
139  */
140 static struct rdsv3_cong_map *
rdsv3_cong_from_addr(uint32_be_t addr)141 rdsv3_cong_from_addr(uint32_be_t addr)
142 {
143 	struct rdsv3_cong_map *map;
144 	struct rdsv3_cong_map *ret = NULL;
145 	unsigned long zp;
146 	unsigned long i;
147 
148 	RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr));
149 
150 	map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP);
151 	if (!map)
152 		return (NULL);
153 
154 	map->m_addr = addr;
155 	rdsv3_init_waitqueue(&map->m_waitq);
156 	list_create(&map->m_conn_list, sizeof (struct rdsv3_connection),
157 	    offsetof(struct rdsv3_connection, c_map_item));
158 
159 	for (i = 0; i < RDSV3_CONG_MAP_PAGES; i++) {
160 		zp = (unsigned long)kmem_zalloc(PAGE_SIZE, KM_NOSLEEP);
161 		if (zp == 0)
162 			goto out;
163 		map->m_page_addrs[i] = zp;
164 	}
165 
166 	mutex_enter(&rdsv3_cong_lock);
167 	ret = rdsv3_cong_tree_walk(addr, map);
168 	mutex_exit(&rdsv3_cong_lock);
169 
170 	if (!ret) {
171 		ret = map;
172 		map = NULL;
173 	}
174 
175 out:
176 	if (map) {
177 		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
178 		    i++)
179 			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
180 		kmem_free(map, sizeof (*map));
181 	}
182 
183 	RDSV3_DPRINTF5("rdsv3_cong_from_addr", "map %p for addr %x",
184 	    ret, ntohl(addr));
185 
186 	return (ret);
187 }
188 
189 /*
190  * Put the conn on its local map's list.  This is called when the conn is
191  * really added to the hash.  It's nested under the rdsv3_conn_lock, sadly.
192  */
193 void
rdsv3_cong_add_conn(struct rdsv3_connection * conn)194 rdsv3_cong_add_conn(struct rdsv3_connection *conn)
195 {
196 	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Enter(conn: %p)", conn);
197 
198 	RDSV3_DPRINTF5("rdsv3_cong_add_conn", "conn %p now on map %p",
199 	    conn, conn->c_lcong);
200 	mutex_enter(&rdsv3_cong_lock);
201 	list_insert_tail(&conn->c_lcong->m_conn_list, conn);
202 	mutex_exit(&rdsv3_cong_lock);
203 
204 	RDSV3_DPRINTF4("rdsv3_cong_add_conn", "Return(conn: %p)", conn);
205 }
206 
207 void
rdsv3_cong_remove_conn(struct rdsv3_connection * conn)208 rdsv3_cong_remove_conn(struct rdsv3_connection *conn)
209 {
210 	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Enter(conn: %p)", conn);
211 
212 	RDSV3_DPRINTF5("rdsv3_cong_remove_conn", "removing conn %p from map %p",
213 	    conn, conn->c_lcong);
214 	mutex_enter(&rdsv3_cong_lock);
215 	list_remove_node(&conn->c_map_item);
216 	mutex_exit(&rdsv3_cong_lock);
217 
218 	RDSV3_DPRINTF4("rdsv3_cong_remove_conn", "Return(conn: %p)", conn);
219 }
220 
221 int
rdsv3_cong_get_maps(struct rdsv3_connection * conn)222 rdsv3_cong_get_maps(struct rdsv3_connection *conn)
223 {
224 	conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr);
225 	conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr);
226 
227 	if (!(conn->c_lcong && conn->c_fcong))
228 		return (-ENOMEM);
229 
230 	return (0);
231 }
232 
233 void
rdsv3_cong_queue_updates(struct rdsv3_cong_map * map)234 rdsv3_cong_queue_updates(struct rdsv3_cong_map *map)
235 {
236 	struct rdsv3_connection *conn;
237 
238 	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Enter(map: %p)", map);
239 
240 	mutex_enter(&rdsv3_cong_lock);
241 
242 	RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) {
243 		if (!test_and_set_bit(0, &conn->c_map_queued)) {
244 			rdsv3_stats_inc(s_cong_update_queued);
245 			(void) rdsv3_send_xmit(conn);
246 		}
247 	}
248 
249 	mutex_exit(&rdsv3_cong_lock);
250 
251 	RDSV3_DPRINTF4("rdsv3_cong_queue_updates", "Return(map: %p)", map);
252 }
253 
254 void
rdsv3_cong_map_updated(struct rdsv3_cong_map * map,uint64_t portmask)255 rdsv3_cong_map_updated(struct rdsv3_cong_map *map, uint64_t portmask)
256 {
257 	RDSV3_DPRINTF4("rdsv3_cong_map_updated",
258 	    "waking map %p for %u.%u.%u.%u",
259 	    map, NIPQUAD(map->m_addr));
260 
261 	rdsv3_stats_inc(s_cong_update_received);
262 	atomic_inc_32(&rdsv3_cong_generation);
263 #if 0
264 XXX
265 	if (waitqueue_active(&map->m_waitq))
266 #endif
267 		rdsv3_wake_up(&map->m_waitq);
268 
269 	if (portmask && !list_is_empty(&rdsv3_cong_monitor)) {
270 		struct rdsv3_sock *rs;
271 
272 		rw_enter(&rdsv3_cong_monitor_lock, RW_READER);
273 		RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_cong_monitor,
274 		    rs_cong_list) {
275 			mutex_enter(&rs->rs_lock);
276 			rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
277 			rs->rs_cong_mask &= ~portmask;
278 			mutex_exit(&rs->rs_lock);
279 			if (rs->rs_cong_notify)
280 				rdsv3_wake_sk_sleep(rs);
281 		}
282 		rw_exit(&rdsv3_cong_monitor_lock);
283 	}
284 
285 	RDSV3_DPRINTF4("rdsv3_cong_map_updated", "Return(map: %p)", map);
286 }
287 
288 int
rdsv3_cong_updated_since(unsigned long * recent)289 rdsv3_cong_updated_since(unsigned long *recent)
290 {
291 	unsigned long gen = atomic_get(&rdsv3_cong_generation);
292 
293 	if (*recent == gen)
294 		return (0);
295 	*recent = gen;
296 	return (1);
297 }
298 
299 /*
300  * We're called under the locking that protects the sockets receive buffer
301  * consumption.  This makes it a lot easier for the caller to only call us
302  * when it knows that an existing set bit needs to be cleared, and vice versa.
303  * We can't block and we need to deal with concurrent sockets working against
304  * the same per-address map.
305  */
306 void
rdsv3_cong_set_bit(struct rdsv3_cong_map * map,uint16_be_t port)307 rdsv3_cong_set_bit(struct rdsv3_cong_map *map, uint16_be_t port)
308 {
309 	unsigned long i;
310 	unsigned long off;
311 
312 	RDSV3_DPRINTF4("rdsv3_cong_set_bit",
313 	    "setting congestion for %u.%u.%u.%u:%u in map %p",
314 	    NIPQUAD(map->m_addr), ntohs(port), map);
315 
316 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
317 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
318 	set_le_bit(off, (void *)map->m_page_addrs[i]);
319 }
320 
321 void
rdsv3_cong_clear_bit(struct rdsv3_cong_map * map,uint16_be_t port)322 rdsv3_cong_clear_bit(struct rdsv3_cong_map *map, uint16_be_t port)
323 {
324 	unsigned long i;
325 	unsigned long off;
326 
327 	RDSV3_DPRINTF4("rdsv3_cong_clear_bit",
328 	    "clearing congestion for %u.%u.%u.%u:%u in map %p\n",
329 	    NIPQUAD(map->m_addr), ntohs(port), map);
330 
331 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
332 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
333 	clear_le_bit(off, (void *)map->m_page_addrs[i]);
334 }
335 
336 static int
rdsv3_cong_test_bit(struct rdsv3_cong_map * map,uint16_be_t port)337 rdsv3_cong_test_bit(struct rdsv3_cong_map *map, uint16_be_t port)
338 {
339 	unsigned long i;
340 	unsigned long off;
341 
342 	i = ntohs(port) / RDSV3_CONG_MAP_PAGE_BITS;
343 	off = ntohs(port) % RDSV3_CONG_MAP_PAGE_BITS;
344 
345 	RDSV3_DPRINTF5("rdsv3_cong_test_bit", "port: 0x%x i = %lx off = %lx",
346 	    ntohs(port), i, off);
347 
348 	return (test_le_bit(off, (void *)map->m_page_addrs[i]));
349 }
350 
351 void
rdsv3_cong_add_socket(struct rdsv3_sock * rs)352 rdsv3_cong_add_socket(struct rdsv3_sock *rs)
353 {
354 	RDSV3_DPRINTF4("rdsv3_cong_add_socket", "Enter(rs: %p)", rs);
355 
356 	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
357 	if (!list_link_active(&rs->rs_cong_list))
358 		list_insert_head(&rdsv3_cong_monitor, rs);
359 	rw_exit(&rdsv3_cong_monitor_lock);
360 }
361 
362 void
rdsv3_cong_remove_socket(struct rdsv3_sock * rs)363 rdsv3_cong_remove_socket(struct rdsv3_sock *rs)
364 {
365 	struct rdsv3_cong_map *map;
366 
367 	RDSV3_DPRINTF4("rdsv3_cong_remove_socket", "Enter(rs: %p)", rs);
368 
369 	rw_enter(&rdsv3_cong_monitor_lock, RW_WRITER);
370 	list_remove_node(&rs->rs_cong_list);
371 	rw_exit(&rdsv3_cong_monitor_lock);
372 
373 	/* update congestion map for now-closed port */
374 	mutex_enter(&rdsv3_cong_lock);
375 	map = rdsv3_cong_tree_walk(rs->rs_bound_addr, NULL);
376 	mutex_exit(&rdsv3_cong_lock);
377 
378 	if (map && rdsv3_cong_test_bit(map, rs->rs_bound_port)) {
379 		rdsv3_cong_clear_bit(map, rs->rs_bound_port);
380 		rdsv3_cong_queue_updates(map);
381 	}
382 }
383 
384 int
rdsv3_cong_wait(struct rdsv3_cong_map * map,uint16_be_t port,int nonblock,struct rdsv3_sock * rs)385 rdsv3_cong_wait(struct rdsv3_cong_map *map, uint16_be_t port, int nonblock,
386     struct rdsv3_sock *rs)
387 {
388 	int ret = 0;
389 
390 	RDSV3_DPRINTF4("rdsv3_cong_wait", "Enter(rs: %p, mode: %d)",
391 	    rs, nonblock);
392 
393 	if (!rdsv3_cong_test_bit(map, port))
394 		return (0);
395 	if (nonblock) {
396 		if (rs && rs->rs_cong_monitor) {
397 			/*
398 			 * It would have been nice to have an atomic set_bit on
399 			 * a uint64_t.
400 			 */
401 			mutex_enter(&rs->rs_lock);
402 			rs->rs_cong_mask |=
403 			    RDS_CONG_MONITOR_MASK(ntohs(port));
404 			mutex_exit(&rs->rs_lock);
405 
406 			/*
407 			 * Test again - a congestion update may have arrived in
408 			 * the meantime.
409 			 */
410 			if (!rdsv3_cong_test_bit(map, port))
411 				return (0);
412 		}
413 		rdsv3_stats_inc(s_cong_send_error);
414 		return (-ENOBUFS);
415 	}
416 
417 	rdsv3_stats_inc(s_cong_send_blocked);
418 	RDSV3_DPRINTF3("rdsv3_cong_wait", "waiting on map %p for port %u",
419 	    map, ntohs(port));
420 
421 #if 0
422 	ret = rdsv3_wait_sig(&map->m_waitq, !rdsv3_cong_test_bit(map, port));
423 	if (ret == 0)
424 		return (-ERESTART);
425 	return (0);
426 #else
427 	mutex_enter(&map->m_waitq.waitq_mutex);
428 	map->m_waitq.waitq_waiters++;
429 	while (rdsv3_cong_test_bit(map, port)) {
430 		ret = cv_wait_sig(&map->m_waitq.waitq_cv,
431 		    &map->m_waitq.waitq_mutex);
432 		if (ret == 0) {
433 			ret = -EINTR;
434 			break;
435 		}
436 	}
437 	map->m_waitq.waitq_waiters--;
438 	mutex_exit(&map->m_waitq.waitq_mutex);
439 	return (ret);
440 #endif
441 }
442 
443 void
rdsv3_cong_exit(void)444 rdsv3_cong_exit(void)
445 {
446 	struct rdsv3_cong_map *map;
447 	unsigned long i;
448 
449 	RDSV3_DPRINTF4("rdsv3_cong_exit", "Enter");
450 
451 	while ((map = avl_first(&rdsv3_cong_tree))) {
452 		RDSV3_DPRINTF5("rdsv3_cong_exit", "freeing map %p\n", map);
453 		avl_remove(&rdsv3_cong_tree, map);
454 		for (i = 0; i < RDSV3_CONG_MAP_PAGES && map->m_page_addrs[i];
455 		    i++)
456 			kmem_free((void *)map->m_page_addrs[i], PAGE_SIZE);
457 		kmem_free(map, sizeof (*map));
458 	}
459 
460 	RDSV3_DPRINTF4("rdsv3_cong_exit", "Return");
461 }
462 
463 /*
464  * Allocate a RDS message containing a congestion update.
465  */
466 struct rdsv3_message *
rdsv3_cong_update_alloc(struct rdsv3_connection * conn)467 rdsv3_cong_update_alloc(struct rdsv3_connection *conn)
468 {
469 	struct rdsv3_cong_map *map = conn->c_lcong;
470 	struct rdsv3_message *rm;
471 
472 	rm = rdsv3_message_map_pages(map->m_page_addrs, RDSV3_CONG_MAP_BYTES);
473 	if (!IS_ERR(rm))
474 		rm->m_inc.i_hdr.h_flags = RDSV3_FLAG_CONG_BITMAP;
475 
476 	return (rm);
477 }
478 
479 static int
rdsv3_cong_compare(const void * map1,const void * map2)480 rdsv3_cong_compare(const void *map1, const void *map2)
481 {
482 #define	addr1	((struct rdsv3_cong_map *)map1)->m_addr
483 #define	addr2	((struct rdsv3_cong_map *)map2)->m_addr
484 
485 	if (addr1 < addr2)
486 		return (-1);
487 	if (addr1 > addr2)
488 		return (1);
489 	return (0);
490 }
491 
492 void
rdsv3_cong_init(void)493 rdsv3_cong_init(void)
494 {
495 	list_create(&rdsv3_cong_monitor, sizeof (struct rdsv3_sock),
496 	    offsetof(struct rdsv3_sock, rs_cong_list));
497 	rw_init(&rdsv3_cong_monitor_lock, NULL, RW_DRIVER, NULL);
498 	mutex_init(&rdsv3_cong_lock, NULL, MUTEX_DRIVER, NULL);
499 	avl_create(&rdsv3_cong_tree, rdsv3_cong_compare,
500 	    sizeof (struct rdsv3_cong_map), offsetof(struct rdsv3_cong_map,
501 	    m_rb_node));
502 }
503