1/*
2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3 */
4
5/*
6 * This file contains code imported from the OFED rds source file ib.c
7 * Oracle elects to have and use the contents of ib.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
10 */
11
12/*
13 * Copyright (c) 2006 Oracle.  All rights reserved.
14 *
15 * This software is available to you under a choice of one of two
16 * licenses.  You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
20 *
21 *     Redistribution and use in source and binary forms, with or
22 *     without modification, are permitted provided that the following
23 *     conditions are met:
24 *
25 *      - Redistributions of source code must retain the above
26 *        copyright notice, this list of conditions and the following
27 *        disclaimer.
28 *
29 *      - Redistributions in binary form must reproduce the above
30 *        copyright notice, this list of conditions and the following
31 *        disclaimer in the documentation and/or other materials
32 *        provided with the distribution.
33 *
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 * SOFTWARE.
42 *
43 */
44#include <sys/sysmacros.h>
45#include <sys/rds.h>
46
47#include <sys/ib/ibtl/ibti.h>
48#include <sys/ib/clients/rdsv3/rdsv3.h>
49#include <sys/ib/clients/rdsv3/ib.h>
50#include <sys/ib/clients/rdsv3/rdsv3_debug.h>
51
52unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
53
54struct list	rdsv3_ib_devices;
55
56/* NOTE: if also grabbing ibdev lock, grab this first */
57kmutex_t ib_nodev_conns_lock;
58list_t ib_nodev_conns;
59
60extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
61extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
62
63void
64rdsv3_ib_add_one(ib_device_t *device)
65{
66	struct rdsv3_ib_device *rds_ibdev;
67	ibt_hca_attr_t *dev_attr;
68	char name[64];
69
70	RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
71
72	/* Only handle IB (no iWARP) devices */
73	if (device->node_type != RDMA_NODE_IB_CA)
74		return;
75
76	dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
77	    KM_NOSLEEP);
78	if (!dev_attr)
79		return;
80
81	if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
82		RDSV3_DPRINTF2("rdsv3_ib_add_one",
83		    "Query device failed for %s", device->name);
84		goto free_attr;
85	}
86
87	/* We depend on Reserved Lkey */
88	if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
89		RDSV3_DPRINTF2("rdsv3_ib_add_one",
90		    "Reserved Lkey support is required: %s",
91		    device->name);
92		goto free_attr;
93	}
94
95	rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
96	if (!rds_ibdev)
97		goto free_attr;
98
99	rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
100	rds_ibdev->hca_attr =  *dev_attr;
101
102	rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
103	mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
104
105	rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
106	rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
107
108	rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
109	rds_ibdev->max_responder_resources =
110	    (uint_t)dev_attr->hca_max_rdma_in_qp;
111
112	rds_ibdev->dev = device;
113	rds_ibdev->pd = ib_alloc_pd(device);
114	if (IS_ERR(rds_ibdev->pd))
115		goto free_dev;
116
117	if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
118		goto free_dev;
119	}
120
121	if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
122		rdsv3_ib_destroy_mr_pool(rds_ibdev);
123		goto free_dev;
124	}
125
126	(void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
127	    (longlong_t)htonll(dev_attr->hca_node_guid));
128	rds_ibdev->ib_frag_slab = kmem_cache_create(name,
129	    sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
130	    rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
131	if (rds_ibdev->ib_frag_slab == NULL) {
132		RDSV3_DPRINTF2("rdsv3_ib_add_one",
133		    "kmem_cache_create for ib_frag_slab failed for device: %s",
134		    device->name);
135		rdsv3_ib_destroy_mr_pool(rds_ibdev);
136		rdsv3_ib_destroy_inc_pool(rds_ibdev);
137		goto free_dev;
138	}
139
140	rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
141	    (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
142	if (rds_ibdev->aft_hcagp == NULL) {
143		rdsv3_ib_destroy_mr_pool(rds_ibdev);
144		rdsv3_ib_destroy_inc_pool(rds_ibdev);
145		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
146		goto free_dev;
147	}
148	rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
149	    (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
150	    rds_ibdev->aft_hcagp);
151	if (rds_ibdev->fmr_soft_cq == NULL) {
152		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
153		rdsv3_ib_destroy_mr_pool(rds_ibdev);
154		rdsv3_ib_destroy_inc_pool(rds_ibdev);
155		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
156		goto free_dev;
157	}
158
159	rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
160	    (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
161	    rds_ibdev->aft_hcagp);
162	if (rds_ibdev->inc_soft_cq == NULL) {
163		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
164		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
165		rdsv3_ib_destroy_mr_pool(rds_ibdev);
166		rdsv3_ib_destroy_inc_pool(rds_ibdev);
167		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
168		goto free_dev;
169	}
170
171	list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
172	    offsetof(struct rdsv3_ib_ipaddr, list));
173	list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
174	    offsetof(struct rdsv3_ib_connection, ib_node));
175
176	list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
177
178	ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
179
180	RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
181
182	goto free_attr;
183
184err_pd:
185	(void) ib_dealloc_pd(rds_ibdev->pd);
186free_dev:
187	mutex_destroy(&rds_ibdev->spinlock);
188	rw_destroy(&rds_ibdev->rwlock);
189	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
190free_attr:
191	kmem_free(dev_attr, sizeof (*dev_attr));
192}
193
194void
195rdsv3_ib_remove_one(struct ib_device *device)
196{
197	struct rdsv3_ib_device *rds_ibdev;
198	struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
199
200	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
201
202	rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
203	if (!rds_ibdev)
204		return;
205
206	RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
207	    list) {
208		list_remove_node(&i_ipaddr->list);
209		kmem_free(i_ipaddr, sizeof (*i_ipaddr));
210	}
211
212	rdsv3_ib_destroy_conns(rds_ibdev);
213
214	if (rds_ibdev->fmr_soft_cq)
215		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
216	if (rds_ibdev->inc_soft_cq)
217		rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
218
219	rdsv3_ib_destroy_mr_pool(rds_ibdev);
220	rdsv3_ib_destroy_inc_pool(rds_ibdev);
221
222	kmem_cache_destroy(rds_ibdev->ib_frag_slab);
223
224	rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
225
226#if 0
227	while (ib_dealloc_pd(rds_ibdev->pd)) {
228#ifndef __lock_lint
229		RDSV3_DPRINTF5("rdsv3_ib_remove_one",
230		    "%s-%d Failed to dealloc pd %p",
231		    __func__, __LINE__, rds_ibdev->pd);
232#endif
233		delay(drv_usectohz(1000));
234	}
235#else
236	if (ib_dealloc_pd(rds_ibdev->pd)) {
237#ifndef __lock_lint
238		RDSV3_DPRINTF2("rdsv3_ib_remove_one",
239		    "Failed to dealloc pd %p\n", rds_ibdev->pd);
240#endif
241	}
242#endif
243
244	list_destroy(&rds_ibdev->ipaddr_list);
245	list_destroy(&rds_ibdev->conn_list);
246	list_remove_node(&rds_ibdev->list);
247	mutex_destroy(&rds_ibdev->spinlock);
248	rw_destroy(&rds_ibdev->rwlock);
249	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
250
251	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
252}
253
254#ifndef __lock_lint
255struct ib_client rdsv3_ib_client = {
256	.name		= "rdsv3_ib",
257	.add		= rdsv3_ib_add_one,
258	.remove		= rdsv3_ib_remove_one,
259	.clnt_hdl	= NULL,
260	.state		= IB_CLNT_UNINITIALIZED
261};
262#else
263struct ib_client rdsv3_ib_client = {
264	"rdsv3_ib",
265	rdsv3_ib_add_one,
266	rdsv3_ib_remove_one,
267	NULL,
268	NULL,
269	IB_CLNT_UNINITIALIZED
270};
271#endif
272
273static int
274rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
275    void *buffer)
276{
277	struct rds_info_rdma_connection *iinfo = buffer;
278	struct rdsv3_ib_connection *ic;
279
280	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
281	    conn, buffer);
282
283	/* We will only ever look at IB transports */
284	if (conn->c_trans != &rdsv3_ib_transport)
285		return (0);
286
287	iinfo->src_addr = conn->c_laddr;
288	iinfo->dst_addr = conn->c_faddr;
289
290	(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
291	(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
292	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
293		struct rdsv3_ib_device *rds_ibdev;
294		struct rdma_dev_addr *dev_addr;
295
296		ic = conn->c_transport_data;
297		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
298
299		ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
300		ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
301
302		rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
303		    &rdsv3_ib_client);
304		iinfo->max_send_wr = ic->i_send_ring.w_nr;
305		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
306		iinfo->max_send_sge = rds_ibdev->max_sge;
307	}
308
309	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
310	    conn, buffer);
311	return (1);
312}
313
314static void
315rds_ib_ic_info(struct rsock *sock, unsigned int len,
316    struct rdsv3_info_iterator *iter,
317    struct rdsv3_info_lengths *lens)
318{
319	RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
320	    sock, iter, lens, len);
321
322	rdsv3_for_each_conn_info(sock, len, iter, lens,
323	    rds_ib_conn_info_visitor,
324	    sizeof (struct rds_info_rdma_connection));
325}
326
327/*
328 * Early RDS/IB was built to only bind to an address if there is an IPoIB
329 * device with that address set.
330 *
331 * If it were me, I'd advocate for something more flexible.  Sending and
332 * receiving should be device-agnostic.  Transports would try and maintain
333 * connections between peers who have messages queued.  Userspace would be
334 * allowed to influence which paths have priority.  We could call userspace
335 * asserting this policy "routing".
336 */
337static int
338rds_ib_laddr_check(uint32_be_t addr)
339{
340	int ret;
341	struct rdma_cm_id *cm_id;
342	struct sockaddr_in sin;
343
344	RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
345
346	/*
347	 * Create a CMA ID and try to bind it. This catches both
348	 * IB and iWARP capable NICs.
349	 */
350	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
351	if (!cm_id)
352		return (-EADDRNOTAVAIL);
353
354	(void) memset(&sin, 0, sizeof (sin));
355	sin.sin_family = AF_INET;
356	sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
357
358	/* rdma_bind_addr will only succeed for IB & iWARP devices */
359	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
360	/*
361	 * due to this, we will claim to support iWARP devices unless we
362	 * check node_type.
363	 */
364	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
365		ret = -EADDRNOTAVAIL;
366
367	RDSV3_DPRINTF5("rds_ib_laddr_check",
368	    "addr %u.%u.%u.%u ret %d node type %d",
369	    NIPQUAD(addr), ret,
370	    cm_id->device ? cm_id->device->node_type : -1);
371
372	rdma_destroy_id(cm_id);
373
374	return (ret);
375}
376
377void
378rdsv3_ib_exit(void)
379{
380	RDSV3_DPRINTF4("rds_ib_exit", "Enter");
381
382	rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
383	rdsv3_ib_destroy_nodev_conns();
384	ib_unregister_client(&rdsv3_ib_client);
385	rdsv3_ib_sysctl_exit();
386	rdsv3_ib_recv_exit();
387	rdsv3_trans_unregister(&rdsv3_ib_transport);
388	kmem_free(rdsv3_ib_stats,
389	    nr_cpus * sizeof (struct rdsv3_ib_statistics));
390	mutex_destroy(&ib_nodev_conns_lock);
391	list_destroy(&ib_nodev_conns);
392	list_destroy(&rdsv3_ib_devices);
393
394	RDSV3_DPRINTF4("rds_ib_exit", "Return");
395}
396
397#ifndef __lock_lint
398struct rdsv3_transport rdsv3_ib_transport = {
399	.laddr_check		= rds_ib_laddr_check,
400	.xmit_complete		= rdsv3_ib_xmit_complete,
401	.xmit			= rdsv3_ib_xmit,
402	.xmit_cong_map		= NULL,
403	.xmit_rdma		= rdsv3_ib_xmit_rdma,
404	.recv			= rdsv3_ib_recv,
405	.conn_alloc		= rdsv3_ib_conn_alloc,
406	.conn_free		= rdsv3_ib_conn_free,
407	.conn_connect		= rdsv3_ib_conn_connect,
408	.conn_shutdown		= rdsv3_ib_conn_shutdown,
409	.inc_copy_to_user	= rdsv3_ib_inc_copy_to_user,
410	.inc_free		= rdsv3_ib_inc_free,
411	.cm_initiate_connect	= rdsv3_ib_cm_initiate_connect,
412	.cm_handle_connect	= rdsv3_ib_cm_handle_connect,
413	.cm_connect_complete	= rdsv3_ib_cm_connect_complete,
414	.stats_info_copy	= rdsv3_ib_stats_info_copy,
415	.exit			= rdsv3_ib_exit,
416	.get_mr			= rdsv3_ib_get_mr,
417	.sync_mr		= rdsv3_ib_sync_mr,
418	.free_mr		= rdsv3_ib_free_mr,
419	.flush_mrs		= rdsv3_ib_flush_mrs,
420	.t_name			= "infiniband",
421	.t_type			= RDS_TRANS_IB
422};
423#else
424struct rdsv3_transport rdsv3_ib_transport;
425#endif
426
427int
428rdsv3_ib_init(void)
429{
430	int ret;
431
432	RDSV3_DPRINTF4("rds_ib_init", "Enter");
433
434	list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
435	    offsetof(struct rdsv3_ib_device, list));
436	list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
437	    offsetof(struct rdsv3_ib_connection, ib_node));
438	mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
439
440	/* allocate space for ib statistics */
441	ASSERT(rdsv3_ib_stats == NULL);
442	rdsv3_ib_stats = kmem_zalloc(nr_cpus *
443	    sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
444
445	rdsv3_ib_client.dip = rdsv3_dev_info;
446	ret = ib_register_client(&rdsv3_ib_client);
447	if (ret)
448		goto out;
449
450	ret = rdsv3_ib_sysctl_init();
451	if (ret)
452		goto out_ibreg;
453
454	ret = rdsv3_ib_recv_init();
455	if (ret)
456		goto out_sysctl;
457
458	ret = rdsv3_trans_register(&rdsv3_ib_transport);
459	if (ret)
460		goto out_recv;
461
462	rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
463
464	RDSV3_DPRINTF4("rds_ib_init", "Return");
465
466	return (0);
467
468out_recv:
469	rdsv3_ib_recv_exit();
470out_sysctl:
471	rdsv3_ib_sysctl_exit();
472out_ibreg:
473	ib_unregister_client(&rdsv3_ib_client);
474out:
475	kmem_free(rdsv3_ib_stats,
476	    nr_cpus * sizeof (struct rdsv3_ib_statistics));
477	mutex_destroy(&ib_nodev_conns_lock);
478	list_destroy(&ib_nodev_conns);
479	list_destroy(&rdsv3_ib_devices);
480	return (ret);
481}
482