1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_srq.c
29  *    Tavor Shared Receive Queue Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, querying,
32  *    modifying and posting shared receive queues.
33  */
34 
35 #include <sys/sysmacros.h>
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42 
43 #include <sys/ib/adapters/tavor/tavor.h>
44 
45 static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
46     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
47 
48 /*
49  * tavor_srq_alloc()
50  *    Context: Can be called only from user or kernel context.
51  */
52 int
tavor_srq_alloc(tavor_state_t * state,tavor_srq_info_t * srqinfo,uint_t sleepflag,tavor_srq_options_t * op)53 tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
54     uint_t sleepflag, tavor_srq_options_t *op)
55 {
56 	ibt_srq_hdl_t		ibt_srqhdl;
57 	tavor_pdhdl_t		pd;
58 	ibt_srq_sizes_t		*sizes;
59 	ibt_srq_sizes_t		*real_sizes;
60 	tavor_srqhdl_t		*srqhdl;
61 	ibt_srq_flags_t		flags;
62 	tavor_rsrc_t		*srqc, *rsrc;
63 	tavor_hw_srqc_t		srqc_entry;
64 	uint32_t		*buf;
65 	tavor_srqhdl_t		srq;
66 	tavor_umap_db_entry_t	*umapdb;
67 	ibt_mr_attr_t		mr_attr;
68 	tavor_mr_options_t	mr_op;
69 	tavor_mrhdl_t		mr;
70 	uint64_t		addr;
71 	uint64_t		value, srq_desc_off;
72 	uint32_t		lkey;
73 	uint32_t		log_srq_size;
74 	uint32_t		uarpg;
75 	uint_t			wq_location, dma_xfer_mode, srq_is_umap;
76 	int			flag, status;
77 	uint_t			max_sgl;
78 	uint_t			wqesz;
79 
80 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
81 
82 	/*
83 	 * Check the "options" flag.  Currently this flag tells the driver
84 	 * whether or not the SRQ's work queues should be come from normal
85 	 * system memory or whether they should be allocated from DDR memory.
86 	 */
87 	if (op == NULL) {
88 		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
89 	} else {
90 		wq_location = op->srqo_wq_loc;
91 	}
92 
93 	/*
94 	 * Extract the necessary info from the tavor_srq_info_t structure
95 	 */
96 	real_sizes = srqinfo->srqi_real_sizes;
97 	sizes	   = srqinfo->srqi_sizes;
98 	pd	   = srqinfo->srqi_pd;
99 	ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
100 	flags	   = srqinfo->srqi_flags;
101 	srqhdl	   = srqinfo->srqi_srqhdl;
102 
103 	/*
104 	 * Determine whether SRQ is being allocated for userland access or
105 	 * whether it is being allocated for kernel access.  If the SRQ is
106 	 * being allocated for userland access, then lookup the UAR doorbell
107 	 * page number for the current process.  Note:  If this is not found
108 	 * (e.g. if the process has not previously open()'d the Tavor driver),
109 	 * then an error is returned.
110 	 */
111 	srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
112 	if (srq_is_umap) {
113 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
114 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
115 		if (status != DDI_SUCCESS) {
116 			goto srqalloc_fail3;
117 		}
118 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
119 	}
120 
121 	/* Increase PD refcnt */
122 	tavor_pd_refcnt_inc(pd);
123 
124 	/* Allocate an SRQ context entry */
125 	status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
126 	if (status != DDI_SUCCESS) {
127 		goto srqalloc_fail1;
128 	}
129 
130 	/* Allocate the SRQ Handle entry */
131 	status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
132 	if (status != DDI_SUCCESS) {
133 		goto srqalloc_fail2;
134 	}
135 
136 	srq = (tavor_srqhdl_t)rsrc->tr_addr;
137 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
138 
139 	srq->srq_srqnum = srqc->tr_indx;	/* just use index */
140 
141 	/*
142 	 * If this will be a user-mappable SRQ, then allocate an entry for
143 	 * the "userland resources database".  This will later be added to
144 	 * the database (after all further SRQ operations are successful).
145 	 * If we fail here, we must undo the reference counts and the
146 	 * previous resource allocation.
147 	 */
148 	if (srq_is_umap) {
149 		umapdb = tavor_umap_db_alloc(state->ts_instance,
150 		    srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
151 		    (uint64_t)(uintptr_t)rsrc);
152 		if (umapdb == NULL) {
153 			goto srqalloc_fail3;
154 		}
155 	}
156 
157 	/*
158 	 * Calculate the appropriate size for the SRQ.
159 	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
160 	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
161 	 * is to round the requested size up to the next highest power-of-2
162 	 */
163 	sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
164 	log_srq_size = highbit(sizes->srq_wr_sz);
165 	if (ISP2(sizes->srq_wr_sz)) {
166 		log_srq_size = log_srq_size - 1;
167 	}
168 
169 	/*
170 	 * Next we verify that the rounded-up size is valid (i.e. consistent
171 	 * with the device limits and/or software-configured limits).  If not,
172 	 * then obviously we have a lot of cleanup to do before returning.
173 	 */
174 	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
175 		goto srqalloc_fail4;
176 	}
177 
178 	/*
179 	 * Next we verify that the requested number of SGL is valid (i.e.
180 	 * consistent with the device limits and/or software-configured
181 	 * limits).  If not, then obviously the same cleanup needs to be done.
182 	 */
183 	max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
184 	if (sizes->srq_sgl_sz > max_sgl) {
185 		goto srqalloc_fail4;
186 	}
187 
188 	/*
189 	 * Determine the SRQ's WQE sizes.  This depends on the requested
190 	 * number of SGLs.  Note: This also has the side-effect of
191 	 * calculating the real number of SGLs (for the calculated WQE size)
192 	 */
193 	tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
194 	    TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
195 	    &srq->srq_wq_sgl);
196 
197 	/*
198 	 * Allocate the memory for SRQ work queues.  Note:  The location from
199 	 * which we will allocate these work queues has been passed in through
200 	 * the tavor_qp_options_t structure.  Since Tavor work queues are not
201 	 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
202 	 * queue memory is very important.  We used to allocate work queues
203 	 * (the combined receive and send queues) so that they would be aligned
204 	 * on their combined size.  That alignment guaranteed that they would
205 	 * never cross the 4GB boundary (Tavor work queues are on the order of
206 	 * MBs at maximum).  Now we are able to relax this alignment constraint
207 	 * by ensuring that the IB address assigned to the queue memory (as a
208 	 * result of the tavor_mr_register() call) is offset from zero.
209 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
210 	 * guarantee the alignment, but when attempting to use IOMMU bypass
211 	 * mode we found that we were not allowed to specify any alignment that
212 	 * was more restrictive than the system page size.  So we avoided this
213 	 * constraint by passing two alignment values, one for the memory
214 	 * allocation itself and the other for the DMA handle (for later bind).
215 	 * This used to cause more memory than necessary to be allocated (in
216 	 * order to guarantee the more restrictive alignment contraint).  But
217 	 * be guaranteeing the zero-based IB virtual address for the queue, we
218 	 * are able to conserve this memory.
219 	 *
220 	 * Note: If SRQ is not user-mappable, then it may come from either
221 	 * kernel system memory or from HCA-attached local DDR memory.
222 	 *
223 	 * Note2: We align this queue on a pagesize boundary.  This is required
224 	 * to make sure that all the resulting IB addresses will start at 0, for
225 	 * a zero-based queue.  By making sure we are aligned on at least a
226 	 * page, any offset we use into our queue will be the same as when we
227 	 * perform tavor_srq_modify() operations later.
228 	 */
229 	wqesz = (1 << srq->srq_wq_log_wqesz);
230 	srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
231 	srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
232 	srq->srq_wqinfo.qa_bind_align = PAGESIZE;
233 	if (srq_is_umap) {
234 		srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
235 	} else {
236 		srq->srq_wqinfo.qa_location = wq_location;
237 	}
238 	status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
239 	if (status != DDI_SUCCESS) {
240 		goto srqalloc_fail4;
241 	}
242 	buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
243 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
244 
245 	/*
246 	 * Register the memory for the SRQ work queues.  The memory for the SRQ
247 	 * must be registered in the Tavor TPT tables.  This gives us the LKey
248 	 * to specify in the SRQ context later.  Note: If the work queue is to
249 	 * be allocated from DDR memory, then only a "bypass" mapping is
250 	 * appropriate.  And if the SRQ memory is user-mappable, then we force
251 	 * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
252 	 * restriction, we pass the "mro_bind_override_addr" flag in the call
253 	 * to tavor_mr_register().  This guarantees that the resulting IB vaddr
254 	 * will be zero-based (modulo the offset into the first page).  If we
255 	 * fail here, we still have the bunch of resource and reference count
256 	 * cleanup to do.
257 	 */
258 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
259 	    IBT_MR_NOSLEEP;
260 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
261 	mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
262 	mr_attr.mr_as    = NULL;
263 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
264 	if (srq_is_umap) {
265 		mr_op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
266 	} else {
267 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
268 			mr_op.mro_bind_type =
269 			    state->ts_cfg_profile->cp_iommu_bypass;
270 			dma_xfer_mode =
271 			    state->ts_cfg_profile->cp_streaming_consistent;
272 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
273 				mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
274 			}
275 		} else {
276 			mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
277 		}
278 	}
279 	mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
280 	mr_op.mro_bind_override_addr = 1;
281 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
282 	if (status != DDI_SUCCESS) {
283 		goto srqalloc_fail5;
284 	}
285 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
286 	addr = mr->mr_bindinfo.bi_addr;
287 	lkey = mr->mr_lkey;
288 
289 	/*
290 	 * Calculate the offset between the kernel virtual address space
291 	 * and the IB virtual address space.  This will be used when
292 	 * posting work requests to properly initialize each WQE.
293 	 */
294 	srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
295 	    (uint64_t)mr->mr_bindinfo.bi_addr;
296 
297 	/*
298 	 * Create WQL and Wridlist for use by this SRQ
299 	 */
300 	srq->srq_wrid_wql = tavor_wrid_wql_create(state);
301 	if (srq->srq_wrid_wql == NULL) {
302 		goto srqalloc_fail6;
303 	}
304 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))
305 
306 	srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
307 	if (srq->srq_wridlist == NULL) {
308 		goto srqalloc_fail7;
309 	}
310 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))
311 
312 	srq->srq_wridlist->wl_srq_en = 1;
313 	srq->srq_wridlist->wl_free_list_indx = -1;
314 
315 	/*
316 	 * Fill in all the return arguments (if necessary).  This includes
317 	 * real queue size and real SGLs.
318 	 */
319 	if (real_sizes != NULL) {
320 		real_sizes->srq_wr_sz = (1 << log_srq_size);
321 		real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
322 	}
323 
324 	/*
325 	 * Fill in the SRQC entry.  This is the final step before passing
326 	 * ownership of the SRQC entry to the Tavor hardware.  We use all of
327 	 * the information collected/calculated above to fill in the
328 	 * requisite portions of the SRQC.  Note: If this SRQ is going to be
329 	 * used for userland access, then we need to set the UAR page number
330 	 * appropriately (otherwise it's a "don't care")
331 	 */
332 	bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
333 	srqc_entry.wqe_addr_h	   = (addr >> 32);
334 	srqc_entry.next_wqe_addr_l = 0;
335 	srqc_entry.ds		   = (wqesz >> 4);
336 	srqc_entry.state	   = TAVOR_SRQ_STATE_HW_OWNER;
337 	srqc_entry.pd		   = pd->pd_pdnum;
338 	srqc_entry.lkey		   = lkey;
339 	srqc_entry.wqe_cnt	   = 0;
340 	if (srq_is_umap) {
341 		srqc_entry.uar	   = uarpg;
342 	} else {
343 		srqc_entry.uar	   = 0;
344 	}
345 
346 	/*
347 	 * Write the SRQC entry to hardware.  Lastly, we pass ownership of
348 	 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
349 	 * command).  Note: In general, this operation shouldn't fail.  But
350 	 * if it does, we have to undo everything we've done above before
351 	 * returning error.
352 	 */
353 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
354 	    sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
355 	    sleepflag);
356 	if (status != TAVOR_CMD_SUCCESS) {
357 		cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
358 		    status);
359 		goto srqalloc_fail8;
360 	}
361 
362 	/*
363 	 * Fill in the rest of the Tavor SRQ handle.  We can update
364 	 * the following fields for use in further operations on the SRQ.
365 	 */
366 	srq->srq_srqcrsrcp = srqc;
367 	srq->srq_rsrcp	   = rsrc;
368 	srq->srq_mrhdl	   = mr;
369 	srq->srq_refcnt	   = 0;
370 	srq->srq_is_umap   = srq_is_umap;
371 	srq->srq_uarpg	   = (srq->srq_is_umap) ? uarpg : 0;
372 	srq->srq_umap_dhp  = (devmap_cookie_t)NULL;
373 	srq->srq_pdhdl	   = pd;
374 	srq->srq_wq_lastwqeindx = -1;
375 	srq->srq_wq_bufsz  = (1 << log_srq_size);
376 	srq->srq_wq_buf	   = buf;
377 	srq->srq_desc_off  = srq_desc_off;
378 	srq->srq_hdlrarg   = (void *)ibt_srqhdl;
379 	srq->srq_state	   = 0;
380 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
381 	srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
382 
383 	/* Determine if later ddi_dma_sync will be necessary */
384 	srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
385 
386 	/*
387 	 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list.  Then fill in the
388 	 * "srqhdl" and return success
389 	 */
390 	ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
391 	state->ts_srqhdl[srqc->tr_indx] = srq;
392 
393 	/*
394 	 * If this is a user-mappable SRQ, then we need to insert the
395 	 * previously allocated entry into the "userland resources database".
396 	 * This will allow for later lookup during devmap() (i.e. mmap())
397 	 * calls.
398 	 */
399 	if (srq->srq_is_umap) {
400 		tavor_umap_db_add(umapdb);
401 	} else {
402 		mutex_enter(&srq->srq_wrid_wql->wql_lock);
403 		tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
404 		mutex_exit(&srq->srq_wrid_wql->wql_lock);
405 	}
406 
407 	*srqhdl = srq;
408 
409 	return (status);
410 
411 /*
412  * The following is cleanup for all possible failure cases in this routine
413  */
414 srqalloc_fail8:
415 	kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
416 	    sizeof (tavor_wrid_entry_t));
417 	kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
418 srqalloc_fail7:
419 	tavor_wql_refcnt_dec(srq->srq_wrid_wql);
420 srqalloc_fail6:
421 	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
422 	    TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
423 		TAVOR_WARNING(state, "failed to deregister SRQ memory");
424 	}
425 srqalloc_fail5:
426 	tavor_queue_free(state, &srq->srq_wqinfo);
427 srqalloc_fail4:
428 	if (srq_is_umap) {
429 		tavor_umap_db_free(umapdb);
430 	}
431 srqalloc_fail3:
432 	tavor_rsrc_free(state, &rsrc);
433 srqalloc_fail2:
434 	tavor_rsrc_free(state, &srqc);
435 srqalloc_fail1:
436 	tavor_pd_refcnt_dec(pd);
437 srqalloc_fail:
438 	return (status);
439 }
440 
441 
442 /*
443  * tavor_srq_free()
444  *    Context: Can be called only from user or kernel context.
445  */
446 /* ARGSUSED */
447 int
tavor_srq_free(tavor_state_t * state,tavor_srqhdl_t * srqhdl,uint_t sleepflag)448 tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
449 {
450 	tavor_rsrc_t		*srqc, *rsrc;
451 	tavor_umap_db_entry_t	*umapdb;
452 	uint64_t		value;
453 	tavor_srqhdl_t		srq;
454 	tavor_mrhdl_t		mr;
455 	tavor_pdhdl_t		pd;
456 	tavor_hw_srqc_t		srqc_entry;
457 	uint32_t		srqnum;
458 	uint32_t		size;
459 	uint_t			maxprot;
460 	int			status;
461 
462 	/*
463 	 * Pull all the necessary information from the Tavor Shared Receive
464 	 * Queue handle.  This is necessary here because the resource for the
465 	 * SRQ handle is going to be freed up as part of this operation.
466 	 */
467 	srq	= *srqhdl;
468 	mutex_enter(&srq->srq_lock);
469 	srqc	= srq->srq_srqcrsrcp;
470 	rsrc	= srq->srq_rsrcp;
471 	pd	= srq->srq_pdhdl;
472 	mr	= srq->srq_mrhdl;
473 	srqnum	= srq->srq_srqnum;
474 
475 	/*
476 	 * If there are work queues still associated with the SRQ, then return
477 	 * an error.  Otherwise, we will be holding the SRQ lock.
478 	 */
479 	if (srq->srq_refcnt != 0) {
480 		mutex_exit(&srq->srq_lock);
481 		return (IBT_SRQ_IN_USE);
482 	}
483 
484 	/*
485 	 * If this was a user-mappable SRQ, then we need to remove its entry
486 	 * from the "userland resources database".  If it is also currently
487 	 * mmap()'d out to a user process, then we need to call
488 	 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
489 	 * We also need to invalidate the SRQ tracking information for the
490 	 * user mapping.
491 	 */
492 	if (srq->srq_is_umap) {
493 		status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
494 		    MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
495 		    &umapdb);
496 		if (status != DDI_SUCCESS) {
497 			mutex_exit(&srq->srq_lock);
498 			TAVOR_WARNING(state, "failed to find in database");
499 			return (ibc_get_ci_failure(0));
500 		}
501 		tavor_umap_db_free(umapdb);
502 		if (srq->srq_umap_dhp != NULL) {
503 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
504 			status = devmap_devmem_remap(srq->srq_umap_dhp,
505 			    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
506 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
507 			if (status != DDI_SUCCESS) {
508 				mutex_exit(&srq->srq_lock);
509 				TAVOR_WARNING(state, "failed in SRQ memory "
510 				    "devmap_devmem_remap()");
511 				return (ibc_get_ci_failure(0));
512 			}
513 			srq->srq_umap_dhp = (devmap_cookie_t)NULL;
514 		}
515 	}
516 
517 	/*
518 	 * Put NULL into the Tavor SRQNum-to-SRQHdl list.  This will allow any
519 	 * in-progress events to detect that the SRQ corresponding to this
520 	 * number has been freed.
521 	 */
522 	state->ts_srqhdl[srqc->tr_indx] = NULL;
523 
524 	mutex_exit(&srq->srq_lock);
525 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
526 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));
527 
528 	/*
529 	 * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
530 	 * firmware command).  If the ownership transfer fails for any reason,
531 	 * then it is an indication that something (either in HW or SW) has
532 	 * gone seriously wrong.
533 	 */
534 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
535 	    sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
536 	if (status != TAVOR_CMD_SUCCESS) {
537 		TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
538 		cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
539 		    status);
540 		return (IBT_FAILURE);
541 	}
542 
543 	/*
544 	 * Deregister the memory for the Shared Receive Queue.  If this fails
545 	 * for any reason, then it is an indication that something (either
546 	 * in HW or SW) has gone seriously wrong.  So we print a warning
547 	 * message and return.
548 	 */
549 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
550 	    sleepflag);
551 	if (status != DDI_SUCCESS) {
552 		TAVOR_WARNING(state, "failed to deregister SRQ memory");
553 		return (IBT_FAILURE);
554 	}
555 
556 	/* Calculate the size and free the wridlist container */
557 	if (srq->srq_wridlist != NULL) {
558 		size = (srq->srq_wridlist->wl_size *
559 		    sizeof (tavor_wrid_entry_t));
560 		kmem_free(srq->srq_wridlist->wl_wre, size);
561 		kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
562 
563 		/*
564 		 * Release reference to WQL; If this is the last reference,
565 		 * this call also has the side effect of freeing up the
566 		 * 'srq_wrid_wql' memory.
567 		 */
568 		tavor_wql_refcnt_dec(srq->srq_wrid_wql);
569 	}
570 
571 	/* Free the memory for the SRQ */
572 	tavor_queue_free(state, &srq->srq_wqinfo);
573 
574 	/* Free the Tavor SRQ Handle */
575 	tavor_rsrc_free(state, &rsrc);
576 
577 	/* Free the SRQC entry resource */
578 	tavor_rsrc_free(state, &srqc);
579 
580 	/* Decrement the reference count on the protection domain (PD) */
581 	tavor_pd_refcnt_dec(pd);
582 
583 	/* Set the srqhdl pointer to NULL and return success */
584 	*srqhdl = NULL;
585 
586 	return (DDI_SUCCESS);
587 }
588 
589 
590 /*
591  * tavor_srq_modify()
592  *    Context: Can be called only from user or kernel context.
593  */
594 int
tavor_srq_modify(tavor_state_t * state,tavor_srqhdl_t srq,uint_t size,uint_t * real_size,uint_t sleepflag)595 tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
596     uint_t *real_size, uint_t sleepflag)
597 {
598 	tavor_qalloc_info_t	new_srqinfo, old_srqinfo;
599 	tavor_rsrc_t		*mtt, *mpt, *old_mtt;
600 	tavor_bind_info_t	bind;
601 	tavor_bind_info_t	old_bind;
602 	tavor_rsrc_pool_info_t	*rsrc_pool;
603 	tavor_mrhdl_t		mr;
604 	tavor_hw_mpt_t		mpt_entry;
605 	tavor_wrid_entry_t	*wre_new, *wre_old;
606 	uint64_t		mtt_ddrbaseaddr, mtt_addr;
607 	uint64_t		srq_desc_off;
608 	uint32_t		*buf, srq_old_bufsz;
609 	uint32_t		wqesz;
610 	uint_t			max_srq_size;
611 	uint_t			dma_xfer_mode, mtt_pgsize_bits;
612 	uint_t			srq_sync, log_srq_size, maxprot;
613 	uint_t			wq_location;
614 	int			status;
615 
616 	/*
617 	 * Check the "inddr" flag.  This flag tells the driver whether or not
618 	 * the SRQ's work queues should be come from normal system memory or
619 	 * whether they should be allocated from DDR memory.
620 	 */
621 	wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;
622 
623 	/*
624 	 * If size requested is larger than device capability, return
625 	 * Insufficient Resources
626 	 */
627 	max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
628 	if (size > max_srq_size) {
629 		return (IBT_HCA_WR_EXCEEDED);
630 	}
631 
632 	/*
633 	 * Calculate the appropriate size for the SRQ.
634 	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
635 	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
636 	 * is to round the requested size up to the next highest power-of-2
637 	 */
638 	size = max(size, TAVOR_SRQ_MIN_SIZE);
639 	log_srq_size = highbit(size);
640 	if (ISP2(size)) {
641 		log_srq_size = log_srq_size - 1;
642 	}
643 
644 	/*
645 	 * Next we verify that the rounded-up size is valid (i.e. consistent
646 	 * with the device limits and/or software-configured limits).
647 	 */
648 	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
649 		goto srqmodify_fail;
650 	}
651 
652 	/*
653 	 * Allocate the memory for newly resized Shared Receive Queue.
654 	 *
655 	 * Note: If SRQ is not user-mappable, then it may come from either
656 	 * kernel system memory or from HCA-attached local DDR memory.
657 	 *
658 	 * Note2: We align this queue on a pagesize boundary.  This is required
659 	 * to make sure that all the resulting IB addresses will start at 0,
660 	 * for a zero-based queue.  By making sure we are aligned on at least a
661 	 * page, any offset we use into our queue will be the same as it was
662 	 * when we allocated it at tavor_srq_alloc() time.
663 	 */
664 	wqesz = (1 << srq->srq_wq_log_wqesz);
665 	new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
666 	new_srqinfo.qa_alloc_align = PAGESIZE;
667 	new_srqinfo.qa_bind_align  = PAGESIZE;
668 	if (srq->srq_is_umap) {
669 		new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
670 	} else {
671 		new_srqinfo.qa_location = wq_location;
672 	}
673 	status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
674 	if (status != DDI_SUCCESS) {
675 		goto srqmodify_fail;
676 	}
677 	buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
678 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
679 
680 	/*
681 	 * Allocate the memory for the new WRE list.  This will be used later
682 	 * when we resize the wridlist based on the new SRQ size.
683 	 */
684 	wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
685 	    sizeof (tavor_wrid_entry_t), sleepflag);
686 	if (wre_new == NULL) {
687 		goto srqmodify_fail;
688 	}
689 
690 	/*
691 	 * Fill in the "bind" struct.  This struct provides the majority
692 	 * of the information that will be used to distinguish between an
693 	 * "addr" binding (as is the case here) and a "buf" binding (see
694 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
695 	 * which does most of the "heavy lifting" for the Tavor memory
696 	 * registration routines.
697 	 */
698 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
699 	bzero(&bind, sizeof (tavor_bind_info_t));
700 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
701 	bind.bi_addr  = (uint64_t)(uintptr_t)buf;
702 	bind.bi_len   = new_srqinfo.qa_size;
703 	bind.bi_as    = NULL;
704 	bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
705 	    IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
706 	if (srq->srq_is_umap) {
707 		bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
708 	} else {
709 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
710 			bind.bi_bypass =
711 			    state->ts_cfg_profile->cp_iommu_bypass;
712 			dma_xfer_mode =
713 			    state->ts_cfg_profile->cp_streaming_consistent;
714 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
715 				bind.bi_flags |= IBT_MR_NONCOHERENT;
716 			}
717 		} else {
718 			bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
719 		}
720 	}
721 	status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
722 	    &mtt_pgsize_bits);
723 	if (status != DDI_SUCCESS) {
724 		kmem_free(wre_new, srq->srq_wq_bufsz *
725 		    sizeof (tavor_wrid_entry_t));
726 		tavor_queue_free(state, &new_srqinfo);
727 		goto srqmodify_fail;
728 	}
729 
730 	/*
731 	 * Calculate the offset between the kernel virtual address space
732 	 * and the IB virtual address space.  This will be used when
733 	 * posting work requests to properly initialize each WQE.
734 	 *
735 	 * Note: bind addr is zero-based (from alloc) so we calculate the
736 	 * correct new offset here.
737 	 */
738 	bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
739 	srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
740 	    (uint64_t)bind.bi_addr;
741 
742 	/*
743 	 * Get the base address for the MTT table.  This will be necessary
744 	 * below when we are modifying the MPT entry.
745 	 */
746 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
747 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
748 
749 	/*
750 	 * Fill in the MPT entry.  This is the final step before passing
751 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
752 	 * the information collected/calculated above to fill in the
753 	 * requisite portions of the MPT.
754 	 */
755 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
756 	mpt_entry.reg_win_len	= bind.bi_len;
757 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
758 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
759 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
760 
761 	/*
762 	 * Now we grab the SRQ lock.  Since we will be updating the actual
763 	 * SRQ location and the producer/consumer indexes, we should hold
764 	 * the lock.
765 	 *
766 	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
767 	 * holding the "srq_lock" and if we got raised to interrupt level
768 	 * by priority inversion, we would not want to block in this routine
769 	 * waiting for success.
770 	 */
771 	mutex_enter(&srq->srq_lock);
772 
773 	/*
774 	 * Copy old entries to new buffer
775 	 */
776 	srq_old_bufsz = srq->srq_wq_bufsz;
777 	bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
778 
779 	/* Determine if later ddi_dma_sync will be necessary */
780 	srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
781 
782 	/* Sync entire "new" SRQ for use by hardware (if necessary) */
783 	if (srq_sync) {
784 		(void) ddi_dma_sync(bind.bi_dmahdl, 0,
785 		    new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
786 	}
787 
788 	/*
789 	 * Setup MPT information for use in the MODIFY_MPT command
790 	 */
791 	mr = srq->srq_mrhdl;
792 	mutex_enter(&mr->mr_lock);
793 	mpt = srq->srq_mrhdl->mr_mptrsrcp;
794 
795 	/*
796 	 * MODIFY_MPT
797 	 *
798 	 * If this fails for any reason, then it is an indication that
799 	 * something (either in HW or SW) has gone seriously wrong.  So we
800 	 * print a warning message and return.
801 	 */
802 	status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
803 	    TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
804 	if (status != TAVOR_CMD_SUCCESS) {
805 		cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
806 		    status);
807 		(void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
808 		    srq->srq_mrhdl->mr_mttrsrcp);
809 		kmem_free(wre_new, srq->srq_wq_bufsz *
810 		    sizeof (tavor_wrid_entry_t));
811 		tavor_queue_free(state, &new_srqinfo);
812 		mutex_exit(&mr->mr_lock);
813 		mutex_exit(&srq->srq_lock);
814 		return (ibc_get_ci_failure(0));
815 	}
816 
817 	/*
818 	 * Update the Tavor Shared Receive Queue handle with all the new
819 	 * information.  At the same time, save away all the necessary
820 	 * information for freeing up the old resources
821 	 */
822 	old_srqinfo	   = srq->srq_wqinfo;
823 	old_mtt		   = srq->srq_mrhdl->mr_mttrsrcp;
824 	bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
825 	    sizeof (tavor_bind_info_t));
826 
827 	/* Now set the new info */
828 	srq->srq_wqinfo	   = new_srqinfo;
829 	srq->srq_wq_buf	   = buf;
830 	srq->srq_wq_bufsz  = (1 << log_srq_size);
831 	bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
832 	srq->srq_mrhdl->mr_mttrsrcp = mtt;
833 	srq->srq_desc_off  = srq_desc_off;
834 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
835 
836 	/* Update MR mtt pagesize */
837 	mr->mr_logmttpgsz = mtt_pgsize_bits;
838 	mutex_exit(&mr->mr_lock);
839 
840 #ifdef __lock_lint
841 	mutex_enter(&srq->srq_wrid_wql->wql_lock);
842 #else
843 	if (srq->srq_wrid_wql != NULL) {
844 		mutex_enter(&srq->srq_wrid_wql->wql_lock);
845 	}
846 #endif
847 
848 	/*
849 	 * Initialize new wridlist, if needed.
850 	 *
851 	 * If a wridlist already is setup on an SRQ (the QP associated with an
852 	 * SRQ has moved "from_reset") then we must update this wridlist based
853 	 * on the new SRQ size.  We allocate the new size of Work Request ID
854 	 * Entries, copy over the old entries to the new list, and
855 	 * re-initialize the srq wridlist in non-umap case
856 	 */
857 	wre_old = NULL;
858 	if (srq->srq_wridlist != NULL) {
859 		wre_old = srq->srq_wridlist->wl_wre;
860 
861 		bcopy(wre_old, wre_new, srq_old_bufsz *
862 		    sizeof (tavor_wrid_entry_t));
863 
864 		/* Setup new sizes in wre */
865 		srq->srq_wridlist->wl_wre = wre_new;
866 		srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
867 
868 		if (!srq->srq_is_umap) {
869 			tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
870 			    srq_old_bufsz);
871 		}
872 	}
873 
874 #ifdef __lock_lint
875 	mutex_exit(&srq->srq_wrid_wql->wql_lock);
876 #else
877 	if (srq->srq_wrid_wql != NULL) {
878 		mutex_exit(&srq->srq_wrid_wql->wql_lock);
879 	}
880 #endif
881 
882 	/*
883 	 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
884 	 * to a user process, then we need to call devmap_devmem_remap() to
885 	 * invalidate the mapping to the SRQ memory.  We also need to
886 	 * invalidate the SRQ tracking information for the user mapping.
887 	 *
888 	 * Note: On failure, the remap really shouldn't ever happen.  So, if it
889 	 * does, it is an indication that something has gone seriously wrong.
890 	 * So we print a warning message and return error (knowing, of course,
891 	 * that the "old" SRQ memory will be leaked)
892 	 */
893 	if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
894 		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
895 		status = devmap_devmem_remap(srq->srq_umap_dhp,
896 		    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
897 		    DEVMAP_MAPPING_INVALID, NULL);
898 		if (status != DDI_SUCCESS) {
899 			mutex_exit(&srq->srq_lock);
900 			TAVOR_WARNING(state, "failed in SRQ memory "
901 			    "devmap_devmem_remap()");
902 			/* We can, however, free the memory for old wre */
903 			if (wre_old != NULL) {
904 				kmem_free(wre_old, srq_old_bufsz *
905 				    sizeof (tavor_wrid_entry_t));
906 			}
907 			return (ibc_get_ci_failure(0));
908 		}
909 		srq->srq_umap_dhp = (devmap_cookie_t)NULL;
910 	}
911 
912 	/*
913 	 * Drop the SRQ lock now.  The only thing left to do is to free up
914 	 * the old resources.
915 	 */
916 	mutex_exit(&srq->srq_lock);
917 
918 	/*
919 	 * Unbind the MTT entries.
920 	 */
921 	status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
922 	if (status != DDI_SUCCESS) {
923 		TAVOR_WARNING(state, "failed to unbind old SRQ memory");
924 		goto srqmodify_fail;
925 	}
926 
927 	/* Free the memory for old wre */
928 	if (wre_old != NULL) {
929 		kmem_free(wre_old, srq_old_bufsz *
930 		    sizeof (tavor_wrid_entry_t));
931 	}
932 
933 	/* Free the memory for the old SRQ */
934 	tavor_queue_free(state, &old_srqinfo);
935 
936 	/*
937 	 * Fill in the return arguments (if necessary).  This includes the
938 	 * real new completion queue size.
939 	 */
940 	if (real_size != NULL) {
941 		*real_size = (1 << log_srq_size);
942 	}
943 
944 	return (DDI_SUCCESS);
945 
946 srqmodify_fail:
947 	return (status);
948 }
949 
950 
951 /*
952  * tavor_srq_refcnt_inc()
953  *    Context: Can be called from interrupt or base context.
954  */
955 void
tavor_srq_refcnt_inc(tavor_srqhdl_t srq)956 tavor_srq_refcnt_inc(tavor_srqhdl_t srq)
957 {
958 	mutex_enter(&srq->srq_lock);
959 	srq->srq_refcnt++;
960 	mutex_exit(&srq->srq_lock);
961 }
962 
963 
964 /*
965  * tavor_srq_refcnt_dec()
966  *    Context: Can be called from interrupt or base context.
967  */
968 void
tavor_srq_refcnt_dec(tavor_srqhdl_t srq)969 tavor_srq_refcnt_dec(tavor_srqhdl_t srq)
970 {
971 	mutex_enter(&srq->srq_lock);
972 	srq->srq_refcnt--;
973 	mutex_exit(&srq->srq_lock);
974 }
975 
976 
977 /*
978  * tavor_srqhdl_from_srqnum()
979  *    Context: Can be called from interrupt or base context.
980  *
981  *    This routine is important because changing the unconstrained
982  *    portion of the SRQ number is critical to the detection of a
983  *    potential race condition in the SRQ handler code (i.e. the case
984  *    where a SRQ is freed and alloc'd again before an event for the
985  *    "old" SRQ can be handled).
986  *
987  *    While this is not a perfect solution (not sure that one exists)
988  *    it does help to mitigate the chance that this race condition will
989  *    cause us to deliver a "stale" event to the new SRQ owner.  Note:
990  *    this solution does not scale well because the number of constrained
991  *    bits increases (and, hence, the number of unconstrained bits
992  *    decreases) as the number of supported SRQ grows.  For small and
993  *    intermediate values, it should hopefully provide sufficient
994  *    protection.
995  */
996 tavor_srqhdl_t
tavor_srqhdl_from_srqnum(tavor_state_t * state,uint_t srqnum)997 tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum)
998 {
999 	uint_t	srqindx, srqmask;
1000 
1001 	/* Calculate the SRQ table index from the srqnum */
1002 	srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1;
1003 	srqindx = srqnum & srqmask;
1004 	return (state->ts_srqhdl[srqindx]);
1005 }
1006 
1007 
1008 /*
1009  * tavor_srq_sgl_to_logwqesz()
1010  *    Context: Can be called from interrupt or base context.
1011  */
1012 static void
tavor_srq_sgl_to_logwqesz(tavor_state_t * state,uint_t num_sgl,tavor_qp_wq_type_t wq_type,uint_t * logwqesz,uint_t * max_sgl)1013 tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1014     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1015 {
1016 	uint_t	max_size, log2, actual_sgl;
1017 
1018 	switch (wq_type) {
1019 	case TAVOR_QP_WQ_TYPE_RECVQ:
1020 		/*
1021 		 * Use requested maximum SGL to calculate max descriptor size
1022 		 * (while guaranteeing that the descriptor size is a
1023 		 * power-of-2 cachelines).
1024 		 */
1025 		max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
1026 		log2 = highbit(max_size);
1027 		if (ISP2(max_size)) {
1028 			log2 = log2 - 1;
1029 		}
1030 
1031 		/* Make sure descriptor is at least the minimum size */
1032 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1033 
1034 		/* Calculate actual number of SGL (given WQE size) */
1035 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
1036 		break;
1037 
1038 	default:
1039 		TAVOR_WARNING(state, "unexpected work queue type");
1040 		break;
1041 	}
1042 
1043 	/* Fill in the return values */
1044 	*logwqesz = log2;
1045 	*max_sgl  = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);
1046 }
1047