1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <sys/kmem.h>
28#include <sys/conf.h>
29#include <sys/ddi.h>
30#include <sys/sunddi.h>
31#include <sys/ksynch.h>
32#include <sys/pattr.h>			/* HCK_* */
33#include <inet/ip.h>			/* ipha_t */
34#include <inet/tcp.h>			/* tcph_t */
35#include <sys/mac_provider.h>		/* mac_* */
36#include <sys/strsun.h>			/* MBLKL */
37
38#include <sys/ib/clients/eoib/eib_impl.h>
39
40/*
41 * Declarations private to this file
42 */
43static int eib_data_setup_cqs(eib_t *, eib_vnic_t *);
44static int eib_data_setup_ud_channel(eib_t *, eib_vnic_t *);
45static void eib_data_setup_lso(eib_wqe_t *, mblk_t *, uint32_t,
46    eib_ether_hdr_t *);
47static int eib_data_prepare_sgl(eib_vnic_t *, eib_wqe_t *, mblk_t *);
48static int eib_data_is_mcast_pkt_ok(eib_vnic_t *, uint8_t *, uint64_t *,
49    uint64_t *);
50static void eib_data_rx_comp_intr(ibt_cq_hdl_t, void *);
51static void eib_data_tx_comp_intr(ibt_cq_hdl_t, void *);
52static mblk_t *eib_data_rx_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *);
53static void eib_data_tx_comp(eib_vnic_t *, eib_wqe_t *, eib_chan_t *);
54static void eib_data_err_comp(eib_vnic_t *, eib_wqe_t *, ibt_wc_t *);
55static void eib_rb_data_setup_cqs(eib_t *, eib_vnic_t *);
56static void eib_rb_data_setup_ud_channel(eib_t *, eib_vnic_t *);
57
58
59int
60eib_data_create_qp(eib_t *ss, eib_vnic_t *vnic, int *err)
61{
62	eib_chan_t *chan = NULL;
63
64	/*
65	 * Allocate a eib_chan_t to store stuff about this vnic's data qp
66	 * and initialize it with default admin qp pkey parameters. We'll
67	 * re-associate this with the pkey we receive from the gw once we
68	 * receive the login ack.
69	 */
70	vnic->vn_data_chan = eib_chan_init();
71
72	chan = vnic->vn_data_chan;
73	chan->ch_pkey = ss->ei_admin_chan->ch_pkey;
74	chan->ch_pkey_ix = ss->ei_admin_chan->ch_pkey_ix;
75	chan->ch_vnic_inst = vnic->vn_instance;
76
77	/*
78	 * Setup tx/rx CQs and completion handlers
79	 */
80	if (eib_data_setup_cqs(ss, vnic) != EIB_E_SUCCESS) {
81		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: "
82		    "eib_data_setup_cqs(vn_inst=0x%x) failed",
83		    vnic->vn_instance);
84		*err = ENOMEM;
85		goto data_create_qp_fail;
86	}
87
88	/*
89	 * Setup UD channel
90	 */
91	if (eib_data_setup_ud_channel(ss, vnic) != EIB_E_SUCCESS) {
92		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_create_qp: "
93		    "eib_data_setup_ud_channel(vn_inst=0x%x) failed",
94		    vnic->vn_instance);
95		*err = ENOMEM;
96		goto data_create_qp_fail;
97	}
98
99	return (EIB_E_SUCCESS);
100
101data_create_qp_fail:
102	eib_rb_data_create_qp(ss, vnic);
103	return (EIB_E_FAILURE);
104}
105
106/*ARGSUSED*/
107uint_t
108eib_data_rx_comp_handler(caddr_t arg1, caddr_t arg2)
109{
110	eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1;
111	eib_t *ss = vnic->vn_ss;
112	eib_chan_t *chan = vnic->vn_data_chan;
113	eib_stats_t *stats = ss->ei_stats;
114	ibt_wc_t *wc;
115	eib_wqe_t *wqe;
116	mblk_t *mp;
117	mblk_t *head = NULL;
118	mblk_t *tail = NULL;
119	ibt_status_t ret;
120	uint_t pkts_per_call = 0;
121	uint_t polled;
122	uint_t rbytes;
123	uint_t ipkts;
124	uint_t num_wc;
125	int i;
126
127	/*
128	 * Re-arm the rx notification callback before we start polling
129	 * the completion queue.  There's nothing much we can do if the
130	 * enable_cq_notify fails - we issue a warning and move on.
131	 */
132	ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION);
133	if (ret != IBT_SUCCESS) {
134		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp_handler: "
135		    "ibt_enable_cq_notify() failed, ret=%d", ret);
136	}
137
138	/*
139	 * We don't want to be stuck in receive processing for too long without
140	 * giving others a chance.
141	 */
142	num_wc = (chan->ch_rcv_cq_sz < EIB_MAX_RX_PKTS_ONINTR) ?
143	    chan->ch_rcv_cq_sz : EIB_MAX_RX_PKTS_ONINTR;
144
145	/*
146	 * Handle rx completions
147	 */
148	while ((ret = ibt_poll_cq(chan->ch_rcv_cq_hdl, chan->ch_rcv_wc,
149	    num_wc, &polled)) == IBT_SUCCESS) {
150
151		rbytes = ipkts = 0;
152		head = tail = NULL;
153
154		for (wc = chan->ch_rcv_wc, i = 0; i < polled; i++, wc++) {
155			wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id;
156
157			ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX);
158
159			/*
160			 * Clear the posted-to-hca flag and reduce the number
161			 * of posted-rwqes count
162			 */
163			wqe->qe_info &= (~EIB_WQE_FLG_POSTED_TO_HCA);
164			eib_rsrc_decr_posted_rwqe(ss, chan);
165
166			rbytes += wc->wc_bytes_xfer;
167			if (wc->wc_status != IBT_WC_SUCCESS) {
168				EIB_INCR_COUNTER(&stats->st_ierrors);
169				eib_data_err_comp(vnic, wqe, wc);
170			} else {
171				ipkts++;
172				mp = eib_data_rx_comp(vnic, wqe, wc);
173				if (mp == NULL) {
174					continue;
175				} else {
176					/*
177					 * Add this mp to the list to
178					 * send it to the nw layer. Note
179					 * that the wqe could've been
180					 * returned to the pool if we're
181					 * running low, so don't process
182					 * wqe after this point.
183					 */
184					if (head)
185						tail->b_next = mp;
186					else
187						head = mp;
188					tail = mp;
189				}
190			}
191		}
192
193		/*
194		 * We reduce the number of atomic updates to key statistics
195		 * by pooling them here, once per ibt_poll_cq().  The accuracy
196		 * and consistency of the published statistics within a cq
197		 * polling cycle will be compromised a little bit, but that
198		 * should be ok, given that we probably gain a little bit by
199		 * not having to do these atomic operations per packet.
200		 */
201		EIB_UPDATE_COUNTER(&stats->st_rbytes, rbytes);
202		EIB_UPDATE_COUNTER(&stats->st_ipkts, ipkts);
203
204		pkts_per_call += ipkts;
205
206		if (head) {
207			mac_rx(ss->ei_mac_hdl, NULL, head);
208		}
209
210		/*
211		 * If we have processed too many packets in one attempt, we'll
212		 * have to come back here later.
213		 */
214		if (pkts_per_call >= EIB_MAX_RX_PKTS_ONINTR) {
215			(void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl,
216			    NULL);
217			break;
218		}
219
220		num_wc -= polled;
221	}
222
223	return (DDI_INTR_CLAIMED);
224}
225
226/*ARGSUSED*/
227uint_t
228eib_data_tx_comp_handler(caddr_t arg1, caddr_t arg2)
229{
230	eib_vnic_t *vnic = (eib_vnic_t *)(void *)arg1;
231	eib_t *ss = vnic->vn_ss;
232	eib_chan_t *chan = vnic->vn_data_chan;
233	eib_stats_t *stats = ss->ei_stats;
234	ibt_wc_t *wc;
235	eib_wqe_t *wqe;
236	ibt_status_t ret;
237	uint_t polled;
238	int i;
239
240	/*
241	 * Re-arm the tx notification callback before we start polling
242	 * the completion queue.  There's nothing much we can do if the
243	 * enable_cq_notify fails - we issue a warning and move on.
244	 */
245	ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
246	if (ret != IBT_SUCCESS) {
247		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_tx_comp_handler: "
248		    "ibt_enable_cq_notify() failed, ret=%d", ret);
249	}
250
251	/*
252	 * Handle tx completions
253	 */
254	while ((ret = ibt_poll_cq(chan->ch_cq_hdl, chan->ch_wc, chan->ch_cq_sz,
255	    &polled)) == IBT_SUCCESS) {
256		for (wc = chan->ch_wc, i = 0; i < polled; i++, wc++) {
257			wqe = (eib_wqe_t *)(uintptr_t)wc->wc_id;
258
259			ASSERT(EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_TX);
260
261			if (wc->wc_status != IBT_WC_SUCCESS) {
262				EIB_INCR_COUNTER(&stats->st_oerrors);
263				eib_data_err_comp(vnic, wqe, wc);
264			} else {
265				eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan);
266			}
267		}
268	}
269
270	return (DDI_INTR_CLAIMED);
271}
272
273void
274eib_data_rx_recycle(caddr_t arg)
275{
276	eib_wqe_t *rwqe = (eib_wqe_t *)(void *)arg;
277	eib_t *ss = rwqe->qe_pool->wp_ss;
278	eib_chan_t *vn_chan;
279	uint_t nic_state;
280	int ret;
281
282	/*
283	 * We come here from three places - (a) from the nw layer if the
284	 * rx mblk we handed to it has been done with and the nw layer is
285	 * calling the freemsg() (b) from eib_data_rx_comp() if the rx
286	 * completion processing discovers that the received EoIB packet
287	 * has a problem and (c) from eib_data_err_comp() if we're tearing
288	 * down this channel.  We only need to repost the rwqe if we're
289	 * being called back from the nw layer.  For the other two cases,
290	 * we'll simply return the rwqe to the pool. Also, since we would've
291	 * already updated the ch_rx_posted counters in the rx completion
292	 * handler, we don't pass the chan pointer to eib_rsrc_return_rwqe
293	 * from within this routine.
294	 */
295	rwqe->qe_mp = NULL;
296	if ((rwqe->qe_info & EIB_WQE_FLG_WITH_NW) == 0) {
297		eib_rsrc_return_rwqe(ss, rwqe, NULL);
298		return;
299	}
300
301	rwqe->qe_info &= (~EIB_WQE_FLG_WITH_NW);
302
303	/*
304	 * If the buffers are being returned by nw layer after a long
305	 * time, this eoib instance could've even been stopped by now.
306	 * If so, simply return the rwqe to the pool.
307	 */
308	nic_state = eib_mac_get_nic_state(ss);
309	if ((nic_state & EIB_NIC_STARTED) != EIB_NIC_STARTED) {
310		eib_rsrc_return_rwqe(ss, rwqe, NULL);
311		return;
312	}
313
314	/*
315	 * Or it could've taken even longer, and the nic has even been
316	 * restarted.  Only thing we can do is to make sure that the
317	 * original channel pointer we passed corresponds to what's in
318	 * the instance of the vnic currently.
319	 */
320	vn_chan = eib_vnic_get_data_chan(ss, rwqe->qe_vnic_inst);
321	if (vn_chan == NULL || vn_chan != rwqe->qe_chan) {
322		eib_rsrc_return_rwqe(ss, rwqe, NULL);
323		return;
324	}
325
326	/*
327	 * Try to repost the rwqe if we're not tearing down this channel
328	 */
329	if (vn_chan->ch_tear_down) {
330		eib_rsrc_return_rwqe(ss, rwqe, NULL);
331	} else {
332		ret = eib_chan_post_recv(ss, vn_chan, rwqe);
333		if (ret != EIB_E_SUCCESS) {
334			if (rwqe->qe_mp)
335				freemsg(rwqe->qe_mp);
336			else
337				eib_rsrc_return_rwqe(ss, rwqe, NULL);
338		}
339	}
340}
341
342void
343eib_data_post_tx(eib_vnic_t *vnic, eib_wqe_t *swqe)
344{
345	eib_chan_t *chan = vnic->vn_data_chan;
346	eib_t *ss = vnic->vn_ss;
347	eib_stats_t *stats = vnic->vn_ss->ei_stats;
348	ibt_send_wr_t wrs[EIB_MAX_POST_MULTIPLE];
349	eib_wqe_t *wqes[EIB_MAX_POST_MULTIPLE];
350	eib_wqe_t *elem;
351	ibt_status_t ret;
352	uint_t n_wrs;
353	uint_t n_posted;
354	uint_t total_failed = 0;
355	uint_t n_failed = 0;
356	uint_t i;
357
358	/*
359	 * See if we have room for this wqe and then add it to the
360	 * list of tx wrs to post in this channel.
361	 */
362	mutex_enter(&chan->ch_tx_lock);
363
364	if ((chan->ch_tx_posted + 1) >= (chan->ch_max_swqes - 1)) {
365		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: "
366		    "too many swqes posted already, posted=0x%lx, "
367		    "max=0x%lx", chan->ch_tx_posted, chan->ch_max_swqes);
368		mutex_exit(&chan->ch_tx_lock);
369		return;
370	}
371
372	swqe->qe_nxt_post = NULL;
373	if (chan->ch_tx) {
374		chan->ch_tx_tail->qe_nxt_post = swqe;
375	} else {
376		chan->ch_tx = swqe;
377	}
378	chan->ch_tx_tail = swqe;
379	chan->ch_tx_posted++;		/* pre-increment */
380
381	/*
382	 * If someone's already posting tx wqes in this channel, let
383	 * them post ours as well.
384	 */
385	if (chan->ch_tx_busy == B_TRUE) {
386		mutex_exit(&chan->ch_tx_lock);
387		return;
388	}
389	chan->ch_tx_busy = B_TRUE;
390
391	while (chan->ch_tx) {
392		/*
393		 * Post EIB_MAX_POST_MULTIPLE wrs at a time
394		 */
395		for (n_wrs = 0, elem = chan->ch_tx;
396		    (elem) && (n_wrs < EIB_MAX_POST_MULTIPLE);
397		    elem = elem->qe_nxt_post, n_wrs++) {
398			wqes[n_wrs] = elem;
399			wrs[n_wrs] = (elem->qe_wr).send;
400		}
401		chan->ch_tx = elem;
402		if (elem == NULL) {
403			chan->ch_tx_tail = NULL;
404		}
405		mutex_exit(&chan->ch_tx_lock);
406
407		ASSERT(n_wrs != 0);
408
409		/*
410		 * If multiple wrs posting fails for some reason, we'll try
411		 * posting the unposted ones one by one.  If even that fails,
412		 * we'll release any mappings/buffers/mblks associated with
413		 * this wqe and return it to the pool.
414		 */
415		n_posted = n_failed = 0;
416		ret = ibt_post_send(chan->ch_chan, wrs, n_wrs, &n_posted);
417		if (ret != IBT_SUCCESS) {
418			EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_post_tx: "
419			    "ibt_post_send(n_wrs=0x%lx, n_posted=0x%lx) "
420			    "failed, ret=%d", n_wrs, n_posted, ret);
421
422			for (i = n_posted; i < n_wrs; i++) {
423				ret = ibt_post_send(chan->ch_chan, &wrs[i],
424				    1, NULL);
425				if (ret != IBT_SUCCESS) {
426					n_failed++;
427					eib_data_tx_comp(vnic, wqes[i], chan);
428
429					EIB_DPRINTF_WARN(ss->ei_instance,
430					    "eib_data_post_tx: "
431					    "ibt_post_send(n_wrs=1) failed, "
432					    "ret=%d", ret);
433				}
434			}
435		}
436		total_failed += n_failed;
437
438		mutex_enter(&chan->ch_tx_lock);
439	}
440
441	chan->ch_tx_busy = B_FALSE;
442	mutex_exit(&chan->ch_tx_lock);
443
444	/*
445	 * If we failed to post something, update error stats
446	 */
447	if (total_failed) {
448		EIB_UPDATE_COUNTER(&stats->st_oerrors, total_failed);
449	}
450}
451
452void
453eib_data_parse_ether_hdr(mblk_t *mp, eib_ether_hdr_t *evh)
454{
455	struct ether_vlan_header *vl_hdr;
456	struct ether_header *hdr;
457
458	/*
459	 * Assume that the ether header (with or without vlan tag) is
460	 * contained in one fragment
461	 */
462	hdr = (struct ether_header *)(void *)mp->b_rptr;
463	vl_hdr = (struct ether_vlan_header *)(void *)mp->b_rptr;
464
465	evh->eh_ether_type = ntohs(hdr->ether_type);
466	if (evh->eh_ether_type != ETHERTYPE_VLAN) {
467		evh->eh_tagless = 1;
468		evh->eh_vlan = 0;
469		ether_copy((void *)hdr->ether_dhost.ether_addr_octet,
470		    (void *)evh->eh_dmac);
471		ether_copy((void *)hdr->ether_shost.ether_addr_octet,
472		    (void *)evh->eh_smac);
473	} else {
474		evh->eh_ether_type = ntohs(vl_hdr->ether_type);
475		evh->eh_tagless = 0;
476		evh->eh_vlan = VLAN_ID(ntohs(vl_hdr->ether_tci));
477		ether_copy((void *)vl_hdr->ether_dhost.ether_addr_octet,
478		    (void *)evh->eh_dmac);
479		ether_copy((void *)vl_hdr->ether_shost.ether_addr_octet,
480		    (void *)evh->eh_smac);
481	}
482}
483
484int
485eib_data_lookup_vnic(eib_t *ss, uint8_t *mac, uint16_t vlan, eib_vnic_t **vnicp,
486    boolean_t *failed)
487{
488	eib_vnic_t *vnic;
489	eib_vnic_req_t *vrq;
490	uint8_t *vn_mac;
491	uint16_t vn_vlan;
492	uint64_t av;
493	int inst = 0;
494
495	if (mac == NULL)
496		return (EIB_E_FAILURE);
497
498	/*
499	 * For now, a simple search (but only what we've allocated). Note that
500	 * if we're in the process of creating a vnic, the instance might've
501	 * been allocated, but the vnic entry would be NULL.
502	 */
503	mutex_enter(&ss->ei_vnic_lock);
504	av = ss->ei_active_vnics;
505	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
506		if ((vnic = ss->ei_vnic[inst]) != NULL) {
507			vn_mac = vnic->vn_login_data.ld_assigned_mac;
508			vn_vlan = vnic->vn_login_data.ld_assigned_vlan;
509
510			if ((vn_vlan == vlan) &&
511			    (bcmp(vn_mac, mac, ETHERADDRL) == 0)) {
512				if (vnicp) {
513					*vnicp = vnic;
514				}
515				mutex_exit(&ss->ei_vnic_lock);
516				return (EIB_E_SUCCESS);
517			}
518		}
519
520		av &= (~((uint64_t)1 << inst));
521	}
522	mutex_exit(&ss->ei_vnic_lock);
523
524	/*
525	 * If we haven't been able to locate a vnic for this {mac,vlan} tuple,
526	 * see if we've already failed a creation request for this vnic, and
527	 * return that information.
528	 */
529	if (failed) {
530		mutex_enter(&ss->ei_vnic_req_lock);
531		*failed = B_FALSE;
532		for (vrq = ss->ei_failed_vnic_req; vrq; vrq = vrq->vr_next) {
533			if ((vrq->vr_vlan == vlan) &&
534			    (bcmp(vrq->vr_mac, mac, ETHERADDRL) == 0)) {
535				*failed = B_TRUE;
536			}
537		}
538		mutex_exit(&ss->ei_vnic_req_lock);
539	}
540
541	return (EIB_E_FAILURE);
542}
543
544int
545eib_data_prepare_frame(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp,
546    eib_ether_hdr_t *evh)
547{
548	uint32_t mss;
549	uint32_t lsoflags;
550	uint32_t hckflags;
551
552	/*
553	 * The swqe defaults are set to use the regular ud work request
554	 * member and the IBT_WRC_SEND opcode, so we don't need to do
555	 * anything here if this isn't an LSO packet.
556	 */
557	mac_lso_get(mp, &mss, &lsoflags);
558	if ((lsoflags & HW_LSO) == HW_LSO)
559		eib_data_setup_lso(swqe, mp, mss, evh);
560
561	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
562	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) {
563		swqe->qe_wr.send.wr_flags |= IBT_WR_SEND_CKSUM;
564	} else {
565		swqe->qe_wr.send.wr_flags &= (~IBT_WR_SEND_CKSUM);
566	}
567
568	if (eib_data_prepare_sgl(vnic, swqe, mp) != 0)
569		return (EIB_E_FAILURE);
570
571	swqe->qe_mp = mp;
572
573	return (EIB_E_SUCCESS);
574}
575
576void
577eib_rb_data_create_qp(eib_t *ss, eib_vnic_t *vnic)
578{
579	eib_rb_data_setup_ud_channel(ss, vnic);
580
581	eib_rb_data_setup_cqs(ss, vnic);
582
583	eib_chan_fini(vnic->vn_data_chan);
584	vnic->vn_data_chan = NULL;
585}
586
587static int
588eib_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic)
589{
590	eib_chan_t *chan = vnic->vn_data_chan;
591	ibt_cq_attr_t cq_attr;
592	ibt_status_t ret;
593	uint_t snd_sz;
594	uint_t rcv_sz;
595	int rv;
596
597	/*
598	 * Allocate send completion queue. Note that we've already verified
599	 * that cp_max_swqe and cp_max_rwqe meet the max cq size requirements
600	 * of the hca.
601	 */
602	cq_attr.cq_sched = NULL;
603	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
604	cq_attr.cq_size = ss->ei_caps->cp_max_swqe + 1;
605
606	ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_cq_hdl, &snd_sz);
607	if (ret != IBT_SUCCESS) {
608		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
609		    "ibt_alloc_cq(snd_cq_sz=0x%lx) failed, ret=%d",
610		    cq_attr.cq_size, ret);
611		goto setup_data_cqs_fail;
612	}
613	ret = ibt_modify_cq(chan->ch_cq_hdl, EIB_TX_COMP_COUNT,
614	    EIB_TX_COMP_USEC, 0);
615	if (ret != IBT_SUCCESS) {
616		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: "
617		    "ibt_modify_cq(snd_comp_count=0x%lx, snd_comp_usec=0x%lx) "
618		    "failed, ret=%d",
619		    EIB_TX_COMP_COUNT, EIB_TX_COMP_USEC, ret);
620	}
621
622	/*
623	 * Allocate receive completion queue
624	 */
625	cq_attr.cq_sched = NULL;
626	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
627	cq_attr.cq_size = ss->ei_caps->cp_max_rwqe + 1;
628
629	ret = ibt_alloc_cq(ss->ei_hca_hdl, &cq_attr, &chan->ch_rcv_cq_hdl,
630	    &rcv_sz);
631	if (ret != IBT_SUCCESS) {
632		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
633		    "ibt_alloc_cq(rcv_cq_sz=0x%lx) failed, ret=%d",
634		    cq_attr.cq_size, ret);
635		goto setup_data_cqs_fail;
636	}
637	ret = ibt_modify_cq(chan->ch_rcv_cq_hdl, EIB_RX_COMP_COUNT,
638	    EIB_RX_COMP_USEC, 0);
639	if (ret != IBT_SUCCESS) {
640		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_setup_cqs: "
641		    "ibt_modify_cq(rcv_comp_count=0x%lx, rcv_comp_usec=0x%lx) "
642		    "failed, ret=%d",
643		    EIB_RX_COMP_COUNT, EIB_RX_COMP_USEC, ret);
644	}
645
646	/*
647	 * Set up parameters for collecting tx and rx completion information
648	 */
649	chan->ch_cq_sz = snd_sz;
650	chan->ch_wc = kmem_zalloc(sizeof (ibt_wc_t) * snd_sz, KM_SLEEP);
651	chan->ch_rcv_cq_sz = rcv_sz;
652	chan->ch_rcv_wc = kmem_zalloc(sizeof (ibt_wc_t) * rcv_sz, KM_SLEEP);
653
654	/*
655	 * Set up the vnic's data tx completion queue handler and allocate
656	 * a softint for it as well.
657	 */
658	if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_tx_si_hdl,
659	    EIB_SOFTPRI_DATA, eib_data_tx_comp_handler, vnic)) != DDI_SUCCESS) {
660		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
661		    "ddi_intr_add_softint() failed for data tx qp, ret=%d", rv);
662		goto setup_data_cqs_fail;
663	}
664	ibt_set_cq_handler(chan->ch_cq_hdl, eib_data_tx_comp_intr, vnic);
665	ret = ibt_enable_cq_notify(chan->ch_cq_hdl, IBT_NEXT_COMPLETION);
666	if (ret != IBT_SUCCESS) {
667		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
668		    "ibt_enable_cq_notify() failed for tx cq, ret=%d", ret);
669		goto setup_data_cqs_fail;
670	}
671
672	/*
673	 * And then the data rx completion queue handler
674	 */
675	if ((rv = ddi_intr_add_softint(ss->ei_dip, &vnic->vn_data_rx_si_hdl,
676	    EIB_SOFTPRI_DATA, eib_data_rx_comp_handler, vnic)) != DDI_SUCCESS) {
677		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
678		    "ddi_intr_add_softint() failed for data rx qp, ret=%d", rv);
679		goto setup_data_cqs_fail;
680	}
681	ibt_set_cq_handler(chan->ch_rcv_cq_hdl, eib_data_rx_comp_intr, vnic);
682	ret = ibt_enable_cq_notify(chan->ch_rcv_cq_hdl, IBT_NEXT_COMPLETION);
683	if (ret != IBT_SUCCESS) {
684		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_cqs: "
685		    "ibt_enable_cq_notify() failed for rx cq, ret=%d", ret);
686		goto setup_data_cqs_fail;
687	}
688
689	return (EIB_E_SUCCESS);
690
691setup_data_cqs_fail:
692	eib_rb_data_setup_cqs(ss, vnic);
693	return (EIB_E_FAILURE);
694}
695
696static int
697eib_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic)
698{
699	eib_chan_t *chan = vnic->vn_data_chan;
700	ibt_ud_chan_alloc_args_t alloc_attr;
701	ibt_ud_chan_query_attr_t query_attr;
702	ibt_status_t ret;
703
704	bzero(&alloc_attr, sizeof (ibt_ud_chan_alloc_args_t));
705	bzero(&query_attr, sizeof (ibt_ud_chan_query_attr_t));
706
707	alloc_attr.ud_flags = IBT_ALL_SIGNALED;
708	if (ss->ei_caps->cp_resv_lkey_capab)
709		alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
710	if (ss->ei_caps->cp_lso_maxlen)
711		alloc_attr.ud_flags |= IBT_USES_LSO;
712
713	alloc_attr.ud_hca_port_num = ss->ei_props->ep_port_num;
714	alloc_attr.ud_pkey_ix = chan->ch_pkey_ix;
715	alloc_attr.ud_sizes.cs_sq = ss->ei_caps->cp_max_swqe;
716	alloc_attr.ud_sizes.cs_rq = ss->ei_caps->cp_max_rwqe;
717	alloc_attr.ud_sizes.cs_sq_sgl = ss->ei_caps->cp_max_sgl;
718	alloc_attr.ud_sizes.cs_rq_sgl = 1;
719	alloc_attr.ud_sizes.cs_inline = 0;
720
721	alloc_attr.ud_qkey = EIB_DATA_QKEY;
722	alloc_attr.ud_scq = chan->ch_cq_hdl;
723	alloc_attr.ud_rcq = chan->ch_rcv_cq_hdl;
724	alloc_attr.ud_pd = ss->ei_pd_hdl;
725
726	ret = ibt_alloc_ud_channel(ss->ei_hca_hdl, IBT_ACHAN_NO_FLAGS,
727	    &alloc_attr, &chan->ch_chan, NULL);
728	if (ret != IBT_SUCCESS) {
729		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: "
730		    "ibt_alloc_ud_channel(port=0x%x, pkey_ix=0x%x, "
731		    "cs_sq=0x%lx, cs_rq=0x%lx, sq_sgl=0x%lx) failed, ret=%d",
732		    alloc_attr.ud_hca_port_num, chan->ch_pkey_ix,
733		    alloc_attr.ud_sizes.cs_sq, alloc_attr.ud_sizes.cs_rq,
734		    alloc_attr.ud_sizes.cs_sq_sgl, ret);
735
736		goto setup_data_ud_channel_fail;
737	}
738
739	ret = ibt_query_ud_channel(chan->ch_chan, &query_attr);
740	if (ret != IBT_SUCCESS) {
741		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_setup_ud_channel: "
742		    "ibt_query_ud_channel() failed, ret=%d", ret);
743		goto setup_data_ud_channel_fail;
744	}
745
746	chan->ch_qpn = query_attr.ud_qpn;
747	chan->ch_max_swqes = query_attr.ud_chan_sizes.cs_sq;
748	chan->ch_max_rwqes = query_attr.ud_chan_sizes.cs_rq;
749	chan->ch_lwm_rwqes = chan->ch_max_rwqes >> 2;
750	chan->ch_rwqe_bktsz = (chan->ch_max_rwqes < EIB_DATA_RWQE_BKT) ?
751	    chan->ch_max_rwqes : EIB_DATA_RWQE_BKT;
752	chan->ch_ip_hdr_align = EIB_IP_HDR_ALIGN;
753	chan->ch_alloc_mp = B_TRUE;
754	chan->ch_tear_down = B_FALSE;
755
756	return (EIB_E_SUCCESS);
757
758setup_data_ud_channel_fail:
759	eib_rb_data_setup_ud_channel(ss, vnic);
760	return (EIB_E_FAILURE);
761}
762
763static void
764eib_data_setup_lso(eib_wqe_t *swqe, mblk_t *mp, uint32_t mss,
765    eib_ether_hdr_t *evh)
766{
767	ibt_wr_lso_t *lso;
768	mblk_t  *nmp;
769	uint8_t *dst;
770	uintptr_t ip_start;
771	uintptr_t tcp_start;
772	uint_t pending;
773	uint_t mblen;
774	uint_t eth_hdr_len;
775	uint_t ip_hdr_len;
776	uint_t tcp_hdr_len;
777
778	/*
779	 * When the swqe was grabbed, it would've had its wr_opcode and
780	 * wr.ud.udwr_dest set to default values. Since we're now going
781	 * to use LSO, we need to change these.
782	 */
783	swqe->qe_wr.send.wr_opcode = IBT_WRC_SEND_LSO;
784	lso = &(swqe->qe_wr.send.wr.ud_lso);
785	lso->lso_ud_dest = swqe->qe_dest;
786	lso->lso_mss = mss;
787
788	/*
789	 * Details on the ethernet header in the mp is already known to us
790	 */
791	eth_hdr_len = (evh->eh_tagless) ? (sizeof (struct ether_header)) :
792	    (sizeof (struct ether_vlan_header));
793
794	/*
795	 * Calculate the LSO header size and set it in the UD LSO structure.
796	 * Note that the only assumption we make is that each of the Ethernet,
797	 * IP and TCP headers will be contained in a single mblk fragment;
798	 * together, the headers may span multiple mblk fragments. Note also
799	 * that since the EoIB encapsulation header is not part of the message
800	 * block we receive, we'll need to account space for inserting it later.
801	 */
802	nmp = mp;
803	ip_start = (uintptr_t)(nmp->b_rptr) + eth_hdr_len;
804	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
805		ip_start = (uintptr_t)nmp->b_cont->b_rptr
806		    + (ip_start - (uintptr_t)(nmp->b_wptr));
807		nmp = nmp->b_cont;
808	}
809	ip_hdr_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
810
811	tcp_start = ip_start + ip_hdr_len;
812	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
813		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
814		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
815		nmp = nmp->b_cont;
816	}
817	tcp_hdr_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
818
819	/*
820	 * Since the passed mp fragment never contains the EoIB encapsulation
821	 * header, we always have to copy the lso header. Sigh.
822	 */
823	lso->lso_hdr = swqe->qe_payload_hdr;
824	lso->lso_hdr_sz = EIB_ENCAP_HDR_SZ + eth_hdr_len +
825	    ip_hdr_len + tcp_hdr_len;
826
827	/*
828	 * We already have the EoIB encapsulation header written at the
829	 * start of wqe->qe_payload_hdr during swqe acquisition.  Only
830	 * copy the remaining headers.
831	 */
832	dst = lso->lso_hdr + EIB_ENCAP_HDR_SZ;
833	pending = lso->lso_hdr_sz - EIB_ENCAP_HDR_SZ;
834
835	for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
836		mblen = MBLKL(nmp);
837		if (pending > mblen) {
838			bcopy(nmp->b_rptr, dst, mblen);
839			dst += mblen;
840			pending -= mblen;
841		} else {
842			bcopy(nmp->b_rptr, dst, pending);
843			break;
844		}
845	}
846}
847
848static int
849eib_data_prepare_sgl(eib_vnic_t *vnic, eib_wqe_t *swqe, mblk_t *mp)
850{
851	eib_t *ss = vnic->vn_ss;
852	eib_stats_t *stats = vnic->vn_ss->ei_stats;
853	ibt_iov_t iov_arr[EIB_MAX_SGL];
854	ibt_iov_attr_t iov_attr;
855	ibt_wr_ds_t *sgl;
856	ibt_status_t ret;
857	mblk_t *nmp;
858	mblk_t *data_mp;
859	uchar_t *bufp;
860	size_t blksize;
861	size_t skip;
862	size_t avail;
863	uint_t lsohdr_sz;
864	uint_t pktsz;
865	ptrdiff_t frag_len;
866	uint_t pending_hdr;
867	uint_t nblks;
868	uint_t i;
869
870	/*
871	 * Let's skip ahead to the TCP data if this is LSO.  Note that while
872	 * the lso header size in the swqe includes the EoIB encapsulation
873	 * header size, that encapsulation header itself won't be found in
874	 * the mblk.
875	 */
876	lsohdr_sz = (swqe->qe_wr.send.wr_opcode == IBT_WRC_SEND) ? 0 :
877	    swqe->qe_wr.send.wr.ud_lso.lso_hdr_sz;
878
879	data_mp = mp;
880	pending_hdr = 0;
881	if (lsohdr_sz) {
882		pending_hdr = lsohdr_sz - EIB_ENCAP_HDR_SZ;
883		for (nmp = mp; nmp; nmp = nmp->b_cont) {
884			frag_len =
885			    (uintptr_t)nmp->b_wptr - (uintptr_t)nmp->b_rptr;
886			if (frag_len > pending_hdr)
887				break;
888			pending_hdr -= frag_len;
889		}
890		data_mp = nmp;  /* start of data past lso header */
891		ASSERT(data_mp != NULL);
892	}
893
894	/*
895	 * If this is an LSO packet, we want pktsz to hold the size of the
896	 * data following the eoib/ethernet/tcp/ip headers.  If this is a
897	 * non-LSO packet, we want pktsz to refer to the size of the entire
898	 * packet with all the headers, and nblks to hold the number of
899	 * mappings we'll need to iov map this (for reserved lkey request).
900	 */
901	if (lsohdr_sz == 0) {
902		nblks = 1;
903		pktsz = EIB_ENCAP_HDR_SZ;
904	} else {
905		nblks = 0;
906		pktsz = 0;
907	}
908	for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
909		pktsz += MBLKL(nmp);
910		nblks++;
911	}
912	pktsz -= pending_hdr;
913
914	EIB_UPDATE_COUNTER(&stats->st_obytes, pktsz);
915	EIB_INCR_COUNTER(&stats->st_opkts);
916
917	/*
918	 * We only do ibt_map_mem_iov() if the pktsz is above the tx copy
919	 * threshold and if the number of mp fragments is less than the
920	 * maximum acceptable.
921	 */
922	if ((ss->ei_caps->cp_resv_lkey_capab) && (pktsz > EIB_TX_COPY_THRESH) &&
923	    (nblks < ss->ei_caps->cp_hiwm_sgl)) {
924
925		iov_attr.iov_as = NULL;
926		iov_attr.iov = iov_arr;
927		iov_attr.iov_buf = NULL;
928		iov_attr.iov_list_len = nblks;
929		iov_attr.iov_wr_nds = ss->ei_caps->cp_max_sgl;
930		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
931		iov_attr.iov_flags = IBT_IOV_SLEEP;
932
933		i = 0;
934		if (lsohdr_sz == 0) {
935			iov_arr[i].iov_addr = (caddr_t)swqe->qe_payload_hdr;
936			iov_arr[i].iov_len = EIB_ENCAP_HDR_SZ;
937			i++;
938		}
939		for (nmp = data_mp; i < nblks; i++, nmp = nmp->b_cont) {
940			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
941			iov_arr[i].iov_len = MBLKL(nmp);
942			if (nmp == data_mp) {
943				iov_arr[i].iov_addr += pending_hdr;
944				iov_arr[i].iov_len -= pending_hdr;
945			}
946		}
947		swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_MAPPED;
948		swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl;
949
950		ret = ibt_map_mem_iov(ss->ei_hca_hdl, &iov_attr,
951		    &swqe->qe_wr, &swqe->qe_iov_hdl);
952		if (ret != IBT_SUCCESS) {
953			EIB_DPRINTF_WARN(ss->ei_instance,
954			"eib_data_prepare_sgl: "
955			"ibt_map_mem_iov(nblks=0x%lx) failed, ret=%d ",
956			"attempting to use copy path", nblks, ret);
957			goto prepare_sgl_copy_path;
958		}
959
960		return (EIB_E_SUCCESS);
961	}
962
963prepare_sgl_copy_path:
964	if (pktsz <= swqe->qe_bufsz) {
965		swqe->qe_wr.send.wr_nds = 1;
966		swqe->qe_wr.send.wr_sgl = &swqe->qe_sgl;
967		swqe->qe_sgl.ds_len = pktsz;
968
969		/*
970		 * Even though this is the copy path for transfers less than
971		 * qe_bufsz, it could still be an LSO packet.  If so, we only
972		 * have to write the data following all the headers into the
973		 * work request buffer, since we'll be sending the lso header
974		 * itself separately. If this is not an LSO send (but pkt size
975		 * greater than mtu, say for a jumbo frame), then we need
976		 * to write all the headers including EoIB encapsulation,
977		 * into the work request buffer.
978		 */
979		bufp = (uchar_t *)(uintptr_t)swqe->qe_sgl.ds_va;
980		if (lsohdr_sz == 0) {
981			*(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR);
982			bufp += EIB_ENCAP_HDR_SZ;
983		}
984		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
985			blksize = MBLKL(nmp) - pending_hdr;
986			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
987			bufp += blksize;
988			pending_hdr = 0;
989		}
990
991		/*
992		 * If the ethernet frame we're going to send is less than
993		 * ETHERMIN, pad up the buffer to ETHERMIN (with zeros)
994		 */
995		if ((pktsz + lsohdr_sz) < (ETHERMIN + EIB_ENCAP_HDR_SZ)) {
996			bzero(bufp, (ETHERMIN + EIB_ENCAP_HDR_SZ) -
997			    (pktsz + lsohdr_sz));
998			swqe->qe_sgl.ds_len = ETHERMIN + EIB_ENCAP_HDR_SZ;
999		}
1000		return (EIB_E_SUCCESS);
1001	}
1002
1003	/*
1004	 * Copy path for transfers greater than swqe->qe_bufsz
1005	 */
1006	swqe->qe_wr.send.wr_sgl = swqe->qe_big_sgl;
1007	if (eib_rsrc_grab_lsobufs(ss, pktsz, swqe->qe_wr.send.wr_sgl,
1008	    &(swqe->qe_wr.send.wr_nds)) != EIB_E_SUCCESS) {
1009		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_prepare_sgl: "
1010		    "eib_rsrc_grab_lsobufs() failed");
1011		return (EIB_E_FAILURE);
1012	}
1013	swqe->qe_info |= EIB_WQE_FLG_BUFTYPE_LSO;
1014
1015	/*
1016	 * Copy the larger-than-qe_buf_sz packet into a set of fixed-sized,
1017	 * pre-mapped LSO buffers. Note that we might need to skip part of
1018	 * the LSO header in the first fragment as before.
1019	 */
1020	nmp = data_mp;
1021	skip = pending_hdr;
1022	for (i = 0; i < swqe->qe_wr.send.wr_nds; i++) {
1023		sgl = swqe->qe_wr.send.wr_sgl + i;
1024		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
1025		avail = EIB_LSO_BUFSZ;
1026
1027		/*
1028		 * If this is a non-LSO packet (perhaps a jumbo frame?)
1029		 * we may still need to prefix the EoIB header in the
1030		 * wr buffer.
1031		 */
1032		if ((i == 0) && (lsohdr_sz == 0)) {
1033			*(uint32_t *)((void *)bufp) = htonl(EIB_TX_ENCAP_HDR);
1034			bufp += EIB_ENCAP_HDR_SZ;
1035			avail -= EIB_ENCAP_HDR_SZ;
1036		}
1037
1038		while (nmp && avail) {
1039			blksize = MBLKL(nmp) - skip;
1040			if (blksize > avail) {
1041				bcopy(nmp->b_rptr + skip, bufp, avail);
1042				skip += avail;
1043				avail = 0;
1044			} else {
1045				bcopy(nmp->b_rptr + skip, bufp, blksize);
1046				skip = 0;
1047				bufp += blksize;
1048				avail -= blksize;
1049				nmp = nmp->b_cont;
1050			}
1051		}
1052	}
1053
1054	return (EIB_E_SUCCESS);
1055}
1056
1057/*ARGSUSED*/
1058static int
1059eib_data_is_mcast_pkt_ok(eib_vnic_t *vnic, uint8_t *macaddr, uint64_t *brdcst,
1060    uint64_t *multicst)
1061{
1062	/*
1063	 * If the dmac is a broadcast packet, let it through.  Otherwise, either
1064	 * we should be in promiscuous mode or the dmac should be in our list of
1065	 * joined multicast addresses. Currently we only update the stat
1066	 * counters and always let things through.
1067	 */
1068	if (bcmp(macaddr, eib_broadcast_mac, ETHERADDRL) == 0)
1069		EIB_INCR_COUNTER(brdcst);
1070	else
1071		EIB_INCR_COUNTER(multicst);
1072
1073	return (1);
1074}
1075
1076static void
1077eib_data_rx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
1078{
1079	eib_vnic_t *vnic = arg;
1080	eib_chan_t *chan = vnic->vn_data_chan;
1081	eib_t *ss = vnic->vn_ss;
1082
1083	if (cq_hdl != chan->ch_rcv_cq_hdl) {
1084		EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_rx_comp_intr: "
1085		    "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
1086		    "ignoring completion", cq_hdl, chan->ch_cq_hdl);
1087		return;
1088	}
1089
1090	ASSERT(vnic->vn_data_rx_si_hdl != NULL);
1091
1092	(void) ddi_intr_trigger_softint(vnic->vn_data_rx_si_hdl, NULL);
1093}
1094
1095static void
1096eib_data_tx_comp_intr(ibt_cq_hdl_t cq_hdl, void *arg)
1097{
1098	eib_vnic_t *vnic = arg;
1099	eib_chan_t *chan = vnic->vn_data_chan;
1100	eib_t *ss = vnic->vn_ss;
1101
1102	if (cq_hdl != chan->ch_cq_hdl) {
1103		EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_data_tx_comp_intr: "
1104		    "cq_hdl(0x%llx) != chan->ch_cq_hdl(0x%llx), "
1105		    "ignoring completion", cq_hdl, chan->ch_cq_hdl);
1106		return;
1107	}
1108
1109	ASSERT(vnic->vn_data_tx_si_hdl != NULL);
1110
1111	(void) ddi_intr_trigger_softint(vnic->vn_data_tx_si_hdl, NULL);
1112}
1113
1114static mblk_t *
1115eib_data_rx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc)
1116{
1117	eib_t *ss = vnic->vn_ss;
1118	eib_chan_t *chan = vnic->vn_data_chan;
1119	eib_login_data_t *ld = &vnic->vn_login_data;
1120	eib_stats_t *stats = ss->ei_stats;
1121	eib_ether_hdr_t evh;
1122	mblk_t *mp;
1123	boolean_t allocd_mp = B_FALSE;
1124	uint_t ec_hdr;
1125	uint_t ec_sign;
1126	uint_t ec_ver;
1127	uint_t ec_tu_cs;
1128	uint_t ec_ip_cs;
1129
1130	/*
1131	 * Before we process this mblk and send it up to network layer, see
1132	 * if we're running low on rwqes in the wqe pool. If so, allocate a
1133	 * new mblk, copy the received data into it and send it up (and return
1134	 * the current rwqe back to the pool immediately by calling freemsg()
1135	 * on the original mblk).
1136	 */
1137	if (!eib_rsrc_rxpool_low(wqe)) {
1138		mp = wqe->qe_mp;
1139	} else {
1140		if ((mp = allocb(wc->wc_bytes_xfer, BPRI_HI)) != NULL) {
1141			bcopy(wqe->qe_mp->b_rptr, mp->b_rptr,
1142			    wc->wc_bytes_xfer);
1143			freemsg(wqe->qe_mp);
1144			allocd_mp = B_TRUE;
1145		} else {
1146			EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1147			    "wqe level below watermark, dropping rx pkt");
1148			EIB_INCR_COUNTER(&stats->st_norcvbuf);
1149			freemsg(wqe->qe_mp);
1150			return (NULL);
1151		}
1152	}
1153
1154	/*
1155	 * Adjust write pointer depending on how much data came in. Note that
1156	 * since the nw layer will expect us to hand over the mp with the
1157	 * ethernet header starting at mp->b_rptr, update the b_rptr as well.
1158	 */
1159	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
1160
1161	/*
1162	 * We have a problem if this really happens!
1163	 */
1164	if (mp->b_next != NULL) {
1165		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1166		    "received packet's b_next not NULL, possible dup from cq");
1167		mp->b_next = NULL;
1168	}
1169
1170	/*
1171	 * Drop loopback packets ?
1172	 */
1173	if ((wc->wc_slid == ss->ei_props->ep_blid) &&
1174	    (wc->wc_qpn == chan->ch_qpn)) {
1175		goto data_rx_comp_fail;
1176	}
1177
1178	mp->b_rptr += EIB_GRH_SZ;
1179
1180	/*
1181	 * Since the recv buffer has been aligned for IP header to start on
1182	 * a word boundary, it is safe to say that the EoIB and ethernet
1183	 * headers won't start on a word boundary.
1184	 */
1185	bcopy(mp->b_rptr, &ec_hdr, EIB_ENCAP_HDR_SZ);
1186
1187	/*
1188	 * Check EoIB signature and version
1189	 */
1190	ec_hdr = ntohl(ec_hdr);
1191
1192	ec_sign = (ec_hdr >> EIB_ENCAP_SIGN_SHIFT) & EIB_ENCAP_SIGN_MASK;
1193	if (ec_sign != EIB_EH_SIGNATURE) {
1194		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1195		    "EoIB encapsulation header signature (0x%lx) unknown",
1196		    ec_sign);
1197		goto data_rx_comp_fail;
1198	}
1199
1200	ec_ver = (ec_hdr >> EIB_ENCAP_VER_SHIFT) & EIB_ENCAP_VER_MASK;
1201	if (ec_ver != EIB_EH_VERSION) {
1202		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1203		    "EoIB encapsulation header version (0x%lx) unknown",
1204		    ec_ver);
1205		goto data_rx_comp_fail;
1206	}
1207
1208	/*
1209	 * Check TCP/UDP and IP checksum
1210	 */
1211	ec_tu_cs = (ec_hdr >> EIB_ENCAP_TCPCHK_SHIFT) & EIB_ENCAP_TCPCHK_MASK;
1212	ec_ip_cs = (ec_hdr >> EIB_ENCAP_IPCHK_SHIFT) & EIB_ENCAP_IPCHK_MASK;
1213
1214	if ((ec_tu_cs == EIB_EH_UDPCSUM_OK || ec_tu_cs == EIB_EH_TCPCSUM_OK) &&
1215	    (ec_ip_cs == EIB_EH_IPCSUM_OK)) {
1216		mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
1217	} else if (ec_tu_cs == EIB_EH_CSUM_BAD || ec_ip_cs == EIB_EH_CSUM_BAD) {
1218		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1219		    "EoIB encapsulation header tcp/udp checksum (0x%lx) or"
1220		    "ip checksum (0x%lx) is bad", ec_tu_cs, ec_ip_cs);
1221	}
1222
1223	/*
1224	 * Update the message block's b_rptr to the start of ethernet header
1225	 * and parse the header information
1226	 */
1227	mp->b_rptr += EIB_ENCAP_HDR_SZ;
1228	eib_data_parse_ether_hdr(mp, &evh);
1229
1230	/*
1231	 * If the incoming packet is vlan-tagged, but the tag doesn't match
1232	 * this vnic's vlan, drop it.
1233	 */
1234	if ((evh.eh_tagless == 0) && (evh.eh_vlan != ld->ld_assigned_vlan)) {
1235		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1236		    "received packet's vlan unknown, expected=0x%x, got=0x%x",
1237		    ld->ld_assigned_vlan, evh.eh_vlan);
1238		goto data_rx_comp_fail;
1239	}
1240
1241	/*
1242	 * Final checks to see if the unicast destination is indeed correct
1243	 * and to see if the multicast address is ok for us.
1244	 */
1245	if (EIB_UNICAST_MAC(evh.eh_dmac)) {
1246		if (bcmp(evh.eh_dmac, ld->ld_assigned_mac, ETHERADDRL) != 0) {
1247			uint8_t *exp;
1248			uint8_t *got;
1249
1250			exp = ld->ld_assigned_mac;
1251			got = evh.eh_dmac;
1252
1253			EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1254			    "received packet's macaddr mismatch, "
1255			    "expected=%x:%x:%x:%x:%x:%x, got=%x:%x:%x:%x:%x:%x",
1256			    exp[0], exp[1], exp[2], exp[3], exp[4], exp[5],
1257			    got[0], got[1], got[2], got[3], got[4], got[5]);
1258
1259			goto data_rx_comp_fail;
1260		}
1261	} else {
1262		if (!eib_data_is_mcast_pkt_ok(vnic, evh.eh_dmac,
1263		    &stats->st_brdcstrcv, &stats->st_multircv)) {
1264			EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1265			    "multicast packet not ok");
1266			goto data_rx_comp_fail;
1267		}
1268	}
1269
1270	/*
1271	 * Strip ethernet FCS if present in the packet.  ConnectX-2 doesn't
1272	 * support ethernet FCS, so this shouldn't happen anyway.
1273	 */
1274	if ((ec_hdr >> EIB_ENCAP_FCS_B_SHIFT) & 0x1) {
1275		EIB_DPRINTF_WARN(ss->ei_instance, "eib_data_rx_comp: "
1276		    "ethernet FCS present (ec_hdr=0%lx), ignoring",
1277		    ec_hdr);
1278
1279		mp->b_wptr -= ETHERFCSL;
1280	}
1281
1282	/*
1283	 * If this is the same mp as was in the original rwqe (i.e. we didn't
1284	 * do any allocb()), then mark the rwqe flag so we know that its mblk
1285	 * is with the network layer.
1286	 */
1287	if (!allocd_mp) {
1288		wqe->qe_info |= EIB_WQE_FLG_WITH_NW;
1289	}
1290
1291	return (mp);
1292
1293data_rx_comp_fail:
1294	freemsg(mp);
1295	return (NULL);
1296}
1297
1298static void
1299eib_data_tx_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, eib_chan_t *chan)
1300{
1301	eib_t *ss = vnic->vn_ss;
1302	ibt_status_t ret;
1303
1304	if (wqe->qe_mp) {
1305		if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_MAPPED) {
1306			ret = ibt_unmap_mem_iov(ss->ei_hca_hdl,
1307			    wqe->qe_iov_hdl);
1308			if (ret != IBT_SUCCESS) {
1309				EIB_DPRINTF_WARN(ss->ei_instance,
1310				    "eib_data_tx_comp: "
1311				    "ibt_unmap_mem_iov() failed, ret=%d", ret);
1312			}
1313			wqe->qe_iov_hdl = NULL;
1314		} else if (wqe->qe_info & EIB_WQE_FLG_BUFTYPE_LSO) {
1315			eib_rsrc_return_lsobufs(ss, wqe->qe_big_sgl,
1316			    wqe->qe_wr.send.wr_nds);
1317		}
1318		freemsg(wqe->qe_mp);
1319		wqe->qe_mp = NULL;
1320	}
1321
1322	eib_rsrc_return_swqe(ss, wqe, chan);
1323}
1324
1325static void
1326eib_data_err_comp(eib_vnic_t *vnic, eib_wqe_t *wqe, ibt_wc_t *wc)
1327{
1328	eib_t *ss = vnic->vn_ss;
1329
1330	/*
1331	 * Currently, all we do is report
1332	 */
1333	switch (wc->wc_status) {
1334	case IBT_WC_WR_FLUSHED_ERR:
1335		break;
1336
1337	case IBT_WC_LOCAL_CHAN_OP_ERR:
1338		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: "
1339		    "IBT_WC_LOCAL_CHAN_OP_ERR seen, wqe_info=0x%lx ",
1340		    wqe->qe_info);
1341		break;
1342
1343	case IBT_WC_LOCAL_PROTECT_ERR:
1344		EIB_DPRINTF_ERR(ss->ei_instance, "eib_data_err_comp: "
1345		    "IBT_WC_LOCAL_PROTECT_ERR seen, wqe_info=0x%lx ",
1346		    wqe->qe_info);
1347		break;
1348	}
1349
1350	/*
1351	 * When a wc indicates error, we do not attempt to repost the
1352	 * rwqe but simply return it to the wqe pool. Also for rwqes,
1353	 * attempting to free the mblk in the wqe invokes the
1354	 * eib_data_rx_recycle() callback.  For tx wqes, error handling
1355	 * is the same as successful completion handling.  We still
1356	 * have to unmap iov/free lsobufs/free mblk and then return the
1357	 * swqe to the pool.
1358	 */
1359	if (EIB_WQE_TYPE(wqe->qe_info) == EIB_WQE_RX) {
1360		ASSERT(wqe->qe_mp != NULL);
1361		freemsg(wqe->qe_mp);
1362	} else {
1363		eib_data_tx_comp(vnic, wqe, vnic->vn_data_chan);
1364	}
1365}
1366
1367/*ARGSUSED*/
1368static void
1369eib_rb_data_setup_cqs(eib_t *ss, eib_vnic_t *vnic)
1370{
1371	eib_chan_t *chan = vnic->vn_data_chan;
1372	ibt_status_t ret;
1373
1374	if (chan == NULL)
1375		return;
1376
1377	/*
1378	 * Reset any completion handlers we may have set up
1379	 */
1380	if (chan->ch_rcv_cq_hdl) {
1381		ibt_set_cq_handler(chan->ch_rcv_cq_hdl, NULL, NULL);
1382	}
1383	if (chan->ch_cq_hdl) {
1384		ibt_set_cq_handler(chan->ch_cq_hdl, NULL, NULL);
1385	}
1386
1387	/*
1388	 * Remove any softints that were added
1389	 */
1390	if (vnic->vn_data_rx_si_hdl) {
1391		(void) ddi_intr_remove_softint(vnic->vn_data_rx_si_hdl);
1392		vnic->vn_data_rx_si_hdl = NULL;
1393	}
1394	if (vnic->vn_data_tx_si_hdl) {
1395		(void) ddi_intr_remove_softint(vnic->vn_data_tx_si_hdl);
1396		vnic->vn_data_tx_si_hdl = NULL;
1397	}
1398
1399	/*
1400	 * Release any work completion buffers we may have allocated
1401	 */
1402	if (chan->ch_rcv_wc && chan->ch_rcv_cq_sz) {
1403		kmem_free(chan->ch_rcv_wc,
1404		    sizeof (ibt_wc_t) * chan->ch_rcv_cq_sz);
1405	}
1406	chan->ch_rcv_cq_sz = 0;
1407	chan->ch_rcv_wc = NULL;
1408
1409	if (chan->ch_wc && chan->ch_cq_sz) {
1410		kmem_free(chan->ch_wc, sizeof (ibt_wc_t) * chan->ch_cq_sz);
1411	}
1412	chan->ch_cq_sz = 0;
1413	chan->ch_wc = NULL;
1414
1415	/*
1416	 * Free any completion queues we may have allocated
1417	 */
1418	if (chan->ch_rcv_cq_hdl) {
1419		ret = ibt_free_cq(chan->ch_rcv_cq_hdl);
1420		if (ret != IBT_SUCCESS) {
1421			EIB_DPRINTF_WARN(ss->ei_instance,
1422			    "eib_rb_data_setup_cqs: "
1423			    "ibt_free_cq(rcv_cq) failed, ret=%d", ret);
1424		}
1425		chan->ch_rcv_cq_hdl = NULL;
1426	}
1427	if (chan->ch_cq_hdl) {
1428		ret = ibt_free_cq(chan->ch_cq_hdl);
1429		if (ret != IBT_SUCCESS) {
1430			EIB_DPRINTF_WARN(ss->ei_instance,
1431			    "eib_rb_data_setup_cqs: "
1432			    "ibt_free_cq(snd_cq) failed, ret=%d", ret);
1433		}
1434		chan->ch_cq_hdl = NULL;
1435	}
1436}
1437
1438/*ARGSUSED*/
1439static void
1440eib_rb_data_setup_ud_channel(eib_t *ss, eib_vnic_t *vnic)
1441{
1442	eib_chan_t *chan = vnic->vn_data_chan;
1443	ibt_status_t ret;
1444
1445	if (chan == NULL)
1446		return;
1447
1448	if (chan->ch_chan) {
1449		/*
1450		 * We're trying to tear down this UD channel. Make sure that
1451		 * we don't attempt to refill (repost) at any point from now on.
1452		 */
1453		chan->ch_tear_down = B_TRUE;
1454		if ((ret = ibt_flush_channel(chan->ch_chan)) != IBT_SUCCESS) {
1455			EIB_DPRINTF_WARN(ss->ei_instance,
1456			    "eib_rb_data_setup_ud_channel: "
1457			    "ibt_flush_channel() failed, ret=%d", ret);
1458		}
1459
1460		/*
1461		 * Wait until all posted tx wqes on this channel are back with
1462		 * the wqe pool.
1463		 */
1464		mutex_enter(&chan->ch_tx_lock);
1465		while (chan->ch_tx_posted > 0)
1466			cv_wait(&chan->ch_tx_cv, &chan->ch_tx_lock);
1467		mutex_exit(&chan->ch_tx_lock);
1468
1469		/*
1470		 * Wait until all posted rx wqes on this channel are back with
1471		 * the wqe pool.
1472		 */
1473		mutex_enter(&chan->ch_rx_lock);
1474		while (chan->ch_rx_posted > 0)
1475			cv_wait(&chan->ch_rx_cv, &chan->ch_rx_lock);
1476		mutex_exit(&chan->ch_rx_lock);
1477
1478		/*
1479		 * Now we're ready to free this channel
1480		 */
1481		if ((ret = ibt_free_channel(chan->ch_chan)) != IBT_SUCCESS) {
1482			EIB_DPRINTF_WARN(ss->ei_instance,
1483			    "eib_rb_data_setup_ud_channel: "
1484			    "ibt_free_channel() failed, ret=%d", ret);
1485		}
1486
1487		chan->ch_alloc_mp = B_FALSE;
1488		chan->ch_ip_hdr_align = 0;
1489		chan->ch_rwqe_bktsz = 0;
1490		chan->ch_lwm_rwqes = 0;
1491		chan->ch_max_rwqes = 0;
1492		chan->ch_max_swqes = 0;
1493		chan->ch_qpn = 0;
1494		chan->ch_chan = NULL;
1495	}
1496}
1497