xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_send.c (revision 09b7f21a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IEEE 802.3ad Link Aggregation - Send code.
28  *
29  * Implements the Distributor function.
30  */
31 
32 #include <sys/conf.h>
33 #include <sys/modctl.h>
34 #include <sys/sunddi.h>
35 #include <sys/callb.h>
36 #include <sys/vlan.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 
41 #include <inet/common.h>
42 #include <inet/led.h>
43 #include <inet/ip.h>
44 #include <inet/ip6.h>
45 #include <inet/tcp.h>
46 #include <netinet/udp.h>
47 
48 #include <sys/aggr.h>
49 #include <sys/aggr_impl.h>
50 
51 /*
52  * Update the TX load balancing policy of the specified group.
53  */
54 void
aggr_send_update_policy(aggr_grp_t * grp,uint32_t policy)55 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
56 {
57 	uint8_t mac_policy = 0;
58 
59 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
60 
61 	if ((policy & AGGR_POLICY_L2) != 0)
62 		mac_policy |= MAC_PKT_HASH_L2;
63 	if ((policy & AGGR_POLICY_L3) != 0)
64 		mac_policy |= MAC_PKT_HASH_L3;
65 	if ((policy & AGGR_POLICY_L4) != 0)
66 		mac_policy |= MAC_PKT_HASH_L4;
67 
68 	grp->lg_tx_policy = policy;
69 	grp->lg_mac_tx_policy = mac_policy;
70 }
71 
72 #define	HASH_HINT(hint)	\
73 	((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
74 
75 /*
76  * Function invoked by mac layer to find a specific TX ring on a port
77  * to send data.
78  */
79 mblk_t *
aggr_find_tx_ring(void * arg,mblk_t * mp,uintptr_t hint,mac_ring_handle_t * rh)80 aggr_find_tx_ring(void *arg, mblk_t *mp, uintptr_t hint, mac_ring_handle_t *rh)
81 {
82 	aggr_grp_t *grp = arg;
83 	aggr_port_t *port;
84 	uint64_t hash;
85 
86 	rw_enter(&grp->lg_tx_lock, RW_READER);
87 	if (grp->lg_ntx_ports == 0) {
88 		/*
89 		 * We could have returned from aggr_m_start() before
90 		 * the ports were actually attached. Drop the chain.
91 		 */
92 		rw_exit(&grp->lg_tx_lock);
93 		freemsgchain(mp);
94 		return (NULL);
95 	}
96 	hash = mac_pkt_hash(DL_ETHER, mp, grp->lg_mac_tx_policy, B_TRUE);
97 	port = grp->lg_tx_ports[hash % grp->lg_ntx_ports];
98 
99 	/*
100 	 * Use hash as the hint so to direct traffic to
101 	 * different TX rings. Note below bit operation
102 	 * is needed in case hint is 0 to get the most
103 	 * benefit from HASH_HINT() algorithm.
104 	 */
105 	if (port->lp_tx_ring_cnt > 1) {
106 		if (hint == 0) {
107 			hash = (hash << 24 | hash << 16 | hash);
108 			hash = (hash << 32 | hash);
109 		} else {
110 			hash = hint;
111 		}
112 		hash = HASH_HINT(hash);
113 		*rh = port->lp_pseudo_tx_rings[hash % port->lp_tx_ring_cnt];
114 	} else {
115 		*rh = port->lp_pseudo_tx_rings[0];
116 	}
117 	rw_exit(&grp->lg_tx_lock);
118 
119 	return (mp);
120 }
121 
122 /*
123  * aggr_tx_notify_thread:
124  *
125  * aggr_tx_ring_update() callback function wakes up this thread when
126  * it gets called. This thread will call mac_tx_ring_update() to
127  * notify upper mac of flow control getting relieved. Note that
128  * aggr_tx_ring_update() cannot call mac_tx_ring_update() directly
129  * because aggr_tx_ring_update() is called from lower mac with
130  * mi_rw_lock held.
131  */
132 void
aggr_tx_notify_thread(void * arg)133 aggr_tx_notify_thread(void *arg)
134 {
135 	callb_cpr_t	cprinfo;
136 	aggr_grp_t	*grp = (aggr_grp_t *)arg;
137 	mac_ring_handle_t	pseudo_mrh;
138 
139 	CALLB_CPR_INIT(&cprinfo, &grp->lg_tx_flowctl_lock, callb_generic_cpr,
140 	    "aggr_tx_notify_thread");
141 
142 	mutex_enter(&grp->lg_tx_flowctl_lock);
143 	while (!grp->lg_tx_notify_done) {
144 		if ((grp->lg_tx_blocked_cnt) == 0) {
145 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
146 			cv_wait(&grp->lg_tx_flowctl_cv,
147 			    &grp->lg_tx_flowctl_lock);
148 			CALLB_CPR_SAFE_END(&cprinfo, &grp->lg_tx_flowctl_lock);
149 			continue;
150 		}
151 		while (grp->lg_tx_blocked_cnt != 0) {
152 			grp->lg_tx_blocked_cnt--;
153 			pseudo_mrh =
154 			    grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt];
155 			mutex_exit(&grp->lg_tx_flowctl_lock);
156 			mac_tx_ring_update(grp->lg_mh, pseudo_mrh);
157 			mutex_enter(&grp->lg_tx_flowctl_lock);
158 		}
159 	}
160 	/*
161 	 * The grp is being destroyed, exit the thread.
162 	 */
163 	grp->lg_tx_notify_thread = NULL;
164 	CALLB_CPR_EXIT(&cprinfo);
165 	thread_exit();
166 }
167 
168 /*
169  * Callback function registered with lower mac to receive wakeups from
170  * drivers when flow control is relieved (i.e. Tx descriptors are
171  * available).
172  */
173 void
aggr_tx_ring_update(void * arg1,uintptr_t arg2)174 aggr_tx_ring_update(void *arg1, uintptr_t arg2)
175 {
176 	aggr_port_t *port = (aggr_port_t *)arg1;
177 	mac_ring_handle_t mrh = (mac_ring_handle_t)arg2;
178 	mac_ring_handle_t pseudo_mrh;
179 	aggr_grp_t *grp = port->lp_grp;
180 	int i = 0;
181 
182 	if (mrh == NULL) {
183 		/*
184 		 * If the underlying NIC does not expose TX rings,
185 		 * still as pseudo TX ring is presented to the
186 		 * aggr mac.
187 		 */
188 		pseudo_mrh = port->lp_pseudo_tx_rings[0];
189 	} else {
190 		for (i = 0; i < port->lp_tx_ring_cnt; i++) {
191 			if (port->lp_tx_rings[i] == mrh)
192 				break;
193 		}
194 		ASSERT(i < port->lp_tx_ring_cnt);
195 		pseudo_mrh = port->lp_pseudo_tx_rings[i];
196 	}
197 	mutex_enter(&grp->lg_tx_flowctl_lock);
198 	/*
199 	 * It could be possible that some (broken?) device driver
200 	 * could send more than one wakeup on the same ring. In
201 	 * such a case, multiple instances of the same pseudo TX
202 	 * ring should not be saved in lg_tx_blocked_rings[]
203 	 * array. So first check if woken up ring (pseudo_mrh) is
204 	 * already in the lg_tx_blocked_rings[] array.
205 	 */
206 	for (i = 0; i < grp->lg_tx_blocked_cnt; i++) {
207 		if (grp->lg_tx_blocked_rings[i] == pseudo_mrh) {
208 			mutex_exit(&grp->lg_tx_flowctl_lock);
209 			return;
210 		}
211 	}
212 	/* A distinct mac_ring_handle. Save and increment count */
213 	grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt] = pseudo_mrh;
214 	grp->lg_tx_blocked_cnt++;
215 	cv_signal(&grp->lg_tx_flowctl_cv);
216 	mutex_exit(&grp->lg_tx_flowctl_lock);
217 }
218 
219 /*
220  * Send function invoked by the MAC service module.
221  */
222 mblk_t *
aggr_ring_tx(void * arg,mblk_t * mp)223 aggr_ring_tx(void *arg, mblk_t *mp)
224 {
225 	aggr_pseudo_tx_ring_t *pseudo_ring = (aggr_pseudo_tx_ring_t *)arg;
226 	aggr_port_t *port = pseudo_ring->atr_port;
227 
228 	return (mac_hwring_send_priv(port->lp_mch, pseudo_ring->atr_hw_rh, mp));
229 }
230 
231 /*
232  * Enable sending on the specified port.
233  */
234 void
aggr_send_port_enable(aggr_port_t * port)235 aggr_send_port_enable(aggr_port_t *port)
236 {
237 	aggr_grp_t *grp = port->lp_grp;
238 
239 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
240 
241 	if (port->lp_tx_enabled || (port->lp_state !=
242 	    AGGR_PORT_STATE_ATTACHED)) {
243 		/* already enabled or port not yet attached */
244 		return;
245 	}
246 
247 	/*
248 	 * Add to group's array of tx ports.
249 	 */
250 	rw_enter(&grp->lg_tx_lock, RW_WRITER);
251 	if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
252 		/* current array too small */
253 		aggr_port_t **new_ports;
254 		uint_t new_size;
255 
256 		new_size = grp->lg_ntx_ports+1;
257 		new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *),
258 		    KM_SLEEP);
259 
260 		if (grp->lg_tx_ports_size > 0) {
261 			ASSERT(grp->lg_tx_ports != NULL);
262 			bcopy(grp->lg_tx_ports, new_ports,
263 			    grp->lg_ntx_ports * sizeof (aggr_port_t *));
264 			kmem_free(grp->lg_tx_ports,
265 			    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
266 		}
267 
268 		grp->lg_tx_ports = new_ports;
269 		grp->lg_tx_ports_size = new_size;
270 	}
271 
272 	grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
273 	port->lp_tx_idx = grp->lg_ntx_ports-1;
274 	rw_exit(&grp->lg_tx_lock);
275 
276 	port->lp_tx_enabled = B_TRUE;
277 
278 	aggr_grp_update_default(grp);
279 }
280 
281 /*
282  * Disable sending from the specified port.
283  */
284 void
aggr_send_port_disable(aggr_port_t * port)285 aggr_send_port_disable(aggr_port_t *port)
286 {
287 	uint_t idx, ntx;
288 	aggr_grp_t *grp = port->lp_grp;
289 
290 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
291 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
292 
293 	if (!port->lp_tx_enabled) {
294 		/* not yet enabled */
295 		return;
296 	}
297 
298 	rw_enter(&grp->lg_tx_lock, RW_WRITER);
299 	idx = port->lp_tx_idx;
300 	ntx = grp->lg_ntx_ports;
301 	ASSERT(idx < ntx);
302 
303 	/* remove from array of attached ports */
304 	if (idx == (ntx - 1)) {
305 		grp->lg_tx_ports[idx] = NULL;
306 	} else {
307 		/* not the last entry, replace with last one */
308 		aggr_port_t *victim;
309 
310 		victim = grp->lg_tx_ports[ntx - 1];
311 		grp->lg_tx_ports[ntx - 1] = NULL;
312 		victim->lp_tx_idx = idx;
313 		grp->lg_tx_ports[idx] = victim;
314 	}
315 
316 	port->lp_tx_idx = 0;
317 	grp->lg_ntx_ports--;
318 	rw_exit(&grp->lg_tx_lock);
319 
320 	port->lp_tx_enabled = B_FALSE;
321 
322 	aggr_grp_update_default(grp);
323 }
324