xref: /illumos-gate/usr/src/uts/common/inet/optcom.c (revision 0f1702c5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * This file contains common code for handling Options Management requests.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/stropts.h>
34 #include <sys/strsubr.h>
35 #include <sys/errno.h>
36 #define	_SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/socket.h>
39 #include <sys/socketvar.h>
40 #include <sys/ddi.h>
41 #include <sys/debug.h>		/* for ASSERT */
42 #include <sys/policy.h>
43 
44 #include <inet/common.h>
45 #include <inet/mi.h>
46 #include <inet/nd.h>
47 #include <netinet/ip6.h>
48 #include <inet/ip.h>
49 #include <inet/mib2.h>
50 #include <netinet/in.h>
51 #include "optcom.h"
52 
53 #include <inet/optcom.h>
54 #include <inet/ipclassifier.h>
55 #include <inet/proto_set.h>
56 
57 /*
58  * Function prototypes
59  */
60 static t_scalar_t process_topthdrs_first_pass(mblk_t *, cred_t *, optdb_obj_t *,
61     boolean_t *, size_t *);
62 static t_scalar_t do_options_second_pass(queue_t *q, mblk_t *reqmp,
63     mblk_t *ack_mp, cred_t *, optdb_obj_t *dbobjp,
64     mblk_t *first_mp, boolean_t is_restart, boolean_t *queued_statusp);
65 static t_uscalar_t get_worst_status(t_uscalar_t, t_uscalar_t);
66 static int do_opt_default(queue_t *, struct T_opthdr *, uchar_t **,
67     t_uscalar_t *, cred_t *, optdb_obj_t *);
68 static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **,
69     t_uscalar_t *, cred_t *cr, optdb_obj_t *);
70 static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
71     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
72     cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp);
73 static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t);
74 static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t);
75 static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *);
76 static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
77 static boolean_t opt_bloated_maxsize(opdes_t *);
78 
79 /* Common code for sending back a T_ERROR_ACK. */
80 void
81 optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
82 {
83 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
84 		qreply(q, mp);
85 }
86 
87 /*
88  * The option management routines svr4_optcom_req() and tpi_optcom_req() use
89  * callback functions as arguments. Here is the expected interfaces
90  * assumed from the callback functions
91  *
92  *
93  * (1) deffn(q, optlevel, optname, optvalp)
94  *
95  *	- Function only called when default value comes from protocol
96  *	 specific code and not the option database table (indicated by
97  *	  OP_DEF_FN property in option database.)
98  *	- Error return is -1. Valid returns are >=0.
99  *	- When valid, the return value represents the length used for storing
100  *		the default value of the option.
101  *      - Error return implies the called routine did not recognize this
102  *              option. Something downstream could so input is left unchanged
103  *              in request buffer.
104  *
105  * (2) getfn(q, optlevel, optname, optvalp)
106  *
107  *	- Error return is -1. Valid returns are >=0.
108  *	- When valid, the return value represents the length used for storing
109  *		the actual value of the option.
110  *      - Error return implies the called routine did not recognize this
111  *              option. Something downstream could so input is left unchanged
112  *              in request buffer.
113  *
114  * (3) setfn(q, optset_context, optlevel, optname, inlen, invalp,
115  *	outlenp, outvalp, attrp, cr);
116  *
117  *	- OK return is 0, Error code is returned as a non-zero argument.
118  *      - If negative it is ignored by svr4_optcom_req(). If positive, error
119  *        is returned. A negative return implies that option, while handled on
120  *	  this stack is not handled at this level and will be handled further
121  *	  downstream.
122  *	- Both negative and positive errors are treats as errors in an
123  *	  identical manner by tpi_optcom_req(). The errors affect "status"
124  *	  field of each option's T_opthdr. If sucessfull, an appropriate sucess
125  *	  result is carried. If error, it instantiated to "failure" at the
126  *	  topmost level and left unchanged at other levels. (This "failure" can
127  *	  turn to a success at another level).
128  *	- optset_context passed for tpi_optcom_req(). It is interpreted as:
129  *        - SETFN_OPTCOM_CHECKONLY
130  *		semantics are to pretend to set the value and report
131  *		back if it would be successful.
132  *		This is used with T_CHECK semantics in XTI
133  *        - SETFN_OPTCOM_NEGOTIATE
134  *		set the value. Call from option management primitive
135  *		T_OPTMGMT_REQ when T_NEGOTIATE flags is used.
136  *	  - SETFN_UD_NEGOTIATE
137  *		option request came riding on UNITDATA primitive most often
138  *		has  "this datagram" semantics to influence prpoerties
139  *		affecting an outgoig datagram or associated with recived
140  *		datagram
141  *		[ Note: XTI permits this use outside of "this datagram"
142  *		semantics also and permits setting "management related"
143  *		options in this	context and its test suite enforces it ]
144  *	  - SETFN_CONN_NEGOTATE
145  *		option request came riding on CONN_REQ/RES primitive and
146  *		most often has "this connection" (negotiation during
147  *		"connection estblishment") semantics.
148  *		[ Note: XTI permits use of these outside of "this connection"
149  *		semantics and permits "management related" options in this
150  *		context and its test suite enforces it. ]
151  *
152  *	- inlen, invalp is the option length,value requested to be set.
153  *	- outlenp, outvalp represent return parameters which contain the
154  *	  value set and it might be different from one passed on input.
155  *	- attrp points to a data structure that's used by v6 modules to
156  *	  store ancillary data options or sticky options.
157  *	- cr points to the caller's credentials
158  *	- the caller might pass same buffers for input and output and the
159  *	  routine should protect against this case by not updating output
160  *	  buffers until it is done referencing input buffers and any other
161  *	  issues (e.g. not use bcopy() if we do not trust what it does).
162  *      - If option is not known, it returns error. We randomly pick EINVAL.
163  *        It can however get called with options that are handled downstream
164  *        opr upstream so for svr4_optcom_req(), it does not return error for
165  *        negative return values.
166  *
167  */
168 
169 /*
170  * Upper Level Protocols call this routine when they receive
171  * a T_SVR4_OPTMGMT_REQ message.  They supply callback functions
172  * for setting a new value for a single options, getting the
173  * current value for a single option, and checking for support
174  * of a single option.  svr4_optcom_req validates the option management
175  * buffer passed in, and calls the appropriate routines to do the
176  * job requested.
177  * XXX Code below needs some restructuring after we have some more
178  * macros to support 'struct opthdr' in the headers.
179  *
180  * IP-MT notes: The option management framework functions svr4_optcom_req() and
181  * tpi_optcom_req() allocate and prepend an M_CTL mblk to the actual
182  * T_optmgmt_req mblk and pass the chain as an additional parameter to the
183  * protocol set functions. If a protocol set function (such as ip_opt_set)
184  * cannot process the option immediately it can return EINPROGRESS. ip_opt_set
185  * enqueues the message in the appropriate sq and returns EINPROGRESS. Later
186  * the sq framework arranges to restart this operation and passes control to
187  * the restart function ip_restart_optmgmt() which in turn calls
188  * svr4_optcom_req() or tpi_optcom_req() to restart the option processing.
189  *
190  * XXX Remove the asynchronous behavior of svr_optcom_req() and
191  * tpi_optcom_req().
192  */
193 int
194 svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
195     boolean_t pass_to_ip)
196 {
197 	pfi_t	deffn = dbobjp->odb_deffn;
198 	pfi_t	getfn = dbobjp->odb_getfn;
199 	opt_set_fn setfn = dbobjp->odb_setfn;
200 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
201 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
202 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
203 	opt_restart_t *or;
204 	struct opthdr *restart_opt;
205 	boolean_t is_restart = B_FALSE;
206 	mblk_t	*first_mp;
207 
208 	t_uscalar_t max_optbuf_len;
209 	int len;
210 	mblk_t	*mp1 = NULL;
211 	struct opthdr *next_opt;
212 	struct opthdr *opt;
213 	struct opthdr *opt1;
214 	struct opthdr *opt_end;
215 	struct opthdr *opt_start;
216 	opdes_t	*optd;
217 	boolean_t	pass_to_next = B_FALSE;
218 	struct T_optmgmt_ack *toa;
219 	struct T_optmgmt_req *tor;
220 	int error;
221 
222 	/*
223 	 * Allocate M_CTL and prepend to the packet for restarting this
224 	 * option if needed. IP may need to queue and restart the option
225 	 * if it cannot obtain exclusive conditions immediately. Please see
226 	 * IP-MT notes before the start of svr4_optcom_req
227 	 */
228 	if (mp->b_datap->db_type == M_CTL) {
229 		is_restart = B_TRUE;
230 		first_mp = mp;
231 		mp = mp->b_cont;
232 		ASSERT(mp->b_wptr - mp->b_rptr >=
233 		    sizeof (struct T_optmgmt_req));
234 		tor = (struct T_optmgmt_req *)mp->b_rptr;
235 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
236 
237 		or = (opt_restart_t *)first_mp->b_rptr;
238 		opt_start = or->or_start;
239 		opt_end = or->or_end;
240 		restart_opt = or->or_ropt;
241 		goto restart;
242 	}
243 
244 	tor = (struct T_optmgmt_req *)mp->b_rptr;
245 	/* Verify message integrity. */
246 	if (mp->b_wptr - mp->b_rptr < sizeof (struct T_optmgmt_req))
247 		goto bad_opt;
248 	/* Verify MGMT_flags legal */
249 	switch (tor->MGMT_flags) {
250 	case T_DEFAULT:
251 	case T_NEGOTIATE:
252 	case T_CURRENT:
253 	case T_CHECK:
254 		/* OK - legal request flags */
255 		break;
256 	default:
257 		optcom_err_ack(q, mp, TBADFLAG, 0);
258 		return (0);
259 	}
260 	if (tor->MGMT_flags == T_DEFAULT) {
261 		/* Is it a request for default option settings? */
262 
263 		/*
264 		 * Note: XXX TLI and TPI specification was unclear about
265 		 * semantics of T_DEFAULT and the following historical note
266 		 * and its interpretation is incorrect (it implies a request
267 		 * for default values of only the identified options not all.
268 		 * The semantics have been explained better in XTI spec.)
269 		 * However, we do not modify (comment or code) here to keep
270 		 * compatibility.
271 		 * We can rethink this if it ever becomes an issue.
272 		 * ----historical comment start------
273 		 * As we understand it, the input buffer is meaningless
274 		 * so we ditch the message.  A T_DEFAULT request is a
275 		 * request to obtain a buffer containing defaults for
276 		 * all supported options, so we allocate a maximum length
277 		 * reply.
278 		 * ----historical comment end -------
279 		 */
280 		/* T_DEFAULT not passed down */
281 		ASSERT(topmost_tpiprovider == B_TRUE);
282 		freemsg(mp);
283 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
284 		    opt_arr_cnt);
285 		mp = allocb(max_optbuf_len, BPRI_MED);
286 		if (!mp) {
287 no_mem:;
288 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
289 			return (0);
290 		}
291 
292 		/* Initialize the T_optmgmt_ack header. */
293 		toa = (struct T_optmgmt_ack *)mp->b_rptr;
294 		bzero((char *)toa, max_optbuf_len);
295 		toa->PRIM_type = T_OPTMGMT_ACK;
296 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
297 		/* TODO: Is T_DEFAULT the right thing to put in MGMT_flags? */
298 		toa->MGMT_flags = T_DEFAULT;
299 
300 		/* Now walk the table of options passed in */
301 		opt = (struct opthdr *)&toa[1];
302 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
303 			/*
304 			 * All the options in the table of options passed
305 			 * in are by definition supported by the protocol
306 			 * calling this function.
307 			 */
308 			if (!OA_READ_PERMISSION(optd, cr))
309 				continue;
310 			opt->level = optd->opdes_level;
311 			opt->name = optd->opdes_name;
312 			if (!(optd->opdes_props & OP_DEF_FN) ||
313 			    ((len = (*deffn)(q, opt->level,
314 			    opt->name, (uchar_t *)&opt[1])) < 0)) {
315 				/*
316 				 * Fill length and value from table.
317 				 *
318 				 * Default value not instantiated from function
319 				 * (or the protocol specific function failed it;
320 				 * In this interpretation of T_DEFAULT, this is
321 				 * the best we can do)
322 				 */
323 				switch (optd->opdes_size) {
324 				/*
325 				 * Since options are guaranteed aligned only
326 				 * on a 4 byte boundary (t_scalar_t) any
327 				 * option that is greater in size will default
328 				 * to the bcopy below
329 				 */
330 				case sizeof (int32_t):
331 					*(int32_t *)&opt[1] =
332 					    (int32_t)optd->opdes_default;
333 					break;
334 				case sizeof (int16_t):
335 					*(int16_t *)&opt[1] =
336 					    (int16_t)optd->opdes_default;
337 					break;
338 				case sizeof (int8_t):
339 					*(int8_t *)&opt[1] =
340 					    (int8_t)optd->opdes_default;
341 					break;
342 				default:
343 					/*
344 					 * other length but still assume
345 					 * fixed - use bcopy
346 					 */
347 					bcopy(optd->opdes_defbuf,
348 					    &opt[1], optd->opdes_size);
349 					break;
350 				}
351 				opt->len = optd->opdes_size;
352 			}
353 			else
354 				opt->len = (t_uscalar_t)len;
355 			opt = (struct opthdr *)((char *)&opt[1] +
356 			    _TPI_ALIGN_OPT(opt->len));
357 		}
358 
359 		/* Now record the final length. */
360 		toa->OPT_length = (t_scalar_t)((char *)opt - (char *)&toa[1]);
361 		mp->b_wptr = (uchar_t *)opt;
362 		mp->b_datap->db_type = M_PCPROTO;
363 		/* Ship it back. */
364 		qreply(q, mp);
365 		return (0);
366 	}
367 	/* T_DEFAULT processing complete - no more T_DEFAULT */
368 
369 	/*
370 	 * For T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make a
371 	 * pass through the input buffer validating the details and
372 	 * making sure each option is supported by the protocol.
373 	 */
374 	if ((opt_start = (struct opthdr *)mi_offset_param(mp,
375 	    tor->OPT_offset, tor->OPT_length)) == NULL)
376 		goto bad_opt;
377 	if (!__TPI_OPT_ISALIGNED(opt_start))
378 		goto bad_opt;
379 
380 	opt_end = (struct opthdr *)((uchar_t *)opt_start +
381 	    tor->OPT_length);
382 
383 	for (opt = opt_start; opt < opt_end; opt = next_opt) {
384 		/*
385 		 * Verify we have room to reference the option header
386 		 * fields in the option buffer.
387 		 */
388 		if ((uchar_t *)opt + sizeof (struct opthdr) >
389 		    (uchar_t *)opt_end)
390 			goto bad_opt;
391 		/*
392 		 * We now compute pointer to next option in buffer 'next_opt'
393 		 * The next_opt computation above below 'opt->len' initialized
394 		 * by application which cannot be trusted. The usual value
395 		 * too large will be captured by the loop termination condition
396 		 * above. We check for the following which it will miss.
397 		 * 	-pointer space wraparound arithmetic overflow
398 		 *	-last option in buffer with 'opt->len' being too large
399 		 *	 (only reason 'next_opt' should equal or exceed
400 		 *	 'opt_end' for last option is roundup unless length is
401 		 *	 too-large/invalid)
402 		 */
403 		next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
404 		    _TPI_ALIGN_OPT(opt->len));
405 
406 		if ((uchar_t *)next_opt < (uchar_t *)&opt[1] ||
407 		    ((next_opt >= opt_end) &&
408 		    (((uchar_t *)next_opt - (uchar_t *)opt_end) >=
409 		    __TPI_ALIGN_SIZE)))
410 			goto bad_opt;
411 
412 		/* sanity check */
413 		if (opt->name == T_ALLOPT)
414 			goto bad_opt;
415 
416 		error = proto_opt_check(opt->level, opt->name, opt->len, NULL,
417 		    opt_arr, opt_arr_cnt, topmost_tpiprovider,
418 		    tor->MGMT_flags == T_NEGOTIATE, tor->MGMT_flags == T_CHECK,
419 		    cr);
420 		if (error < 0) {
421 			optcom_err_ack(q, mp, -error, 0);
422 			return (0);
423 		} else if (error > 0) {
424 			optcom_err_ack(q, mp, TSYSERR, error);
425 			return (0);
426 		}
427 	} /* end for loop scanning option buffer */
428 
429 	/* Now complete the operation as required. */
430 	switch (tor->MGMT_flags) {
431 	case T_CHECK:
432 		/*
433 		 * Historically used same as T_CURRENT (which was added to
434 		 * standard later). Code retained for compatibility.
435 		 */
436 		/* FALLTHROUGH */
437 	case T_CURRENT:
438 		/*
439 		 * Allocate a maximum size reply.  Perhaps we are supposed to
440 		 * assume that the input buffer includes space for the answers
441 		 * as well as the opthdrs, but we don't know that for sure.
442 		 * So, instead, we create a new output buffer, using the
443 		 * input buffer only as a list of options.
444 		 */
445 		max_optbuf_len = optcom_max_optbuf_len(opt_arr,
446 		    opt_arr_cnt);
447 		mp1 = allocb_cred(max_optbuf_len, cr);
448 		if (!mp1)
449 			goto no_mem;
450 		/* Initialize the header. */
451 		mp1->b_datap->db_type = M_PCPROTO;
452 		mp1->b_wptr = &mp1->b_rptr[sizeof (struct T_optmgmt_ack)];
453 		toa = (struct T_optmgmt_ack *)mp1->b_rptr;
454 		toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
455 		toa->MGMT_flags = tor->MGMT_flags;
456 		/*
457 		 * Walk through the input buffer again, this time adding
458 		 * entries to the output buffer for each option requested.
459 		 * Note, sanity of option header, last option etc, verified
460 		 * in first pass.
461 		 */
462 		opt1 = (struct opthdr *)&toa[1];
463 
464 		for (opt = opt_start; opt < opt_end; opt = next_opt) {
465 
466 			next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
467 			    _TPI_ALIGN_OPT(opt->len));
468 
469 			opt1->name = opt->name;
470 			opt1->level = opt->level;
471 			len = (*getfn)(q, opt->level,
472 			    opt->name, (uchar_t *)&opt1[1]);
473 			/*
474 			 * Failure means option is not recognized. Copy input
475 			 * buffer as is
476 			 */
477 			if (len < 0) {
478 				opt1->len = opt->len;
479 				bcopy(&opt[1], &opt1[1], opt->len);
480 			} else {
481 				opt1->len = (t_uscalar_t)len;
482 			}
483 			opt1 = (struct opthdr *)((uchar_t *)&opt1[1] +
484 			    _TPI_ALIGN_OPT(opt1->len));
485 		} /* end for loop */
486 
487 		/* Record the final length. */
488 		toa->OPT_length = (t_scalar_t)((uchar_t *)opt1 -
489 		    (uchar_t *)&toa[1]);
490 		mp1->b_wptr = (uchar_t *)opt1;
491 		/* Ditch the input buffer. */
492 		freemsg(mp);
493 		mp = mp1;
494 		/* Always let the next module look at the option. */
495 		pass_to_next = B_TRUE;
496 		break;
497 
498 	case T_NEGOTIATE:
499 		first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
500 		if (first_mp == NULL) {
501 			optcom_err_ack(q, mp, TSYSERR, ENOMEM);
502 			return (0);
503 		}
504 		first_mp->b_datap->db_type = M_CTL;
505 		or = (opt_restart_t *)first_mp->b_rptr;
506 		or->or_start = opt_start;
507 		or->or_end =  opt_end;
508 		or->or_type = T_SVR4_OPTMGMT_REQ;
509 		or->or_private = 0;
510 		first_mp->b_cont = mp;
511 restart:
512 		/*
513 		 * Here we are expecting that the response buffer is exactly
514 		 * the same size as the input buffer.  We pass each opthdr
515 		 * to the protocol's set function.  If the protocol doesn't
516 		 * like it, it can update the value in it return argument.
517 		 */
518 		/*
519 		 * Pass each negotiated option through the protocol set
520 		 * function.
521 		 * Note: sanity check on option header values done in first
522 		 * pass and not repeated here.
523 		 */
524 		toa = (struct T_optmgmt_ack *)tor;
525 
526 		for (opt = is_restart ? restart_opt: opt_start; opt < opt_end;
527 		    opt = next_opt) {
528 			int error;
529 
530 			/*
531 			 * Point to the current option in or, in case this
532 			 * option has to be restarted later on
533 			 */
534 			or->or_ropt = opt;
535 			next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
536 			    _TPI_ALIGN_OPT(opt->len));
537 
538 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
539 			    opt->level, opt->name,
540 			    opt->len, (uchar_t *)&opt[1],
541 			    &opt->len, (uchar_t *)&opt[1], NULL, cr, first_mp);
542 			/*
543 			 * Treat positive "errors" as real.
544 			 * Note: negative errors are to be treated as
545 			 * non-fatal by svr4_optcom_req() and are
546 			 * returned by setfn() when it is passed an
547 			 * option it does not handle. Since the option
548 			 * passed proto_opt_lookup(), it is implied that
549 			 * it is valid but was either handled upstream
550 			 * or will be handled downstream.
551 			 */
552 			if (error == EINPROGRESS) {
553 				/*
554 				 * The message is queued and will be
555 				 * reprocessed later. Typically ip queued
556 				 * the message to get some exclusive conditions
557 				 * and later on calls this func again.
558 				 */
559 				return (EINPROGRESS);
560 			} else if (error > 0) {
561 				optcom_err_ack(q, mp, TSYSERR, error);
562 				freeb(first_mp);
563 				return (0);
564 			}
565 			/*
566 			 * error < 0 means option is not recognized.
567 			 * But with OP_PASSNEXT the next module
568 			 * might recognize it.
569 			 */
570 		}
571 		/* Done with the restart control mp. */
572 		freeb(first_mp);
573 		pass_to_next = B_TRUE;
574 		break;
575 	default:
576 		optcom_err_ack(q, mp, TBADFLAG, 0);
577 		return (0);
578 	}
579 
580 	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
581 		/* Send it down to the next module and let it reply */
582 		toa->PRIM_type = T_SVR4_OPTMGMT_REQ; /* Changed by IP to ACK */
583 		if (q->q_next != NULL)
584 			putnext(q, mp);
585 		else
586 			ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
587 	} else {
588 		/* Set common fields in the header. */
589 		toa->MGMT_flags = T_SUCCESS;
590 		mp->b_datap->db_type = M_PCPROTO;
591 		toa->PRIM_type = T_OPTMGMT_ACK;
592 		qreply(q, mp);
593 	}
594 	return (0);
595 bad_opt:;
596 	optcom_err_ack(q, mp, TBADOPT, 0);
597 	return (0);
598 }
599 
600 /*
601  * New optcom_req inspired by TPI/XTI semantics
602  */
603 int
604 tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
605     boolean_t pass_to_ip)
606 {
607 	t_scalar_t t_error;
608 	mblk_t *toa_mp;
609 	boolean_t pass_to_next;
610 	size_t toa_len;
611 	struct T_optmgmt_ack *toa;
612 	struct T_optmgmt_req *tor =
613 	    (struct T_optmgmt_req *)mp->b_rptr;
614 
615 	opt_restart_t *or;
616 	boolean_t is_restart = B_FALSE;
617 	mblk_t	*first_mp = NULL;
618 	t_uscalar_t worst_status;
619 	boolean_t queued_status;
620 
621 	/*
622 	 * Allocate M_CTL and prepend to the packet for restarting this
623 	 * option if needed. IP may need to queue and restart the option
624 	 * if it cannot obtain exclusive conditions immediately. Please see
625 	 * IP-MT notes before the start of svr4_optcom_req
626 	 */
627 	if (mp->b_datap->db_type == M_CTL) {
628 		is_restart = B_TRUE;
629 		first_mp = mp;
630 		toa_mp = mp->b_cont;
631 		mp = toa_mp->b_cont;
632 		ASSERT(mp->b_wptr - mp->b_rptr >=
633 		    sizeof (struct T_optmgmt_req));
634 		tor = (struct T_optmgmt_req *)mp->b_rptr;
635 		ASSERT(tor->MGMT_flags == T_NEGOTIATE);
636 
637 		or = (opt_restart_t *)first_mp->b_rptr;
638 		goto restart;
639 	}
640 
641 	/* Verify message integrity. */
642 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_optmgmt_req)) {
643 		optcom_err_ack(q, mp, TBADOPT, 0);
644 		return (0);
645 	}
646 
647 	/* Verify MGMT_flags legal */
648 	switch (tor->MGMT_flags) {
649 	case T_DEFAULT:
650 	case T_NEGOTIATE:
651 	case T_CURRENT:
652 	case T_CHECK:
653 		/* OK - legal request flags */
654 		break;
655 	default:
656 		optcom_err_ack(q, mp, TBADFLAG, 0);
657 		return (0);
658 	}
659 
660 	/*
661 	 * In this design, there are two passes required on the input buffer
662 	 * mostly to accomodate variable length options and "T_ALLOPT" option
663 	 * which has the semantics "all options of the specified level".
664 	 *
665 	 * For T_DEFAULT, T_NEGOTIATE, T_CURRENT, and T_CHECK requests, we make
666 	 * a pass through the input buffer validating the details and making
667 	 * sure each option is supported by the protocol. We also determine the
668 	 * length of the option buffer to return. (Variable length options and
669 	 * T_ALLOPT mean that length can be different for output buffer).
670 	 */
671 
672 	pass_to_next = B_FALSE;	/* initial value */
673 	toa_len = 0;		/* initial value */
674 
675 	/*
676 	 * First pass, we do the following
677 	 *	- estimate cumulative length needed for results
678 	 *	- set "status" field based on permissions, option header check
679 	 *	  etc.
680 	 *	- determine "pass_to_next" whether we need to send request to
681 	 *	  downstream module/driver.
682 	 */
683 	if ((t_error = process_topthdrs_first_pass(mp, cr, dbobjp,
684 	    &pass_to_next, &toa_len)) != 0) {
685 		optcom_err_ack(q, mp, t_error, 0);
686 		return (0);
687 	}
688 
689 	/*
690 	 * A validation phase of the input buffer is done. We have also
691 	 * obtained the length requirement and and other details about the
692 	 * input and we liked input buffer so far.  We make another scan
693 	 * through the input now and generate the output necessary to complete
694 	 * the operation.
695 	 */
696 
697 	toa_mp = allocb_cred(toa_len, cr);
698 	if (!toa_mp) {
699 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
700 		return (0);
701 	}
702 
703 	first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
704 	if (first_mp == NULL) {
705 		freeb(toa_mp);
706 		optcom_err_ack(q, mp, TSYSERR, ENOMEM);
707 		return (0);
708 	}
709 	first_mp->b_datap->db_type = M_CTL;
710 	or = (opt_restart_t *)first_mp->b_rptr;
711 	/*
712 	 * Set initial values for generating output.
713 	 */
714 	or->or_worst_status = T_SUCCESS;
715 	or->or_type = T_OPTMGMT_REQ;
716 	or->or_private = 0;
717 	/* remaining fields fileed in do_options_second_pass */
718 
719 restart:
720 	/*
721 	 * This routine makes another pass through the option buffer this
722 	 * time acting on the request based on "status" result in the
723 	 * first pass. It also performs "expansion" of T_ALLOPT into
724 	 * all options of a certain level and acts on each for this request.
725 	 */
726 	if ((t_error = do_options_second_pass(q, mp, toa_mp, cr, dbobjp,
727 	    first_mp, is_restart, &queued_status)) != 0) {
728 		freemsg(toa_mp);
729 		optcom_err_ack(q, mp, t_error, 0);
730 		return (0);
731 	}
732 	if (queued_status) {
733 		/* Option will be restarted */
734 		return (EINPROGRESS);
735 	}
736 	worst_status = or->or_worst_status;
737 	/* Done with the first mp */
738 	freeb(first_mp);
739 	toa_mp->b_cont = NULL;
740 
741 	/*
742 	 * Following code relies on the coincidence that T_optmgmt_req
743 	 * and T_optmgmt_ack are identical in binary representation
744 	 */
745 	toa = (struct T_optmgmt_ack *)toa_mp->b_rptr;
746 	toa->OPT_length = (t_scalar_t)(toa_mp->b_wptr - (toa_mp->b_rptr +
747 	    sizeof (struct T_optmgmt_ack)));
748 	toa->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_ack);
749 
750 	toa->MGMT_flags = tor->MGMT_flags;
751 
752 
753 	freemsg(mp);		/* free input mblk */
754 
755 	/*
756 	 * If there is atleast one option that requires a downstream
757 	 * forwarding and if it is possible, we forward the message
758 	 * downstream. Else we ack it.
759 	 */
760 	if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
761 		/*
762 		 * We pass it down as T_OPTMGMT_REQ. This code relies
763 		 * on the happy coincidence that T_optmgmt_req and
764 		 * T_optmgmt_ack are identical data structures
765 		 * at the binary representation level.
766 		 */
767 		toa_mp->b_datap->db_type = M_PROTO;
768 		toa->PRIM_type = T_OPTMGMT_REQ;
769 		if (q->q_next != NULL)
770 			putnext(q, toa_mp);
771 		else
772 			ip_output(Q_TO_CONN(q), toa_mp, q, IP_WPUT);
773 	} else {
774 		toa->PRIM_type = T_OPTMGMT_ACK;
775 		toa_mp->b_datap->db_type = M_PCPROTO;
776 		toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */
777 		qreply(q, toa_mp);
778 	}
779 	return (0);
780 }
781 
782 
783 /*
784  * Following routine makes a pass through option buffer in mp and performs the
785  * following tasks.
786  *	- estimate cumulative length needed for results
787  *	- set "status" field based on permissions, option header check
788  *	  etc.
789  *	- determine "pass_to_next" whether we need to send request to
790  *	  downstream module/driver.
791  */
792 
793 static t_scalar_t
794 process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
795     boolean_t *pass_to_nextp, size_t *toa_lenp)
796 {
797 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
798 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
799 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
800 	optlevel_t *valid_level_arr = dbobjp->odb_valid_levels_arr;
801 	uint_t valid_level_arr_cnt = dbobjp->odb_valid_levels_arr_cnt;
802 	struct T_opthdr *opt;
803 	struct T_opthdr *opt_start, *opt_end;
804 	opdes_t	*optd;
805 	size_t allopt_len;
806 	struct T_optmgmt_req *tor =
807 	    (struct T_optmgmt_req *)mp->b_rptr;
808 
809 	*toa_lenp = sizeof (struct T_optmgmt_ack); /* initial value */
810 
811 	if ((opt_start = (struct T_opthdr *)
812 	    mi_offset_param(mp, tor->OPT_offset, tor->OPT_length)) == NULL) {
813 		return (TBADOPT);
814 	}
815 	if (!__TPI_TOPT_ISALIGNED(opt_start))
816 		return (TBADOPT);
817 
818 	opt_end = (struct T_opthdr *)((uchar_t *)opt_start + tor->OPT_length);
819 
820 	for (opt = opt_start; opt && (opt < opt_end);
821 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
822 		/*
823 		 * Validate the option for length and alignment
824 		 * before accessing anything in it.
825 		 */
826 		if (!(_TPI_TOPT_VALID(opt, opt_start, opt_end)))
827 			return (TBADOPT);
828 
829 		/* Find the option in the opt_arr. */
830 		if (opt->name != T_ALLOPT) {
831 			optd = proto_opt_lookup(opt->level, opt->name,
832 			    opt_arr, opt_arr_cnt);
833 			if (optd == NULL) {
834 				/*
835 				 * Option not found
836 				 *
837 				 * Verify if level is "valid" or not.
838 				 * Note: This check is required by XTI
839 				 *
840 				 * TPI provider always initializes
841 				 * the "not supported" (or whatever) status
842 				 * for the options. Other levels leave status
843 				 * unchanged if they do not understand an
844 				 * option.
845 				 */
846 				if (topmost_tpiprovider) {
847 					if (!opt_level_valid(opt->level,
848 					    valid_level_arr,
849 					    valid_level_arr_cnt))
850 						return (TBADOPT);
851 					/*
852 					 * level is valid - initialize
853 					 * option as not supported
854 					 */
855 					opt->status = T_NOTSUPPORT;
856 				}
857 
858 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
859 				continue;
860 			}
861 		} else {
862 			/*
863 			 * Handle T_ALLOPT case as a special case.
864 			 * Note: T_ALLOPT does not mean anything
865 			 * for T_CHECK operation.
866 			 */
867 			allopt_len = 0;
868 			if (tor->MGMT_flags == T_CHECK ||
869 			    !topmost_tpiprovider ||
870 			    ((allopt_len = opt_level_allopts_lengths(opt->level,
871 			    opt_arr, opt_arr_cnt)) == 0)) {
872 				/*
873 				 * This is confusing but correct !
874 				 * It is not valid to to use T_ALLOPT with
875 				 * T_CHECK flag.
876 				 *
877 				 * T_ALLOPT is assumed "expanded" at the
878 				 * topmost_tpiprovider level so it should not
879 				 * be there as an "option name" if this is not
880 				 * a topmost_tpiprovider call and we fail it.
881 				 *
882 				 * opt_level_allopts_lengths() is used to verify
883 				 * that "level" associated with the T_ALLOPT is
884 				 * supported.
885 				 *
886 				 */
887 				opt->status = T_FAILURE;
888 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
889 				continue;
890 			}
891 			ASSERT(allopt_len != 0); /* remove ? */
892 
893 			*toa_lenp += allopt_len;
894 			opt->status = T_SUCCESS;
895 			/* XXX - always set T_ALLOPT 'pass_to_next' for now */
896 			*pass_to_nextp = B_TRUE;
897 			continue;
898 		}
899 		/*
900 		 * Check if option wants to flow downstream
901 		 */
902 		if (optd->opdes_props & OP_PASSNEXT)
903 			*pass_to_nextp = B_TRUE;
904 
905 		/* Additional checks dependent on operation. */
906 		switch (tor->MGMT_flags) {
907 		case T_DEFAULT:
908 		case T_CURRENT:
909 
910 			/*
911 			 * The proto_opt_lookup() routine call above approved of
912 			 * this option so we can work on the status for it
913 			 * based on the permissions for the operation. (This
914 			 * can override any status for it set at higher levels)
915 			 * We assume this override is OK since chkfn at this
916 			 * level approved of this option.
917 			 *
918 			 * T_CURRENT semantics:
919 			 * The read access is required. Else option
920 			 * status is T_NOTSUPPORT.
921 			 *
922 			 * T_DEFAULT semantics:
923 			 * Note: specification is not clear on this but we
924 			 * interpret T_DEFAULT semantics such that access to
925 			 * read value is required for access even the default
926 			 * value. Otherwise the option status is T_NOTSUPPORT.
927 			 */
928 			if (!OA_READ_PERMISSION(optd, cr)) {
929 				opt->status = T_NOTSUPPORT;
930 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
931 				/* skip to next */
932 				continue;
933 			}
934 
935 			/*
936 			 * T_DEFAULT/T_CURRENT semantics:
937 			 * We know that read access is set. If no other access
938 			 * is set, then status is T_READONLY.
939 			 */
940 			if (OA_READONLY_PERMISSION(optd, cr))
941 				opt->status = T_READONLY;
942 			else
943 				opt->status = T_SUCCESS;
944 			/*
945 			 * Option passes all checks. Make room for it in the
946 			 * ack. Note: size stored in table does not include
947 			 * space for option header.
948 			 */
949 			*toa_lenp += sizeof (struct T_opthdr) +
950 			    _TPI_ALIGN_TOPT(optd->opdes_size);
951 			break;
952 
953 		case T_CHECK:
954 		case T_NEGOTIATE:
955 
956 			/*
957 			 * T_NEGOTIATE semantics:
958 			 * If for fixed length option value on input is not the
959 			 * same as value supplied, then status is T_FAILURE.
960 			 *
961 			 * T_CHECK semantics:
962 			 * If value is supplied, semantics same as T_NEGOTIATE.
963 			 * It is however ok not to supply a value with T_CHECK.
964 			 */
965 
966 			if (tor->MGMT_flags == T_NEGOTIATE ||
967 			    (opt->len != sizeof (struct T_opthdr))) {
968 				/*
969 				 * Implies "value" is specified in T_CHECK or
970 				 * it is a T_NEGOTIATE request.
971 				 * Verify size.
972 				 * Note: This can override anything about this
973 				 * option request done at a higher level.
974 				 */
975 				if (!opt_length_ok(optd, opt)) {
976 					/* bad size */
977 					*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
978 					opt->status = T_FAILURE;
979 					continue;
980 				}
981 			}
982 			/*
983 			 * The proto_opt_lookup()  routine above() approved of
984 			 * this option so we can work on the status for it based
985 			 * on the permissions for the operation. (This can
986 			 * override anything set at a higher level).
987 			 *
988 			 * T_CHECK/T_NEGOTIATE semantics:
989 			 * Set status to T_READONLY if read is the only access
990 			 * permitted
991 			 */
992 			if (OA_READONLY_PERMISSION(optd, cr)) {
993 				opt->status = T_READONLY;
994 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
995 				/* skip to next */
996 				continue;
997 			}
998 
999 			/*
1000 			 * T_CHECK/T_NEGOTIATE semantics:
1001 			 * If write (or execute) access is not set, then status
1002 			 * is T_NOTSUPPORT.
1003 			 */
1004 			if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
1005 				opt->status = T_NOTSUPPORT;
1006 				*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
1007 				/* skip to next option */
1008 				continue;
1009 			}
1010 			/*
1011 			 * Option passes all checks. Make room for it in the
1012 			 * ack and set success in status.
1013 			 * Note: size stored in table does not include header
1014 			 * length.
1015 			 */
1016 			opt->status = T_SUCCESS;
1017 			*toa_lenp += sizeof (struct T_opthdr) +
1018 			    _TPI_ALIGN_TOPT(optd->opdes_size);
1019 			break;
1020 
1021 		default:
1022 			return (TBADFLAG);
1023 		}
1024 	} /* for loop scanning input buffer */
1025 
1026 	return (0);		/* OK return */
1027 }
1028 
1029 /*
1030  * This routine makes another pass through the option buffer this
1031  * time acting on the request based on "status" result in the
1032  * first pass. It also performs "expansion" of T_ALLOPT into
1033  * all options of a certain level and acts on each for this request.
1034  */
1035 static t_scalar_t
1036 do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
1037     optdb_obj_t *dbobjp, mblk_t *first_mp, boolean_t is_restart,
1038     boolean_t *queued_statusp)
1039 {
1040 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1041 	int failed_option;
1042 	struct T_opthdr *opt;
1043 	struct T_opthdr *opt_start, *opt_end, *restart_opt;
1044 	uchar_t *optr;
1045 	uint_t optset_context;
1046 	struct T_optmgmt_req *tor = (struct T_optmgmt_req *)reqmp->b_rptr;
1047 	opt_restart_t	*or;
1048 	t_uscalar_t	*worst_statusp;
1049 	int	err;
1050 
1051 	*queued_statusp = B_FALSE;
1052 	or = (opt_restart_t *)first_mp->b_rptr;
1053 	worst_statusp = &or->or_worst_status;
1054 
1055 	optr = (uchar_t *)ack_mp->b_rptr +
1056 	    sizeof (struct T_optmgmt_ack); /* assumed int32_t aligned */
1057 
1058 	/*
1059 	 * Set initial values for scanning input
1060 	 */
1061 	if (is_restart) {
1062 		opt_start = (struct T_opthdr *)or->or_start;
1063 		opt_end = (struct T_opthdr *)or->or_end;
1064 		restart_opt = (struct T_opthdr *)or->or_ropt;
1065 	} else {
1066 		opt_start = (struct T_opthdr *)mi_offset_param(reqmp,
1067 		    tor->OPT_offset, tor->OPT_length);
1068 		if (opt_start == NULL)
1069 			return (TBADOPT);
1070 		opt_end = (struct T_opthdr *)((uchar_t *)opt_start +
1071 		    tor->OPT_length);
1072 		or->or_start = (struct opthdr *)opt_start;
1073 		or->or_end = (struct opthdr *)opt_end;
1074 		/*
1075 		 * construct the mp chain, in case the setfn needs to
1076 		 * queue this and restart option processing later on.
1077 		 */
1078 		first_mp->b_cont = ack_mp;
1079 		ack_mp->b_cont = reqmp;
1080 	}
1081 	ASSERT(__TPI_TOPT_ISALIGNED(opt_start)); /* verified in first pass */
1082 
1083 	for (opt = is_restart ? restart_opt : opt_start;
1084 	    opt && (opt < opt_end);
1085 	    opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
1086 		or->or_ropt = (struct opthdr *)opt;
1087 		/* verified in first pass */
1088 		ASSERT(_TPI_TOPT_VALID(opt, opt_start, opt_end));
1089 
1090 		/*
1091 		 * If the first pass in process_topthdrs_first_pass()
1092 		 * has marked the option as a failure case for the MGMT_flags
1093 		 * semantics then there is not much to do.
1094 		 *
1095 		 * Note: For all practical purposes, T_READONLY status is
1096 		 * a "success" for T_DEFAULT/T_CURRENT and "failure" for
1097 		 * T_CHECK/T_NEGOTIATE
1098 		 */
1099 		failed_option =
1100 		    (opt->status == T_NOTSUPPORT) ||
1101 		    (opt->status == T_FAILURE) ||
1102 		    ((tor->MGMT_flags & (T_NEGOTIATE|T_CHECK)) &&
1103 		    (opt->status == T_READONLY));
1104 
1105 		if (failed_option) {
1106 			/*
1107 			 * According to T_DEFAULT/T_CURRENT semantics, the
1108 			 * input values, even if present, are to be ignored.
1109 			 * Note: Specification is not clear on this, but we
1110 			 * interpret that even though we ignore the values, we
1111 			 * can return them as is. So we process them similar to
1112 			 * T_CHECK/T_NEGOTIATE case which has the semantics to
1113 			 * return the values as is. XXX If interpretation is
1114 			 * ever determined incorrect fill in appropriate code
1115 			 * here to treat T_DEFAULT/T_CURRENT differently.
1116 			 *
1117 			 * According to T_CHECK/T_NEGOTIATE semantics,
1118 			 * in the case of T_NOTSUPPORT/T_FAILURE/T_READONLY,
1119 			 * the semantics are to return the "value" part of
1120 			 * option untouched. So here we copy the option
1121 			 * head including value part if any to output.
1122 			 */
1123 
1124 			bcopy(opt, optr, opt->len);
1125 			optr += _TPI_ALIGN_TOPT(opt->len);
1126 
1127 			*worst_statusp = get_worst_status(opt->status,
1128 			    *worst_statusp);
1129 
1130 			/* skip to process next option in buffer */
1131 			continue;
1132 
1133 		} /* end if "failed option" */
1134 		/*
1135 		 * The status is T_SUCCESS or T_READONLY
1136 		 * We process the value part here
1137 		 */
1138 		ASSERT(opt->status == T_SUCCESS || opt->status == T_READONLY);
1139 		switch (tor->MGMT_flags) {
1140 		case T_DEFAULT:
1141 			/*
1142 			 * We fill default value from table or protocol specific
1143 			 * function. If this call fails, we pass input through.
1144 			 */
1145 			if (do_opt_default(q, opt, &optr, worst_statusp,
1146 			    cr, dbobjp) < 0) {
1147 				/* fail or pass transparently */
1148 				if (topmost_tpiprovider)
1149 					opt->status = T_FAILURE;
1150 				bcopy(opt, optr, opt->len);
1151 				optr += _TPI_ALIGN_TOPT(opt->len);
1152 				*worst_statusp = get_worst_status(opt->status,
1153 				    *worst_statusp);
1154 			}
1155 			break;
1156 
1157 		case T_CURRENT:
1158 
1159 			do_opt_current(q, opt, &optr, worst_statusp, cr,
1160 			    dbobjp);
1161 			break;
1162 
1163 		case T_CHECK:
1164 		case T_NEGOTIATE:
1165 			if (tor->MGMT_flags == T_CHECK)
1166 				optset_context = SETFN_OPTCOM_CHECKONLY;
1167 			else	/* T_NEGOTIATE */
1168 				optset_context = SETFN_OPTCOM_NEGOTIATE;
1169 			err = do_opt_check_or_negotiate(q, opt, optset_context,
1170 			    &optr, worst_statusp, cr, dbobjp, first_mp);
1171 			if (err == EINPROGRESS) {
1172 				*queued_statusp = B_TRUE;
1173 				return (0);
1174 			}
1175 			break;
1176 		default:
1177 			return (TBADFLAG);
1178 		}
1179 	} /* end for loop scanning option buffer */
1180 
1181 	ack_mp->b_wptr = optr;
1182 	ASSERT(ack_mp->b_wptr <= ack_mp->b_datap->db_lim);
1183 
1184 	return (0);		/* OK return */
1185 }
1186 
1187 
1188 static t_uscalar_t
1189 get_worst_status(t_uscalar_t status, t_uscalar_t current_worst_status)
1190 {
1191 	/*
1192 	 * Return the "worst" among the arguments "status" and
1193 	 * "current_worst_status".
1194 	 *
1195 	 * Note: Tracking "worst_status" can be made a bit simpler
1196 	 * if we use the property that status codes are bitwise
1197 	 * distinct.
1198 	 *
1199 	 * The pecking order is
1200 	 *
1201 	 * T_SUCCESS ..... best
1202 	 * T_PARTSUCCESS
1203 	 * T_FAILURE
1204 	 * T_READONLY
1205 	 * T_NOTSUPPORT... worst
1206 	 */
1207 	if (status == current_worst_status)
1208 		return (current_worst_status);
1209 	switch (current_worst_status) {
1210 	case T_SUCCESS:
1211 		if (status == T_PARTSUCCESS)
1212 			return (T_PARTSUCCESS);
1213 		/* FALLTHROUGH */
1214 	case T_PARTSUCCESS:
1215 		if (status == T_FAILURE)
1216 			return (T_FAILURE);
1217 		/* FALLTHROUGH */
1218 	case T_FAILURE:
1219 		if (status == T_READONLY)
1220 			return (T_READONLY);
1221 		/* FALLTHROUGH */
1222 	case T_READONLY:
1223 		if (status == T_NOTSUPPORT)
1224 			return (T_NOTSUPPORT);
1225 		/* FALLTHROUGH */
1226 	case T_NOTSUPPORT:
1227 	default:
1228 		return (current_worst_status);
1229 	}
1230 }
1231 
1232 static int
1233 do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
1234     t_uscalar_t *worst_statusp, cred_t *cr, optdb_obj_t *dbobjp)
1235 {
1236 	pfi_t	deffn = dbobjp->odb_deffn;
1237 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1238 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1239 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1240 
1241 	struct T_opthdr *topth;
1242 	opdes_t *optd;
1243 
1244 	if (reqopt->name != T_ALLOPT) {
1245 		/*
1246 		 * lookup the option in the table and fill default value
1247 		 */
1248 		optd = proto_opt_lookup(reqopt->level, reqopt->name,
1249 		    opt_arr, opt_arr_cnt);
1250 
1251 		if (optd == NULL) {
1252 			/*
1253 			 * not found - fail this one. Should not happen
1254 			 * for topmost_tpiprovider as calling routine
1255 			 * should have verified it.
1256 			 */
1257 			ASSERT(!topmost_tpiprovider);
1258 			return (-1);
1259 		}
1260 
1261 		topth = (struct T_opthdr *)(*resptrp);
1262 		topth->level = reqopt->level;
1263 		topth->name = reqopt->name;
1264 		topth->status = reqopt->status;
1265 
1266 		*worst_statusp = get_worst_status(reqopt->status,
1267 		    *worst_statusp);
1268 
1269 		if (optd->opdes_props & OP_NODEFAULT) {
1270 			/* header only, no default "value" part */
1271 			topth->len = sizeof (struct T_opthdr);
1272 			*resptrp += sizeof (struct T_opthdr);
1273 		} else {
1274 			int deflen;
1275 
1276 			if (optd->opdes_props & OP_DEF_FN) {
1277 				deflen = (*deffn)(q, reqopt->level,
1278 				    reqopt->name, _TPI_TOPT_DATA(topth));
1279 				if (deflen >= 0) {
1280 					topth->len = (t_uscalar_t)
1281 					    (sizeof (struct T_opthdr) + deflen);
1282 				} else {
1283 					/*
1284 					 * return error, this should 'pass
1285 					 * through' the option and maybe some
1286 					 * other level will fill it in or
1287 					 * already did.
1288 					 * (No change in 'resptrp' upto here)
1289 					 */
1290 					return (-1);
1291 				}
1292 			} else {
1293 				/* fill length and value part */
1294 				switch (optd->opdes_size) {
1295 				/*
1296 				 * Since options are guaranteed aligned only
1297 				 * on a 4 byte boundary (t_scalar_t) any
1298 				 * option that is greater in size will default
1299 				 * to the bcopy below
1300 				 */
1301 				case sizeof (int32_t):
1302 					*(int32_t *)_TPI_TOPT_DATA(topth) =
1303 					    (int32_t)optd->opdes_default;
1304 					break;
1305 				case sizeof (int16_t):
1306 					*(int16_t *)_TPI_TOPT_DATA(topth) =
1307 					    (int16_t)optd->opdes_default;
1308 					break;
1309 				case sizeof (int8_t):
1310 					*(int8_t *)_TPI_TOPT_DATA(topth) =
1311 					    (int8_t)optd->opdes_default;
1312 					break;
1313 				default:
1314 					/*
1315 					 * other length but still assume
1316 					 * fixed - use bcopy
1317 					 */
1318 					bcopy(optd->opdes_defbuf,
1319 					    _TPI_TOPT_DATA(topth),
1320 					    optd->opdes_size);
1321 					break;
1322 				}
1323 				topth->len = (t_uscalar_t)(optd->opdes_size +
1324 				    sizeof (struct T_opthdr));
1325 			}
1326 			*resptrp += _TPI_ALIGN_TOPT(topth->len);
1327 		}
1328 		return (0);	/* OK return */
1329 	}
1330 
1331 	/*
1332 	 * T_ALLOPT processing
1333 	 *
1334 	 * lookup and stuff default values of all the options of the
1335 	 * level specified
1336 	 * Note: This expansion of T_ALLOPT should happen in
1337 	 * a topmost_tpiprovider.
1338 	 */
1339 	ASSERT(topmost_tpiprovider);
1340 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1341 		if (reqopt->level != optd->opdes_level)
1342 			continue;
1343 		/*
1344 		 *
1345 		 * T_DEFAULT semantics:
1346 		 * XXX: we interpret T_DEFAULT semantics such that access to
1347 		 * read value is required for access even the default value.
1348 		 * Else option is ignored for T_ALLOPT request.
1349 		 */
1350 		if (!OA_READ_PERMISSION(optd, cr))
1351 			/* skip this one */
1352 			continue;
1353 
1354 		/*
1355 		 * Found option of same level as T_ALLOPT request
1356 		 * that we can return.
1357 		 */
1358 
1359 		topth = (struct T_opthdr *)(*resptrp);
1360 		topth->level = optd->opdes_level;
1361 		topth->name = optd->opdes_name;
1362 
1363 		/*
1364 		 * T_DEFAULT semantics:
1365 		 * We know that read access is set. If no other access is set,
1366 		 * then status is T_READONLY
1367 		 */
1368 		if (OA_READONLY_PERMISSION(optd, cr)) {
1369 			topth->status = T_READONLY;
1370 			*worst_statusp = get_worst_status(T_READONLY,
1371 			    *worst_statusp);
1372 		} else {
1373 			topth->status = T_SUCCESS;
1374 			/*
1375 			 * Note: *worst_statusp has to be T_SUCCESS or
1376 			 * worse so no need to adjust
1377 			 */
1378 		}
1379 
1380 		if (optd->opdes_props & OP_NODEFAULT) {
1381 			/* header only, no value part */
1382 			topth->len = sizeof (struct T_opthdr);
1383 			*resptrp += sizeof (struct T_opthdr);
1384 		} else {
1385 			int deflen;
1386 
1387 			if (optd->opdes_props & OP_DEF_FN) {
1388 				deflen = (*deffn)(q, reqopt->level,
1389 				    reqopt->name, _TPI_TOPT_DATA(topth));
1390 				if (deflen >= 0) {
1391 					topth->len = (t_uscalar_t)(deflen +
1392 					    sizeof (struct T_opthdr));
1393 				} else {
1394 					/*
1395 					 * deffn failed.
1396 					 * return just the header as T_ALLOPT
1397 					 * expansion.
1398 					 * Some other level deffn may
1399 					 * supply value part.
1400 					 */
1401 					topth->len = sizeof (struct T_opthdr);
1402 					topth->status = T_FAILURE;
1403 					*worst_statusp =
1404 					    get_worst_status(T_FAILURE,
1405 					    *worst_statusp);
1406 				}
1407 			} else {
1408 				/*
1409 				 * fill length and value part from
1410 				 * table
1411 				 */
1412 				switch (optd->opdes_size) {
1413 				/*
1414 				 * Since options are guaranteed aligned only
1415 				 * on a 4 byte boundary (t_scalar_t) any
1416 				 * option that is greater in size will default
1417 				 * to the bcopy below
1418 				 */
1419 				case sizeof (int32_t):
1420 					*(int32_t *)_TPI_TOPT_DATA(topth) =
1421 					    (int32_t)optd->opdes_default;
1422 					break;
1423 				case sizeof (int16_t):
1424 					*(int16_t *)_TPI_TOPT_DATA(topth) =
1425 					    (int16_t)optd->opdes_default;
1426 					break;
1427 				case sizeof (int8_t):
1428 					*(int8_t *)_TPI_TOPT_DATA(topth) =
1429 					    (int8_t)optd->opdes_default;
1430 					break;
1431 				default:
1432 					/*
1433 					 * other length but still assume
1434 					 * fixed - use bcopy
1435 					 */
1436 					bcopy(optd->opdes_defbuf,
1437 					    _TPI_TOPT_DATA(topth),
1438 					    optd->opdes_size);
1439 				}
1440 				topth->len = (t_uscalar_t)(optd->opdes_size +
1441 				    sizeof (struct T_opthdr));
1442 			}
1443 			*resptrp += _TPI_ALIGN_TOPT(topth->len);
1444 		}
1445 	}
1446 	return (0);
1447 }
1448 
1449 static void
1450 do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
1451     t_uscalar_t *worst_statusp, cred_t *cr, optdb_obj_t *dbobjp)
1452 {
1453 	pfi_t	getfn = dbobjp->odb_getfn;
1454 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1455 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1456 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1457 
1458 	struct T_opthdr *topth;
1459 	opdes_t *optd;
1460 	int optlen;
1461 	uchar_t *initptr = *resptrp;
1462 
1463 	/*
1464 	 * We call getfn to get the current value of an option. The call may
1465 	 * fail in which case we copy the values from the input buffer. Maybe
1466 	 * something downstream will fill it in or something upstream did.
1467 	 */
1468 
1469 	if (reqopt->name != T_ALLOPT) {
1470 		topth = (struct T_opthdr *)*resptrp;
1471 		*resptrp += sizeof (struct T_opthdr);
1472 		optlen = (*getfn)(q, reqopt->level, reqopt->name, *resptrp);
1473 		if (optlen >= 0) {
1474 			topth->len = (t_uscalar_t)(optlen +
1475 			    sizeof (struct T_opthdr));
1476 			topth->level = reqopt->level;
1477 			topth->name = reqopt->name;
1478 			topth->status = reqopt->status;
1479 			*resptrp += _TPI_ALIGN_TOPT(optlen);
1480 			*worst_statusp = get_worst_status(topth->status,
1481 			    *worst_statusp);
1482 		} else {
1483 			/* failed - reset "*resptrp" pointer */
1484 			*resptrp -= sizeof (struct T_opthdr);
1485 		}
1486 	} else {		/* T_ALLOPT processing */
1487 		ASSERT(topmost_tpiprovider == B_TRUE);
1488 		/* scan and get all options */
1489 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1490 			/* skip other levels */
1491 			if (reqopt->level != optd->opdes_level)
1492 				continue;
1493 
1494 			if (!OA_READ_PERMISSION(optd, cr))
1495 				/* skip this one */
1496 				continue;
1497 
1498 			topth = (struct T_opthdr *)*resptrp;
1499 			*resptrp += sizeof (struct T_opthdr);
1500 
1501 			/* get option of this level */
1502 			optlen = (*getfn)(q, reqopt->level, optd->opdes_name,
1503 			    *resptrp);
1504 			if (optlen >= 0) {
1505 				/* success */
1506 				topth->len = (t_uscalar_t)(optlen +
1507 				    sizeof (struct T_opthdr));
1508 				topth->level = reqopt->level;
1509 				topth->name = optd->opdes_name;
1510 				if (OA_READONLY_PERMISSION(optd, cr))
1511 					topth->status = T_READONLY;
1512 				else
1513 					topth->status = T_SUCCESS;
1514 				*resptrp += _TPI_ALIGN_TOPT(optlen);
1515 			} else {
1516 				/*
1517 				 * failed, return as T_FAILURE and null value
1518 				 * part. Maybe something downstream will
1519 				 * handle this one and fill in a value. Here
1520 				 * it is just part of T_ALLOPT expansion.
1521 				 */
1522 				topth->len = sizeof (struct T_opthdr);
1523 				topth->level = reqopt->level;
1524 				topth->name = optd->opdes_name;
1525 				topth->status = T_FAILURE;
1526 			}
1527 			*worst_statusp = get_worst_status(topth->status,
1528 			    *worst_statusp);
1529 		} /* end for loop */
1530 	}
1531 	if (*resptrp == initptr) {
1532 		/*
1533 		 * getfn failed and does not want to handle this option. Maybe
1534 		 * something downstream will or something upstream did. (If
1535 		 * topmost_tpiprovider, initialize "status" to failure which
1536 		 * can possibly change downstream). Copy the input "as is" from
1537 		 * input option buffer if any to maintain transparency.
1538 		 */
1539 		if (topmost_tpiprovider)
1540 			reqopt->status = T_FAILURE;
1541 		bcopy(reqopt, *resptrp, reqopt->len);
1542 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
1543 		*worst_statusp = get_worst_status(reqopt->status,
1544 		    *worst_statusp);
1545 	}
1546 }
1547 
1548 /* ARGSUSED */
1549 static int
1550 do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
1551     uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
1552     cred_t *cr, optdb_obj_t *dbobjp, mblk_t *first_mp)
1553 {
1554 	pfi_t	deffn = dbobjp->odb_deffn;
1555 	opt_set_fn setfn = dbobjp->odb_setfn;
1556 	opdes_t	*opt_arr = dbobjp->odb_opt_des_arr;
1557 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1558 	boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
1559 
1560 	struct T_opthdr *topth;
1561 	opdes_t *optd;
1562 	int error;
1563 	t_uscalar_t optlen;
1564 	t_scalar_t optsize;
1565 	uchar_t *initptr = *resptrp;
1566 
1567 	ASSERT(reqopt->status == T_SUCCESS);
1568 
1569 	if (reqopt->name != T_ALLOPT) {
1570 		topth = (struct T_opthdr *)*resptrp;
1571 		*resptrp += sizeof (struct T_opthdr);
1572 		error = (*setfn)(q, optset_context, reqopt->level, reqopt->name,
1573 		    reqopt->len - sizeof (struct T_opthdr),
1574 		    _TPI_TOPT_DATA(reqopt), &optlen, _TPI_TOPT_DATA(topth),
1575 		    NULL, cr, first_mp);
1576 		if (error) {
1577 			/* failed - reset "*resptrp" */
1578 			*resptrp -= sizeof (struct T_opthdr);
1579 			if (error == EINPROGRESS)
1580 				return (error);
1581 		} else {
1582 			/*
1583 			 * success - "value" already filled in setfn()
1584 			 */
1585 			topth->len = (t_uscalar_t)(optlen +
1586 			    sizeof (struct T_opthdr));
1587 			topth->level = reqopt->level;
1588 			topth->name = reqopt->name;
1589 			topth->status = reqopt->status;
1590 			*resptrp += _TPI_ALIGN_TOPT(optlen);
1591 			*worst_statusp = get_worst_status(topth->status,
1592 			    *worst_statusp);
1593 		}
1594 	} else {		/* T_ALLOPT processing */
1595 		/* only for T_NEGOTIATE case */
1596 		ASSERT(optset_context == SETFN_OPTCOM_NEGOTIATE);
1597 		ASSERT(topmost_tpiprovider == B_TRUE);
1598 
1599 		/* scan and set all options to default value */
1600 		for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
1601 
1602 			/* skip other levels */
1603 			if (reqopt->level != optd->opdes_level)
1604 				continue;
1605 
1606 			if (OA_EXECUTE_PERMISSION(optd, cr) ||
1607 			    OA_NO_PERMISSION(optd, cr)) {
1608 				/*
1609 				 * skip this one too. Does not make sense to
1610 				 * set anything to default value for "execute"
1611 				 * options.
1612 				 */
1613 				continue;
1614 			}
1615 
1616 			if (OA_READONLY_PERMISSION(optd, cr)) {
1617 				/*
1618 				 * Return with T_READONLY status (and no value
1619 				 * part). Note: spec is not clear but
1620 				 * XTI test suite needs this.
1621 				 */
1622 				topth = (struct T_opthdr *)*resptrp;
1623 				topth->len = sizeof (struct T_opthdr);
1624 				*resptrp += topth->len;
1625 				topth->level = reqopt->level;
1626 				topth->name = optd->opdes_name;
1627 				topth->status = T_READONLY;
1628 				*worst_statusp = get_worst_status(topth->status,
1629 				    *worst_statusp);
1630 				continue;
1631 			}
1632 
1633 			/*
1634 			 * It is not read only or execute type
1635 			 * the it must have write permission
1636 			 */
1637 			ASSERT(OA_WRITE_PERMISSION(optd, cr));
1638 
1639 			topth = (struct T_opthdr *)*resptrp;
1640 			*resptrp += sizeof (struct T_opthdr);
1641 
1642 			topth->len = sizeof (struct T_opthdr);
1643 			topth->level = reqopt->level;
1644 			topth->name = optd->opdes_name;
1645 			if (optd->opdes_props & OP_NODEFAULT) {
1646 				/*
1647 				 * Option of "no default value" so it does not
1648 				 * make sense to try to set it. We just return
1649 				 * header with status of T_SUCCESS
1650 				 * XXX should this be failure ?
1651 				 */
1652 				topth->status = T_SUCCESS;
1653 				continue; /* skip setting */
1654 			}
1655 			if (optd->opdes_props & OP_DEF_FN) {
1656 				if ((optd->opdes_props & OP_VARLEN) ||
1657 				    ((optsize = (*deffn)(q, reqopt->level,
1658 				    optd->opdes_name,
1659 				    (uchar_t *)optd->opdes_defbuf)) < 0)) {
1660 					/* XXX - skip these too */
1661 					topth->status = T_SUCCESS;
1662 					continue; /* skip setting */
1663 				}
1664 			} else {
1665 				optsize = optd->opdes_size;
1666 			}
1667 
1668 
1669 			/* set option of this level */
1670 			error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
1671 			    reqopt->level, optd->opdes_name, optsize,
1672 			    (uchar_t *)optd->opdes_defbuf, &optlen,
1673 			    _TPI_TOPT_DATA(topth), NULL, cr, NULL);
1674 			if (error) {
1675 				/*
1676 				 * failed, return as T_FAILURE and null value
1677 				 * part. Maybe something downstream will
1678 				 * handle this one and fill in a value. Here
1679 				 * it is just part of T_ALLOPT expansion.
1680 				 */
1681 				topth->status = T_FAILURE;
1682 				*worst_statusp = get_worst_status(topth->status,
1683 				    *worst_statusp);
1684 			} else {
1685 				/* success */
1686 				topth->len += optlen;
1687 				topth->status = T_SUCCESS;
1688 				*resptrp += _TPI_ALIGN_TOPT(optlen);
1689 			}
1690 		} /* end for loop */
1691 		/* END T_ALLOPT */
1692 	}
1693 
1694 	if (*resptrp == initptr) {
1695 		/*
1696 		 * setfn failed and does not want to handle this option. Maybe
1697 		 * something downstream will or something upstream
1698 		 * did. Copy the input as is from input option buffer if any to
1699 		 * maintain transparency (maybe something at a level above
1700 		 * did something.
1701 		 */
1702 		if (topmost_tpiprovider)
1703 			reqopt->status = T_FAILURE;
1704 		bcopy(reqopt, *resptrp, reqopt->len);
1705 		*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
1706 		*worst_statusp = get_worst_status(reqopt->status,
1707 		    *worst_statusp);
1708 	}
1709 	return (0);
1710 }
1711 
1712 /*
1713  * The following routines process options buffer passed with
1714  * T_CONN_REQ, T_CONN_RES and T_UNITDATA_REQ.
1715  * This routine does the consistency check applied to the
1716  * sanity of formatting of multiple options packed in the
1717  * buffer.
1718  *
1719  * XTI brain damage alert:
1720  * XTI interface adopts the notion of an option being an
1721  * "absolute requirement" from OSI transport service (but applies
1722  * it to all transports including Internet transports).
1723  * The main effect of that is action on failure to "negotiate" a
1724  * requested option to the exact requested value
1725  *
1726  *          - if the option is an "absolute requirement", the primitive
1727  *            is aborted (e.g T_DISCON_REQ or T_UDERR generated)
1728  *          - if the option is NOT and "absolute requirement" it can
1729  *            just be ignored.
1730  *
1731  * We would not support "negotiating" of options on connection
1732  * primitives for Internet transports. However just in case we
1733  * forced to in order to pass strange test suites, the design here
1734  * tries to support these notions.
1735  *
1736  * tpi_optcom_buf(q, mp, opt_lenp, opt_offset, cred, dbobjp, thisdg_attrs,
1737  *	*is_absreq_failurep)
1738  *
1739  * - Verify the option buffer, if formatted badly, return error 1
1740  *
1741  * - If it is a "permissions" failure (read-only), return error 2
1742  *
1743  * - Else, process the option "in place", the following can happen,
1744  *	     - if a "privileged" option, mark it as "ignored".
1745  *	     - if "not supported", mark "ignored"
1746  *	     - if "supported" attempt negotiation and fill result in
1747  *	       the outcome
1748  *			- if "absolute requirement", set "*is_absreq_failurep"
1749  *			- if NOT an "absolute requirement", then our
1750  *			  interpretation is to mark is at ignored if
1751  *			  negotiation fails (Spec allows partial success
1752  *			  as in OSI protocols but not failure)
1753  *
1754  *   Then delete "ignored" options from option buffer and return success.
1755  *
1756  */
1757 int
1758 tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
1759     t_scalar_t opt_offset, cred_t *cr, optdb_obj_t *dbobjp,
1760     void *thisdg_attrs, int *is_absreq_failurep)
1761 {
1762 	opt_set_fn setfn = dbobjp->odb_setfn;
1763 	opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
1764 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
1765 	struct T_opthdr *opt, *opt_start, *opt_end;
1766 	mblk_t  *copy_mp_head;
1767 	uchar_t *optr, *init_optr;
1768 	opdes_t *optd;
1769 	uint_t optset_context;
1770 	t_uscalar_t olen;
1771 	int error = 0;
1772 
1773 	ASSERT((uchar_t *)opt_lenp > mp->b_rptr &&
1774 	    (uchar_t *)opt_lenp < mp->b_wptr);
1775 
1776 	copy_mp_head = NULL;
1777 	*is_absreq_failurep = 0;
1778 	switch (((union T_primitives *)mp->b_rptr)->type) {
1779 	case T_CONN_REQ:
1780 	case T_CONN_RES:
1781 		optset_context = SETFN_CONN_NEGOTIATE;
1782 		break;
1783 	case T_UNITDATA_REQ:
1784 		optset_context = SETFN_UD_NEGOTIATE;
1785 		break;
1786 	default:
1787 		/*
1788 		 * should never get here, all possible TPI primitives
1789 		 * where this can be called from should be accounted
1790 		 * for in the cases above
1791 		 */
1792 		return (EINVAL);
1793 	}
1794 
1795 	if ((opt_start = (struct T_opthdr *)
1796 	    mi_offset_param(mp, opt_offset, *opt_lenp)) == NULL) {
1797 		error = ENOPROTOOPT;
1798 		goto error_ret;
1799 	}
1800 	if (!__TPI_TOPT_ISALIGNED(opt_start)) {
1801 		error = ENOPROTOOPT;
1802 		goto error_ret;
1803 	}
1804 
1805 	opt_end = (struct T_opthdr *)((uchar_t *)opt_start
1806 	    + *opt_lenp);
1807 
1808 	if ((copy_mp_head = copyb(mp)) == (mblk_t *)NULL) {
1809 		error = ENOMEM;
1810 		goto error_ret;
1811 	}
1812 
1813 	init_optr = optr = (uchar_t *)&copy_mp_head->b_rptr[opt_offset];
1814 
1815 	for (opt = opt_start; opt && (opt < opt_end);
1816 	    opt = _TPI_TOPT_NEXTHDR(opt_start, *opt_lenp, opt)) {
1817 		/*
1818 		 * Validate the option for length and alignment
1819 		 * before accessing anything in it
1820 		 */
1821 		if (!_TPI_TOPT_VALID(opt, opt_start, opt_end)) {
1822 			error = ENOPROTOOPT;
1823 			goto error_ret;
1824 		}
1825 
1826 		/* Find the option in the opt_arr. */
1827 		optd = proto_opt_lookup(opt->level, opt->name,
1828 		    opt_arr, opt_arr_cnt);
1829 
1830 		if (optd == NULL) {
1831 			/*
1832 			 * Option not found
1833 			 */
1834 			opt->status = T_NOTSUPPORT;
1835 			continue;
1836 		}
1837 
1838 		/*
1839 		 * Weird but as in XTI spec.
1840 		 * Sec 6.3.6 "Privileged and ReadOnly Options"
1841 		 * Permission problems (e.g.readonly) fail with bad access
1842 		 * BUT "privileged" option request from those NOT PRIVILEGED
1843 		 * are to be merely "ignored".
1844 		 * XXX Prevents "probing" of privileged options ?
1845 		 */
1846 		if (OA_READONLY_PERMISSION(optd, cr)) {
1847 			error = EACCES;
1848 			goto error_ret;
1849 		}
1850 		if (OA_MATCHED_PRIV(optd, cr)) {
1851 			/*
1852 			 * For privileged options, we DO perform
1853 			 * access checks as is common sense
1854 			 */
1855 			if (!OA_WX_ANYPRIV(optd)) {
1856 				error = EACCES;
1857 				goto error_ret;
1858 			}
1859 		} else {
1860 			/*
1861 			 * For non privileged, we fail instead following
1862 			 * "ignore" semantics dictated by XTI spec for
1863 			 * permissions problems.
1864 			 * Sec 6.3.6 "Privileged and ReadOnly Options"
1865 			 * XXX Should we do "ignore" semantics ?
1866 			 */
1867 			if (!OA_WX_NOPRIV(optd)) { /* nopriv */
1868 				opt->status = T_FAILURE;
1869 				continue;
1870 			}
1871 		}
1872 		/*
1873 		 *
1874 		 * If the negotiation fails, for options that
1875 		 * are "absolute requirement", it is a fatal error.
1876 		 * For options that are NOT "absolute requirements",
1877 		 * and the value fails to negotiate, the XTI spec
1878 		 * only considers the possibility of partial success
1879 		 * (T_PARTSUCCES - not likely for Internet protocols).
1880 		 * The spec is in denial about complete failure
1881 		 * (T_FAILURE) to negotiate for options that are
1882 		 * carried on T_CONN_REQ/T_CONN_RES/T_UNITDATA
1883 		 * We interpret the T_FAILURE to negotiate an option
1884 		 * that is NOT an absolute requirement that it is safe
1885 		 * to ignore it.
1886 		 */
1887 
1888 		/* verify length */
1889 		if (!opt_length_ok(optd, opt)) {
1890 			/* bad size */
1891 			if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
1892 				/* option is absolute requirement */
1893 				*is_absreq_failurep = 1;
1894 				error = EINVAL;
1895 				goto error_ret;
1896 			}
1897 			opt->status = T_FAILURE;
1898 			continue;
1899 		}
1900 
1901 		/*
1902 		 * verified generic attributes. Now call set function.
1903 		 * Note: We assume the following to simplify code.
1904 		 * XXX If this is found not to be valid, this routine
1905 		 * will need to be rewritten. At this point it would
1906 		 * be premature to introduce more complexity than is
1907 		 * needed.
1908 		 * Assumption: For variable length options, we assume
1909 		 * that the value returned will be same or less length
1910 		 * (size does not increase). This makes it OK to pass the
1911 		 * same space for output as it is on input.
1912 		 */
1913 
1914 		error = (*setfn)(q, optset_context, opt->level, opt->name,
1915 		    opt->len - (t_uscalar_t)sizeof (struct T_opthdr),
1916 		    _TPI_TOPT_DATA(opt), &olen, _TPI_TOPT_DATA(opt),
1917 		    thisdg_attrs, cr, NULL);
1918 
1919 		if (olen > (int)(opt->len - sizeof (struct T_opthdr))) {
1920 			/*
1921 			 * Space on output more than space on input. Should
1922 			 * not happen and we consider it a bug/error.
1923 			 * More of a restriction than an error in our
1924 			 * implementation. Will see if we can live with this
1925 			 * otherwise code will get more hairy with multiple
1926 			 * passes.
1927 			 */
1928 			error = EINVAL;
1929 			goto error_ret;
1930 		}
1931 		if (error != 0) {
1932 			if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
1933 				/* option is absolute requirement. */
1934 				*is_absreq_failurep = 1;
1935 				goto error_ret;
1936 			}
1937 			/*
1938 			 * failed - but option "not an absolute
1939 			 * requirement"
1940 			 */
1941 			opt->status = T_FAILURE;
1942 			continue;
1943 		}
1944 		/*
1945 		 * Fill in the only possible successful result
1946 		 * (Note: TPI allows for T_PARTSUCCESS - partial
1947 		 * sucess result code which is relevant in OSI world
1948 		 * and not possible in Internet code)
1949 		 */
1950 		opt->status = T_SUCCESS;
1951 
1952 		/*
1953 		 * Add T_SUCCESS result code options to the "output" options.
1954 		 * No T_FAILURES or T_NOTSUPPORT here as they are to be
1955 		 * ignored.
1956 		 * This code assumes output option buffer will
1957 		 * be <= input option buffer.
1958 		 *
1959 		 * Copy option header+value
1960 		 */
1961 		bcopy(opt, optr, opt->len);
1962 		optr +=  _TPI_ALIGN_TOPT(opt->len);
1963 	}
1964 	/*
1965 	 * Overwrite the input mblk option buffer now with the output
1966 	 * and update length, and contents in original mbl
1967 	 * (offset remains unchanged).
1968 	 */
1969 	*opt_lenp = (t_scalar_t)(optr - init_optr);
1970 	if (*opt_lenp > 0) {
1971 		bcopy(init_optr, opt_start, *opt_lenp);
1972 	}
1973 
1974 error_ret:
1975 	if (copy_mp_head != NULL)
1976 		freeb(copy_mp_head);
1977 	return (error);
1978 }
1979 
1980 static boolean_t
1981 opt_level_valid(t_uscalar_t level, optlevel_t *valid_level_arr,
1982     uint_t valid_level_arr_cnt)
1983 {
1984 	optlevel_t		*olp;
1985 
1986 	for (olp = valid_level_arr;
1987 	    olp < &valid_level_arr[valid_level_arr_cnt];
1988 	    olp++) {
1989 		if (level == (uint_t)(*olp))
1990 			return (B_TRUE);
1991 	}
1992 	return (B_FALSE);
1993 }
1994 
1995 
1996 /*
1997  * Compute largest possible size for an option buffer containing
1998  * all options in one buffer.
1999  *
2000  * XXX TBD, investigate use of opt_bloated_maxsize() to avoid
2001  *     wastefully large buffer allocation.
2002  */
2003 static size_t
2004 opt_level_allopts_lengths(t_uscalar_t level, opdes_t *opt_arr,
2005     uint_t opt_arr_cnt)
2006 {
2007 	opdes_t		*optd;
2008 	size_t allopt_len = 0;	/* 0 implies no option at this level */
2009 
2010 	/*
2011 	 * Scan opt_arr computing aggregate length
2012 	 * requirement for storing values of all
2013 	 * options.
2014 	 * Note: we do not filter for permissions
2015 	 * etc. This will be >= the real aggregate
2016 	 * length required (upper bound).
2017 	 */
2018 
2019 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt];
2020 	    optd++) {
2021 		if (level == optd->opdes_level) {
2022 			allopt_len += sizeof (struct T_opthdr) +
2023 			    _TPI_ALIGN_TOPT(optd->opdes_size);
2024 		}
2025 	}
2026 	return (allopt_len);	/* 0 implies level not found */
2027 }
2028 
2029 /*
2030  * Compute largest possible size for an option buffer containing
2031  * all options in one buffer - a (theoretical?) worst case scenario
2032  * for certain cases.
2033  */
2034 t_uscalar_t
2035 optcom_max_optbuf_len(opdes_t *opt_arr, uint_t opt_arr_cnt)
2036 {
2037 	t_uscalar_t max_optbuf_len = sizeof (struct T_info_ack);
2038 	opdes_t		*optd;
2039 
2040 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
2041 		max_optbuf_len += (t_uscalar_t)sizeof (struct T_opthdr) +
2042 		    (t_uscalar_t)_TPI_ALIGN_TOPT(optd->opdes_size);
2043 	}
2044 	return (max_optbuf_len);
2045 }
2046 
2047 /*
2048  * Compute largest possible size for OPT_size for a transport.
2049  * Heuristic used is to add all but certain extremely large
2050  * size options; this is done by calling opt_bloated_maxsize().
2051  * It affects user level allocations in TLI/XTI code using t_alloc()
2052  * and other TLI/XTI implementation instance strucutures.
2053  * The large size options excluded are presumed to be
2054  * never accessed through the (theoretical?) worst case code paths
2055  * through TLI/XTI as they are currently IPv6 specific options.
2056  */
2057 
2058 t_uscalar_t
2059 optcom_max_optsize(opdes_t *opt_arr, uint_t opt_arr_cnt)
2060 {
2061 	t_uscalar_t max_optbuf_len = sizeof (struct T_info_ack);
2062 	opdes_t		*optd;
2063 
2064 	for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
2065 		if (!opt_bloated_maxsize(optd)) {
2066 			max_optbuf_len +=
2067 			    (t_uscalar_t)sizeof (struct T_opthdr) +
2068 			    (t_uscalar_t)_TPI_ALIGN_TOPT(optd->opdes_size);
2069 		}
2070 	}
2071 	return (max_optbuf_len);
2072 }
2073 
2074 /*
2075  * The theoretical model used in optcom_max_optsize() and
2076  * opt_level_allopts_lengths() accounts for the worst case of all
2077  * possible options for the theoretical cases and results in wasteful
2078  * memory allocations for certain theoretically correct usage scenarios.
2079  * In practice, the "features" they support are rarely, if ever,
2080  * used and even then only by test suites for those features (VSU, VST).
2081  * However, they result in large allocations due to the increased transport
2082  * T_INFO_ACK OPT_size field affecting t_alloc() users and TLI/XTI library
2083  * instance data structures for applications.
2084  *
2085  * The following routine opt_bloated_maxsize() supports a hack that avoids
2086  * paying the tax for the bloated options by excluding them and pretending
2087  * they don't exist for certain features without affecting features that
2088  * do use them.
2089  *
2090  * XXX Currently implemented only for optcom_max_optsize()
2091  *     (to reduce risk late in release).
2092  *     TBD for future, investigate use in optcom_level_allopts_lengths() and
2093  *     all the instances of T_ALLOPT processing to exclude "bloated options".
2094  *     Will not affect VSU/VST tests as they do not test with IPPROTO_IPV6
2095  *     level options which are the only ones that fit the "bloated maxsize"
2096  *     option profile now.
2097  */
2098 static boolean_t
2099 opt_bloated_maxsize(opdes_t *optd)
2100 {
2101 	if (optd->opdes_level != IPPROTO_IPV6)
2102 		return (B_FALSE);
2103 	switch (optd->opdes_name) {
2104 	case IPV6_HOPOPTS:
2105 	case IPV6_DSTOPTS:
2106 	case IPV6_RTHDRDSTOPTS:
2107 	case IPV6_RTHDR:
2108 	case IPV6_PATHMTU:
2109 		return (B_TRUE);
2110 	default:
2111 		break;
2112 	}
2113 	return (B_FALSE);
2114 }
2115 
2116 static boolean_t
2117 opt_length_ok(opdes_t *optd, struct T_opthdr *opt)
2118 {
2119 	/*
2120 	 * Verify length.
2121 	 * Value specified should match length of fixed length option or be
2122 	 * less than maxlen of variable length option.
2123 	 */
2124 	if (optd->opdes_props & OP_VARLEN) {
2125 		if (opt->len <= optd->opdes_size +
2126 		    (t_uscalar_t)sizeof (struct T_opthdr))
2127 			return (B_TRUE);
2128 	} else {
2129 		/* fixed length option */
2130 		if (opt->len == optd->opdes_size +
2131 		    (t_uscalar_t)sizeof (struct T_opthdr))
2132 			return (B_TRUE);
2133 	}
2134 	return (B_FALSE);
2135 }
2136 
2137 /*
2138  * This routine appends a pssed in hop-by-hop option to the existing
2139  * option (in this case a cipso label encoded in HOPOPT option). The
2140  * passed in option is always padded. The 'reservelen' is the
2141  * length of reserved data (label). New memory will be allocated if
2142  * the current buffer is not large enough. Return failure if memory
2143  * can not be allocated.
2144  */
2145 int
2146 optcom_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky,
2147     uchar_t **optbufp, uint_t *optlenp, uint_t reservelen)
2148 {
2149 	uchar_t *optbuf;
2150 	uchar_t	*optp;
2151 
2152 	if (!sticky) {
2153 		*optbufp = invalp;
2154 		*optlenp = inlen;
2155 		return (0);
2156 	}
2157 
2158 	if (inlen == *optlenp - reservelen) {
2159 		/* Unchanged length - no need to reallocate */
2160 		optp = *optbufp + reservelen;
2161 		bcopy(invalp, optp, inlen);
2162 		if (reservelen != 0) {
2163 			/*
2164 			 * Convert the NextHeader and Length of the
2165 			 * passed in hop-by-hop header to pads
2166 			 */
2167 			optp[0] = IP6OPT_PADN;
2168 			optp[1] = 0;
2169 		}
2170 		return (0);
2171 	}
2172 	if (inlen + reservelen > 0) {
2173 		/* Allocate new buffer before free */
2174 		optbuf = kmem_alloc(inlen + reservelen, KM_NOSLEEP);
2175 		if (optbuf == NULL)
2176 			return (ENOMEM);
2177 	} else {
2178 		optbuf = NULL;
2179 	}
2180 
2181 	/* Copy out old reserved data (label) */
2182 	if (reservelen > 0)
2183 		bcopy(*optbufp, optbuf, reservelen);
2184 
2185 	/* Free old buffer */
2186 	if (*optlenp != 0)
2187 		kmem_free(*optbufp, *optlenp);
2188 
2189 	if (inlen > 0)
2190 		bcopy(invalp, optbuf + reservelen, inlen);
2191 
2192 	if (reservelen != 0) {
2193 		/*
2194 		 * Convert the NextHeader and Length of the
2195 		 * passed in hop-by-hop header to pads
2196 		 */
2197 		optbuf[reservelen] = IP6OPT_PADN;
2198 		optbuf[reservelen + 1] = 0;
2199 		/*
2200 		 * Set the Length of the hop-by-hop header, number of 8
2201 		 * byte-words following the 1st 8 bytes
2202 		 */
2203 		optbuf[1] = (reservelen + inlen - 1) >> 3;
2204 	}
2205 	*optbufp = optbuf;
2206 	*optlenp = inlen + reservelen;
2207 	return (0);
2208 }
2209 
2210 int
2211 process_auxiliary_options(conn_t *connp, void *control, t_uscalar_t controllen,
2212     void *optbuf, optdb_obj_t *dbobjp, int (*opt_set_fn)(conn_t *, uint_t, int,
2213     int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *))
2214 {
2215 	struct cmsghdr *cmsg;
2216 	opdes_t *optd;
2217 	t_uscalar_t outlen;
2218 	int error = EOPNOTSUPP;
2219 	t_uscalar_t len;
2220 	uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
2221 	opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
2222 
2223 	for (cmsg = (struct cmsghdr *)control;
2224 	    CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
2225 	    cmsg = CMSG_NEXT(cmsg)) {
2226 
2227 		len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
2228 		/* Find the option in the opt_arr. */
2229 		optd = proto_opt_lookup(cmsg->cmsg_level, cmsg->cmsg_type,
2230 		    opt_arr, opt_arr_cnt);
2231 		if (optd == NULL) {
2232 			return (EINVAL);
2233 		}
2234 		if (OA_READONLY_PERMISSION(optd, connp->conn_cred)) {
2235 			return (EACCES);
2236 		}
2237 		if (OA_MATCHED_PRIV(optd, connp->conn_cred)) {
2238 			/*
2239 			 * For privileged options, we DO perform
2240 			 * access checks as is common sense
2241 			 */
2242 			if (!OA_WX_ANYPRIV(optd)) {
2243 				return (EACCES);
2244 			}
2245 		} else {
2246 			/*
2247 			 * For non privileged, we fail instead following
2248 			 * "ignore" semantics dictated by XTI spec for
2249 			 * permissions problems.
2250 			 */
2251 			if (!OA_WX_NOPRIV(optd)) { /* nopriv */
2252 				return (EACCES);
2253 			}
2254 		}
2255 		error = opt_set_fn(connp, SETFN_UD_NEGOTIATE, optd->opdes_level,
2256 		    optd->opdes_name, len, (uchar_t *)CMSG_CONTENT(cmsg),
2257 		    &outlen, (uchar_t *)CMSG_CONTENT(cmsg), (void *)optbuf,
2258 		    connp->conn_cred);
2259 		if (error > 0) {
2260 			return (error);
2261 		} else if (outlen > len) {
2262 			return (EINVAL);
2263 		} else {
2264 			/*
2265 			 * error can be -ve if the protocol wants to
2266 			 * pass the option to IP. We donot pass auxiliary
2267 			 * options to IP.
2268 			 */
2269 			error = 0;
2270 		}
2271 	}
2272 	return (error);
2273 }
2274