xref: /illumos-gate/usr/src/uts/common/os/msg.c (revision ac2ff9f2)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5b2eb1770Sudpa  * Common Development and Distribution License (the "License").
6b2eb1770Sudpa  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22005d3febSMarek Pospisil  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27*ac2ff9f2SToomas Soome /*	  All Rights Reserved	*/
287c478bd9Sstevel@tonic-gate 
297c478bd9Sstevel@tonic-gate 
307c478bd9Sstevel@tonic-gate /*
317c478bd9Sstevel@tonic-gate  * Inter-Process Communication Message Facility.
327c478bd9Sstevel@tonic-gate  *
337c478bd9Sstevel@tonic-gate  * See os/ipc.c for a description of common IPC functionality.
347c478bd9Sstevel@tonic-gate  *
357c478bd9Sstevel@tonic-gate  * Resource controls
367c478bd9Sstevel@tonic-gate  * -----------------
377c478bd9Sstevel@tonic-gate  *
38824c205fSml  * Control:      zone.max-msg-ids (rc_zone_msgmni)
39824c205fSml  * Description:  Maximum number of message queue ids allowed a zone.
40824c205fSml  *
41824c205fSml  *   When msgget() is used to allocate a message queue, one id is
42824c205fSml  *   allocated.  If the id allocation doesn't succeed, msgget() fails
43824c205fSml  *   and errno is set to ENOSPC.  Upon successful msgctl(, IPC_RMID)
44824c205fSml  *   the id is deallocated.
45824c205fSml  *
467c478bd9Sstevel@tonic-gate  * Control:      project.max-msg-ids (rc_project_msgmni)
477c478bd9Sstevel@tonic-gate  * Description:  Maximum number of message queue ids allowed a project.
487c478bd9Sstevel@tonic-gate  *
497c478bd9Sstevel@tonic-gate  *   When msgget() is used to allocate a message queue, one id is
507c478bd9Sstevel@tonic-gate  *   allocated.  If the id allocation doesn't succeed, msgget() fails
517c478bd9Sstevel@tonic-gate  *   and errno is set to ENOSPC.  Upon successful msgctl(, IPC_RMID)
527c478bd9Sstevel@tonic-gate  *   the id is deallocated.
537c478bd9Sstevel@tonic-gate  *
547c478bd9Sstevel@tonic-gate  * Control:      process.max-msg-qbytes (rc_process_msgmnb)
557c478bd9Sstevel@tonic-gate  * Description:  Maximum number of bytes of messages on a message queue.
567c478bd9Sstevel@tonic-gate  *
577c478bd9Sstevel@tonic-gate  *   When msgget() successfully allocates a message queue, the minimum
587c478bd9Sstevel@tonic-gate  *   enforced value of this limit is used to initialize msg_qbytes.
597c478bd9Sstevel@tonic-gate  *
607c478bd9Sstevel@tonic-gate  * Control:      process.max-msg-messages (rc_process_msgtql)
617c478bd9Sstevel@tonic-gate  * Description:  Maximum number of messages on a message queue.
627c478bd9Sstevel@tonic-gate  *
637c478bd9Sstevel@tonic-gate  *   When msgget() successfully allocates a message queue, the minimum
647c478bd9Sstevel@tonic-gate  *   enforced value of this limit is used to initialize a per-queue
657c478bd9Sstevel@tonic-gate  *   limit on the number of messages.
667c478bd9Sstevel@tonic-gate  */
677c478bd9Sstevel@tonic-gate 
687c478bd9Sstevel@tonic-gate #include <sys/types.h>
697c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
707c478bd9Sstevel@tonic-gate #include <sys/param.h>
717c478bd9Sstevel@tonic-gate #include <sys/cred.h>
727c478bd9Sstevel@tonic-gate #include <sys/user.h>
737c478bd9Sstevel@tonic-gate #include <sys/proc.h>
747c478bd9Sstevel@tonic-gate #include <sys/time.h>
757c478bd9Sstevel@tonic-gate #include <sys/ipc.h>
767c478bd9Sstevel@tonic-gate #include <sys/ipc_impl.h>
777c478bd9Sstevel@tonic-gate #include <sys/msg.h>
787c478bd9Sstevel@tonic-gate #include <sys/msg_impl.h>
797c478bd9Sstevel@tonic-gate #include <sys/list.h>
807c478bd9Sstevel@tonic-gate #include <sys/systm.h>
817c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
827c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
837c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
847c478bd9Sstevel@tonic-gate #include <sys/ddi.h>
857c478bd9Sstevel@tonic-gate #include <sys/errno.h>
867c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
877c478bd9Sstevel@tonic-gate #include <sys/debug.h>
887c478bd9Sstevel@tonic-gate #include <sys/project.h>
897c478bd9Sstevel@tonic-gate #include <sys/modctl.h>
907c478bd9Sstevel@tonic-gate #include <sys/syscall.h>
917c478bd9Sstevel@tonic-gate #include <sys/policy.h>
927c478bd9Sstevel@tonic-gate #include <sys/zone.h>
937c478bd9Sstevel@tonic-gate 
947c478bd9Sstevel@tonic-gate #include <c2/audit.h>
957c478bd9Sstevel@tonic-gate 
967c478bd9Sstevel@tonic-gate /*
977c478bd9Sstevel@tonic-gate  * The following tunables are obsolete.  Though for compatibility we
987c478bd9Sstevel@tonic-gate  * still read and interpret msginfo_msgmnb, msginfo_msgmni, and
997c478bd9Sstevel@tonic-gate  * msginfo_msgtql (see os/project.c and os/rctl_proc.c), the preferred
1007c478bd9Sstevel@tonic-gate  * mechanism for administrating the IPC Message facility is through the
1017c478bd9Sstevel@tonic-gate  * resource controls described at the top of this file.
1027c478bd9Sstevel@tonic-gate  */
1037c478bd9Sstevel@tonic-gate size_t	msginfo_msgmax = 2048;	/* (obsolete) */
1047c478bd9Sstevel@tonic-gate size_t	msginfo_msgmnb = 4096;	/* (obsolete) */
1057c478bd9Sstevel@tonic-gate int	msginfo_msgmni = 50;	/* (obsolete) */
1067c478bd9Sstevel@tonic-gate int	msginfo_msgtql = 40;	/* (obsolete) */
1077c478bd9Sstevel@tonic-gate int	msginfo_msgssz = 8;	/* (obsolete) */
1087c478bd9Sstevel@tonic-gate int	msginfo_msgmap = 0;	/* (obsolete) */
1097c478bd9Sstevel@tonic-gate ushort_t msginfo_msgseg = 1024;	/* (obsolete) */
1107c478bd9Sstevel@tonic-gate 
111824c205fSml extern rctl_hndl_t rc_zone_msgmni;
1127c478bd9Sstevel@tonic-gate extern rctl_hndl_t rc_project_msgmni;
1137c478bd9Sstevel@tonic-gate extern rctl_hndl_t rc_process_msgmnb;
1147c478bd9Sstevel@tonic-gate extern rctl_hndl_t rc_process_msgtql;
1157c478bd9Sstevel@tonic-gate static ipc_service_t *msq_svc;
1167c478bd9Sstevel@tonic-gate static zone_key_t msg_zone_key;
1177c478bd9Sstevel@tonic-gate 
1187c478bd9Sstevel@tonic-gate static void msg_dtor(kipc_perm_t *);
1197c478bd9Sstevel@tonic-gate static void msg_rmid(kipc_perm_t *);
1207c478bd9Sstevel@tonic-gate static void msg_remove_zone(zoneid_t, void *);
1217c478bd9Sstevel@tonic-gate 
1227c478bd9Sstevel@tonic-gate /*
1237c478bd9Sstevel@tonic-gate  * Module linkage information for the kernel.
1247c478bd9Sstevel@tonic-gate  */
1257c478bd9Sstevel@tonic-gate static ssize_t msgsys(int opcode, uintptr_t a0, uintptr_t a1, uintptr_t a2,
1267c478bd9Sstevel@tonic-gate 	uintptr_t a4, uintptr_t a5);
1277c478bd9Sstevel@tonic-gate 
1287c478bd9Sstevel@tonic-gate static struct sysent ipcmsg_sysent = {
1297c478bd9Sstevel@tonic-gate 	6,
1307c478bd9Sstevel@tonic-gate #ifdef	_LP64
1317c478bd9Sstevel@tonic-gate 	SE_ARGC | SE_NOUNLOAD | SE_64RVAL,
1327c478bd9Sstevel@tonic-gate #else
1337c478bd9Sstevel@tonic-gate 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
1347c478bd9Sstevel@tonic-gate #endif
135*ac2ff9f2SToomas Soome 	(int (*)())(uintptr_t)msgsys
1367c478bd9Sstevel@tonic-gate };
1377c478bd9Sstevel@tonic-gate 
1387c478bd9Sstevel@tonic-gate #ifdef	_SYSCALL32_IMPL
1397c478bd9Sstevel@tonic-gate static ssize32_t msgsys32(int opcode, uint32_t a0, uint32_t a1, uint32_t a2,
1407c478bd9Sstevel@tonic-gate 	uint32_t a4, uint32_t a5);
1417c478bd9Sstevel@tonic-gate 
1427c478bd9Sstevel@tonic-gate static struct sysent ipcmsg_sysent32 = {
1437c478bd9Sstevel@tonic-gate 	6,
1447c478bd9Sstevel@tonic-gate 	SE_ARGC | SE_NOUNLOAD | SE_32RVAL1,
145*ac2ff9f2SToomas Soome 	msgsys32
1467c478bd9Sstevel@tonic-gate };
1477c478bd9Sstevel@tonic-gate #endif	/* _SYSCALL32_IMPL */
1487c478bd9Sstevel@tonic-gate 
1497c478bd9Sstevel@tonic-gate static struct modlsys modlsys = {
1507c478bd9Sstevel@tonic-gate 	&mod_syscallops, "System V message facility", &ipcmsg_sysent
1517c478bd9Sstevel@tonic-gate };
1527c478bd9Sstevel@tonic-gate 
1537c478bd9Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
1547c478bd9Sstevel@tonic-gate static struct modlsys modlsys32 = {
1557c478bd9Sstevel@tonic-gate 	&mod_syscallops32, "32-bit System V message facility", &ipcmsg_sysent32
1567c478bd9Sstevel@tonic-gate };
1577c478bd9Sstevel@tonic-gate #endif
1587c478bd9Sstevel@tonic-gate 
1592c5b6df1Sdv /*
1602c5b6df1Sdv  *      Big Theory statement for message queue correctness
1612c5b6df1Sdv  *
1622c5b6df1Sdv  * The msgrcv and msgsnd functions no longer uses cv_broadcast to wake up
1632c5b6df1Sdv  * receivers who are waiting for an event.  Using the cv_broadcast method
1642c5b6df1Sdv  * resulted in negative scaling when the number of waiting receivers are large
1652c5b6df1Sdv  * (the thundering herd problem).  Instead, the receivers waiting to receive a
1662c5b6df1Sdv  * message are now linked in a queue-like fashion and awaken one at a time in
1672c5b6df1Sdv  * a controlled manner.
1682c5b6df1Sdv  *
1692c5b6df1Sdv  * Receivers can block on two different classes of waiting list:
1702c5b6df1Sdv  *    1) "sendwait" list, which is the more complex list of the two.  The
1712c5b6df1Sdv  *	  receiver will be awakened by a sender posting a new message.  There
1722c5b6df1Sdv  *	  are two types of "sendwait" list used:
1732c5b6df1Sdv  *		a) msg_wait_snd: handles all receivers who are looking for
1742c5b6df1Sdv  *		   a message type >= 0, but was unable to locate a match.
1752c5b6df1Sdv  *
1762c5b6df1Sdv  *		   slot 0: reserved for receivers that have designated they
1772c5b6df1Sdv  *			   will take any message type.
1782c5b6df1Sdv  *		   rest:   consist of receivers requesting a specific type
1792c5b6df1Sdv  *			   but the type was not present.  The entries are
1802c5b6df1Sdv  *			   hashed into a bucket in an attempt to keep
1812c5b6df1Sdv  *			   any list search relatively short.
182*ac2ff9f2SToomas Soome  *		b) msg_wait_snd_ngt: handles all receivers that have designated
1832c5b6df1Sdv  *		   a negative message type. Unlike msg_wait_snd, the hash bucket
1842c5b6df1Sdv  *		   serves a range of negative message types (-1 to -5, -6 to -10
1852c5b6df1Sdv  *		   and so forth), where the last bucket is reserved for all the
1862c5b6df1Sdv  *		   negative message types that hash outside of MSG_MAX_QNUM - 1.
1872c5b6df1Sdv  *		   This is done this way to simplify the operation of locating a
1882c5b6df1Sdv  *		   negative message type.
1892c5b6df1Sdv  *
1902c5b6df1Sdv  *    2) "copyout" list, where the receiver is awakened by another
1912c5b6df1Sdv  *	 receiver after a message is copied out.  This is a linked list
1922c5b6df1Sdv  *	 of waiters that are awakened one at a time.  Although the solution is
1932c5b6df1Sdv  *	 not optimal, the complexity that would be added in for waking
1942c5b6df1Sdv  *	 up the right entry far exceeds any potential pay back (too many
1952c5b6df1Sdv  *	 correctness and corner case issues).
1962c5b6df1Sdv  *
1972c5b6df1Sdv  * The lists are doubly linked.  In the case of the "sendwait"
1982c5b6df1Sdv  * list, this allows the thread to remove itself from the list without having
1992c5b6df1Sdv  * to traverse the list.  In the case of the "copyout" list it simply allows
2002c5b6df1Sdv  * us to use common functions with the "sendwait" list.
2012c5b6df1Sdv  *
2022c5b6df1Sdv  * To make sure receivers are not hung out to dry, we must guarantee:
2032c5b6df1Sdv  *    1. If any queued message matches any receiver, then at least one
2042c5b6df1Sdv  *       matching receiver must be processing the request.
2052c5b6df1Sdv  *    2. Blocking on the copyout queue is only temporary while messages
2062c5b6df1Sdv  *	 are being copied out.  The process is guaranted to wakeup
2072c5b6df1Sdv  *	 when it gets to front of the queue (copyout is a FIFO).
2082c5b6df1Sdv  *
2092c5b6df1Sdv  * Rules for blocking and waking up:
2102c5b6df1Sdv  *   1. A receiver entering msgrcv must examine all messages for a match
2112c5b6df1Sdv  *      before blocking on a sendwait queue.
2122c5b6df1Sdv  *   2. If the receiver blocks because the message it chose is already
2132c5b6df1Sdv  *	being copied out, then when it wakes up needs to start start
2142c5b6df1Sdv  *	checking the messages from the beginning.
2152c5b6df1Sdv  *   3) When ever a process returns from msgrcv for any reason, if it
2162c5b6df1Sdv  *	had attempted to copy a message or blocked waiting for a copy
2172c5b6df1Sdv  *	to complete it needs to wakeup the next receiver blocked on
2182c5b6df1Sdv  *	a copy out.
2192c5b6df1Sdv  *   4) When a message is sent, the sender selects a process waiting
2202c5b6df1Sdv  *	for that type of message.  This selection process rotates between
2212c5b6df1Sdv  *	receivers types of 0, negative and positive to prevent starvation of
2222c5b6df1Sdv  *	any one particular receiver type.
2232c5b6df1Sdv  *   5) The following are the scenarios for processes that are awakened
2242c5b6df1Sdv  *	by a msgsnd:
2252c5b6df1Sdv  *		a) The process finds the message and is able to copy
2262c5b6df1Sdv  *		   it out.  Once complete, the process returns.
2272c5b6df1Sdv  *		b) The message that was sent that triggered the wakeup is no
2282c5b6df1Sdv  *		   longer available (another process found the message first).
2292c5b6df1Sdv  *		   We issue a wakeup on copy queue and then go back to
2302c5b6df1Sdv  *		   sleep waiting for another matching message to be sent.
2312c5b6df1Sdv  *		c) The message that was supposed to be processed was
2322c5b6df1Sdv  *		   already serviced by another process.  However a different
2332c5b6df1Sdv  *		   message is present which we can service.  The message
2342c5b6df1Sdv  *		   is copied and the process returns.
2352c5b6df1Sdv  *		d) The message is found, but some sort of error occurs that
2362c5b6df1Sdv  *		   prevents the message from being copied.  The receiver
2372c5b6df1Sdv  *		   wakes up the next sender that can service this message
2382c5b6df1Sdv  *		   type and returns an error to the caller.
2392c5b6df1Sdv  *		e) The message is found, but it is marked as being copied
2402c5b6df1Sdv  *		   out.  The receiver then goes to sleep on the copyout
2412c5b6df1Sdv  *		   queue where it will be awakened again sometime in the future.
2422c5b6df1Sdv  *
2432c5b6df1Sdv  *
2442c5b6df1Sdv  *   6) Whenever a message is found that matches the message type designated,
245*ac2ff9f2SToomas Soome  *	but is being copied out we have to block on the copyout queue.
2462c5b6df1Sdv  *	After process copying finishes the copy out, it  must wakeup (either
2472c5b6df1Sdv  *	directly or indirectly) all receivers who blocked on its copyout,
2482c5b6df1Sdv  *	so they are guaranteed a chance to examine the remaining messages.
2492c5b6df1Sdv  *	This is implemented via a chain of wakeups: Y wakes X, who wakes Z,
2502c5b6df1Sdv  *	and so on.  The chain cannot be broken.  This leads to the following
2512c5b6df1Sdv  *	cases:
2522c5b6df1Sdv  *		a) A receiver is finished copying the message (or encountered)
2532c5b6df1Sdv  *		   an error), the first entry on the copyout queue is woken
2542c5b6df1Sdv  *		   up.
2552c5b6df1Sdv  *		b) When the receiver is woken up, it attempts to locate
2562c5b6df1Sdv  *		   a message type match.
2572c5b6df1Sdv  *		c) If a message type is found and
2582c5b6df1Sdv  *			-- MSG_RCVCOPY flag is not set, the message is
2592c5b6df1Sdv  *			   marked for copying out.  Regardless of the copyout
2602c5b6df1Sdv  *			   success the next entry on the copyout queue is
2612c5b6df1Sdv  *			   awakened and the operation is completed.
2622c5b6df1Sdv  *			-- MSG_RCVCOPY is set, we simply go back to sleep again
2632c5b6df1Sdv  *			   on the copyout queue.
2642c5b6df1Sdv  *		d) If the message type is not found then we wakeup the next
2652c5b6df1Sdv  *		   process on the copyout queue.
266eb9fe4caSDavid Valin  *   7) If a msgsnd is unable to complete for of any of the following reasons
267eb9fe4caSDavid Valin  *	  a) the msgq has no space for the message
268eb9fe4caSDavid Valin  *	  b) the maximum number of messages allowed has been reached
269eb9fe4caSDavid Valin  *      then one of two things happen:
270eb9fe4caSDavid Valin  *	  1) If the passed in msg_flag has IPC_NOWAIT set, then
271eb9fe4caSDavid Valin  *	     an error is returned.
272eb9fe4caSDavid Valin  *	  2) The IPC_NOWAIT bit is not set in msg_flag, then the
273eb9fe4caSDavid Valin  *	     the thread is placed to sleep until the request can be
274eb9fe4caSDavid Valin  *	     serviced.
275eb9fe4caSDavid Valin  *   8) When waking a thread waiting to send a message, a check is done to
276eb9fe4caSDavid Valin  *      verify that the operation being asked for by the thread will complete.
277eb9fe4caSDavid Valin  *      This decision making process is done in a loop where the oldest request
278eb9fe4caSDavid Valin  *      is checked first. The search will continue until there is no more
279eb9fe4caSDavid Valin  *	room on the msgq or we have checked all the waiters.
2802c5b6df1Sdv  */
2812c5b6df1Sdv 
282e5994f96Sdv static uint_t msg_type_hash(long);
2832c5b6df1Sdv static int msgq_check_err(kmsqid_t *qp, int cvres);
2842c5b6df1Sdv static int msg_rcvq_sleep(list_t *, msgq_wakeup_t *, kmutex_t **,
2852c5b6df1Sdv     kmsqid_t *);
2862c5b6df1Sdv static int msg_copyout(kmsqid_t *, long, kmutex_t **, size_t *, size_t,
2872c5b6df1Sdv     struct msg *, struct ipcmsgbuf *, int);
2882c5b6df1Sdv static void msg_rcvq_wakeup_all(list_t *);
289eb9fe4caSDavid Valin static void msg_wakeup_senders(kmsqid_t *);
2902c5b6df1Sdv static void msg_wakeup_rdr(kmsqid_t *, msg_select_t **, long);
2912c5b6df1Sdv static msgq_wakeup_t *msg_fnd_any_snd(kmsqid_t *, int, long);
2922c5b6df1Sdv static msgq_wakeup_t *msg_fnd_any_rdr(kmsqid_t *, int, long);
2932c5b6df1Sdv static msgq_wakeup_t *msg_fnd_neg_snd(kmsqid_t *, int, long);
2942c5b6df1Sdv static msgq_wakeup_t *msg_fnd_spc_snd(kmsqid_t *, int, long);
2952c5b6df1Sdv static struct msg *msgrcv_lookup(kmsqid_t *, long);
2962c5b6df1Sdv 
2972c5b6df1Sdv msg_select_t msg_fnd_sndr[] = {
2982c5b6df1Sdv 	{ msg_fnd_any_snd, &msg_fnd_sndr[1] },
2992c5b6df1Sdv 	{ msg_fnd_spc_snd, &msg_fnd_sndr[2] },
3002c5b6df1Sdv 	{ msg_fnd_neg_snd, &msg_fnd_sndr[0] }
3012c5b6df1Sdv };
3022c5b6df1Sdv 
3032c5b6df1Sdv msg_select_t msg_fnd_rdr[1] = {
3042c5b6df1Sdv 	{ msg_fnd_any_rdr, &msg_fnd_rdr[0] },
3052c5b6df1Sdv };
3062c5b6df1Sdv 
3077c478bd9Sstevel@tonic-gate static struct modlinkage modlinkage = {
3087c478bd9Sstevel@tonic-gate 	MODREV_1,
3097c478bd9Sstevel@tonic-gate 	&modlsys,
3107c478bd9Sstevel@tonic-gate #ifdef _SYSCALL32_IMPL
3117c478bd9Sstevel@tonic-gate 	&modlsys32,
3127c478bd9Sstevel@tonic-gate #endif
3137c478bd9Sstevel@tonic-gate 	NULL
3147c478bd9Sstevel@tonic-gate };
3157c478bd9Sstevel@tonic-gate 
316eb9fe4caSDavid Valin #define	MSG_SMALL_INIT (size_t)-1
3177c478bd9Sstevel@tonic-gate int
_init(void)3187c478bd9Sstevel@tonic-gate _init(void)
3197c478bd9Sstevel@tonic-gate {
3207c478bd9Sstevel@tonic-gate 	int result;
3217c478bd9Sstevel@tonic-gate 
322824c205fSml 	msq_svc = ipcs_create("msqids", rc_project_msgmni, rc_zone_msgmni,
323824c205fSml 	    sizeof (kmsqid_t), msg_dtor, msg_rmid, AT_IPC_MSG,
324824c205fSml 	    offsetof(ipc_rqty_t, ipcq_msgmni));
3257c478bd9Sstevel@tonic-gate 	zone_key_create(&msg_zone_key, NULL, msg_remove_zone, NULL);
3267c478bd9Sstevel@tonic-gate 
3277c478bd9Sstevel@tonic-gate 	if ((result = mod_install(&modlinkage)) == 0)
3287c478bd9Sstevel@tonic-gate 		return (0);
3297c478bd9Sstevel@tonic-gate 
3307c478bd9Sstevel@tonic-gate 	(void) zone_key_delete(msg_zone_key);
3317c478bd9Sstevel@tonic-gate 	ipcs_destroy(msq_svc);
3327c478bd9Sstevel@tonic-gate 
3337c478bd9Sstevel@tonic-gate 	return (result);
3347c478bd9Sstevel@tonic-gate }
3357c478bd9Sstevel@tonic-gate 
3367c478bd9Sstevel@tonic-gate int
_fini(void)3377c478bd9Sstevel@tonic-gate _fini(void)
3387c478bd9Sstevel@tonic-gate {
3397c478bd9Sstevel@tonic-gate 	return (EBUSY);
3407c478bd9Sstevel@tonic-gate }
3417c478bd9Sstevel@tonic-gate 
3427c478bd9Sstevel@tonic-gate int
_info(struct modinfo * modinfop)3437c478bd9Sstevel@tonic-gate _info(struct modinfo *modinfop)
3447c478bd9Sstevel@tonic-gate {
3457c478bd9Sstevel@tonic-gate 	return (mod_info(&modlinkage, modinfop));
3467c478bd9Sstevel@tonic-gate }
3477c478bd9Sstevel@tonic-gate 
3487c478bd9Sstevel@tonic-gate static void
msg_dtor(kipc_perm_t * perm)3497c478bd9Sstevel@tonic-gate msg_dtor(kipc_perm_t *perm)
3507c478bd9Sstevel@tonic-gate {
3517c478bd9Sstevel@tonic-gate 	kmsqid_t *qp = (kmsqid_t *)perm;
352b2eb1770Sudpa 	int		ii;
3537c478bd9Sstevel@tonic-gate 
3542c5b6df1Sdv 	for (ii = 0; ii <= MSG_MAX_QNUM; ii++) {
3552c5b6df1Sdv 		ASSERT(list_is_empty(&qp->msg_wait_snd[ii]));
3562c5b6df1Sdv 		ASSERT(list_is_empty(&qp->msg_wait_snd_ngt[ii]));
3572c5b6df1Sdv 		list_destroy(&qp->msg_wait_snd[ii]);
3582c5b6df1Sdv 		list_destroy(&qp->msg_wait_snd_ngt[ii]);
3592c5b6df1Sdv 	}
3602c5b6df1Sdv 	ASSERT(list_is_empty(&qp->msg_cpy_block));
361eb9fe4caSDavid Valin 	ASSERT(list_is_empty(&qp->msg_wait_rcv));
3622c5b6df1Sdv 	list_destroy(&qp->msg_cpy_block);
3637c478bd9Sstevel@tonic-gate 	ASSERT(qp->msg_snd_cnt == 0);
3647c478bd9Sstevel@tonic-gate 	ASSERT(qp->msg_cbytes == 0);
3657c478bd9Sstevel@tonic-gate 	list_destroy(&qp->msg_list);
366eb9fe4caSDavid Valin 	list_destroy(&qp->msg_wait_rcv);
3677c478bd9Sstevel@tonic-gate }
3687c478bd9Sstevel@tonic-gate 
3697c478bd9Sstevel@tonic-gate 
3707c478bd9Sstevel@tonic-gate #define	msg_hold(mp)	(mp)->msg_copycnt++
3717c478bd9Sstevel@tonic-gate 
3727c478bd9Sstevel@tonic-gate /*
3737c478bd9Sstevel@tonic-gate  * msg_rele - decrement the reference count on the message.  When count
3747c478bd9Sstevel@tonic-gate  * reaches zero, free message header and contents.
3757c478bd9Sstevel@tonic-gate  */
3767c478bd9Sstevel@tonic-gate static void
msg_rele(struct msg * mp)3777c478bd9Sstevel@tonic-gate msg_rele(struct msg *mp)
3787c478bd9Sstevel@tonic-gate {
3797c478bd9Sstevel@tonic-gate 	ASSERT(mp->msg_copycnt > 0);
3807c478bd9Sstevel@tonic-gate 	if (mp->msg_copycnt-- == 1) {
3817c478bd9Sstevel@tonic-gate 		if (mp->msg_addr)
3827c478bd9Sstevel@tonic-gate 			kmem_free(mp->msg_addr, mp->msg_size);
3837c478bd9Sstevel@tonic-gate 		kmem_free(mp, sizeof (struct msg));
3847c478bd9Sstevel@tonic-gate 	}
3857c478bd9Sstevel@tonic-gate }
3867c478bd9Sstevel@tonic-gate 
3877c478bd9Sstevel@tonic-gate /*
3887c478bd9Sstevel@tonic-gate  * msgunlink - Unlink msg from queue, decrement byte count and wake up anyone
3897c478bd9Sstevel@tonic-gate  * waiting for free bytes on queue.
3907c478bd9Sstevel@tonic-gate  *
3917c478bd9Sstevel@tonic-gate  * Called with queue locked.
3927c478bd9Sstevel@tonic-gate  */
3937c478bd9Sstevel@tonic-gate static void
msgunlink(kmsqid_t * qp,struct msg * mp)3947c478bd9Sstevel@tonic-gate msgunlink(kmsqid_t *qp, struct msg *mp)
3957c478bd9Sstevel@tonic-gate {
3967c478bd9Sstevel@tonic-gate 	list_remove(&qp->msg_list, mp);
3977c478bd9Sstevel@tonic-gate 	qp->msg_qnum--;
3987c478bd9Sstevel@tonic-gate 	qp->msg_cbytes -= mp->msg_size;
3997c478bd9Sstevel@tonic-gate 	msg_rele(mp);
4007c478bd9Sstevel@tonic-gate 
4017c478bd9Sstevel@tonic-gate 	/* Wake up waiting writers */
402eb9fe4caSDavid Valin 	msg_wakeup_senders(qp);
4037c478bd9Sstevel@tonic-gate }
4047c478bd9Sstevel@tonic-gate 
4057c478bd9Sstevel@tonic-gate static void
msg_rmid(kipc_perm_t * perm)4067c478bd9Sstevel@tonic-gate msg_rmid(kipc_perm_t *perm)
4077c478bd9Sstevel@tonic-gate {
4087c478bd9Sstevel@tonic-gate 	kmsqid_t *qp = (kmsqid_t *)perm;
4097c478bd9Sstevel@tonic-gate 	struct msg *mp;
410b2eb1770Sudpa 	int		ii;
4117c478bd9Sstevel@tonic-gate 
4127c478bd9Sstevel@tonic-gate 
4137c478bd9Sstevel@tonic-gate 	while ((mp = list_head(&qp->msg_list)) != NULL)
4147c478bd9Sstevel@tonic-gate 		msgunlink(qp, mp);
4157c478bd9Sstevel@tonic-gate 	ASSERT(qp->msg_cbytes == 0);
4167c478bd9Sstevel@tonic-gate 
4172c5b6df1Sdv 	/*
4182c5b6df1Sdv 	 * Wake up everyone who is in a wait state of some sort
4192c5b6df1Sdv 	 * for this message queue.
4202c5b6df1Sdv 	 */
4212c5b6df1Sdv 	for (ii = 0; ii <= MSG_MAX_QNUM; ii++) {
4222c5b6df1Sdv 		msg_rcvq_wakeup_all(&qp->msg_wait_snd[ii]);
4232c5b6df1Sdv 		msg_rcvq_wakeup_all(&qp->msg_wait_snd_ngt[ii]);
424b2eb1770Sudpa 	}
4252c5b6df1Sdv 	msg_rcvq_wakeup_all(&qp->msg_cpy_block);
426eb9fe4caSDavid Valin 	msg_rcvq_wakeup_all(&qp->msg_wait_rcv);
4277c478bd9Sstevel@tonic-gate }
4287c478bd9Sstevel@tonic-gate 
4297c478bd9Sstevel@tonic-gate /*
4307c478bd9Sstevel@tonic-gate  * msgctl system call.
4317c478bd9Sstevel@tonic-gate  *
4327c478bd9Sstevel@tonic-gate  * gets q lock (via ipc_lookup), releases before return.
4337c478bd9Sstevel@tonic-gate  * may call users of msg_lock
4347c478bd9Sstevel@tonic-gate  */
4357c478bd9Sstevel@tonic-gate static int
msgctl(int msgid,int cmd,void * arg)4367c478bd9Sstevel@tonic-gate msgctl(int msgid, int cmd, void *arg)
4377c478bd9Sstevel@tonic-gate {
4387c478bd9Sstevel@tonic-gate 	STRUCT_DECL(msqid_ds, ds);		/* SVR4 queue work area */
4397c478bd9Sstevel@tonic-gate 	kmsqid_t		*qp;		/* ptr to associated q */
4402c5b6df1Sdv 	int			error;
4417c478bd9Sstevel@tonic-gate 	struct	cred		*cr;
4427c478bd9Sstevel@tonic-gate 	model_t	mdl = get_udatamodel();
4437c478bd9Sstevel@tonic-gate 	struct msqid_ds64	ds64;
4447c478bd9Sstevel@tonic-gate 	kmutex_t		*lock;
4457c478bd9Sstevel@tonic-gate 	proc_t			*pp = curproc;
4467c478bd9Sstevel@tonic-gate 
4477c478bd9Sstevel@tonic-gate 	STRUCT_INIT(ds, mdl);
4487c478bd9Sstevel@tonic-gate 	cr = CRED();
4497c478bd9Sstevel@tonic-gate 
4507c478bd9Sstevel@tonic-gate 	/*
4517c478bd9Sstevel@tonic-gate 	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
4527c478bd9Sstevel@tonic-gate 	 */
4537c478bd9Sstevel@tonic-gate 	switch (cmd) {
4547c478bd9Sstevel@tonic-gate 	case IPC_SET:
4557c478bd9Sstevel@tonic-gate 		if (copyin(arg, STRUCT_BUF(ds), STRUCT_SIZE(ds)))
4567c478bd9Sstevel@tonic-gate 			return (set_errno(EFAULT));
4577c478bd9Sstevel@tonic-gate 		break;
4587c478bd9Sstevel@tonic-gate 
4597c478bd9Sstevel@tonic-gate 	case IPC_SET64:
4607c478bd9Sstevel@tonic-gate 		if (copyin(arg, &ds64, sizeof (struct msqid_ds64)))
4617c478bd9Sstevel@tonic-gate 			return (set_errno(EFAULT));
4627c478bd9Sstevel@tonic-gate 		break;
4637c478bd9Sstevel@tonic-gate 
4647c478bd9Sstevel@tonic-gate 	case IPC_RMID:
4657c478bd9Sstevel@tonic-gate 		if (error = ipc_rmid(msq_svc, msgid, cr))
4667c478bd9Sstevel@tonic-gate 			return (set_errno(error));
4677c478bd9Sstevel@tonic-gate 		return (0);
4687c478bd9Sstevel@tonic-gate 	}
4697c478bd9Sstevel@tonic-gate 
4707c478bd9Sstevel@tonic-gate 	/*
4717c478bd9Sstevel@tonic-gate 	 * get msqid_ds for this msgid
4727c478bd9Sstevel@tonic-gate 	 */
4737c478bd9Sstevel@tonic-gate 	if ((lock = ipc_lookup(msq_svc, msgid, (kipc_perm_t **)&qp)) == NULL)
4747c478bd9Sstevel@tonic-gate 		return (set_errno(EINVAL));
4757c478bd9Sstevel@tonic-gate 
4767c478bd9Sstevel@tonic-gate 	switch (cmd) {
4777c478bd9Sstevel@tonic-gate 	case IPC_SET:
4787c478bd9Sstevel@tonic-gate 		if (STRUCT_FGET(ds, msg_qbytes) > qp->msg_qbytes &&
4797c478bd9Sstevel@tonic-gate 		    secpolicy_ipc_config(cr) != 0) {
4807c478bd9Sstevel@tonic-gate 			mutex_exit(lock);
4817c478bd9Sstevel@tonic-gate 			return (set_errno(EPERM));
4827c478bd9Sstevel@tonic-gate 		}
4837c478bd9Sstevel@tonic-gate 		if (error = ipcperm_set(msq_svc, cr, &qp->msg_perm,
4847c478bd9Sstevel@tonic-gate 		    &STRUCT_BUF(ds)->msg_perm, mdl)) {
4857c478bd9Sstevel@tonic-gate 			mutex_exit(lock);
4867c478bd9Sstevel@tonic-gate 			return (set_errno(error));
4877c478bd9Sstevel@tonic-gate 		}
4887c478bd9Sstevel@tonic-gate 		qp->msg_qbytes = STRUCT_FGET(ds, msg_qbytes);
4897c478bd9Sstevel@tonic-gate 		qp->msg_ctime = gethrestime_sec();
4907c478bd9Sstevel@tonic-gate 		break;
4917c478bd9Sstevel@tonic-gate 
4927c478bd9Sstevel@tonic-gate 	case IPC_STAT:
4937c478bd9Sstevel@tonic-gate 		if (error = ipcperm_access(&qp->msg_perm, MSG_R, cr)) {
4947c478bd9Sstevel@tonic-gate 			mutex_exit(lock);
4957c478bd9Sstevel@tonic-gate 			return (set_errno(error));
4967c478bd9Sstevel@tonic-gate 		}
4977c478bd9Sstevel@tonic-gate 
4982c5b6df1Sdv 		if (qp->msg_rcv_cnt)
4992c5b6df1Sdv 			qp->msg_perm.ipc_mode |= MSG_RWAIT;
5007c478bd9Sstevel@tonic-gate 		if (qp->msg_snd_cnt)
5017c478bd9Sstevel@tonic-gate 			qp->msg_perm.ipc_mode |= MSG_WWAIT;
5027c478bd9Sstevel@tonic-gate 		ipcperm_stat(&STRUCT_BUF(ds)->msg_perm, &qp->msg_perm, mdl);
5037c478bd9Sstevel@tonic-gate 		qp->msg_perm.ipc_mode &= ~(MSG_RWAIT|MSG_WWAIT);
504*ac2ff9f2SToomas Soome 		STRUCT_FSETP(ds, msg_first, NULL);	/* kernel addr */
5057c478bd9Sstevel@tonic-gate 		STRUCT_FSETP(ds, msg_last, NULL);
5067c478bd9Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_cbytes, qp->msg_cbytes);
5077c478bd9Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_qnum, qp->msg_qnum);
5087c478bd9Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_qbytes, qp->msg_qbytes);
5097c478bd9Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_lspid, qp->msg_lspid);
5107c478bd9Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_lrpid, qp->msg_lrpid);
5117c478bd9Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_stime, qp->msg_stime);
5127c478bd9Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_rtime, qp->msg_rtime);
5137c478bd9Sstevel@tonic-gate 		STRUCT_FSET(ds, msg_ctime, qp->msg_ctime);
5147c478bd9Sstevel@tonic-gate 		break;
5157c478bd9Sstevel@tonic-gate 
5167c478bd9Sstevel@tonic-gate 	case IPC_SET64:
5177c478bd9Sstevel@tonic-gate 		mutex_enter(&pp->p_lock);
5187c478bd9Sstevel@tonic-gate 		if ((ds64.msgx_qbytes > qp->msg_qbytes) &&
5197c478bd9Sstevel@tonic-gate 		    secpolicy_ipc_config(cr) != 0 &&
5207c478bd9Sstevel@tonic-gate 		    rctl_test(rc_process_msgmnb, pp->p_rctls, pp,
5217c478bd9Sstevel@tonic-gate 		    ds64.msgx_qbytes, RCA_SAFE) & RCT_DENY) {
5227c478bd9Sstevel@tonic-gate 			mutex_exit(&pp->p_lock);
5237c478bd9Sstevel@tonic-gate 			mutex_exit(lock);
5247c478bd9Sstevel@tonic-gate 			return (set_errno(EPERM));
5257c478bd9Sstevel@tonic-gate 		}
5267c478bd9Sstevel@tonic-gate 		mutex_exit(&pp->p_lock);
5277c478bd9Sstevel@tonic-gate 		if (error = ipcperm_set64(msq_svc, cr, &qp->msg_perm,
5287c478bd9Sstevel@tonic-gate 		    &ds64.msgx_perm)) {
5297c478bd9Sstevel@tonic-gate 			mutex_exit(lock);
5307c478bd9Sstevel@tonic-gate 			return (set_errno(error));
5317c478bd9Sstevel@tonic-gate 		}
5327c478bd9Sstevel@tonic-gate 		qp->msg_qbytes = ds64.msgx_qbytes;
5337c478bd9Sstevel@tonic-gate 		qp->msg_ctime = gethrestime_sec();
5347c478bd9Sstevel@tonic-gate 		break;
5357c478bd9Sstevel@tonic-gate 
5367c478bd9Sstevel@tonic-gate 	case IPC_STAT64:
5372c5b6df1Sdv 		if (qp->msg_rcv_cnt)
5382c5b6df1Sdv 			qp->msg_perm.ipc_mode |= MSG_RWAIT;
5397c478bd9Sstevel@tonic-gate 		if (qp->msg_snd_cnt)
5407c478bd9Sstevel@tonic-gate 			qp->msg_perm.ipc_mode |= MSG_WWAIT;
5417c478bd9Sstevel@tonic-gate 		ipcperm_stat64(&ds64.msgx_perm, &qp->msg_perm);
5427c478bd9Sstevel@tonic-gate 		qp->msg_perm.ipc_mode &= ~(MSG_RWAIT|MSG_WWAIT);
5437c478bd9Sstevel@tonic-gate 		ds64.msgx_cbytes = qp->msg_cbytes;
5447c478bd9Sstevel@tonic-gate 		ds64.msgx_qnum = qp->msg_qnum;
5457c478bd9Sstevel@tonic-gate 		ds64.msgx_qbytes = qp->msg_qbytes;
5467c478bd9Sstevel@tonic-gate 		ds64.msgx_lspid = qp->msg_lspid;
5477c478bd9Sstevel@tonic-gate 		ds64.msgx_lrpid = qp->msg_lrpid;
5487c478bd9Sstevel@tonic-gate 		ds64.msgx_stime = qp->msg_stime;
5497c478bd9Sstevel@tonic-gate 		ds64.msgx_rtime = qp->msg_rtime;
5507c478bd9Sstevel@tonic-gate 		ds64.msgx_ctime = qp->msg_ctime;
5517c478bd9Sstevel@tonic-gate 		break;
5527c478bd9Sstevel@tonic-gate 
5537c478bd9Sstevel@tonic-gate 	default:
5547c478bd9Sstevel@tonic-gate 		mutex_exit(lock);
5557c478bd9Sstevel@tonic-gate 		return (set_errno(EINVAL));
5567c478bd9Sstevel@tonic-gate 	}
5577c478bd9Sstevel@tonic-gate 
5587c478bd9Sstevel@tonic-gate 	mutex_exit(lock);
5597c478bd9Sstevel@tonic-gate 
5607c478bd9Sstevel@tonic-gate 	/*
5617c478bd9Sstevel@tonic-gate 	 * Do copyout last (after releasing mutex).
5627c478bd9Sstevel@tonic-gate 	 */
5637c478bd9Sstevel@tonic-gate 	switch (cmd) {
5647c478bd9Sstevel@tonic-gate 	case IPC_STAT:
5657c478bd9Sstevel@tonic-gate 		if (copyout(STRUCT_BUF(ds), arg, STRUCT_SIZE(ds)))
5667c478bd9Sstevel@tonic-gate 			return (set_errno(EFAULT));
5677c478bd9Sstevel@tonic-gate 		break;
5687c478bd9Sstevel@tonic-gate 
5697c478bd9Sstevel@tonic-gate 	case IPC_STAT64:
5707c478bd9Sstevel@tonic-gate 		if (copyout(&ds64, arg, sizeof (struct msqid_ds64)))
5717c478bd9Sstevel@tonic-gate 			return (set_errno(EFAULT));
5727c478bd9Sstevel@tonic-gate 		break;
5737c478bd9Sstevel@tonic-gate 	}
5747c478bd9Sstevel@tonic-gate 
5757c478bd9Sstevel@tonic-gate 	return (0);
5767c478bd9Sstevel@tonic-gate }
5777c478bd9Sstevel@tonic-gate 
5787c478bd9Sstevel@tonic-gate /*
5797c478bd9Sstevel@tonic-gate  * Remove all message queues associated with a given zone.  Called by
5807c478bd9Sstevel@tonic-gate  * zone_shutdown when the zone is halted.
5817c478bd9Sstevel@tonic-gate  */
5827c478bd9Sstevel@tonic-gate /*ARGSUSED1*/
5837c478bd9Sstevel@tonic-gate static void
msg_remove_zone(zoneid_t zoneid,void * arg)5847c478bd9Sstevel@tonic-gate msg_remove_zone(zoneid_t zoneid, void *arg)
5857c478bd9Sstevel@tonic-gate {
5867c478bd9Sstevel@tonic-gate 	ipc_remove_zone(msq_svc, zoneid);
5877c478bd9Sstevel@tonic-gate }
5887c478bd9Sstevel@tonic-gate 
5897c478bd9Sstevel@tonic-gate /*
5907c478bd9Sstevel@tonic-gate  * msgget system call.
5917c478bd9Sstevel@tonic-gate  */
5927c478bd9Sstevel@tonic-gate static int
msgget(key_t key,int msgflg)5937c478bd9Sstevel@tonic-gate msgget(key_t key, int msgflg)
5947c478bd9Sstevel@tonic-gate {
5957c478bd9Sstevel@tonic-gate 	kmsqid_t	*qp;
5967c478bd9Sstevel@tonic-gate 	kmutex_t	*lock;
5977c478bd9Sstevel@tonic-gate 	int		id, error;
598b2eb1770Sudpa 	int		ii;
5997c478bd9Sstevel@tonic-gate 	proc_t		*pp = curproc;
6007c478bd9Sstevel@tonic-gate 
6017c478bd9Sstevel@tonic-gate top:
6027c478bd9Sstevel@tonic-gate 	if (error = ipc_get(msq_svc, key, msgflg, (kipc_perm_t **)&qp, &lock))
6037c478bd9Sstevel@tonic-gate 		return (set_errno(error));
6047c478bd9Sstevel@tonic-gate 
6057c478bd9Sstevel@tonic-gate 	if (IPC_FREE(&qp->msg_perm)) {
6067c478bd9Sstevel@tonic-gate 		mutex_exit(lock);
6077c478bd9Sstevel@tonic-gate 		mutex_exit(&pp->p_lock);
6087c478bd9Sstevel@tonic-gate 
6097c478bd9Sstevel@tonic-gate 		list_create(&qp->msg_list, sizeof (struct msg),
6107c478bd9Sstevel@tonic-gate 		    offsetof(struct msg, msg_node));
6117c478bd9Sstevel@tonic-gate 		qp->msg_qnum = 0;
6127c478bd9Sstevel@tonic-gate 		qp->msg_lspid = qp->msg_lrpid = 0;
6137c478bd9Sstevel@tonic-gate 		qp->msg_stime = qp->msg_rtime = 0;
6147c478bd9Sstevel@tonic-gate 		qp->msg_ctime = gethrestime_sec();
6152c5b6df1Sdv 		qp->msg_ngt_cnt = 0;
6162c5b6df1Sdv 		qp->msg_neg_copy = 0;
6172c5b6df1Sdv 		for (ii = 0; ii <= MSG_MAX_QNUM; ii++) {
6182c5b6df1Sdv 			list_create(&qp->msg_wait_snd[ii],
6192c5b6df1Sdv 			    sizeof (msgq_wakeup_t),
6202c5b6df1Sdv 			    offsetof(msgq_wakeup_t, msgw_list));
6212c5b6df1Sdv 			list_create(&qp->msg_wait_snd_ngt[ii],
6222c5b6df1Sdv 			    sizeof (msgq_wakeup_t),
6232c5b6df1Sdv 			    offsetof(msgq_wakeup_t, msgw_list));
6242c5b6df1Sdv 		}
6252c5b6df1Sdv 		/*
6262c5b6df1Sdv 		 * The proper initialization of msg_lowest_type is to the
6272c5b6df1Sdv 		 * highest possible value.  By doing this we guarantee that
6282c5b6df1Sdv 		 * when the first send happens, the lowest type will be set
6292c5b6df1Sdv 		 * properly.
6302c5b6df1Sdv 		 */
631eb9fe4caSDavid Valin 		qp->msg_lowest_type = MSG_SMALL_INIT;
6322c5b6df1Sdv 		list_create(&qp->msg_cpy_block,
6332c5b6df1Sdv 		    sizeof (msgq_wakeup_t),
6342c5b6df1Sdv 		    offsetof(msgq_wakeup_t, msgw_list));
635eb9fe4caSDavid Valin 		list_create(&qp->msg_wait_rcv,
636eb9fe4caSDavid Valin 		    sizeof (msgq_wakeup_t),
637eb9fe4caSDavid Valin 		    offsetof(msgq_wakeup_t, msgw_list));
6382c5b6df1Sdv 		qp->msg_fnd_sndr = &msg_fnd_sndr[0];
6392c5b6df1Sdv 		qp->msg_fnd_rdr = &msg_fnd_rdr[0];
6402c5b6df1Sdv 		qp->msg_rcv_cnt = 0;
641b2eb1770Sudpa 		qp->msg_snd_cnt = 0;
642eb9fe4caSDavid Valin 		qp->msg_snd_smallest = MSG_SMALL_INIT;
6437c478bd9Sstevel@tonic-gate 
6447c478bd9Sstevel@tonic-gate 		if (error = ipc_commit_begin(msq_svc, key, msgflg,
6457c478bd9Sstevel@tonic-gate 		    (kipc_perm_t *)qp)) {
6467c478bd9Sstevel@tonic-gate 			if (error == EAGAIN)
6477c478bd9Sstevel@tonic-gate 				goto top;
6487c478bd9Sstevel@tonic-gate 			return (set_errno(error));
6497c478bd9Sstevel@tonic-gate 		}
6507c478bd9Sstevel@tonic-gate 		qp->msg_qbytes = rctl_enforced_value(rc_process_msgmnb,
6517c478bd9Sstevel@tonic-gate 		    pp->p_rctls, pp);
6527c478bd9Sstevel@tonic-gate 		qp->msg_qmax = rctl_enforced_value(rc_process_msgtql,
6537c478bd9Sstevel@tonic-gate 		    pp->p_rctls, pp);
6547c478bd9Sstevel@tonic-gate 		lock = ipc_commit_end(msq_svc, &qp->msg_perm);
6557c478bd9Sstevel@tonic-gate 	}
656005d3febSMarek Pospisil 
657005d3febSMarek Pospisil 	if (AU_AUDITING())
6587c478bd9Sstevel@tonic-gate 		audit_ipcget(AT_IPC_MSG, (void *)qp);
659005d3febSMarek Pospisil 
6607c478bd9Sstevel@tonic-gate 	id = qp->msg_perm.ipc_id;
6617c478bd9Sstevel@tonic-gate 	mutex_exit(lock);
6627c478bd9Sstevel@tonic-gate 	return (id);
6637c478bd9Sstevel@tonic-gate }
6647c478bd9Sstevel@tonic-gate 
6657c478bd9Sstevel@tonic-gate static ssize_t
msgrcv(int msqid,struct ipcmsgbuf * msgp,size_t msgsz,long msgtyp,int msgflg)6667c478bd9Sstevel@tonic-gate msgrcv(int msqid, struct ipcmsgbuf *msgp, size_t msgsz, long msgtyp, int msgflg)
6677c478bd9Sstevel@tonic-gate {
6687c478bd9Sstevel@tonic-gate 	struct msg	*smp;	/* ptr to best msg on q */
6697c478bd9Sstevel@tonic-gate 	kmsqid_t	*qp;	/* ptr to associated q */
6707c478bd9Sstevel@tonic-gate 	kmutex_t	*lock;
6717c478bd9Sstevel@tonic-gate 	size_t		xtsz;	/* transfer byte count */
6722c5b6df1Sdv 	int		error = 0;
6737c478bd9Sstevel@tonic-gate 	int		cvres;
674e5994f96Sdv 	uint_t		msg_hash;
6752c5b6df1Sdv 	msgq_wakeup_t	msg_entry;
6767c478bd9Sstevel@tonic-gate 
6777c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(CPU, sys, msg, 1);	/* bump msg send/rcv count */
6787c478bd9Sstevel@tonic-gate 
6792c5b6df1Sdv 	msg_hash = msg_type_hash(msgtyp);
6802c5b6df1Sdv 	if ((lock = ipc_lookup(msq_svc, msqid, (kipc_perm_t **)&qp)) == NULL) {
6817c478bd9Sstevel@tonic-gate 		return ((ssize_t)set_errno(EINVAL));
6822c5b6df1Sdv 	}
6837c478bd9Sstevel@tonic-gate 	ipc_hold(msq_svc, (kipc_perm_t *)qp);
6847c478bd9Sstevel@tonic-gate 
6852c5b6df1Sdv 	if (error = ipcperm_access(&qp->msg_perm, MSG_R, CRED())) {
6867c478bd9Sstevel@tonic-gate 		goto msgrcv_out;
6872c5b6df1Sdv 	}
6887c478bd9Sstevel@tonic-gate 
6892c5b6df1Sdv 	/*
6902c5b6df1Sdv 	 * Various information (including the condvar_t) required for the
6912c5b6df1Sdv 	 * process to sleep is provided by it's stack.
6922c5b6df1Sdv 	 */
6932c5b6df1Sdv 	msg_entry.msgw_thrd = curthread;
6942c5b6df1Sdv 	msg_entry.msgw_snd_wake = 0;
6952c5b6df1Sdv 	msg_entry.msgw_type = msgtyp;
6967c478bd9Sstevel@tonic-gate findmsg:
6972c5b6df1Sdv 	smp = msgrcv_lookup(qp, msgtyp);
6987c478bd9Sstevel@tonic-gate 
6997c478bd9Sstevel@tonic-gate 	if (smp) {
7007c478bd9Sstevel@tonic-gate 		/*
7012c5b6df1Sdv 		 * We found a possible message to copy out.
7027c478bd9Sstevel@tonic-gate 		 */
7037c478bd9Sstevel@tonic-gate 		if ((smp->msg_flags & MSG_RCVCOPY) == 0) {
704e5994f96Sdv 			long t = msg_entry.msgw_snd_wake;
7056344fb25Sqiao 			long copy_type = smp->msg_type;
7066344fb25Sqiao 
7077c478bd9Sstevel@tonic-gate 			/*
7082c5b6df1Sdv 			 * It is available, attempt to copy it.
7097c478bd9Sstevel@tonic-gate 			 */
7102c5b6df1Sdv 			error = msg_copyout(qp, msgtyp, &lock, &xtsz, msgsz,
7112c5b6df1Sdv 			    smp, msgp, msgflg);
712e5994f96Sdv 
713e5994f96Sdv 			/*
714e5994f96Sdv 			 * It is possible to consume a different message
715e5994f96Sdv 			 * type then what originally awakened for (negative
716e5994f96Sdv 			 * types).  If this happens a check must be done to
717e5994f96Sdv 			 * to determine if another receiver is available
718e5994f96Sdv 			 * for the waking message type,  Failure to do this
719e5994f96Sdv 			 * can result in a message on the queue that can be
720e5994f96Sdv 			 * serviced by a sleeping receiver.
721e5994f96Sdv 			 */
7226344fb25Sqiao 			if (!error && t && (copy_type != t))
723e5994f96Sdv 				msg_wakeup_rdr(qp, &qp->msg_fnd_sndr, t);
724e5994f96Sdv 
7257c478bd9Sstevel@tonic-gate 			/*
7262c5b6df1Sdv 			 * Don't forget to wakeup a sleeper that blocked because
7272c5b6df1Sdv 			 * we were copying things out.
7287c478bd9Sstevel@tonic-gate 			 */
7292c5b6df1Sdv 			msg_wakeup_rdr(qp, &qp->msg_fnd_rdr, 0);
7302c5b6df1Sdv 			goto msgrcv_out;
7312c5b6df1Sdv 		}
7322c5b6df1Sdv 		/*
7332c5b6df1Sdv 		 * The selected message is being copied out, so block.  We do
7342c5b6df1Sdv 		 * not need to wake the next person up on the msg_cpy_block list
7352c5b6df1Sdv 		 * due to the fact some one is copying out and they will get
7362c5b6df1Sdv 		 * things moving again once the copy is completed.
7372c5b6df1Sdv 		 */
7382c5b6df1Sdv 		cvres = msg_rcvq_sleep(&qp->msg_cpy_block,
7392c5b6df1Sdv 		    &msg_entry, &lock, qp);
7402c5b6df1Sdv 		error = msgq_check_err(qp, cvres);
7412c5b6df1Sdv 		if (error) {
7422c5b6df1Sdv 			goto msgrcv_out;
7432c5b6df1Sdv 		}
7442c5b6df1Sdv 		goto findmsg;
7452c5b6df1Sdv 	}
7462c5b6df1Sdv 	/*
7472c5b6df1Sdv 	 * There isn't a message to copy out that matches the designated
7482c5b6df1Sdv 	 * criteria.
7492c5b6df1Sdv 	 */
7502c5b6df1Sdv 	if (msgflg & IPC_NOWAIT) {
7512c5b6df1Sdv 		error = ENOMSG;
7522c5b6df1Sdv 		goto msgrcv_out;
7532c5b6df1Sdv 	}
7542c5b6df1Sdv 	msg_wakeup_rdr(qp,  &qp->msg_fnd_rdr, 0);
7557c478bd9Sstevel@tonic-gate 
7562c5b6df1Sdv 	/*
7572c5b6df1Sdv 	 * Wait for new message.  We keep the negative and positive types
7582c5b6df1Sdv 	 * separate for performance reasons.
7592c5b6df1Sdv 	 */
7602c5b6df1Sdv 	msg_entry.msgw_snd_wake = 0;
7612c5b6df1Sdv 	if (msgtyp >= 0) {
7622c5b6df1Sdv 		cvres = msg_rcvq_sleep(&qp->msg_wait_snd[msg_hash],
7632c5b6df1Sdv 		    &msg_entry, &lock, qp);
7642c5b6df1Sdv 	} else {
7652c5b6df1Sdv 		qp->msg_ngt_cnt++;
7662c5b6df1Sdv 		cvres = msg_rcvq_sleep(&qp->msg_wait_snd_ngt[msg_hash],
7672c5b6df1Sdv 		    &msg_entry, &lock, qp);
7682c5b6df1Sdv 		qp->msg_ngt_cnt--;
7692c5b6df1Sdv 	}
7707c478bd9Sstevel@tonic-gate 
7712c5b6df1Sdv 	if (!(error = msgq_check_err(qp, cvres))) {
7722c5b6df1Sdv 		goto findmsg;
7732c5b6df1Sdv 	}
7747c478bd9Sstevel@tonic-gate 
7752c5b6df1Sdv msgrcv_out:
7762c5b6df1Sdv 	if (error) {
7772c5b6df1Sdv 		msg_wakeup_rdr(qp,  &qp->msg_fnd_rdr, 0);
7782c5b6df1Sdv 		if (msg_entry.msgw_snd_wake) {
7792c5b6df1Sdv 			msg_wakeup_rdr(qp, &qp->msg_fnd_sndr,
7802c5b6df1Sdv 			    msg_entry.msgw_snd_wake);
7812c5b6df1Sdv 		}
7822c5b6df1Sdv 		ipc_rele(msq_svc, (kipc_perm_t *)qp);
7832c5b6df1Sdv 		return ((ssize_t)set_errno(error));
7842c5b6df1Sdv 	}
7852c5b6df1Sdv 	ipc_rele(msq_svc, (kipc_perm_t *)qp);
7862c5b6df1Sdv 	return ((ssize_t)xtsz);
7872c5b6df1Sdv }
7887c478bd9Sstevel@tonic-gate 
7892c5b6df1Sdv static int
msgq_check_err(kmsqid_t * qp,int cvres)7902c5b6df1Sdv msgq_check_err(kmsqid_t *qp, int cvres)
7912c5b6df1Sdv {
7922c5b6df1Sdv 	if (IPC_FREE(&qp->msg_perm)) {
7932c5b6df1Sdv 		return (EIDRM);
7942c5b6df1Sdv 	}
7957c478bd9Sstevel@tonic-gate 
7962c5b6df1Sdv 	if (cvres == 0) {
7972c5b6df1Sdv 		return (EINTR);
7982c5b6df1Sdv 	}
7992c5b6df1Sdv 
8002c5b6df1Sdv 	return (0);
8012c5b6df1Sdv }
8022c5b6df1Sdv 
8032c5b6df1Sdv static int
msg_copyout(kmsqid_t * qp,long msgtyp,kmutex_t ** lock,size_t * xtsz_ret,size_t msgsz,struct msg * smp,struct ipcmsgbuf * msgp,int msgflg)8042c5b6df1Sdv msg_copyout(kmsqid_t *qp, long msgtyp, kmutex_t **lock, size_t *xtsz_ret,
8052c5b6df1Sdv     size_t msgsz, struct msg *smp, struct ipcmsgbuf *msgp, int msgflg)
8062c5b6df1Sdv {
8072c5b6df1Sdv 	size_t		xtsz;
8082c5b6df1Sdv 	STRUCT_HANDLE(ipcmsgbuf, umsgp);
8092c5b6df1Sdv 	model_t		mdl = get_udatamodel();
8102c5b6df1Sdv 	int		copyerror = 0;
8112c5b6df1Sdv 
8122c5b6df1Sdv 	STRUCT_SET_HANDLE(umsgp, mdl, msgp);
8132c5b6df1Sdv 	if (msgsz < smp->msg_size) {
8142c5b6df1Sdv 		if ((msgflg & MSG_NOERROR) == 0) {
8152c5b6df1Sdv 			return (E2BIG);
8162c5b6df1Sdv 		} else {
8172c5b6df1Sdv 			xtsz = msgsz;
8187c478bd9Sstevel@tonic-gate 		}
8192c5b6df1Sdv 	} else {
8202c5b6df1Sdv 		xtsz = smp->msg_size;
8212c5b6df1Sdv 	}
8222c5b6df1Sdv 	*xtsz_ret = xtsz;
8232c5b6df1Sdv 
8242c5b6df1Sdv 	/*
8252c5b6df1Sdv 	 * To prevent a DOS attack we mark the message as being
8262c5b6df1Sdv 	 * copied out and release mutex.  When the copy is completed
8272c5b6df1Sdv 	 * we need to acquire the mutex and make the appropriate updates.
8282c5b6df1Sdv 	 */
8292c5b6df1Sdv 	ASSERT((smp->msg_flags & MSG_RCVCOPY) == 0);
8302c5b6df1Sdv 	smp->msg_flags |= MSG_RCVCOPY;
8312c5b6df1Sdv 	msg_hold(smp);
8322c5b6df1Sdv 	if (msgtyp < 0) {
8332c5b6df1Sdv 		ASSERT(qp->msg_neg_copy == 0);
8342c5b6df1Sdv 		qp->msg_neg_copy = 1;
8352c5b6df1Sdv 	}
8362c5b6df1Sdv 	mutex_exit(*lock);
8377c478bd9Sstevel@tonic-gate 
8382c5b6df1Sdv 	if (mdl == DATAMODEL_NATIVE) {
8392c5b6df1Sdv 		copyerror = copyout(&smp->msg_type, msgp,
8402c5b6df1Sdv 		    sizeof (smp->msg_type));
8417c478bd9Sstevel@tonic-gate 	} else {
8427c478bd9Sstevel@tonic-gate 		/*
8432c5b6df1Sdv 		 * 32-bit callers need an imploded msg type.
8447c478bd9Sstevel@tonic-gate 		 */
8452c5b6df1Sdv 		int32_t	msg_type32 = smp->msg_type;
8462c5b6df1Sdv 
8472c5b6df1Sdv 		copyerror = copyout(&msg_type32, msgp,
8482c5b6df1Sdv 		    sizeof (msg_type32));
8492c5b6df1Sdv 	}
8502c5b6df1Sdv 
8512c5b6df1Sdv 	if (copyerror == 0 && xtsz) {
8522c5b6df1Sdv 		copyerror = copyout(smp->msg_addr,
8532c5b6df1Sdv 		    STRUCT_FADDR(umsgp, mtext), xtsz);
8547c478bd9Sstevel@tonic-gate 	}
8557c478bd9Sstevel@tonic-gate 
8562c5b6df1Sdv 	/*
8572c5b6df1Sdv 	 * Reclaim the mutex and make sure the message queue still exists.
8582c5b6df1Sdv 	 */
8597c478bd9Sstevel@tonic-gate 
8602c5b6df1Sdv 	*lock = ipc_lock(msq_svc, qp->msg_perm.ipc_id);
8612c5b6df1Sdv 	if (msgtyp < 0) {
8622c5b6df1Sdv 		qp->msg_neg_copy = 0;
8632c5b6df1Sdv 	}
8642c5b6df1Sdv 	ASSERT(smp->msg_flags & MSG_RCVCOPY);
8652c5b6df1Sdv 	smp->msg_flags &= ~MSG_RCVCOPY;
8662c5b6df1Sdv 	msg_rele(smp);
8677c478bd9Sstevel@tonic-gate 	if (IPC_FREE(&qp->msg_perm)) {
8682c5b6df1Sdv 		return (EIDRM);
8697c478bd9Sstevel@tonic-gate 	}
8702c5b6df1Sdv 	if (copyerror) {
8712c5b6df1Sdv 		return (EFAULT);
8727c478bd9Sstevel@tonic-gate 	}
8732c5b6df1Sdv 	qp->msg_lrpid = ttoproc(curthread)->p_pid;
8742c5b6df1Sdv 	qp->msg_rtime = gethrestime_sec();
8752c5b6df1Sdv 	msgunlink(qp, smp);
8762c5b6df1Sdv 	return (0);
8772c5b6df1Sdv }
8787c478bd9Sstevel@tonic-gate 
8792c5b6df1Sdv static struct msg *
msgrcv_lookup(kmsqid_t * qp,long msgtyp)8802c5b6df1Sdv msgrcv_lookup(kmsqid_t *qp, long msgtyp)
8812c5b6df1Sdv {
882*ac2ff9f2SToomas Soome 	struct msg		*smp = NULL;
883e5994f96Sdv 	long			qp_low;
8842c5b6df1Sdv 	struct msg		*mp;	/* ptr to msg on q */
885e5994f96Sdv 	long			low_msgtype;
8862c5b6df1Sdv 	static struct msg	neg_copy_smp;
8877c478bd9Sstevel@tonic-gate 
8882c5b6df1Sdv 	mp = list_head(&qp->msg_list);
8892c5b6df1Sdv 	if (msgtyp == 0) {
8902c5b6df1Sdv 		smp = mp;
8912c5b6df1Sdv 	} else {
8922c5b6df1Sdv 		qp_low = qp->msg_lowest_type;
8932c5b6df1Sdv 		if (msgtyp > 0) {
8942c5b6df1Sdv 			/*
8952c5b6df1Sdv 			 * If our lowest possible message type is larger than
8962c5b6df1Sdv 			 * the message type desired, then we know there is
8972c5b6df1Sdv 			 * no entry present.
8982c5b6df1Sdv 			 */
8992c5b6df1Sdv 			if (qp_low > msgtyp) {
9002c5b6df1Sdv 				return (NULL);
9012c5b6df1Sdv 			}
9022c5b6df1Sdv 
9032c5b6df1Sdv 			for (; mp; mp = list_next(&qp->msg_list, mp)) {
9042c5b6df1Sdv 				if (msgtyp == mp->msg_type) {
9052c5b6df1Sdv 					smp = mp;
9062c5b6df1Sdv 					break;
9072c5b6df1Sdv 				}
9082c5b6df1Sdv 			}
9092c5b6df1Sdv 		} else {
9102c5b6df1Sdv 			/*
9112c5b6df1Sdv 			 * We have kept track of the lowest possible message
9122c5b6df1Sdv 			 * type on the send queue.  This allows us to terminate
9132c5b6df1Sdv 			 * the search early if we find a message type of that
9142c5b6df1Sdv 			 * type.  Note, the lowest type may not be the actual
9152c5b6df1Sdv 			 * lowest value in the system, it is only guaranteed
9162c5b6df1Sdv 			 * that there isn't a value lower than that.
9172c5b6df1Sdv 			 */
9182c5b6df1Sdv 			low_msgtype = -msgtyp;
919e5994f96Sdv 			if (low_msgtype < qp_low) {
9202c5b6df1Sdv 				return (NULL);
9212c5b6df1Sdv 			}
9222c5b6df1Sdv 			if (qp->msg_neg_copy) {
9232c5b6df1Sdv 				neg_copy_smp.msg_flags = MSG_RCVCOPY;
9242c5b6df1Sdv 				return (&neg_copy_smp);
9252c5b6df1Sdv 			}
9262c5b6df1Sdv 			for (; mp; mp = list_next(&qp->msg_list, mp)) {
927e5994f96Sdv 				if (mp->msg_type <= low_msgtype &&
928e5994f96Sdv 				    !(smp && smp->msg_type <= mp->msg_type)) {
9292c5b6df1Sdv 					smp = mp;
9302c5b6df1Sdv 					low_msgtype = mp->msg_type;
9312c5b6df1Sdv 					if (low_msgtype == qp_low) {
9322c5b6df1Sdv 						break;
9332c5b6df1Sdv 					}
9342c5b6df1Sdv 				}
9352c5b6df1Sdv 			}
9362c5b6df1Sdv 			if (smp) {
9372c5b6df1Sdv 				/*
9382c5b6df1Sdv 				 * Update the lowest message type.
9392c5b6df1Sdv 				 */
9402c5b6df1Sdv 				qp->msg_lowest_type = smp->msg_type;
9412c5b6df1Sdv 			}
9422c5b6df1Sdv 		}
9432c5b6df1Sdv 	}
9442c5b6df1Sdv 	return (smp);
9457c478bd9Sstevel@tonic-gate }
9467c478bd9Sstevel@tonic-gate 
9477c478bd9Sstevel@tonic-gate /*
9487c478bd9Sstevel@tonic-gate  * msgids system call.
9497c478bd9Sstevel@tonic-gate  */
9507c478bd9Sstevel@tonic-gate static int
msgids(int * buf,uint_t nids,uint_t * pnids)9517c478bd9Sstevel@tonic-gate msgids(int *buf, uint_t nids, uint_t *pnids)
9527c478bd9Sstevel@tonic-gate {
9537c478bd9Sstevel@tonic-gate 	int error;
9547c478bd9Sstevel@tonic-gate 
9557c478bd9Sstevel@tonic-gate 	if (error = ipc_ids(msq_svc, buf, nids, pnids))
9567c478bd9Sstevel@tonic-gate 		return (set_errno(error));
9577c478bd9Sstevel@tonic-gate 
9587c478bd9Sstevel@tonic-gate 	return (0);
9597c478bd9Sstevel@tonic-gate }
9607c478bd9Sstevel@tonic-gate 
9617c478bd9Sstevel@tonic-gate #define	RND(x)		roundup((x), sizeof (size_t))
9627c478bd9Sstevel@tonic-gate #define	RND32(x)	roundup((x), sizeof (size32_t))
9637c478bd9Sstevel@tonic-gate 
9647c478bd9Sstevel@tonic-gate /*
9657c478bd9Sstevel@tonic-gate  * msgsnap system call.
9667c478bd9Sstevel@tonic-gate  */
9677c478bd9Sstevel@tonic-gate static int
msgsnap(int msqid,caddr_t buf,size_t bufsz,long msgtyp)9687c478bd9Sstevel@tonic-gate msgsnap(int msqid, caddr_t buf, size_t bufsz, long msgtyp)
9697c478bd9Sstevel@tonic-gate {
9707c478bd9Sstevel@tonic-gate 	struct msg	*mp;	/* ptr to msg on q */
9717c478bd9Sstevel@tonic-gate 	kmsqid_t	*qp;	/* ptr to associated q */
9727c478bd9Sstevel@tonic-gate 	kmutex_t	*lock;
9737c478bd9Sstevel@tonic-gate 	size_t		size;
9747c478bd9Sstevel@tonic-gate 	size_t		nmsg;
9757c478bd9Sstevel@tonic-gate 	struct msg	**snaplist;
9767c478bd9Sstevel@tonic-gate 	int		error, i;
9777c478bd9Sstevel@tonic-gate 	model_t		mdl = get_udatamodel();
9787c478bd9Sstevel@tonic-gate 	STRUCT_DECL(msgsnap_head, head);
9797c478bd9Sstevel@tonic-gate 	STRUCT_DECL(msgsnap_mhead, mhead);
9807c478bd9Sstevel@tonic-gate 
9817c478bd9Sstevel@tonic-gate 	STRUCT_INIT(head, mdl);
9827c478bd9Sstevel@tonic-gate 	STRUCT_INIT(mhead, mdl);
9837c478bd9Sstevel@tonic-gate 
9847c478bd9Sstevel@tonic-gate 	if (bufsz < STRUCT_SIZE(head))
9857c478bd9Sstevel@tonic-gate 		return (set_errno(EINVAL));
9867c478bd9Sstevel@tonic-gate 
9877c478bd9Sstevel@tonic-gate 	if ((lock = ipc_lookup(msq_svc, msqid, (kipc_perm_t **)&qp)) == NULL)
9887c478bd9Sstevel@tonic-gate 		return (set_errno(EINVAL));
9897c478bd9Sstevel@tonic-gate 
9907c478bd9Sstevel@tonic-gate 	if (error = ipcperm_access(&qp->msg_perm, MSG_R, CRED())) {
9917c478bd9Sstevel@tonic-gate 		mutex_exit(lock);
9927c478bd9Sstevel@tonic-gate 		return (set_errno(error));
9937c478bd9Sstevel@tonic-gate 	}
9947c478bd9Sstevel@tonic-gate 	ipc_hold(msq_svc, (kipc_perm_t *)qp);
9957c478bd9Sstevel@tonic-gate 
9967c478bd9Sstevel@tonic-gate 	/*
9977c478bd9Sstevel@tonic-gate 	 * First compute the required buffer size and
9987c478bd9Sstevel@tonic-gate 	 * the number of messages on the queue.
9997c478bd9Sstevel@tonic-gate 	 */
10007c478bd9Sstevel@tonic-gate 	size = nmsg = 0;
10017c478bd9Sstevel@tonic-gate 	for (mp = list_head(&qp->msg_list); mp;
10027c478bd9Sstevel@tonic-gate 	    mp = list_next(&qp->msg_list, mp)) {
10037c478bd9Sstevel@tonic-gate 		if (msgtyp == 0 ||
10047c478bd9Sstevel@tonic-gate 		    (msgtyp > 0 && msgtyp == mp->msg_type) ||
10057c478bd9Sstevel@tonic-gate 		    (msgtyp < 0 && mp->msg_type <= -msgtyp)) {
10067c478bd9Sstevel@tonic-gate 			nmsg++;
10077c478bd9Sstevel@tonic-gate 			if (mdl == DATAMODEL_NATIVE)
10087c478bd9Sstevel@tonic-gate 				size += RND(mp->msg_size);
10097c478bd9Sstevel@tonic-gate 			else
10107c478bd9Sstevel@tonic-gate 				size += RND32(mp->msg_size);
10117c478bd9Sstevel@tonic-gate 		}
10127c478bd9Sstevel@tonic-gate 	}
10137c478bd9Sstevel@tonic-gate 
10147c478bd9Sstevel@tonic-gate 	size += STRUCT_SIZE(head) + nmsg * STRUCT_SIZE(mhead);
10157c478bd9Sstevel@tonic-gate 	if (size > bufsz)
10167c478bd9Sstevel@tonic-gate 		nmsg = 0;
10177c478bd9Sstevel@tonic-gate 
10187c478bd9Sstevel@tonic-gate 	if (nmsg > 0) {
10197c478bd9Sstevel@tonic-gate 		/*
10207c478bd9Sstevel@tonic-gate 		 * Mark the messages as being copied.
10217c478bd9Sstevel@tonic-gate 		 */
10227c478bd9Sstevel@tonic-gate 		snaplist = (struct msg **)kmem_alloc(nmsg *
10237c478bd9Sstevel@tonic-gate 		    sizeof (struct msg *), KM_SLEEP);
10247c478bd9Sstevel@tonic-gate 		i = 0;
10257c478bd9Sstevel@tonic-gate 		for (mp = list_head(&qp->msg_list); mp;
10267c478bd9Sstevel@tonic-gate 		    mp = list_next(&qp->msg_list, mp)) {
10277c478bd9Sstevel@tonic-gate 			if (msgtyp == 0 ||
10287c478bd9Sstevel@tonic-gate 			    (msgtyp > 0 && msgtyp == mp->msg_type) ||
10297c478bd9Sstevel@tonic-gate 			    (msgtyp < 0 && mp->msg_type <= -msgtyp)) {
10307c478bd9Sstevel@tonic-gate 				msg_hold(mp);
10317c478bd9Sstevel@tonic-gate 				snaplist[i] = mp;
10327c478bd9Sstevel@tonic-gate 				i++;
10337c478bd9Sstevel@tonic-gate 			}
10347c478bd9Sstevel@tonic-gate 		}
10357c478bd9Sstevel@tonic-gate 	}
10367c478bd9Sstevel@tonic-gate 	mutex_exit(lock);
10377c478bd9Sstevel@tonic-gate 
10387c478bd9Sstevel@tonic-gate 	/*
10397c478bd9Sstevel@tonic-gate 	 * Copy out the buffer header.
10407c478bd9Sstevel@tonic-gate 	 */
10417c478bd9Sstevel@tonic-gate 	STRUCT_FSET(head, msgsnap_size, size);
10427c478bd9Sstevel@tonic-gate 	STRUCT_FSET(head, msgsnap_nmsg, nmsg);
10437c478bd9Sstevel@tonic-gate 	if (copyout(STRUCT_BUF(head), buf, STRUCT_SIZE(head)))
10447c478bd9Sstevel@tonic-gate 		error = EFAULT;
10457c478bd9Sstevel@tonic-gate 
10467c478bd9Sstevel@tonic-gate 	buf += STRUCT_SIZE(head);
10477c478bd9Sstevel@tonic-gate 
10487c478bd9Sstevel@tonic-gate 	/*
10497c478bd9Sstevel@tonic-gate 	 * Now copy out the messages one by one.
10507c478bd9Sstevel@tonic-gate 	 */
10517c478bd9Sstevel@tonic-gate 	for (i = 0; i < nmsg; i++) {
10527c478bd9Sstevel@tonic-gate 		mp = snaplist[i];
10537c478bd9Sstevel@tonic-gate 		if (error == 0) {
10547c478bd9Sstevel@tonic-gate 			STRUCT_FSET(mhead, msgsnap_mlen, mp->msg_size);
10557c478bd9Sstevel@tonic-gate 			STRUCT_FSET(mhead, msgsnap_mtype, mp->msg_type);
10567c478bd9Sstevel@tonic-gate 			if (copyout(STRUCT_BUF(mhead), buf, STRUCT_SIZE(mhead)))
10577c478bd9Sstevel@tonic-gate 				error = EFAULT;
10587c478bd9Sstevel@tonic-gate 			buf += STRUCT_SIZE(mhead);
10597c478bd9Sstevel@tonic-gate 
10607c478bd9Sstevel@tonic-gate 			if (error == 0 &&
10617c478bd9Sstevel@tonic-gate 			    mp->msg_size != 0 &&
10627c478bd9Sstevel@tonic-gate 			    copyout(mp->msg_addr, buf, mp->msg_size))
10637c478bd9Sstevel@tonic-gate 				error = EFAULT;
10647c478bd9Sstevel@tonic-gate 			if (mdl == DATAMODEL_NATIVE)
10657c478bd9Sstevel@tonic-gate 				buf += RND(mp->msg_size);
10667c478bd9Sstevel@tonic-gate 			else
10677c478bd9Sstevel@tonic-gate 				buf += RND32(mp->msg_size);
10687c478bd9Sstevel@tonic-gate 		}
10697c478bd9Sstevel@tonic-gate 		lock = ipc_lock(msq_svc, qp->msg_perm.ipc_id);
10707c478bd9Sstevel@tonic-gate 		msg_rele(mp);
10717c478bd9Sstevel@tonic-gate 		/* Check for msg q deleted or reallocated */
10727c478bd9Sstevel@tonic-gate 		if (IPC_FREE(&qp->msg_perm))
10737c478bd9Sstevel@tonic-gate 			error = EIDRM;
10747c478bd9Sstevel@tonic-gate 		mutex_exit(lock);
10757c478bd9Sstevel@tonic-gate 	}
10767c478bd9Sstevel@tonic-gate 
10777c478bd9Sstevel@tonic-gate 	(void) ipc_lock(msq_svc, qp->msg_perm.ipc_id);
10787c478bd9Sstevel@tonic-gate 	ipc_rele(msq_svc, (kipc_perm_t *)qp);
10797c478bd9Sstevel@tonic-gate 
10807c478bd9Sstevel@tonic-gate 	if (nmsg > 0)
10817c478bd9Sstevel@tonic-gate 		kmem_free(snaplist, nmsg * sizeof (struct msg *));
10827c478bd9Sstevel@tonic-gate 
10837c478bd9Sstevel@tonic-gate 	if (error)
10847c478bd9Sstevel@tonic-gate 		return (set_errno(error));
10857c478bd9Sstevel@tonic-gate 	return (0);
10867c478bd9Sstevel@tonic-gate }
10877c478bd9Sstevel@tonic-gate 
1088e50383f4Sdv #define	MSG_PREALLOC_LIMIT 8192
1089e50383f4Sdv 
10907c478bd9Sstevel@tonic-gate /*
10917c478bd9Sstevel@tonic-gate  * msgsnd system call.
10927c478bd9Sstevel@tonic-gate  */
10937c478bd9Sstevel@tonic-gate static int
msgsnd(int msqid,struct ipcmsgbuf * msgp,size_t msgsz,int msgflg)10947c478bd9Sstevel@tonic-gate msgsnd(int msqid, struct ipcmsgbuf *msgp, size_t msgsz, int msgflg)
10957c478bd9Sstevel@tonic-gate {
10967c478bd9Sstevel@tonic-gate 	kmsqid_t	*qp;
1097e50383f4Sdv 	kmutex_t	*lock = NULL;
10987c478bd9Sstevel@tonic-gate 	struct msg	*mp = NULL;
10997c478bd9Sstevel@tonic-gate 	long		type;
1100eb9fe4caSDavid Valin 	int		error = 0, wait_wakeup = 0;
1101eb9fe4caSDavid Valin 	msgq_wakeup_t   msg_entry;
11027c478bd9Sstevel@tonic-gate 	model_t		mdl = get_udatamodel();
11037c478bd9Sstevel@tonic-gate 	STRUCT_HANDLE(ipcmsgbuf, umsgp);
11047c478bd9Sstevel@tonic-gate 
11057c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(CPU, sys, msg, 1);	/* bump msg send/rcv count */
11067c478bd9Sstevel@tonic-gate 	STRUCT_SET_HANDLE(umsgp, mdl, msgp);
11077c478bd9Sstevel@tonic-gate 
11087c478bd9Sstevel@tonic-gate 	if (mdl == DATAMODEL_NATIVE) {
11097c478bd9Sstevel@tonic-gate 		if (copyin(msgp, &type, sizeof (type)))
11107c478bd9Sstevel@tonic-gate 			return (set_errno(EFAULT));
11117c478bd9Sstevel@tonic-gate 	} else {
11127c478bd9Sstevel@tonic-gate 		int32_t	type32;
11137c478bd9Sstevel@tonic-gate 		if (copyin(msgp, &type32, sizeof (type32)))
11147c478bd9Sstevel@tonic-gate 			return (set_errno(EFAULT));
11157c478bd9Sstevel@tonic-gate 		type = type32;
11167c478bd9Sstevel@tonic-gate 	}
11177c478bd9Sstevel@tonic-gate 
11187c478bd9Sstevel@tonic-gate 	if (type < 1)
11197c478bd9Sstevel@tonic-gate 		return (set_errno(EINVAL));
11207c478bd9Sstevel@tonic-gate 
1121e50383f4Sdv 	/*
1122e50383f4Sdv 	 * We want the value here large enough that most of the
1123e50383f4Sdv 	 * the message operations will use the "lockless" path,
1124e50383f4Sdv 	 * but small enough that a user can not reserve large
1125e50383f4Sdv 	 * chunks of kernel memory unless they have a valid
1126e50383f4Sdv 	 * reason to.
1127e50383f4Sdv 	 */
1128e50383f4Sdv 	if (msgsz <= MSG_PREALLOC_LIMIT) {
1129e50383f4Sdv 		/*
1130e50383f4Sdv 		 * We are small enough that we can afford to do the
1131e50383f4Sdv 		 * allocation now.  This saves dropping the lock
1132e50383f4Sdv 		 * and then reacquiring the lock.
1133e50383f4Sdv 		 */
1134e50383f4Sdv 		mp = kmem_zalloc(sizeof (struct msg), KM_SLEEP);
1135e50383f4Sdv 		mp->msg_copycnt = 1;
1136e50383f4Sdv 		mp->msg_size = msgsz;
1137e50383f4Sdv 		if (msgsz) {
1138e50383f4Sdv 			mp->msg_addr = kmem_alloc(msgsz, KM_SLEEP);
1139e50383f4Sdv 			if (copyin(STRUCT_FADDR(umsgp, mtext),
1140e50383f4Sdv 			    mp->msg_addr, msgsz) == -1) {
1141e50383f4Sdv 				error = EFAULT;
1142e50383f4Sdv 				goto msgsnd_out;
1143e50383f4Sdv 			}
1144e50383f4Sdv 		}
1145e50383f4Sdv 	}
1146e50383f4Sdv 
1147e50383f4Sdv 	if ((lock = ipc_lookup(msq_svc, msqid, (kipc_perm_t **)&qp)) == NULL) {
1148e50383f4Sdv 		error = EINVAL;
1149e50383f4Sdv 		goto msgsnd_out;
1150e50383f4Sdv 	}
1151e50383f4Sdv 
11527c478bd9Sstevel@tonic-gate 	ipc_hold(msq_svc, (kipc_perm_t *)qp);
11537c478bd9Sstevel@tonic-gate 
11547c478bd9Sstevel@tonic-gate 	if (msgsz > qp->msg_qbytes) {
11557c478bd9Sstevel@tonic-gate 		error = EINVAL;
11567c478bd9Sstevel@tonic-gate 		goto msgsnd_out;
11577c478bd9Sstevel@tonic-gate 	}
11587c478bd9Sstevel@tonic-gate 
11597c478bd9Sstevel@tonic-gate 	if (error = ipcperm_access(&qp->msg_perm, MSG_W, CRED()))
11607c478bd9Sstevel@tonic-gate 		goto msgsnd_out;
11617c478bd9Sstevel@tonic-gate 
11627c478bd9Sstevel@tonic-gate top:
11637c478bd9Sstevel@tonic-gate 	/*
11647c478bd9Sstevel@tonic-gate 	 * Allocate space on q, message header, & buffer space.
11657c478bd9Sstevel@tonic-gate 	 */
11667c478bd9Sstevel@tonic-gate 	ASSERT(qp->msg_qnum <= qp->msg_qmax);
11677c478bd9Sstevel@tonic-gate 	while ((msgsz > qp->msg_qbytes - qp->msg_cbytes) ||
11687c478bd9Sstevel@tonic-gate 	    (qp->msg_qnum == qp->msg_qmax)) {
11697c478bd9Sstevel@tonic-gate 		int cvres;
11707c478bd9Sstevel@tonic-gate 
11717c478bd9Sstevel@tonic-gate 		if (msgflg & IPC_NOWAIT) {
11727c478bd9Sstevel@tonic-gate 			error = EAGAIN;
11737c478bd9Sstevel@tonic-gate 			goto msgsnd_out;
11747c478bd9Sstevel@tonic-gate 		}
11757c478bd9Sstevel@tonic-gate 
1176eb9fe4caSDavid Valin 		wait_wakeup = 0;
11777c478bd9Sstevel@tonic-gate 		qp->msg_snd_cnt++;
1178eb9fe4caSDavid Valin 		msg_entry.msgw_snd_size = msgsz;
1179eb9fe4caSDavid Valin 		msg_entry.msgw_thrd = curthread;
1180eb9fe4caSDavid Valin 		msg_entry.msgw_type = type;
1181eb9fe4caSDavid Valin 		cv_init(&msg_entry.msgw_wake_cv, NULL, 0, NULL);
1182eb9fe4caSDavid Valin 		list_insert_tail(&qp->msg_wait_rcv, &msg_entry);
1183eb9fe4caSDavid Valin 		if (qp->msg_snd_smallest > msgsz)
1184eb9fe4caSDavid Valin 			qp->msg_snd_smallest = msgsz;
1185eb9fe4caSDavid Valin 		cvres = cv_wait_sig(&msg_entry.msgw_wake_cv, lock);
11867c478bd9Sstevel@tonic-gate 		lock = ipc_relock(msq_svc, qp->msg_perm.ipc_id, lock);
11877c478bd9Sstevel@tonic-gate 		qp->msg_snd_cnt--;
1188eb9fe4caSDavid Valin 		if (list_link_active(&msg_entry.msgw_list))
1189eb9fe4caSDavid Valin 			list_remove(&qp->msg_wait_rcv, &msg_entry);
11902c5b6df1Sdv 		if (error = msgq_check_err(qp, cvres)) {
11917c478bd9Sstevel@tonic-gate 			goto msgsnd_out;
11927c478bd9Sstevel@tonic-gate 		}
1193eb9fe4caSDavid Valin 		wait_wakeup = 1;
11947c478bd9Sstevel@tonic-gate 	}
11957c478bd9Sstevel@tonic-gate 
11967c478bd9Sstevel@tonic-gate 	if (mp == NULL) {
11977c478bd9Sstevel@tonic-gate 		int failure;
11987c478bd9Sstevel@tonic-gate 
11997c478bd9Sstevel@tonic-gate 		mutex_exit(lock);
1200e50383f4Sdv 		ASSERT(msgsz > 0);
12017c478bd9Sstevel@tonic-gate 		mp = kmem_zalloc(sizeof (struct msg), KM_SLEEP);
1202e50383f4Sdv 		mp->msg_addr = kmem_alloc(msgsz, KM_SLEEP);
12037c478bd9Sstevel@tonic-gate 		mp->msg_size = msgsz;
12047c478bd9Sstevel@tonic-gate 		mp->msg_copycnt = 1;
12057c478bd9Sstevel@tonic-gate 
1206e50383f4Sdv 		failure = (copyin(STRUCT_FADDR(umsgp, mtext),
12077c478bd9Sstevel@tonic-gate 		    mp->msg_addr, msgsz) == -1);
12087c478bd9Sstevel@tonic-gate 		lock = ipc_lock(msq_svc, qp->msg_perm.ipc_id);
12097c478bd9Sstevel@tonic-gate 		if (IPC_FREE(&qp->msg_perm)) {
12107c478bd9Sstevel@tonic-gate 			error = EIDRM;
12117c478bd9Sstevel@tonic-gate 			goto msgsnd_out;
12127c478bd9Sstevel@tonic-gate 		}
12137c478bd9Sstevel@tonic-gate 		if (failure) {
12147c478bd9Sstevel@tonic-gate 			error = EFAULT;
12157c478bd9Sstevel@tonic-gate 			goto msgsnd_out;
12167c478bd9Sstevel@tonic-gate 		}
12177c478bd9Sstevel@tonic-gate 		goto top;
12187c478bd9Sstevel@tonic-gate 	}
12197c478bd9Sstevel@tonic-gate 
12207c478bd9Sstevel@tonic-gate 	/*
12217c478bd9Sstevel@tonic-gate 	 * Everything is available, put msg on q.
12227c478bd9Sstevel@tonic-gate 	 */
12237c478bd9Sstevel@tonic-gate 	qp->msg_qnum++;
12247c478bd9Sstevel@tonic-gate 	qp->msg_cbytes += msgsz;
12257c478bd9Sstevel@tonic-gate 	qp->msg_lspid = curproc->p_pid;
12267c478bd9Sstevel@tonic-gate 	qp->msg_stime = gethrestime_sec();
12277c478bd9Sstevel@tonic-gate 	mp->msg_type = type;
12282c5b6df1Sdv 	if (qp->msg_lowest_type > type)
12292c5b6df1Sdv 		qp->msg_lowest_type = type;
12307c478bd9Sstevel@tonic-gate 	list_insert_tail(&qp->msg_list, mp);
1231b2eb1770Sudpa 	/*
12322c5b6df1Sdv 	 * Get the proper receiver going.
1233b2eb1770Sudpa 	 */
12342c5b6df1Sdv 	msg_wakeup_rdr(qp, &qp->msg_fnd_sndr, type);
12357c478bd9Sstevel@tonic-gate 
12367c478bd9Sstevel@tonic-gate msgsnd_out:
1237eb9fe4caSDavid Valin 	/*
1238eb9fe4caSDavid Valin 	 * We were woken up from the send wait list, but an
1239eb9fe4caSDavid Valin 	 * an error occured on placing the message onto the
1240eb9fe4caSDavid Valin 	 * msg queue.  Given that, we need to do the wakeup
1241eb9fe4caSDavid Valin 	 * dance again.
1242eb9fe4caSDavid Valin 	 */
1243eb9fe4caSDavid Valin 
1244eb9fe4caSDavid Valin 	if (wait_wakeup && error) {
1245eb9fe4caSDavid Valin 		msg_wakeup_senders(qp);
1246eb9fe4caSDavid Valin 	}
1247e50383f4Sdv 	if (lock)
1248e50383f4Sdv 		ipc_rele(msq_svc, (kipc_perm_t *)qp);	/* drops lock */
12497c478bd9Sstevel@tonic-gate 
12507c478bd9Sstevel@tonic-gate 	if (error) {
12517c478bd9Sstevel@tonic-gate 		if (mp)
12527c478bd9Sstevel@tonic-gate 			msg_rele(mp);
12537c478bd9Sstevel@tonic-gate 		return (set_errno(error));
12547c478bd9Sstevel@tonic-gate 	}
12557c478bd9Sstevel@tonic-gate 
12567c478bd9Sstevel@tonic-gate 	return (0);
12577c478bd9Sstevel@tonic-gate }
12587c478bd9Sstevel@tonic-gate 
12592c5b6df1Sdv static void
msg_wakeup_rdr(kmsqid_t * qp,msg_select_t ** flist,long type)12602c5b6df1Sdv msg_wakeup_rdr(kmsqid_t *qp, msg_select_t **flist, long type)
12612c5b6df1Sdv {
12622c5b6df1Sdv 	msg_select_t	*walker = *flist;
12632c5b6df1Sdv 	msgq_wakeup_t	*wakeup;
1264e5994f96Sdv 	uint_t		msg_hash;
12652c5b6df1Sdv 
12662c5b6df1Sdv 	msg_hash = msg_type_hash(type);
12672c5b6df1Sdv 
12682c5b6df1Sdv 	do {
12692c5b6df1Sdv 		wakeup = walker->selection(qp, msg_hash, type);
12702c5b6df1Sdv 		walker = walker->next_selection;
12712c5b6df1Sdv 	} while (!wakeup && walker != *flist);
12722c5b6df1Sdv 
12732c5b6df1Sdv 	*flist = (*flist)->next_selection;
12742c5b6df1Sdv 	if (wakeup) {
12752c5b6df1Sdv 		if (type) {
12762c5b6df1Sdv 			wakeup->msgw_snd_wake = type;
12772c5b6df1Sdv 		}
12782c5b6df1Sdv 		cv_signal(&wakeup->msgw_wake_cv);
12792c5b6df1Sdv 	}
12802c5b6df1Sdv }
12812c5b6df1Sdv 
1282e5994f96Sdv static uint_t
msg_type_hash(long msg_type)12832c5b6df1Sdv msg_type_hash(long msg_type)
12842c5b6df1Sdv {
12852c5b6df1Sdv 	if (msg_type < 0) {
1286e5994f96Sdv 		long	hash = -msg_type / MSG_NEG_INTERVAL;
12872c5b6df1Sdv 		/*
12882c5b6df1Sdv 		 * Negative message types are hashed over an
12892c5b6df1Sdv 		 * interval.  Any message type that hashes
12902c5b6df1Sdv 		 * beyond MSG_MAX_QNUM is automatically placed
12912c5b6df1Sdv 		 * in the last bucket.
12922c5b6df1Sdv 		 */
1293e5994f96Sdv 		if (hash > MSG_MAX_QNUM)
12942c5b6df1Sdv 			hash = MSG_MAX_QNUM;
12952c5b6df1Sdv 		return (hash);
12962c5b6df1Sdv 	}
12972c5b6df1Sdv 
12982c5b6df1Sdv 	/*
12992c5b6df1Sdv 	 * 0 or positive message type.  The first bucket is reserved for
13002c5b6df1Sdv 	 * message receivers of type 0, the other buckets we hash into.
13012c5b6df1Sdv 	 */
1302e5994f96Sdv 	if (msg_type)
1303e5994f96Sdv 		return (1 + (msg_type % MSG_MAX_QNUM));
13042c5b6df1Sdv 	return (0);
13052c5b6df1Sdv }
13062c5b6df1Sdv 
13072c5b6df1Sdv /*
13082c5b6df1Sdv  * Routines to see if we have a receiver of type 0 either blocked waiting
13092c5b6df1Sdv  * for a message.  Simply return the first guy on the list.
13102c5b6df1Sdv  */
13112c5b6df1Sdv 
13122c5b6df1Sdv static msgq_wakeup_t *
1313e5994f96Sdv /* ARGSUSED */
msg_fnd_any_snd(kmsqid_t * qp,int msg_hash,long type)13142c5b6df1Sdv msg_fnd_any_snd(kmsqid_t *qp, int msg_hash, long type)
13152c5b6df1Sdv {
1316e5994f96Sdv 	msgq_wakeup_t	*walker;
1317e5994f96Sdv 
1318e5994f96Sdv 	walker = list_head(&qp->msg_wait_snd[0]);
1319e5994f96Sdv 
1320e5994f96Sdv 	if (walker)
1321e5994f96Sdv 		list_remove(&qp->msg_wait_snd[0], walker);
1322e5994f96Sdv 	return (walker);
13232c5b6df1Sdv }
13242c5b6df1Sdv 
13252c5b6df1Sdv static msgq_wakeup_t *
1326e5994f96Sdv /* ARGSUSED */
msg_fnd_any_rdr(kmsqid_t * qp,int msg_hash,long type)13272c5b6df1Sdv msg_fnd_any_rdr(kmsqid_t *qp, int msg_hash, long type)
13282c5b6df1Sdv {
1329e5994f96Sdv 	msgq_wakeup_t	*walker;
1330e5994f96Sdv 
1331e5994f96Sdv 	walker = list_head(&qp->msg_cpy_block);
1332e5994f96Sdv 	if (walker)
1333e5994f96Sdv 		list_remove(&qp->msg_cpy_block, walker);
1334e5994f96Sdv 	return (walker);
13352c5b6df1Sdv }
13362c5b6df1Sdv 
13372c5b6df1Sdv static msgq_wakeup_t *
msg_fnd_spc_snd(kmsqid_t * qp,int msg_hash,long type)13382c5b6df1Sdv msg_fnd_spc_snd(kmsqid_t *qp, int msg_hash, long type)
13392c5b6df1Sdv {
13402c5b6df1Sdv 	msgq_wakeup_t	*walker;
13412c5b6df1Sdv 
13422c5b6df1Sdv 	walker = list_head(&qp->msg_wait_snd[msg_hash]);
13432c5b6df1Sdv 
1344e5994f96Sdv 	while (walker && walker->msgw_type != type)
1345e5994f96Sdv 		walker = list_next(&qp->msg_wait_snd[msg_hash], walker);
1346e5994f96Sdv 	if (walker)
1347e5994f96Sdv 		list_remove(&qp->msg_wait_snd[msg_hash], walker);
13482c5b6df1Sdv 	return (walker);
13492c5b6df1Sdv }
13502c5b6df1Sdv 
1351e5994f96Sdv /* ARGSUSED */
13522c5b6df1Sdv static msgq_wakeup_t *
msg_fnd_neg_snd(kmsqid_t * qp,int msg_hash,long type)13532c5b6df1Sdv msg_fnd_neg_snd(kmsqid_t *qp, int msg_hash, long type)
13542c5b6df1Sdv {
13552c5b6df1Sdv 	msgq_wakeup_t	*qptr;
13562c5b6df1Sdv 	int		count;
13572c5b6df1Sdv 	int		check_index;
13582c5b6df1Sdv 	int		neg_index;
13592c5b6df1Sdv 	int		nbuckets;
13602c5b6df1Sdv 
13612c5b6df1Sdv 	if (!qp->msg_ngt_cnt) {
13622c5b6df1Sdv 		return (NULL);
13632c5b6df1Sdv 	}
13642c5b6df1Sdv 	neg_index = msg_type_hash(-type);
13652c5b6df1Sdv 
13662c5b6df1Sdv 	/*
13672c5b6df1Sdv 	 * Check for a match among the negative type queues.  Any buckets
13682c5b6df1Sdv 	 * at neg_index or larger can match the type.  Use the last send
13692c5b6df1Sdv 	 * time to randomize the starting bucket to prevent starvation.
13702c5b6df1Sdv 	 * Search all buckets from neg_index to MSG_MAX_QNUM, starting
13712c5b6df1Sdv 	 * from the random starting point, and wrapping around after
13722c5b6df1Sdv 	 * MSG_MAX_QNUM.
13732c5b6df1Sdv 	 */
13742c5b6df1Sdv 
13752c5b6df1Sdv 	nbuckets = MSG_MAX_QNUM - neg_index + 1;
13762c5b6df1Sdv 	check_index = neg_index + (qp->msg_stime % nbuckets);
13772c5b6df1Sdv 
13782c5b6df1Sdv 	for (count = nbuckets; count > 0; count--) {
13792c5b6df1Sdv 		qptr = list_head(&qp->msg_wait_snd_ngt[check_index]);
13802c5b6df1Sdv 		while (qptr) {
13812c5b6df1Sdv 			/*
13822c5b6df1Sdv 			 * The lowest hash bucket may actually contain
13832c5b6df1Sdv 			 * message types that are not valid for this
13842c5b6df1Sdv 			 * request.  This can happen due to the fact that
13852c5b6df1Sdv 			 * the message buckets actually contain a consecutive
13862c5b6df1Sdv 			 * range of types.
13872c5b6df1Sdv 			 */
13882c5b6df1Sdv 			if (-qptr->msgw_type >= type) {
1389e5994f96Sdv 				list_remove(&qp->msg_wait_snd_ngt[check_index],
1390e5994f96Sdv 				    qptr);
13912c5b6df1Sdv 				return (qptr);
13922c5b6df1Sdv 			}
1393e5994f96Sdv 			qptr = list_next(&qp->msg_wait_snd_ngt[check_index],
1394e5994f96Sdv 			    qptr);
13952c5b6df1Sdv 		}
13962c5b6df1Sdv 		if (++check_index > MSG_MAX_QNUM) {
13972c5b6df1Sdv 			check_index = neg_index;
13982c5b6df1Sdv 		}
13992c5b6df1Sdv 	}
14002c5b6df1Sdv 	return (NULL);
14012c5b6df1Sdv }
14022c5b6df1Sdv 
14032c5b6df1Sdv static int
msg_rcvq_sleep(list_t * queue,msgq_wakeup_t * entry,kmutex_t ** lock,kmsqid_t * qp)14042c5b6df1Sdv msg_rcvq_sleep(list_t *queue, msgq_wakeup_t *entry, kmutex_t **lock,
14052c5b6df1Sdv     kmsqid_t *qp)
14062c5b6df1Sdv {
14072c5b6df1Sdv 	int		cvres;
14082c5b6df1Sdv 
14092c5b6df1Sdv 	cv_init(&entry->msgw_wake_cv, NULL, 0, NULL);
14102c5b6df1Sdv 
14112c5b6df1Sdv 	list_insert_tail(queue, entry);
14122c5b6df1Sdv 
14132c5b6df1Sdv 	qp->msg_rcv_cnt++;
14142c5b6df1Sdv 	cvres = cv_wait_sig(&entry->msgw_wake_cv, *lock);
14152c5b6df1Sdv 	*lock = ipc_relock(msq_svc, qp->msg_perm.ipc_id, *lock);
14162c5b6df1Sdv 	qp->msg_rcv_cnt--;
1417e5994f96Sdv 
1418e5994f96Sdv 	if (list_link_active(&entry->msgw_list)) {
1419e5994f96Sdv 		/*
1420e5994f96Sdv 		 * We woke up unexpectedly, remove ourself.
1421e5994f96Sdv 		 */
1422e5994f96Sdv 		list_remove(queue, entry);
1423e5994f96Sdv 	}
14242c5b6df1Sdv 
14252c5b6df1Sdv 	return (cvres);
14262c5b6df1Sdv }
14272c5b6df1Sdv 
14282c5b6df1Sdv static void
msg_rcvq_wakeup_all(list_t * q_ptr)14292c5b6df1Sdv msg_rcvq_wakeup_all(list_t *q_ptr)
14302c5b6df1Sdv {
14312c5b6df1Sdv 	msgq_wakeup_t	*q_walk;
14322c5b6df1Sdv 
1433e5994f96Sdv 	while (q_walk = list_head(q_ptr)) {
1434e5994f96Sdv 		list_remove(q_ptr, q_walk);
14352c5b6df1Sdv 		cv_signal(&q_walk->msgw_wake_cv);
14362c5b6df1Sdv 	}
14372c5b6df1Sdv }
14382c5b6df1Sdv 
14397c478bd9Sstevel@tonic-gate /*
14407c478bd9Sstevel@tonic-gate  * msgsys - System entry point for msgctl, msgget, msgrcv, and msgsnd
14417c478bd9Sstevel@tonic-gate  * system calls.
14427c478bd9Sstevel@tonic-gate  */
14437c478bd9Sstevel@tonic-gate static ssize_t
msgsys(int opcode,uintptr_t a1,uintptr_t a2,uintptr_t a3,uintptr_t a4,uintptr_t a5)14447c478bd9Sstevel@tonic-gate msgsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3,
1445*ac2ff9f2SToomas Soome     uintptr_t a4, uintptr_t a5)
14467c478bd9Sstevel@tonic-gate {
14477c478bd9Sstevel@tonic-gate 	ssize_t error;
14487c478bd9Sstevel@tonic-gate 
14497c478bd9Sstevel@tonic-gate 	switch (opcode) {
14507c478bd9Sstevel@tonic-gate 	case MSGGET:
14517c478bd9Sstevel@tonic-gate 		error = msgget((key_t)a1, (int)a2);
14527c478bd9Sstevel@tonic-gate 		break;
14537c478bd9Sstevel@tonic-gate 	case MSGCTL:
14547c478bd9Sstevel@tonic-gate 		error = msgctl((int)a1, (int)a2, (void *)a3);
14557c478bd9Sstevel@tonic-gate 		break;
14567c478bd9Sstevel@tonic-gate 	case MSGRCV:
14577c478bd9Sstevel@tonic-gate 		error = msgrcv((int)a1, (struct ipcmsgbuf *)a2,
14587c478bd9Sstevel@tonic-gate 		    (size_t)a3, (long)a4, (int)a5);
14597c478bd9Sstevel@tonic-gate 		break;
14607c478bd9Sstevel@tonic-gate 	case MSGSND:
14617c478bd9Sstevel@tonic-gate 		error = msgsnd((int)a1, (struct ipcmsgbuf *)a2,
14627c478bd9Sstevel@tonic-gate 		    (size_t)a3, (int)a4);
14637c478bd9Sstevel@tonic-gate 		break;
14647c478bd9Sstevel@tonic-gate 	case MSGIDS:
14657c478bd9Sstevel@tonic-gate 		error = msgids((int *)a1, (uint_t)a2, (uint_t *)a3);
14667c478bd9Sstevel@tonic-gate 		break;
14677c478bd9Sstevel@tonic-gate 	case MSGSNAP:
14687c478bd9Sstevel@tonic-gate 		error = msgsnap((int)a1, (caddr_t)a2, (size_t)a3, (long)a4);
14697c478bd9Sstevel@tonic-gate 		break;
14707c478bd9Sstevel@tonic-gate 	default:
14717c478bd9Sstevel@tonic-gate 		error = set_errno(EINVAL);
14727c478bd9Sstevel@tonic-gate 		break;
14737c478bd9Sstevel@tonic-gate 	}
14747c478bd9Sstevel@tonic-gate 
14757c478bd9Sstevel@tonic-gate 	return (error);
14767c478bd9Sstevel@tonic-gate }
14777c478bd9Sstevel@tonic-gate 
1478eb9fe4caSDavid Valin /*
1479eb9fe4caSDavid Valin  * Determine if a writer who is waiting can process its message.  If so
1480eb9fe4caSDavid Valin  * wake it up.
1481eb9fe4caSDavid Valin  */
1482eb9fe4caSDavid Valin static void
msg_wakeup_senders(kmsqid_t * qp)1483eb9fe4caSDavid Valin msg_wakeup_senders(kmsqid_t *qp)
1484eb9fe4caSDavid Valin {
1485eb9fe4caSDavid Valin 	struct msgq_wakeup *ptr, *optr;
1486eb9fe4caSDavid Valin 	size_t avail, smallest;
1487eb9fe4caSDavid Valin 	int msgs_out;
1488eb9fe4caSDavid Valin 
1489eb9fe4caSDavid Valin 	/*
1490eb9fe4caSDavid Valin 	 * Is there a writer waiting, and if so, can it be serviced? If
1491eb9fe4caSDavid Valin 	 * not return back to the caller.
1492eb9fe4caSDavid Valin 	 */
1493eb9fe4caSDavid Valin 	if (IPC_FREE(&qp->msg_perm) || qp->msg_qnum >= qp->msg_qmax)
1494eb9fe4caSDavid Valin 		return;
1495eb9fe4caSDavid Valin 
1496eb9fe4caSDavid Valin 	avail = qp->msg_qbytes - qp->msg_cbytes;
1497eb9fe4caSDavid Valin 	if (avail < qp->msg_snd_smallest)
1498eb9fe4caSDavid Valin 		return;
1499eb9fe4caSDavid Valin 
1500eb9fe4caSDavid Valin 	ptr = list_head(&qp->msg_wait_rcv);
1501eb9fe4caSDavid Valin 	if (ptr == NULL) {
1502eb9fe4caSDavid Valin 		qp->msg_snd_smallest = MSG_SMALL_INIT;
1503eb9fe4caSDavid Valin 		return;
1504eb9fe4caSDavid Valin 	}
1505eb9fe4caSDavid Valin 	optr = ptr;
1506eb9fe4caSDavid Valin 
1507eb9fe4caSDavid Valin 	/*
1508eb9fe4caSDavid Valin 	 * smallest:	minimum message size of all queued writers
1509eb9fe4caSDavid Valin 	 *
1510eb9fe4caSDavid Valin 	 * avail:	amount of space left on the msgq
1511eb9fe4caSDavid Valin 	 *		if all the writers we have woken up are successful.
1512eb9fe4caSDavid Valin 	 *
1513eb9fe4caSDavid Valin 	 * msgs_out:	is the number of messages on the message queue if
1514eb9fe4caSDavid Valin 	 *		all the writers we have woken up are successful.
1515eb9fe4caSDavid Valin 	 */
1516eb9fe4caSDavid Valin 
1517eb9fe4caSDavid Valin 	smallest = MSG_SMALL_INIT;
1518eb9fe4caSDavid Valin 	msgs_out = qp->msg_qnum;
1519eb9fe4caSDavid Valin 	while (ptr) {
1520eb9fe4caSDavid Valin 		ptr = list_next(&qp->msg_wait_rcv, ptr);
1521eb9fe4caSDavid Valin 		if (optr->msgw_snd_size <= avail) {
1522eb9fe4caSDavid Valin 			list_remove(&qp->msg_wait_rcv, optr);
1523eb9fe4caSDavid Valin 			avail -= optr->msgw_snd_size;
1524eb9fe4caSDavid Valin 			cv_signal(&optr->msgw_wake_cv);
1525eb9fe4caSDavid Valin 			msgs_out++;
1526eb9fe4caSDavid Valin 			if (msgs_out == qp->msg_qmax ||
1527eb9fe4caSDavid Valin 			    avail < qp->msg_snd_smallest)
1528eb9fe4caSDavid Valin 				break;
1529eb9fe4caSDavid Valin 		} else {
1530eb9fe4caSDavid Valin 			if (smallest > optr->msgw_snd_size)
1531eb9fe4caSDavid Valin 				smallest = optr->msgw_snd_size;
1532eb9fe4caSDavid Valin 		}
1533eb9fe4caSDavid Valin 		optr = ptr;
1534eb9fe4caSDavid Valin 	}
1535eb9fe4caSDavid Valin 
1536eb9fe4caSDavid Valin 	/*
1537eb9fe4caSDavid Valin 	 * Reset the smallest message size if the entire list has been visited
1538eb9fe4caSDavid Valin 	 */
1539eb9fe4caSDavid Valin 	if (ptr == NULL && smallest != MSG_SMALL_INIT)
1540eb9fe4caSDavid Valin 		qp->msg_snd_smallest = smallest;
1541eb9fe4caSDavid Valin }
1542eb9fe4caSDavid Valin 
15437c478bd9Sstevel@tonic-gate #ifdef	_SYSCALL32_IMPL
15447c478bd9Sstevel@tonic-gate /*
15457c478bd9Sstevel@tonic-gate  * msgsys32 - System entry point for msgctl, msgget, msgrcv, and msgsnd
15467c478bd9Sstevel@tonic-gate  * system calls for 32-bit callers on LP64 kernel.
15477c478bd9Sstevel@tonic-gate  */
15487c478bd9Sstevel@tonic-gate static ssize32_t
msgsys32(int opcode,uint32_t a1,uint32_t a2,uint32_t a3,uint32_t a4,uint32_t a5)15497c478bd9Sstevel@tonic-gate msgsys32(int opcode, uint32_t a1, uint32_t a2, uint32_t a3,
1550*ac2ff9f2SToomas Soome     uint32_t a4, uint32_t a5)
15517c478bd9Sstevel@tonic-gate {
15527c478bd9Sstevel@tonic-gate 	ssize_t error;
15537c478bd9Sstevel@tonic-gate 
15547c478bd9Sstevel@tonic-gate 	switch (opcode) {
15557c478bd9Sstevel@tonic-gate 	case MSGGET:
15567c478bd9Sstevel@tonic-gate 		error = msgget((key_t)a1, (int)a2);
15577c478bd9Sstevel@tonic-gate 		break;
15587c478bd9Sstevel@tonic-gate 	case MSGCTL:
15597c478bd9Sstevel@tonic-gate 		error = msgctl((int)a1, (int)a2, (void *)(uintptr_t)a3);
15607c478bd9Sstevel@tonic-gate 		break;
15617c478bd9Sstevel@tonic-gate 	case MSGRCV:
15627c478bd9Sstevel@tonic-gate 		error = msgrcv((int)a1, (struct ipcmsgbuf *)(uintptr_t)a2,
15637c478bd9Sstevel@tonic-gate 		    (size_t)a3, (long)(int32_t)a4, (int)a5);
15647c478bd9Sstevel@tonic-gate 		break;
15657c478bd9Sstevel@tonic-gate 	case MSGSND:
15667c478bd9Sstevel@tonic-gate 		error = msgsnd((int)a1, (struct ipcmsgbuf *)(uintptr_t)a2,
15677c478bd9Sstevel@tonic-gate 		    (size_t)(int32_t)a3, (int)a4);
15687c478bd9Sstevel@tonic-gate 		break;
15697c478bd9Sstevel@tonic-gate 	case MSGIDS:
15707c478bd9Sstevel@tonic-gate 		error = msgids((int *)(uintptr_t)a1, (uint_t)a2,
15717c478bd9Sstevel@tonic-gate 		    (uint_t *)(uintptr_t)a3);
15727c478bd9Sstevel@tonic-gate 		break;
15737c478bd9Sstevel@tonic-gate 	case MSGSNAP:
15747c478bd9Sstevel@tonic-gate 		error = msgsnap((int)a1, (caddr_t)(uintptr_t)a2, (size_t)a3,
15757c478bd9Sstevel@tonic-gate 		    (long)(int32_t)a4);
15767c478bd9Sstevel@tonic-gate 		break;
15777c478bd9Sstevel@tonic-gate 	default:
15787c478bd9Sstevel@tonic-gate 		error = set_errno(EINVAL);
15797c478bd9Sstevel@tonic-gate 		break;
15807c478bd9Sstevel@tonic-gate 	}
15817c478bd9Sstevel@tonic-gate 
15827c478bd9Sstevel@tonic-gate 	return (error);
15837c478bd9Sstevel@tonic-gate }
15847c478bd9Sstevel@tonic-gate #endif	/* SYSCALL32_IMPL */
1585