1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2020 Oxide Computer Company
26 */
27
28#if !defined(_KERNEL) && !defined(_KMDB)
29#include "lint.h"
30#endif /* !_KERNEL && !_KMDB */
31
32#include <sys/types.h>
33
34#if !defined(_KERNEL) && !defined(_KMDB)
35#include <stdlib.h>
36#include <synch.h>
37#endif /* !_KERNEL && !_KMDB */
38
39#include "qsort.h"
40
41static void swapp32(uint32_t *r1, uint32_t *r2, size_t cnt);
42static void swapp64(uint64_t *r1, uint64_t *r2, size_t cnt);
43static void swapi(uint32_t *r1, uint32_t *r2, size_t cnt);
44static void swapb(char *r1, char *r2, size_t cnt);
45
46typedef int (*cmp_f)(const void *, const void *, void *);
47
48/*
49 * choose a median of 3 values
50 */
51static __GNU_INLINE char *
52med3(char *a, char *b, char *c, cmp_f cmp, void *arg)
53{
54	if (cmp(a, b, arg) < 0) {
55		if (cmp(b, c, arg) < 0) {
56			return (b);
57		} else if (cmp(a, c, arg) < 0) {
58			return (c);
59		} else {
60			return (a);
61		}
62	} else {
63		if (cmp(b, c, arg) > 0) {
64			return (b);
65		} else if (cmp(a, c, arg) > 0) {
66			return (c);
67		} else {
68			return (a);
69		}
70	}
71}
72
73#define	THRESH_L	5	/* threshold for insertion sort */
74#define	THRESH_M3	20	/* threshold for median of 3 */
75#define	THRESH_M9	50	/* threshold for median of 9 */
76
77typedef struct {
78	char	*b_lim;
79	size_t	nrec;
80} stk_t;
81
82/*
83 * qsort() is a general purpose, in-place sorting routine using a
84 * user provided call back function for comparisons.  This implementation
85 * utilizes a ternary quicksort algorithm, and cuts over to an
86 * insertion sort for partitions involving fewer than THRESH_L records.
87 *
88 * Potential User Errors
89 *   There is no return value from qsort, this function has no method
90 *   of alerting the user that a sort did not work or could not work.
91 *   We do not print an error message or exit the process or thread,
92 *   Even if we can detect an error, We CANNOT silently return without
93 *   sorting the data, if we did so the user could never be sure the
94 *   sort completed successfully.
95 *   It is possible we could change the return value of sort from void
96 *   to int and return success or some error codes, but this gets into
97 *   standards  and compatibility issues.
98 *
99 *   Examples of qsort parameter errors might be
100 *   1) record size (rsiz) equal to 0
101 *      qsort will loop and never return.
102 *   2) record size (rsiz) less than 0
103 *      rsiz is unsigned, so a negative value is insanely large
104 *   3) number of records (nrec) is 0
105 *      This is legal - qsort will return without examining any records
106 *   4) number of records (nrec) is less than 0
107 *      nrec is unsigned, so a negative value is insanely large.
108 *   5) nrec * rsiz > memory allocation for sort array
109 *      a segment violation may occur
110 *      corruption of other memory may occur
111 *   6) The base address of the sort array is invalid
112 *      a segment violation may occur
113 *      corruption of other memory may occur
114 *   7) The user call back function is invalid
115 *      we may get alignment errors or segment violations
116 *      we may jump into never-never land
117 *
118 *   Some less obvious errors might be
119 *   8) The user compare function is not comparing correctly
120 *   9) The user compare function modifies the data records
121 */
122
123static int
124qsort_r_wrapper(const void *a, const void *b, void *arg)
125{
126	int (*cmp)(const void *, const void *) =
127	    (int(*)(const void *, const void *))(uintptr_t)arg;
128	return (cmp(a, b));
129}
130
131void
132qsort_r(void *basep, size_t nrec, size_t rsiz,
133    cmp_f cmp, void *arg)
134{
135	size_t		i;		/* temporary variable */
136
137	/* variables used by swap */
138	void		(*swapf)(char *, char *, size_t);
139	size_t		loops;
140
141	/* variables used by sort */
142	stk_t		stack[8 * sizeof (nrec) + 1];
143	stk_t		*sp;
144	char		*b_lim;		/* bottom limit */
145	char		*b_dup;		/* bottom duplicate */
146	char		*b_par;		/* bottom partition */
147	char		*t_lim;		/* top limit */
148	char		*t_dup;		/* top duplicate */
149	char		*t_par;		/* top partition */
150	char		*m1, *m2, *m3;	/* median pointers */
151	uintptr_t	d_bytelength;	/* byte length of duplicate records */
152	int		b_nrec;
153	int		t_nrec;
154	int		cv;		/* results of compare (bottom / top) */
155
156	/*
157	 * choose a swap function based on alignment and size
158	 *
159	 * The qsort function sorts an array of fixed length records.
160	 * We have very limited knowledge about the data record itself.
161	 * It may be that the data record is in the array we are sorting
162	 * or it may be that the array contains pointers or indexes to
163	 * the actual data record and all that we are sorting is the indexes.
164	 *
165	 * The following decision will choose an optimal swap function
166	 * based on the size and alignment of the data records
167	 *   swapp64	will swap 64 bit pointers
168	 *   swapp32	will swap 32 bit pointers
169	 *   swapi	will swap an array of 32 bit integers
170	 *   swapb	will swap an array of 8 bit characters
171	 *
172	 * swapi and swapb will also require the variable loops to be set
173	 * to control the length of the array being swapped
174	 */
175	if ((((uintptr_t)basep & (sizeof (uint64_t) - 1)) == 0) &&
176	    (rsiz == sizeof (uint64_t))) {
177		loops = 1;
178		swapf = (void (*)(char *, char *, size_t))swapp64;
179	} else if ((((uintptr_t)basep & (sizeof (uint32_t) - 1)) == 0) &&
180	    (rsiz == sizeof (uint32_t))) {
181		loops = 1;
182		swapf = (void (*)(char *, char *, size_t))swapp32;
183	} else if ((((uintptr_t)basep & (sizeof (uint32_t) - 1)) == 0) &&
184	    ((rsiz & (sizeof (uint32_t) - 1)) == 0)) {
185		loops = rsiz / sizeof (int);
186		swapf = (void (*)(char *, char *, size_t))swapi;
187	} else {
188		loops = rsiz;
189		swapf = swapb;
190	}
191
192	/*
193	 * qsort is a partitioning sort
194	 *
195	 * the stack is the bookkeeping mechanism to keep track of all
196	 * the partitions.
197	 *
198	 * each sort pass takes one partition and sorts it into two partitions.
199	 * at the top of the loop we simply take the partition on the top
200	 * of the stack and sort it. See the comments at the bottom
201	 * of the loop regarding which partitions to add in what order.
202	 *
203	 * initially put the whole partition on the stack
204	 */
205	sp = stack;
206	sp->b_lim = (char *)basep;
207	sp->nrec = nrec;
208	sp++;
209	while (sp > stack) {
210		sp--;
211		b_lim = sp->b_lim;
212		nrec = sp->nrec;
213
214		/*
215		 * a linear insertion sort i faster than a qsort for
216		 * very small number of records (THRESH_L)
217		 *
218		 * if number records < threshold use linear insertion sort
219		 *
220		 * this also handles the special case where the partition
221		 * 0 or 1 records length.
222		 */
223		if (nrec < THRESH_L) {
224			/*
225			 * Linear insertion sort
226			 */
227			t_par = b_lim;
228			for (i = 1; i < nrec; i++) {
229				t_par += rsiz;
230				b_par = t_par;
231				while (b_par > b_lim) {
232					b_par -= rsiz;
233					if ((*cmp)(b_par, b_par + rsiz, arg) <=
234					    0) {
235						break;
236					}
237					(*swapf)(b_par, b_par + rsiz, loops);
238				}
239			}
240
241			/*
242			 * a linear insertion sort will put all records
243			 * in their final position and will not create
244			 * subpartitions.
245			 *
246			 * therefore when the insertion sort is complete
247			 * just go to the top of the loop and get the
248			 * next partition to sort.
249			 */
250			continue;
251		}
252
253		/* quicksort */
254
255		/*
256		 * choose a pivot record
257		 *
258		 * Ideally the pivot record will divide the partition
259		 * into two equal parts. however we have to balance the
260		 * work involved in selecting the pivot record with the
261		 * expected benefit.
262		 *
263		 * The choice of pivot record depends on the number of
264		 * records in the partition
265		 *
266		 * for small partitions (nrec < THRESH_M3)
267		 *   we just select the record in the middle of the partition
268		 *
269		 * if (nrec >= THRESH_M3 && nrec < THRESH_M9)
270		 *   we select three values and choose the median of 3
271		 *
272		 * if (nrec >= THRESH_M9)
273		 *   then we use an approximate median of 9
274		 *   9 records are selected and grouped in 3 groups of 3
275		 *   the median of each of these 3 groups is fed into another
276		 *   median of 3 decision.
277		 *
278		 * Each median of 3 decision is 2 or 3 compares,
279		 * so median of 9 costs between 8 and 12 compares.
280		 *
281		 * i is byte distance between two consecutive samples
282		 * m2 will point to the pivot record
283		 */
284		if (nrec < THRESH_M3) {
285			m2 = b_lim + (nrec / 2) * rsiz;
286		} else if (nrec < THRESH_M9) {
287			/* use median of 3 */
288			i = ((nrec - 1) / 2) * rsiz;
289			m2 = med3(b_lim, b_lim + i, b_lim + 2 * i, cmp, arg);
290		} else {
291			/* approx median of 9 */
292			i = ((nrec - 1) / 8) * rsiz;
293			m1 = med3(b_lim, b_lim +  i, b_lim + 2 * i, cmp, arg);
294			m2 = med3(b_lim + 3 * i, b_lim + 4 * i, b_lim + 5 * i,
295			    cmp, arg);
296			m3 = med3(b_lim + 6 * i, b_lim + 7 * i, b_lim + 8 * i,
297			    cmp, arg);
298			m2 = med3(m1, m2, m3, cmp, arg);
299		}
300
301		/*
302		 * quick sort partitioning
303		 *
304		 * The partition limits are defined by bottom and top pointers
305		 * b_lim and t_lim.
306		 *
307		 * qsort uses a fairly standard method of moving the
308		 * partitioning pointers, b_par and t_par, to the middle of
309		 * the partition and exchanging records that are in the
310		 * wrong part of the partition.
311		 *
312		 * Two enhancements have been made to the basic algorithm.
313		 * One for handling duplicate records and one to minimize
314		 * the number of swaps.
315		 *
316		 * Two duplicate records pointers are (b_dup and t_dup) are
317		 * initially set to b_lim and t_lim.  Each time a record
318		 * whose sort key value is equal to the pivot record is found
319		 * it will be swapped with the record pointed to by
320		 * b_dup or t_dup and the duplicate pointer will be
321		 * incremented toward the center.
322		 * When partitioning is complete, all the duplicate records
323		 * will have been collected at the upper and lower limits of
324		 * the partition and can easily be moved adjacent to the
325		 * pivot record.
326		 *
327		 * The second optimization is to minimize the number of swaps.
328		 * The pointer m2 points to the pivot record.
329		 * During partitioning, if m2 is ever equal to the partitioning
330		 * pointers, b_par or t_par, then b_par or t_par just moves
331		 * onto the next record without doing a compare.
332		 * If as a result of duplicate record detection,
333		 * b_dup or t_dup is ever equal to m2,
334		 * then m2 is changed to point to the duplicate record and
335		 * b_dup or t_dup is incremented with out swapping records.
336		 *
337		 * When partitioning is done, we may not have the same pivot
338		 * record that we started with, but we will have one with
339		 * an equal sort key.
340		 */
341		b_dup = b_par		= b_lim;
342		t_dup = t_par = t_lim	= b_lim + rsiz * (nrec - 1);
343		for (;;) {
344
345			/* move bottom pointer up */
346			for (; b_par <= t_par; b_par += rsiz) {
347				if (b_par == m2) {
348					continue;
349				}
350				cv = cmp(b_par, m2, arg);
351				if (cv > 0) {
352					break;
353				}
354				if (cv == 0) {
355					if (b_dup == m2) {
356						m2 = b_par;
357					} else if (b_dup != b_par) {
358						(*swapf)(b_dup, b_par, loops);
359					}
360					b_dup += rsiz;
361				}
362			}
363
364			/* move top pointer down */
365			for (; b_par < t_par; t_par -= rsiz) {
366				if (t_par == m2) {
367					continue;
368				}
369				cv = cmp(t_par, m2, arg);
370				if (cv < 0) {
371					break;
372				}
373				if (cv == 0) {
374					if (t_dup == m2) {
375						m2 = t_par;
376					} else if (t_dup != t_par) {
377						(*swapf)(t_dup, t_par, loops);
378					}
379					t_dup -= rsiz;
380				}
381			}
382
383			/* break if we are done partitioning */
384			if (b_par >= t_par) {
385				break;
386			}
387
388			/* exchange records at upper and lower break points */
389			(*swapf)(b_par, t_par, loops);
390			b_par += rsiz;
391			t_par -= rsiz;
392		}
393
394		/*
395		 * partitioning is now complete
396		 *
397		 * there are two termination conditions from the partitioning
398		 * loop above.  Either b_par or t_par have crossed or
399		 * they are equal.
400		 *
401		 * we need to swap the pivot record to its final position
402		 * m2 could be in either the upper or lower partitions
403		 * or it could already be in its final position
404		 */
405		/*
406		 * R[b_par] > R[m2]
407		 * R[t_par] < R[m2]
408		 */
409		if (t_par < b_par) {
410			if (m2 < t_par) {
411				(*swapf)(m2, t_par, loops);
412				m2 = b_par = t_par;
413			} else if (m2 > b_par) {
414				(*swapf)(m2, b_par, loops);
415				m2 = t_par = b_par;
416			} else {
417				b_par = t_par = m2;
418			}
419		} else {
420			if (m2 < t_par) {
421				t_par = b_par = t_par - rsiz;
422			}
423			if (m2 != b_par) {
424				(*swapf)(m2, b_par, loops);
425			}
426			m2 = t_par;
427		}
428
429		/*
430		 * move bottom duplicates next to pivot
431		 * optimized to eliminate overlap
432		 */
433		d_bytelength = b_dup - b_lim;
434		if (b_par - b_dup < d_bytelength) {
435			b_dup = b_lim + (b_par - b_dup);
436		}
437		while (b_dup > b_lim) {
438			b_dup -= rsiz;
439			b_par -= rsiz;
440			(*swapf)(b_dup, b_par, loops);
441		}
442		b_par = m2 - d_bytelength;
443
444		/*
445		 * move top duplicates next to pivot
446		 */
447		d_bytelength = t_lim - t_dup;
448		if (t_dup - t_par < d_bytelength) {
449			t_dup = t_lim - (t_dup - t_par);
450		}
451		while (t_dup < t_lim) {
452			t_dup += rsiz;
453			t_par += rsiz;
454			(*swapf)(t_dup, t_par, loops);
455		}
456		t_par = m2 + d_bytelength;
457
458		/*
459		 * when a qsort pass completes there are three partitions
460		 * 1) the lower contains all records less than pivot
461		 * 2) the upper contains all records greater than pivot
462		 * 3) the pivot partition contains all record equal to pivot
463		 *
464		 * all records in the pivot partition are in their final
465		 * position and do not need to be accounted for by the stack
466		 *
467		 * when adding partitions to the stack
468		 * it is important to add the largest partition first
469		 * to prevent stack overflow.
470		 *
471		 * calculate number of unsorted records in top and bottom
472		 * push resulting partitions on stack
473		 */
474		b_nrec = (b_par - b_lim) / rsiz;
475		t_nrec = (t_lim - t_par) / rsiz;
476		if (b_nrec < t_nrec) {
477			sp->b_lim = t_par + rsiz;
478			sp->nrec = t_nrec;
479			sp++;
480			sp->b_lim = b_lim;
481			sp->nrec = b_nrec;
482			sp++;
483		} else {
484			sp->b_lim = b_lim;
485			sp->nrec = b_nrec;
486			sp++;
487			sp->b_lim = t_par + rsiz;
488			sp->nrec = t_nrec;
489			sp++;
490		}
491	}
492}
493
494void
495qsort(void *basep, size_t nrec, size_t rsiz,
496    int (*cmp)(const void *, const void *))
497{
498	return (qsort_r(basep, nrec, rsiz, qsort_r_wrapper,
499	    (void *)(uintptr_t)cmp));
500}
501
502/*
503 * The following swap functions should not create a stack frame
504 * the SPARC call / return instruction will be executed
505 * but the a save / restore will not be executed
506 * which means we won't do a window turn with the spill / fill overhead
507 * verify this by examining the assembly code
508 */
509
510/* ARGSUSED */
511static void
512swapp32(uint32_t *r1, uint32_t *r2, size_t cnt)
513{
514	uint32_t temp;
515
516	temp = *r1;
517	*r1++ = *r2;
518	*r2++ = temp;
519}
520
521/* ARGSUSED */
522static void
523swapp64(uint64_t *r1, uint64_t *r2, size_t cnt)
524{
525	uint64_t temp;
526
527	temp = *r1;
528	*r1++ = *r2;
529	*r2++ = temp;
530}
531
532static void
533swapi(uint32_t *r1, uint32_t *r2, size_t cnt)
534{
535	uint32_t temp;
536
537	/* character by character */
538	while (cnt--) {
539		temp = *r1;
540		*r1++ = *r2;
541		*r2++ = temp;
542	}
543}
544
545static void
546swapb(char *r1, char *r2, size_t cnt)
547{
548	char	temp;
549
550	/* character by character */
551	while (cnt--) {
552		temp = *r1;
553		*r1++ = *r2;
554		*r2++ = temp;
555	}
556}
557