xref: /illumos-gate/usr/src/cmd/sgs/rtld/amd64/boot_elf.S (revision 5d9d9091)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2018 Joyent, Inc. All rights reserved.
26 */
27
28/*
29 * Welcome to the magic behind the PLT (procedure linkage table). When rtld
30 * fills out the PLT entries, it will refer initially to the functions in this
31 * file. As such our goal is simple:
32 *
33 *     The lie of the function call must be preserved at all costs.
34 *
35 * This means that we need to prepare the system for an arbitrary series of
36 * instructions to be called. For example, as a side effect of resolving a
37 * symbol we may need to open a shared object which will cause any _init
38 * functions to be called. Those functions can use any and all of the ABI state
39 * that they desire (for example, the FPU registers). Therefore we must save and
40 * restore all the ABI mandated registers here.
41 *
42 * For the full information about what we need to save and restore and why,
43 * please see the System V amd64 PS ABI '3.2.3 Parameter Passing'. For general
44 * purpose registers, we need to take care of the following:
45 *
46 * 	%rax	- Used for information about the number of vector arguments
47 *	%rdi	- arg0
48 *	%rsi	- arg1
49 *	%rdx	- arg2
50 *	%rcx	- arg3
51 *	%r8	- arg4
52 *	%r9	- arg5
53 *	%r10	- static chain pointer
54 *
55 * Unfortunately, the world of the FPU is more complicated.
56 *
57 * The ABI mandates that we must save %xmm0-%xmm7. On newer Intel processors,
58 * %xmm0-%xmm7 shadow %ymm0-%ymm7 and %zmm0-%zmm7. Historically, when saving the
59 * FPU, we only saved and restored these eight registers. Unfortunately, this
60 * process itself ended up having side effects. Because the registers shadow one
61 * another, if we saved a full %zmm register when only a %xmm register was
62 * valid, we would end up causing the processor to think that the full %zmm
63 * register was valid. Once it believed that this was the case, it would then
64 * degrade performance of code that only used the %xmm registers.
65 *
66 * One way to tackle this problem would have been to use xgetbv with ecx=1 to
67 * get information about what was actually in use and only save and restore
68 * that. You can imagine that this logic roughly ends up as something like:
69 *
70 *         if (zmm_inuse)
71 *		save_zmm()
72 *         if (ymm_inuse)
73 *		save_ymm()
74 *         save_xmm()
75 *
76 * However, this logic leaves us at the mercy of the branch predictor. This
77 * means that all of our efforts can end up still causing the CPU to execute
78 * things to make it think that some of these other FPU registers are in use and
79 * thus defeat the optimizations that it has.
80 *
81 * To deal with this problem, Intel has suggested using the xsave family of
82 * instructions. The kernel provides information about the size required for the
83 * floating point registers as well as which of several methods we need to
84 * employ through the aux vector. This gets us out of trying to look at the
85 * hardware capabilities and make decisions every time. As part of the
86 * amd64-specific portion of rtld, it will process those values and determine
87 * the functions on an as-needed basis.
88 *
89 * There are two different functions that we export. The first is elf_rtbndr().
90 * This is basically the glue that gets us into the PLT and to perform
91 * relocations. elf_rtbndr() determines the address of the function that we must
92 * call and arranges its stack such that when we return from elf_rtbndr() we
93 * will instead jump to the actual relocated function which will return to the
94 * original caller. Because of this, we must preserve all of the registers that
95 * are used for arguments and restore them before returning.
96 *
97 * The second function we export is elf_plt_trace(). This is used to add support
98 * for audit libraries among other things. elf_plt_trace() may or may not call
99 * the underlying function as a side effect or merely set up its return to it.
100 * This changes how we handle %rax. If we call the function ourself, then we end
101 * up making sure that %rax is the return value versus the initial value. In
102 * addition, because we get %r11 from the surrounding PLT code, we opt to
103 * preserve it in case some of the relocation logic ever ends up calling back
104 * into us again.
105 */
106
107#if	defined(lint)
108
109#include	<sys/types.h>
110#include	<_rtld.h>
111#include	<_audit.h>
112#include	<_elf.h>
113#include	<sys/regset.h>
114#include	<sys/auxv_386.h>
115
116#else
117
118#include	<link.h>
119#include	<_audit.h>
120#include	<sys/asm_linkage.h>
121#include	<sys/auxv_386.h>
122#include	<sys/x86_archext.h>
123
124/*
125 * This macro is used to zero the xsave header. The contents of scratchreg will
126 * be destroyed. locreg should contain the starting address of the xsave header.
127 */
128#define	XSAVE_HEADER_ZERO(scratch, loc) \
129	xorq	scratch, scratch;	\
130	movq	scratch, 0x200(loc);	\
131	movq	scratch, 0x208(loc);	\
132	movq	scratch, 0x210(loc);	\
133	movq	scratch, 0x218(loc);	\
134	movq	scratch, 0x220(loc);	\
135	movq	scratch, 0x228(loc);	\
136	movq	scratch, 0x230(loc);	\
137	movq	scratch, 0x238(loc)
138
139
140	.file	"boot_elf.s"
141	.text
142
143/*
144 * This section of the code contains glue functions that are used to take care
145 * of saving and restoring the FPU. We deal with this in a few different ways
146 * based on the hardware support and what exists. Historically we've only saved
147 * and restored the first 8 floating point registers rather than the entire FPU.
148 * That implementation still exists here and is kept around mostly as an
149 * insurance policy.
150 */
151	ENTRY(_elf_rtbndr_fp_save_orig)
152	movq	org_scapset@GOTPCREL(%rip),%r11
153	movq	(%r11),%r11		/* Syscapset_t pointer */
154	movl	8(%r11),%edx		/* sc_hw_2 */
155	testl	$AV_386_2_AVX512F,%edx
156	jne	.save_zmm
157	movl	(%r11),%edx		/* sc_hw_1 */
158	testl	$AV_386_AVX,%edx
159	jne	.save_ymm
160	movdqa	%xmm0, (%rdi)
161	movdqa	%xmm1, 64(%rdi)
162	movdqa	%xmm2, 128(%rdi)
163	movdqa	%xmm3, 192(%rdi)
164	movdqa	%xmm4, 256(%rdi)
165	movdqa	%xmm5, 320(%rdi)
166	movdqa	%xmm6, 384(%rdi)
167	movdqa	%xmm7, 448(%rdi)
168	jmp	.save_finish
169
170.save_ymm:
171	vmovdqa	%ymm0, (%rdi)
172	vmovdqa	%ymm1, 64(%rdi)
173	vmovdqa	%ymm2, 128(%rdi)
174	vmovdqa	%ymm3, 192(%rdi)
175	vmovdqa	%ymm4, 256(%rdi)
176	vmovdqa	%ymm5, 320(%rdi)
177	vmovdqa	%ymm6, 384(%rdi)
178	vmovdqa	%ymm7, 448(%rdi)
179	jmp	.save_finish
180
181.save_zmm:
182	vmovdqa64	%zmm0, (%rdi)
183	vmovdqa64	%zmm1, 64(%rdi)
184	vmovdqa64	%zmm2, 128(%rdi)
185	vmovdqa64	%zmm3, 192(%rdi)
186	vmovdqa64	%zmm4, 256(%rdi)
187	vmovdqa64	%zmm5, 320(%rdi)
188	vmovdqa64	%zmm6, 384(%rdi)
189	vmovdqa64	%zmm7, 448(%rdi)
190
191.save_finish:
192	ret
193	SET_SIZE(_elf_rtbndr_fp_save_orig)
194
195	ENTRY(_elf_rtbndr_fp_restore_orig)
196	movq	org_scapset@GOTPCREL(%rip),%r11
197	movq	(%r11),%r11		/* Syscapset_t pointer */
198	movl	8(%r11),%edx		/* sc_hw_2 */
199	testl	$AV_386_2_AVX512F,%edx
200	jne	.restore_zmm
201	movl	(%r11),%edx		/* sc_hw_1 */
202	testl	$AV_386_AVX,%edx
203	jne	.restore_ymm
204
205	movdqa	(%rdi), %xmm0
206	movdqa	64(%rdi), %xmm1
207	movdqa	128(%rdi), %xmm2
208	movdqa	192(%rdi), %xmm3
209	movdqa	256(%rdi), %xmm4
210	movdqa	320(%rdi), %xmm5
211	movdqa	384(%rdi), %xmm6
212	movdqa	448(%rdi), %xmm7
213	jmp	.restore_finish
214
215.restore_ymm:
216	vmovdqa	(%rdi), %ymm0
217	vmovdqa	64(%rdi), %ymm1
218	vmovdqa	128(%rdi), %ymm2
219	vmovdqa	192(%rdi), %ymm3
220	vmovdqa	256(%rdi), %ymm4
221	vmovdqa	320(%rdi), %ymm5
222	vmovdqa	384(%rdi), %ymm6
223	vmovdqa	448(%rdi), %ymm7
224	jmp	.restore_finish
225
226.restore_zmm:
227	vmovdqa64	(%rdi), %zmm0
228	vmovdqa64	64(%rdi), %zmm1
229	vmovdqa64	128(%rdi), %zmm2
230	vmovdqa64	192(%rdi), %zmm3
231	vmovdqa64	256(%rdi), %zmm4
232	vmovdqa64	320(%rdi), %zmm5
233	vmovdqa64	384(%rdi), %zmm6
234	vmovdqa64	448(%rdi), %zmm7
235
236.restore_finish:
237	ret
238	SET_SIZE(_elf_rtbndr_fp_restore_orig)
239
240	ENTRY(_elf_rtbndr_fp_fxsave)
241	fxsaveq	(%rdi)
242	ret
243	SET_SIZE(_elf_rtbndr_fp_fxsave)
244
245	ENTRY(_elf_rtbndr_fp_fxrestore)
246	fxrstor	(%rdi)
247	ret
248	SET_SIZE(_elf_rtbndr_fp_fxrestore)
249
250	ENTRY(_elf_rtbndr_fp_xsave)
251	XSAVE_HEADER_ZERO(%rdx, %rdi)
252	movq	$_CONST(XFEATURE_FP_ALL), %rdx
253	movl	%edx, %eax
254	shrq	$32, %rdx
255	xsave	(%rdi)			/* save data */
256	ret
257	SET_SIZE(_elf_rtbndr_fp_xsave)
258
259	ENTRY(_elf_rtbndr_fp_xrestore)
260	movq	$_CONST(XFEATURE_FP_ALL), %rdx
261	movl	%edx, %eax
262	shrq	$32, %rdx
263	xrstor	(%rdi)			/* save data */
264	ret
265	SET_SIZE(_elf_rtbndr_fp_xrestore)
266
267#endif
268
269#if	defined(lint)
270
271/* ARGSUSED0 */
272int
273elf_plt_trace()
274{
275	return (0);
276}
277
278#else
279
280/*
281 * On entry the 'glue code' has already  done the following:
282 *
283 *	pushq	%rbp
284 *	movq	%rsp, %rbp
285 *	subq	$0x10, %rsp
286 *	leaq	trace_fields(%rip), %r11
287 *	movq	%r11, -0x8(%rbp)
288 *	movq	$elf_plt_trace, %r11
289 *	jmp	*%r11
290 *
291 * so - -8(%rbp) contains the dyndata ptr
292 *
293 *	0x0	Addr		*reflmp
294 *	0x8	Addr		*deflmp
295 *	0x10	Word		symndx
296 *	0x14	Word		sb_flags
297 *	0x18	Sym		symdef.st_name
298 *	0x1c			symdef.st_info
299 *	0x1d			symdef.st_other
300 *	0x1e			symdef.st_shndx
301 *	0x20			symdef.st_value
302 *	0x28			symdef.st_size
303 *
304 * Also note - on entry 16 bytes have already been subtracted
305 * from the %rsp.  The first 8 bytes is for the dyn_data_ptr,
306 * the second 8 bytes are to align the stack and are available
307 * for use.
308 */
309#define	REFLMP_OFF		0x0
310#define	DEFLMP_OFF		0x8
311#define	SYMNDX_OFF		0x10
312#define	SBFLAGS_OFF		0x14
313#define	SYMDEF_OFF		0x18
314#define	SYMDEF_VALUE_OFF	0x20
315
316/*
317 * Next, we need to create a bunch of local storage. First, we have to preserve
318 * the standard registers per the amd64 ABI. This means we need to deal with:
319 *	%rax	- Used for information about the number of vector arguments
320 *	%rdi	- arg0
321 *	%rsi	- arg1
322 *	%rdx	- arg2
323 *	%rcx	- arg3
324 *	%r8	- arg4
325 *	%r9	- arg5
326 *	%r10	- static chain pointer
327 *	%r11	- PLT Interwork register, our caller is using this, so it's not
328 *		  a temporary for us.
329 *
330 * In addition, we need to save the amd64 ABI floating point arguments. Finally,
331 * we need to deal with our local storage. We need a La_amd64_regs and a
332 * uint64_t for the previous stack size.
333 *
334 * To deal with this and the potentially variable size of the FPU regs, we have
335 * to play a few different games. We refer to all of the standard registers, the
336 * previous stack size, and La_amd64_regs structure off of %rbp. These are all
337 * values that are below %rbp.
338 */
339#define	SPDYNOFF	-8
340#define	SPDESTOFF	-16
341#define	SPPRVSTKOFF	-24
342#define	SPLAREGOFF	-88
343#define	ORIG_RDI	-96
344#define	ORIG_RSI	-104
345#define	ORIG_RDX	-112
346#define	ORIG_RCX	-120
347#define	ORIG_R8		-128
348#define	ORIG_R9		-136
349#define	ORIG_R10	-144
350#define	ORIG_R11	-152
351#define	ORIG_RAX	-160
352#define	PLT_SAVE_OFF	168
353
354	ENTRY(elf_plt_trace)
355	/*
356	 * Save our static registers. After that 64-byte align us and subtract
357	 * the appropriate amount for the FPU. The frame pointer has already
358	 * been pushed for us by the glue code.
359	 */
360	movq	%rdi, ORIG_RDI(%rbp)
361	movq	%rsi, ORIG_RSI(%rbp)
362	movq	%rdx, ORIG_RDX(%rbp)
363	movq	%rcx, ORIG_RCX(%rbp)
364	movq	%r8, ORIG_R8(%rbp)
365	movq	%r9, ORIG_R9(%rbp)
366	movq	%r10, ORIG_R10(%rbp)
367	movq	%r11, ORIG_R11(%rbp)
368	movq	%rax, ORIG_RAX(%rbp)
369
370	subq	$PLT_SAVE_OFF, %rsp
371
372	movq	_plt_save_size@GOTPCREL(%rip),%r9
373	movq	_plt_fp_save@GOTPCREL(%rip),%r10
374	subq	(%r9), %rsp
375	andq	$-64, %rsp
376	movq	%rsp, %rdi
377	call	*(%r10)
378
379	/*
380	 * Now that we've saved all of our registers, figure out what we need to
381	 * do next.
382	 */
383	movq	SPDYNOFF(%rbp), %rax			/ %rax = dyndata
384	testb	$LA_SYMB_NOPLTENTER, SBFLAGS_OFF(%rax)	/ <link.h>
385	je	.start_pltenter
386	movq	SYMDEF_VALUE_OFF(%rax), %rdi
387	movq	%rdi, SPDESTOFF(%rbp)		/ save destination address
388	jmp	.end_pltenter
389
390.start_pltenter:
391	/*
392	 * save all registers into La_amd64_regs
393	 */
394	leaq	SPLAREGOFF(%rbp), %rsi	/ %rsi = &La_amd64_regs
395	leaq	8(%rbp), %rdi
396	movq	%rdi, 0(%rsi)		/ la_rsp
397	movq	0(%rbp), %rdi
398	movq	%rdi, 8(%rsi)		/ la_rbp
399	movq	ORIG_RDI(%rbp), %rdi
400	movq	%rdi, 16(%rsi)		/ la_rdi
401	movq	ORIG_RSI(%rbp), %rdi
402	movq	%rdi, 24(%rsi)		/ la_rsi
403	movq	ORIG_RDX(%rbp), %rdi
404	movq	%rdi, 32(%rsi)		/ la_rdx
405	movq	ORIG_RCX(%rbp), %rdi
406	movq	%rdi, 40(%rsi)		/ la_rcx
407	movq	ORIG_R8(%rbp), %rdi
408	movq	%rdi, 48(%rsi)		/ la_r8
409	movq	ORIG_R9(%rbp), %rdi
410	movq	%rdi, 56(%rsi)		/ la_r9
411
412	/*
413	 * prepare for call to la_pltenter
414	 */
415	movq	SPDYNOFF(%rbp), %r11		/ %r11 = &dyndata
416	leaq	SBFLAGS_OFF(%r11), %r9		/ arg6 (&sb_flags)
417	leaq	SPLAREGOFF(%rbp), %r8		/ arg5 (&La_amd64_regs)
418	movl	SYMNDX_OFF(%r11), %ecx		/ arg4 (symndx)
419	leaq	SYMDEF_OFF(%r11), %rdx		/ arg3 (&Sym)
420	movq	DEFLMP_OFF(%r11), %rsi		/ arg2 (dlmp)
421	movq	REFLMP_OFF(%r11), %rdi		/ arg1 (rlmp)
422	call	audit_pltenter@PLT
423	movq	%rax, SPDESTOFF(%rbp)		/ save calling address
424.end_pltenter:
425
426	/*
427	 * If *no* la_pltexit() routines exist
428	 * we do not need to keep the stack frame
429	 * before we call the actual routine.  Instead we
430	 * jump to it and remove our stack from the stack
431	 * at the same time.
432	 */
433	movl	audit_flags(%rip), %eax
434	andl	$AF_PLTEXIT, %eax		/ value of audit.h:AF_PLTEXIT
435	cmpl	$0, %eax
436	je	.bypass_pltexit
437	/*
438	 * Has the *nopltexit* flag been set for this entry point
439	 */
440	movq	SPDYNOFF(%rbp), %r11		/ %r11 = &dyndata
441	testb	$LA_SYMB_NOPLTEXIT, SBFLAGS_OFF(%r11)
442	je	.start_pltexit
443
444.bypass_pltexit:
445	/*
446	 * No PLTEXIT processing required.
447	 */
448	movq	0(%rbp), %r11
449	movq	%r11, -8(%rbp)			/ move prev %rbp
450	movq	SPDESTOFF(%rbp), %r11		/ r11 == calling destination
451	movq	%r11, 0(%rbp)			/ store destination at top
452
453	/* Restore FPU */
454	movq	_plt_fp_restore@GOTPCREL(%rip),%r10
455
456	movq	%rsp, %rdi
457	call	*(%r10)
458
459	movq	ORIG_RDI(%rbp), %rdi
460	movq	ORIG_RSI(%rbp), %rsi
461	movq	ORIG_RDX(%rbp), %rdx
462	movq	ORIG_RCX(%rbp), %rcx
463	movq	ORIG_R8(%rbp), %r8
464	movq	ORIG_R9(%rbp), %r9
465	movq	ORIG_R10(%rbp), %r10
466	movq	ORIG_R11(%rbp), %r11
467	movq	ORIG_RAX(%rbp), %rax
468
469	subq	$8, %rbp			/ adjust %rbp for 'ret'
470	movq	%rbp, %rsp			/
471	/*
472	 * At this point, after a little doctoring, we should
473	 * have the following on the stack:
474	 *
475	 *	16(%rsp):  ret addr
476	 *	8(%rsp):  dest_addr
477	 *	0(%rsp):  Previous %rbp
478	 *
479	 * So - we pop the previous %rbp, and then
480	 * ret to our final destination.
481	 */
482	popq	%rbp				/
483	ret					/ jmp to final destination
484						/ and clean up stack :)
485
486.start_pltexit:
487	/*
488	 * In order to call the destination procedure and then return
489	 * to audit_pltexit() for post analysis we must first grow
490	 * our stack frame and then duplicate the original callers
491	 * stack state.  This duplicates all of the arguements
492	 * that were to be passed to the destination procedure.
493	 */
494	movq	%rbp, %rdi			/
495	addq	$16, %rdi			/    %rdi = src
496	movq	(%rbp), %rdx			/
497	subq	%rdi, %rdx			/    %rdx == prev frame sz
498	/*
499	 * If audit_argcnt > 0 then we limit the number of
500	 * arguements that will be duplicated to audit_argcnt.
501	 *
502	 * If (prev_stack_size > (audit_argcnt * 8))
503	 *	prev_stack_size = audit_argcnt * 8;
504	 */
505	movl	audit_argcnt(%rip),%eax		/   %eax = audit_argcnt
506	cmpl	$0, %eax
507	jle	.grow_stack
508	leaq	(,%rax,8), %rax			/    %eax = %eax * 4
509	cmpq	%rax,%rdx
510	jle	.grow_stack
511	movq	%rax, %rdx
512	/*
513	 * Grow the stack and duplicate the arguements of the
514	 * original caller.
515	 */
516.grow_stack:
517	movq	%rsp, %r11
518	subq	%rdx, %rsp			/    grow the stack
519	movq	%rdx, SPPRVSTKOFF(%rbp)		/    -88(%rbp) == prev frame sz
520	movq	%rsp, %rcx			/    %rcx = dest
521	addq	%rcx, %rdx			/    %rdx == tail of dest
522.while_base:
523	cmpq	%rdx, %rcx			/   while (base+size >= src++) {
524	jge	.end_while			/
525	movq	(%rdi), %rsi
526	movq	%rsi,(%rcx)			/        *dest = *src
527	addq	$8, %rdi			/	 src++
528	addq	$8, %rcx			/        dest++
529	jmp	.while_base			/    }
530
531	/*
532	 * The above stack is now an exact duplicate of
533	 * the stack of the original calling procedure.
534	 */
535.end_while:
536	/
537	/ Restore registers using %r11 which contains our old %rsp value
538	/ before growing the stack.
539	/
540	movq	_plt_fp_restore@GOTPCREL(%rip),%r10
541	movq	%r11, %rdi
542	call	*(%r10)
543
544.trace_r2_finish:
545	movq	ORIG_RDI(%rbp), %rdi
546	movq	ORIG_RSI(%rbp), %rsi
547	movq	ORIG_RDX(%rbp), %rdx
548	movq	ORIG_RCX(%rbp), %rcx
549	movq	ORIG_R8(%rbp), %r8
550	movq	ORIG_R9(%rbp), %r9
551	movq	ORIG_R10(%rbp), %r10
552	movq	ORIG_RAX(%rbp), %rax
553	movq	ORIG_R11(%rbp), %r11
554
555	/*
556	 * Call to desitnation function - we'll return here
557	 * for pltexit monitoring.
558	 */
559	call	*SPDESTOFF(%rbp)
560
561	addq	SPPRVSTKOFF(%rbp), %rsp	/ cleanup dupped stack
562
563	/
564	/ prepare for call to audit_pltenter()
565	/
566	movq	SPDYNOFF(%rbp), %r11		/ %r11 = &dyndata
567	movq	SYMNDX_OFF(%r11), %r8		/ arg5 (symndx)
568	leaq	SYMDEF_OFF(%r11), %rcx		/ arg4 (&Sym)
569	movq	DEFLMP_OFF(%r11), %rdx		/ arg3 (dlmp)
570	movq	REFLMP_OFF(%r11), %rsi		/ arg2 (rlmp)
571	movq	%rax, %rdi			/ arg1 (returnval)
572	call	audit_pltexit@PLT
573
574	/*
575	 * Clean up after ourselves and return to the
576	 * original calling procedure. Make sure to restore
577	 * registers.
578	 */
579
580	movq	_plt_fp_restore@GOTPCREL(%rip),%r10
581	movq	%rsp, %rdi
582	movq	%rax, SPPRVSTKOFF(%rbp)
583	call	*(%r10)
584
585	movq	ORIG_RDI(%rbp), %rdi
586	movq	ORIG_RSI(%rbp), %rsi
587	movq	ORIG_RDX(%rbp), %rdx
588	movq	ORIG_RCX(%rbp), %rcx
589	movq	ORIG_R8(%rbp), %r8
590	movq	ORIG_R9(%rbp), %r9
591	movq	ORIG_R10(%rbp), %r10
592	movq	ORIG_R11(%rbp), %r11
593	movq	SPPRVSTKOFF(%rbp), %rax
594
595	movq	%rbp, %rsp			/
596	popq	%rbp				/
597	ret					/ return to caller
598	SET_SIZE(elf_plt_trace)
599#endif
600
601/*
602 * We got here because a call to a function resolved to a procedure
603 * linkage table entry.  That entry did a JMPL to the first PLT entry, which
604 * in turn did a call to elf_rtbndr.
605 *
606 * the code sequence that got us here was:
607 *
608 * .PLT0:
609 *	pushq	GOT+8(%rip)	#GOT[1]
610 *	jmp	*GOT+16(%rip)	#GOT[2]
611 *	nop
612 *	nop
613 *	nop
614 *	nop
615 *	...
616 * PLT entry for foo:
617 *	jmp	*name1@GOTPCREL(%rip)
618 *	pushl	$rel.plt.foo
619 *	jmp	PLT0
620 *
621 * At entry, the stack looks like this:
622 *
623 *	return address			16(%rsp)
624 *	$rel.plt.foo	(plt index)	8(%rsp)
625 *	lmp				0(%rsp)
626 *
627 */
628#if defined(lint)
629
630extern unsigned long	elf_bndr(Rt_map *, unsigned long, caddr_t);
631
632void
633elf_rtbndr(Rt_map * lmp, unsigned long reloc, caddr_t pc)
634{
635	(void) elf_bndr(lmp, reloc, pc);
636}
637
638#else
639
640/*
641 * The PLT code that landed us here placed 2 arguments on the stack as
642 * arguments to elf_rtbndr.
643 * Additionally the pc of caller is below these 2 args.
644 * Our stack will look like this after we establish a stack frame with
645 * push %rbp; movq %rsp, %rbp sequence:
646 *
647 *	8(%rbp)			arg1 - *lmp
648 *	16(%rbp), %rsi		arg2 - reloc index
649 *	24(%rbp), %rdx		arg3 - pc of caller
650 */
651#define	LBPLMPOFF	8	/* arg1 - *lmp */
652#define	LBPRELOCOFF	16	/* arg2 - reloc index */
653#define	LBRPCOFF	24	/* arg3 - pc of caller */
654
655/*
656 * With the above in place, we must now proceed to preserve all temporary
657 * registers that are also used for passing arguments. Specifically this
658 * means:
659 *
660 *	%rax	- Used for information about the number of vector arguments
661 *	%rdi	- arg0
662 *	%rsi	- arg1
663 *	%rdx	- arg2
664 *	%rcx	- arg3
665 *	%r8	- arg4
666 *	%r9	- arg5
667 *	%r10	- static chain pointer
668 *
669 * While we don't have to preserve %r11, we do have to preserve the FPU
670 * registers. The FPU logic is delegated to a specific function that we'll call.
671 * However, it requires that its stack is 64-byte aligned. We defer the
672 * alignment to that point. This will also take care of the fact that a caller
673 * may not call us with a correctly aligned stack pointer per the amd64 ABI.
674 */
675
676	.extern _plt_save_size
677	.extern _plt_fp_save
678	.extern plt_fp_restore
679
680	.weak	_elf_rtbndr
681	_elf_rtbndr = elf_rtbndr
682
683	ENTRY(elf_rtbndr)
684	pushq	%rbp		/* Establish stack frame */
685	movq	%rsp, %rbp
686
687	/*
688	 * Save basic regs.
689	 */
690	pushq	%rax
691	pushq	%rdi
692	pushq	%rsi
693	pushq	%rdx
694	pushq	%rcx
695	pushq	%r8
696	pushq	%r9
697	pushq	%r10
698	pushq	%r12
699
700	/*
701	 * Save the amount of space we need for the FPU registers and call that
702	 * function. Save %rsp before we manipulate it to make restore easier.
703	 */
704	movq	%rsp, %r12
705	movq	_plt_save_size@GOTPCREL(%rip),%r9
706	movq	_plt_fp_save@GOTPCREL(%rip),%r10
707	subq	(%r9), %rsp
708	andq	$-64, %rsp
709
710	movq	%rsp, %rdi
711	call	*(%r10)
712
713	/*
714	 * Perform actual PLT logic. Note that the plt related arguments are
715	 * located at an offset relative to %rbp.
716	 */
717	movq	LBPLMPOFF(%rbp), %rdi	/* arg1 - *lmp */
718	movq	LBPRELOCOFF(%rbp), %rsi	/* arg2 - reloc index */
719	movq	LBRPCOFF(%rbp), %rdx	/* arg3 - pc of caller */
720	call	elf_bndr@PLT		/* call elf_rtbndr(lmp, relndx, pc) */
721	movq	%rax, LBPRELOCOFF(%rbp)	/* store final destination */
722
723	/* Restore FPU */
724	movq	_plt_fp_restore@GOTPCREL(%rip),%r10
725
726	movq	%rsp, %rdi
727	call	*(%r10)
728
729	movq	%r12, %rsp
730	popq	%r12
731	popq	%r10
732	popq	%r9
733	popq	%r8
734	popq	%rcx
735	popq	%rdx
736	popq	%rsi
737	popq	%rdi
738	popq	%rax
739
740	movq	%rbp, %rsp	/* Restore our stack frame */
741	popq	%rbp
742
743	addq	$8, %rsp	/* pop 1st plt-pushed args */
744				/* the second arguement is used */
745				/* for the 'return' address to our */
746				/* final destination */
747
748	ret			/* invoke resolved function */
749
750	SET_SIZE(elf_rtbndr)
751#endif
752