1/* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12/* 13 * Copyright 2019 Joyent, Inc. 14 */ 15 16#include <sys/asm_linkage.h> 17#include <sys/segments.h> 18#include <sys/time_impl.h> 19#include <sys/tsc.h> 20#include <cp_offsets.h> 21 22#define GETCPU_GDT_OFFSET SEL_GDT(GDT_CPUID, SEL_UPL) 23 24 .file "cp_subr.s" 25 26/* 27 * These are cloned from TSC and time related code in the kernel. They should 28 * be kept in sync in the case that the source values are changed. 29 * See: uts/i86pc/os/timestamp.c 30 */ 31#define NSEC_SHIFT 5 32#define ADJ_SHIFT 4 33#define NANOSEC 0x3b9aca00 34 35/* 36 * For __cp_tsc_read calls which incur looping retries due to CPU migration, 37 * this represents the maximum number of tries before bailing out. 38 */ 39#define TSC_READ_MAXLOOP 0x4 40 41/* 42 * hrtime_t 43 * __cp_tsc_read(comm_page_t *cp) 44 * 45 * Stack usage: 0 bytes 46 */ 47 ENTRY_NP(__cp_tsc_read) 48 movl CP_TSC_TYPE(%rdi), %esi 49 movl CP_TSC_NCPU(%rdi), %r8d 50 51 cmpl $TSC_TSCP, %esi 52 jne 2f 53 rdtscp 54 /* 55 * When the TSC is read, the low 32 bits are placed in %eax while the 56 * high 32 bits are placed in %edx. They are shifted and ORed together 57 * to obtain the full 64-bit value. 58 */ 59 shlq $0x20, %rdx 60 orq %rdx, %rax 61 62 /* 63 * A zeroed cp_tsc_ncpu (currently held in r8d) indicates that no 64 * per-CPU TSC offsets are required. 65 */ 66 testl %r8d, %r8d 67 jnz 1f 68 ret 69 701: 71 /* 72 * A non-zero cp_tsc_ncpu indicates the array length of 73 * cp_tsc_sync_tick_delta containing per-CPU offsets which are applied 74 * to TSC readings. The CPU ID furnished by the IA32_TSC_AUX register 75 * via rdtscp (placed in rcx) is used to look up an offset value in 76 * that array and apply it to the TSC value. 77 */ 78 leaq CP_TSC_SYNC_TICK_DELTA(%rdi), %r9 79 movq (%r9, %rcx, 8), %rdx 80 addq %rdx, %rax 81 ret 82 832: 84 /* 85 * TSC reading without RDTSCP 86 * 87 * Check if handling for per-CPU TSC offsets is required. If not, 88 * immediately skip to the the appropriate steps to perform a rdtsc. 89 * 90 * If per-CPU offsets are present, the TSC reading process is more 91 * complicated. Without rdtscp, there is no way to simultaneously read 92 * the TSC and query the current CPU. In order to "catch" migrations 93 * during execution, the CPU ID is queried before and after rdtsc. The 94 * execution is repeated if results differ, subject to a loop limit. 95 */ 96 xorq %r9, %r9 97 testl %r8d, %r8d 98 jz 3f 99 100 /* 101 * Load the address of the per-CPU offset array, since it is needed. 102 * The attempted loop count is kept in r8. 103 */ 104 leaq CP_TSC_SYNC_TICK_DELTA(%rdi), %r9 105 xorl %r8d, %r8d 106 107 /* Query the CPU ID and stash it in r10 for later comparison */ 108 movl $GETCPU_GDT_OFFSET, %edx 109 lsl %dx, %edx 110 movl %edx, %r10d 111 1123: 113 cmpl $TSC_RDTSC_MFENCE, %esi 114 jne 4f 115 mfence 116 rdtsc 117 jmp 7f 118 1194: 120 cmpl $TSC_RDTSC_LFENCE, %esi 121 jne 5f 122 lfence 123 rdtsc 124 jmp 7f 125 1265: 127 cmpl $TSC_RDTSC_CPUID, %esi 128 jne 6f 129 /* 130 * Since the amd64 ABI dictates that %rbx is callee-saved, it must be 131 * preserved here. Its contents will be overwritten when cpuid is used 132 * as a serializing instruction. 133 */ 134 movq %rbx, %r11 135 xorl %eax, %eax 136 cpuid 137 rdtsc 138 movq %r11, %rbx 139 jmp 7f 140 1416: 142 /* 143 * Other protections should have prevented this function from being 144 * called in the first place. Since callers must handle a failure from 145 * CPU migration looping, yield the same result as a bail-out: 0 146 */ 147 xorl %eax, %eax 148 ret 149 1507: 151 shlq $0x20, %rdx 152 orq %rdx, %rax 153 154 /* 155 * With the TSC reading in-hand, check if any per-CPU offset handling 156 * is required. The address to the array of deltas (r9) will not have 157 * been populated if offset handling is unecessary. 158 */ 159 testq %r9, %r9 160 jnz 8f 161 ret 162 1638: 164 movl $GETCPU_GDT_OFFSET, %edx 165 lsl %dx, %edx 166 cmpl %edx, %r10d 167 jne 9f 168 movq (%r9, %rdx, 8), %rdx 169 addq %rdx, %rax 170 ret 171 1729: 173 /* 174 * It appears that a migration has occurred between the first CPU ID 175 * query and now. Check if the loop limit has been broken and retry if 176 * that's not the case. 177 */ 178 cmpl $TSC_READ_MAXLOOP, %r8d 179 jge 10f 180 incl %r8d 181 movl %edx, %r10d 182 jmp 3b 183 18410: 185 /* Loop limit was reached. Return bail-out value of 0. */ 186 xorl %eax, %eax 187 ret 188 189 SET_SIZE(__cp_tsc_read) 190 191 192/* 193 * uint_t 194 * __cp_getcpu(comm_page_t *) 195 * 196 * Stack usage: 0 bytes 197 */ 198 ENTRY_NP(__cp_getcpu) 199 movl CP_TSC_TYPE(%rdi), %edi 200 /* 201 * If RDTSCP is available, it is a quick way to grab the cpu_id which 202 * is stored in the TSC_AUX MSR by the kernel. 203 */ 204 cmpl $TSC_TSCP, %edi 205 jne 1f 206 rdtscp 207 movl %ecx, %eax 208 ret 2091: 210 mov $GETCPU_GDT_OFFSET, %eax 211 lsl %ax, %eax 212 ret 213 SET_SIZE(__cp_getcpu) 214 215/* 216 * hrtime_t 217 * __cp_gethrtime(comm_page_t *cp) 218 * 219 * Stack usage: 0x20 local + 0x8 call = 0x28 bytes 220 * 221 * %rsp+0x00 - hrtime_t tsc_last 222 * %rsp+0x08 - hrtime_t hrtime_base 223 * %rsp+0x10 - commpage_t *cp 224 * %rsp+0x18 - int hres_lock 225 */ 226 ENTRY_NP(__cp_gethrtime) 227 subq $0x20, %rsp 228 movq %rdi, 0x10(%rsp) 2291: 230 movl CP_HRES_LOCK(%rdi), %r9d 231 movl %r9d, 0x18(%rsp) 232 233 movq CP_TSC_LAST(%rdi), %rax 234 movq CP_TSC_HRTIME_BASE(%rdi), %rdx 235 movq %rax, (%rsp) 236 movq %rdx, 0x8(%rsp) 237 238 call __cp_tsc_read 239 240 /* 241 * Failure is inferred from a TSC reading of 0. The normal fasttrap 242 * mechanism can be used as a fallback in such cases. 243 */ 244 testq %rax, %rax 245 jz 6f 246 247 movq 0x10(%rsp), %rdi 248 movl 0x18(%rsp), %r9d 249 movl CP_HRES_LOCK(%rdi), %edx 250 andl $0xfffffffe, %r9d 251 cmpl %r9d, %edx 252 jne 1b 253 254 /* 255 * The in-kernel logic for calculating hrtime performs several checks 256 * to protect against edge cases. That logic is summarized as: 257 * if (tsc >= tsc_last) { 258 * delta -= tsc_last; 259 * } else if (tsc >= tsc_last - 2*tsc_max_delta) { 260 * delta = 0; 261 * } else { 262 * delta = MIN(tsc, tsc_resume_cap); 263 * } 264 * 265 * The below implementation achieves the same result, although it is 266 * structured for speed and optimized for the fast path: 267 * 268 * delta = tsc - tsc_last; 269 * if (delta < 0) { 270 * delta += (tsc_max_delta << 1); 271 * if (delta >= 0) { 272 * delta = 0; 273 * } else { 274 * delta = MIN(tsc, tsc_resume_cap); 275 * } 276 * } 277 */ 278 movq (%rsp), %rdx 279 subq %rdx, %rax /* delta = tsc - tsc_last */ 280 jbe 3f /* if (delta < 0) */ 281 2822: 283 /* 284 * Optimized TSC_CONVERT_AND_ADD: 285 * hrtime_base += (tsc_delta * nsec_scale) >> (32 - NSEC_SHIFT) 286 * 287 * Since the multiply and shift are done in 128-bit, there is no need 288 * to worry about overflow. 289 */ 290 movl CP_NSEC_SCALE(%rdi), %ecx 291 mulq %rcx 292 shrdq $_CONST(32 - NSEC_SHIFT), %rdx, %rax 293 movq 0x8(%rsp), %r8 294 addq %r8, %rax 295 296 addq $0x20, %rsp 297 ret 298 2993: 300 movq %rax, %r9 /* save (tsc - tsc_last) in r9 */ 301 movl CP_TSC_MAX_DELTA(%rdi), %ecx 302 sall $1, %ecx 303 addq %rcx, %rax /* delta += (tsc_max_delta << 1) */ 304 jae 4f /* delta < 0 */ 305 xorq %rax, %rax 306 jmp 2b 307 3084: 309 /* 310 * Repopulate %rax with the TSC reading by adding tsc_last to %r9 311 * (which holds tsc - tsc_last) 312 */ 313 movq (%rsp), %rax 314 addq %r9, %rax 315 316 /* delta = MIN(tsc, resume_cap) */ 317 movq CP_TSC_RESUME_CAP(%rdi), %rcx 318 cmpq %rcx, %rax 319 jbe 5f 320 movq %rcx, %rax 3215: 322 jmp 2b 323 3246: 325 movl $T_GETHRTIME, %eax 326 int $T_FASTTRAP 327 addq $0x20, %rsp 328 ret 329 330 SET_SIZE(__cp_gethrtime) 331 332/* 333 * int 334 * __cp_clock_gettime_monotonic(comm_page_t *cp, timespec_t *tsp) 335 * 336 * Stack usage: 0x8 local + 0x8 call + 0x28 called func. = 0x38 bytes 337 * 338 * %rsp+0x00 - timespec_t *tsp 339 */ 340 ENTRY_NP(__cp_clock_gettime_monotonic) 341 subq $0x8, %rsp 342 movq %rsi, (%rsp) 343 344 call __cp_gethrtime 345 346 /* 347 * Convert from hrtime_t (int64_t in nanoseconds) to timespec_t. 348 * This uses the same approach as hrt2ts, although it has been updated 349 * to utilize 64-bit math. 350 * 1 / 1,000,000,000 = 351 * 1000100101110000010111110100000100110110101101001010110110011B-26 352 * = 0x112e0be826d694b3 * 2^-26 353 * 354 * secs = (nsecs * 0x112e0be826d694b3) >> 26 355 * 356 * In order to account for the 2s-compliment of negative inputs, a 357 * final operation completes the process: 358 * 359 * secs -= (nsecs >> 63) 360 */ 361 movq %rax, %r11 362 movq $0x112e0be826d694b3, %rdx 363 imulq %rdx 364 sarq $0x1a, %rdx 365 movq %r11, %rax 366 sarq $0x3f, %rax 367 subq %rax, %rdx 368 movq (%rsp), %rsi 369 movq %rdx, (%rsi) 370 /* 371 * Populating tv_nsec is easier: 372 * tv_nsec = nsecs - (secs * NANOSEC) 373 */ 374 imulq $NANOSEC, %rdx, %rdx 375 subq %rdx, %r11 376 movq %r11, 0x8(%rsi) 377 378 xorl %eax, %eax 379 addq $0x8, %rsp 380 ret 381 SET_SIZE(__cp_clock_gettime_monotonic) 382 383/* 384 * int 385 * __cp_clock_gettime_realtime(comm_page_t *cp, timespec_t *tsp) 386 * 387 * Stack usage: 0x18 local + 0x8 call + 0x28 called func. = 0x48 bytes 388 * 389 * %rsp+0x00 - commpage_t *cp 390 * %rsp+0x08 - timespec_t *tsp 391 * %rsp+0x10 - int hres_lock 392 */ 393 ENTRY_NP(__cp_clock_gettime_realtime) 394 subq $0x18, %rsp 395 movq %rdi, (%rsp) 396 movq %rsi, 0x8(%rsp) 397 3981: 399 movl CP_HRES_LOCK(%rdi), %eax 400 movl %eax, 0x10(%rsp) 401 402 call __cp_gethrtime 403 movq (%rsp), %rdi 404 movq CP_HRES_LAST_TICK(%rdi), %rdx 405 subq %rdx, %rax /* nslt = hrtime - last_tick */ 406 jb 1b 407 movq CP_HRESTIME(%rdi), %r9 408 movq _CONST(CP_HRESTIME + CP_HRESTIME_INCR)(%rdi), %r10 409 movl CP_HRESTIME_ADJ(%rdi), %r11d 410 411 addq %rax, %r10 /* now.tv_nsec += nslt */ 412 413 cmpl $0, %r11d 414 jb 4f /* hres_adj > 0 */ 415 ja 6f /* hres_adj < 0 */ 416 4172: 418 cmpq $NANOSEC, %r10 419 jae 8f /* tv_nsec >= NANOSEC */ 420 4213: 422 movl 0x10(%rsp), %eax 423 movl CP_HRES_LOCK(%rdi), %edx 424 andl $0xfffffffe, %edx 425 cmpl %eax, %edx 426 jne 1b 427 428 movq 0x8(%rsp), %rsi 429 movq %r9, (%rsi) 430 movq %r10, 0x8(%rsi) 431 432 xorl %eax, %eax 433 addq $0x18, %rsp 434 ret 435 436 4374: /* hres_adj > 0 */ 438 sarq $ADJ_SHIFT, %rax 439 cmpl %r11d, %eax 440 jbe 5f 441 movl %r11d, %eax 4425: 443 addq %rax, %r10 444 jmp 2b 445 4466: /* hres_adj < 0 */ 447 sarq $ADJ_SHIFT, %rax 448 negl %r11d 449 cmpl %r11d, %eax 450 jbe 7f 451 movl %r11d, %eax 4527: 453 subq %rax, %r10 454 jmp 2b 455 4568: /* tv_nsec >= NANOSEC */ 457 subq $NANOSEC, %r10 458 incq %r9 459 cmpq $NANOSEC, %r10 460 jae 8b 461 jmp 3b 462 463 SET_SIZE(__cp_clock_gettime_realtime) 464