diff --git a/pal/src/host/linux-sgx/pal_exception.c b/pal/src/host/linux-sgx/pal_exception.c index 6fe7ff6248..a92cb29712 100644 --- a/pal/src/host/linux-sgx/pal_exception.c +++ b/pal/src/host/linux-sgx/pal_exception.c @@ -110,11 +110,12 @@ static void save_pal_context(PAL_CONTEXT* ctx, sgx_cpu_context_t* uc, } } +#include "utils/fast_clock.h" + static void emulate_rdtsc_and_print_warning(sgx_cpu_context_t* uc) { if (FIRST_TIME()) { - /* if we end up emulating RDTSC/RDTSCP instruction, we cannot use invariant TSC */ - extern uint64_t g_tsc_hz; - g_tsc_hz = 0; + /* if we end up emulating RDTSC/RDTSCP instruction, we cannot use TSC-based clock emulation */ + fast_clock__disable(&g_fast_clock); log_warning("all RDTSC/RDTSCP instructions are emulated (imprecisely) via gettime() " "syscall."); } diff --git a/pal/src/host/linux-sgx/pal_linux.h b/pal/src/host/linux-sgx/pal_linux.h index 701f69a438..9a3d849d25 100644 --- a/pal/src/host/linux-sgx/pal_linux.h +++ b/pal/src/host/linux-sgx/pal_linux.h @@ -100,8 +100,6 @@ void _PalExceptionHandler(uint32_t trusted_exit_info_, uint32_t untrusted_external_event, sgx_cpu_context_t* uc, PAL_XREGS_STATE* xregs_state, sgx_arch_exinfo_t* exinfo); -void init_tsc(void); - int init_cpuid(void); int init_enclave(void); diff --git a/pal/src/host/linux-sgx/pal_main.c b/pal/src/host/linux-sgx/pal_main.c index 3d9cbd867d..a6f7834c32 100644 --- a/pal/src/host/linux-sgx/pal_main.c +++ b/pal/src/host/linux-sgx/pal_main.c @@ -407,7 +407,6 @@ static int import_and_init_extra_runtime_domain_names(struct pal_dns_host_conf* extern void* g_enclave_base; extern void* g_enclave_top; extern bool g_allowed_files_warn; -extern uint64_t g_tsc_hz; extern size_t g_unused_tcs_pages_num; static int print_warnings_on_insecure_configs(PAL_HANDLE parent_process) { @@ -552,14 +551,6 @@ static int print_warnings_on_insecure_configs(PAL_HANDLE parent_process) { return ret; } -static void print_warning_on_invariant_tsc(PAL_HANDLE parent_process) { - if (!parent_process && !g_tsc_hz) { - /* Warn only in the first process. */ - log_warning("Could not set up Invariant TSC (CPU is too old or you run on a VM that does " - "not expose corresponding CPUID leaves). This degrades performance."); - } -} - static void print_warnings_on_invalid_dns_host_conf(PAL_HANDLE parent_process) { if (!parent_process) { /* Warn only in the first process. */ @@ -581,8 +572,6 @@ static void post_callback(void) { ocall_exit(1, /*is_exitgroup=*/true); } - print_warning_on_invariant_tsc(g_pal_common_state.parent_process); - print_warnings_on_invalid_dns_host_conf(g_pal_common_state.parent_process); } @@ -721,12 +710,11 @@ noreturn void pal_linux_main(void* uptr_libpal_uri, size_t libpal_uri_len, void* SET_ENCLAVE_TCB(ready_for_exceptions, 1UL); - /* initialize "Invariant TSC" HW feature for fast and accurate gettime and immediately probe - * RDTSC instruction inside SGX enclave (via dummy get_tsc) -- it is possible that - * the CPU supports invariant TSC but doesn't support executing RDTSC inside SGX enclave, in - * this case the SIGILL exception is generated and leads to emulate_rdtsc_and_print_warning() - * which unsets invariant TSC, and we end up falling back to the slower ocall_gettime() */ - init_tsc(); + /* We implement a "fast-path" clock that is emulated internally using x86 RDTSC instruction. + * It is possible that the CPU does not support the RDTSC instruction within SGX enclave, + * in this case the SIGILL exception is generated and leads to emulate_rdtsc_and_print_warning() + * which disables the TSC based clock, and we end up falling back to the slower ocall_gettime() + */ (void)get_tsc(); /* must be after `ready_for_exceptions=1` since it may generate SIGILL */ ret = init_cpuid(); diff --git a/pal/src/host/linux-sgx/pal_misc.c b/pal/src/host/linux-sgx/pal_misc.c index d86a1b21e8..d28aef4505 100644 --- a/pal/src/host/linux-sgx/pal_misc.c +++ b/pal/src/host/linux-sgx/pal_misc.c @@ -22,208 +22,10 @@ #include "spinlock.h" #include "toml_utils.h" #include "topo_info.h" - -/* The timeout of 50ms was found to be a safe TSC drift correction periodicity based on results - * from multiple systems. Any higher or lower could pose risks of negative time drift or - * performance hit respectively. - */ -#define TSC_REFINE_INIT_TIMEOUT_USECS 50000 - -uint64_t g_tsc_hz = 0; /* TSC frequency for fast and accurate time ("invariant TSC" HW feature) */ -static uint64_t g_start_tsc = 0; -static uint64_t g_start_usec = 0; -static seqlock_t g_tsc_lock = INIT_SEQLOCK_UNLOCKED; - -static bool is_tsc_usable(void) { - uint32_t words[CPUID_WORD_NUM]; - _PalCpuIdRetrieve(INVARIANT_TSC_LEAF, 0, words); - return words[CPUID_WORD_EDX] & (1 << 8); -} - -/* return TSC frequency or 0 if invariant TSC is not supported */ -static uint64_t get_tsc_hz_baremetal(void) { - uint32_t words[CPUID_WORD_NUM]; - - /* - * Based on "Time Stamp Counter and Nominal Core Crystal Clock Information" leaf, calculate TSC - * frequency as ECX * EBX / EAX, where - * - EAX is denominator of the TSC/"core crystal clock" ratio, - * - EBX is numerator of the TSC/"core crystal clock" ratio, - * - ECX is core crystal clock (nominal) frequency in Hz. - */ - _PalCpuIdRetrieve(TSC_FREQ_LEAF, 0, words); - if (!words[CPUID_WORD_EAX] || !words[CPUID_WORD_EBX]) { - /* TSC/core crystal clock ratio is not enumerated, can't use RDTSC for accurate time */ - return 0; - } - - if (words[CPUID_WORD_ECX] > 0) { - /* cast to 64-bit first to prevent integer overflow */ - return (uint64_t)words[CPUID_WORD_ECX] * words[CPUID_WORD_EBX] / words[CPUID_WORD_EAX]; - } - - /* some Intel CPUs do not report nominal frequency of crystal clock, let's calculate it - * based on Processor Frequency Information Leaf (CPUID 16H); this leaf always exists if - * TSC Frequency Leaf exists; logic is taken from Linux 5.11's arch/x86/kernel/tsc.c */ - _PalCpuIdRetrieve(PROC_FREQ_LEAF, 0, words); - if (!words[CPUID_WORD_EAX]) { - /* processor base frequency (in MHz) is not enumerated, can't calculate frequency */ - return 0; - } - - /* processor base frequency is in MHz but we need to return TSC frequency in Hz; cast to 64-bit - * first to prevent integer overflow */ - return (uint64_t)words[CPUID_WORD_EAX] * 1000000; -} - -/* return TSC frequency or 0 if invariant TSC is not supported */ -static uint64_t get_tsc_hz_hypervisor(void) { - uint32_t words[CPUID_WORD_NUM]; - - /* - * We rely on the Generic CPUID space for hypervisors: - * - 0x40000000: EAX: The maximum input value for CPUID supported by the hypervisor - * - EBX, ECX, EDX: Hypervisor vendor ID signature (hypervisor_id) - * - * If we detect QEMU/KVM or Cloud Hypervisor/KVM (hypervisor_id = "KVMKVMKVM") or VMWare - * ("VMwareVMware"), then we assume that leaf 0x40000010 contains virtual TSC frequency in kHz - * in EAX. We check hypervisor_id because leaf 0x40000010 is not standardized and e.g. Microsoft - * Hyper-V may use it for other purposes. - * - * Relevant materials: - * - https://github.com/qemu/qemu/commit/9954a1582e18b03ddb66f6c892dccf2c3508f4b2 - * - qemu/target/i386/cpu.h, qemu/target/i386/cpu.c, qemu/target/i386/kvm/kvm.c sources - * - https://github.com/freebsd/freebsd-src/blob/9df6eea/sys/x86/x86/identcpu.c#L1372-L1377 (for - * the list of hypervisor_id values) - */ - _PalCpuIdRetrieve(HYPERVISOR_INFO_LEAF, 0, words); - - bool is_kvm = words[CPUID_WORD_EBX] == 0x4b4d564b - && words[CPUID_WORD_ECX] == 0x564b4d56 - && words[CPUID_WORD_EDX] == 0x0000004d; - bool is_vmware = words[CPUID_WORD_EBX] == 0x61774d56 - && words[CPUID_WORD_ECX] == 0x4d566572 - && words[CPUID_WORD_EDX] == 0x65726177; - - if (!is_kvm && !is_vmware) { - /* not a hypervisor that contains "virtual TSC frequency" in leaf 0x40000010 */ - return 0; - } - - if (words[CPUID_WORD_EAX] < HYPERVISOR_VMWARE_TIME_LEAF) { - /* virtual TSC frequency is not available */ - return 0; - } - - _PalCpuIdRetrieve(HYPERVISOR_VMWARE_TIME_LEAF, 0, words); - if (!words[CPUID_WORD_EAX]) { - /* TSC frequency (in kHz) is not enumerated, can't calculate frequency */ - return 0; - } - - /* TSC frequency is in kHz but we need to return TSC frequency in Hz; cast to 64-bit first to - * prevent integer overflow */ - return (uint64_t)words[CPUID_WORD_EAX] * 1000; -} - -/* initialize the data structures used for date/time emulation using TSC */ -void init_tsc(void) { - if (!is_tsc_usable()) - return; - - g_tsc_hz = get_tsc_hz_baremetal(); - if (g_tsc_hz) - return; - - /* hypervisors may not expose crystal-clock frequency CPUID leaves, so instead try - * hypervisor-special synthetic CPUID leaf 0x40000010 (VMWare-style Timing Information) */ - g_tsc_hz = get_tsc_hz_hypervisor(); - if (g_tsc_hz) - return; -} +#include "utils/fast_clock.h" int _PalSystemTimeQuery(uint64_t* out_usec) { - int ret; - - if (!g_tsc_hz) { - /* RDTSC is not allowed or no Invariant TSC feature -- fallback to the slow ocall */ - return ocall_gettime(out_usec); - } - - uint32_t seq; - uint64_t start_tsc; - uint64_t start_usec; - do { - seq = read_seqbegin(&g_tsc_lock); - start_tsc = g_start_tsc; - start_usec = g_start_usec; - } while (read_seqretry(&g_tsc_lock, seq)); - - uint64_t usec = 0; - /* Last seen RDTSC-calculated time value. This guards against time rewinding. */ - static uint64_t last_usec = 0; - if (start_tsc > 0 && start_usec > 0) { - /* baseline TSC/usec pair was initialized, can calculate time via RDTSC (but should be - * careful with integer overflow during calculations) */ - uint64_t diff_tsc = get_tsc() - start_tsc; - if (diff_tsc < UINT64_MAX / 1000000) { - uint64_t diff_usec = diff_tsc * 1000000 / g_tsc_hz; - if (diff_usec < TSC_REFINE_INIT_TIMEOUT_USECS) { - /* less than TSC_REFINE_INIT_TIMEOUT_USECS passed from the previous update of - * TSC/usec pair (time drift is contained), use the RDTSC-calculated time */ - usec = start_usec + diff_usec; - if (usec < start_usec) - return -PAL_ERROR_OVERFLOW; - - /* It's simply `last_usec = max(last_usec, usec)`, but executed atomically. */ - uint64_t expected_usec = __atomic_load_n(&last_usec, __ATOMIC_ACQUIRE); - while (expected_usec < usec) { - if (__atomic_compare_exchange_n(&last_usec, &expected_usec, usec, - /*weak=*/true, __ATOMIC_RELEASE, - __ATOMIC_ACQUIRE)) { - break; - } - } - - *out_usec = MAX(usec, expected_usec); - return 0; - } - } - } - - /* if we are here, either the baseline TSC/usec pair was not yet initialized or too much time - * passed since the previous TSC/usec update, so let's refresh them to contain the time drift */ - uint64_t tsc_cyc1 = get_tsc(); - ret = ocall_gettime(&usec); - if (ret < 0) - return -PAL_ERROR_DENIED; - uint64_t tsc_cyc2 = get_tsc(); - - uint64_t last_recorded_rdtsc = __atomic_load_n(&last_usec, __ATOMIC_ACQUIRE); - if (usec < last_recorded_rdtsc) { - /* new OCALL-obtained timestamp (`usec`) is "back in time" than the last recorded timestamp - * from RDTSC (`last_recorded_rdtsc`); this can happen if the actual host time drifted - * backwards compared to the RDTSC time. */ - usec = last_recorded_rdtsc; - } - - /* we need to match the OCALL-obtained timestamp (`usec`) with the RDTSC-obtained number of - * cycles (`tsc_cyc`); since OCALL is a time-consuming operation, we estimate `tsc_cyc` as a - * mid-point between the RDTSC values obtained right-before and right-after the OCALL. */ - uint64_t tsc_cyc = tsc_cyc1 + (tsc_cyc2 - tsc_cyc1) / 2; - if (tsc_cyc < tsc_cyc1) - return -PAL_ERROR_OVERFLOW; - - /* refresh the baseline data if no other thread updated g_start_tsc */ - write_seqbegin(&g_tsc_lock); - if (g_start_tsc < tsc_cyc) { - g_start_tsc = tsc_cyc; - g_start_usec = usec; - } - write_seqend(&g_tsc_lock); - - *out_usec = usec; - return 0; + return fast_clock__get_time(&g_fast_clock, out_usec, false); } static uint32_t g_extended_feature_flags_max_supported_sub_leaves = 0; @@ -515,7 +317,6 @@ static const struct cpuid_leaf cpuid_known_leaves[] = { /* hypervisor-specific CPUID leaf functions (0x40000000 - 0x400000FF) start here */ {.leaf = 0x40000000, .zero_subleaf = true, .cache = true}, /* CPUID Info */ {.leaf = 0x40000010, .zero_subleaf = true, .cache = true}, /* VMWare-style Timing Info */ - /* NOTE: currently only the above two leaves are used, see also get_tsc_hz_hypervisor() */ /* invalid CPUID leaf functions (no existing or future CPU will return any meaningful * information in these leaves) occupy 0x40000100 - 0x4FFFFFFF -- they are treated the same as diff --git a/pal/src/host/linux-sgx/utils/fast_clock.c b/pal/src/host/linux-sgx/utils/fast_clock.c index 189fcbf587..3d1a14e6fc 100644 --- a/pal/src/host/linux-sgx/utils/fast_clock.c +++ b/pal/src/host/linux-sgx/utils/fast_clock.c @@ -182,12 +182,6 @@ int fast_clock__get_time(fast_clock_t* fast_clock, uint64_t* time_usec, bool for } } -bool fast_clock__is_enabled(const fast_clock_t* fast_clock) -{ - fast_clock_desc_t descriptor = desc_atomic_load(fast_clock, __ATOMIC_RELAXED); - return (descriptor.state != FC_STATE_RDTSC_DISABLED); -} - static inline bool set_change_state_guard(fast_clock_t* fast_clock, fast_clock_desc_t descriptor) { if ((descriptor.flags & FC_FLAGS_STATE_CHANGING) != 0) { @@ -205,6 +199,25 @@ static inline bool set_change_state_guard(fast_clock_t* fast_clock, fast_clock_d ); } +bool fast_clock__is_enabled(const fast_clock_t* fast_clock) +{ + fast_clock_desc_t descriptor = desc_atomic_load(fast_clock, __ATOMIC_RELAXED); + return (descriptor.state != FC_STATE_RDTSC_DISABLED); +} + +void fast_clock__disable(fast_clock_t* fast_clock) +{ + /* We need to busy-loop until the state change guard is acquired here - since fast-clock + * might be in the midst of transitioning states. We can't simply store the DISABLED state. */ + fast_clock_desc_t descriptor; + do { + descriptor = desc_atomic_load(fast_clock, __ATOMIC_ACQUIRE); + } while(!set_change_state_guard(fast_clock, descriptor)); + + fast_clock_desc_t disabled_desc = advance_state(descriptor, FC_STATE_RDTSC_DISABLED, false); + desc_atomic_store(fast_clock, disabled_desc, __ATOMIC_RELEASE); +} + static inline fast_clock_timepoint_t* get_timepoint(fast_clock_t* fast_clock, fast_clock_desc_t descriptor) { return &fast_clock->time_points[timepoint_index(descriptor)]; diff --git a/pal/src/host/linux-sgx/utils/fast_clock.h b/pal/src/host/linux-sgx/utils/fast_clock.h index a25fe4189d..5808b7a28f 100644 --- a/pal/src/host/linux-sgx/utils/fast_clock.h +++ b/pal/src/host/linux-sgx/utils/fast_clock.h @@ -70,6 +70,7 @@ extern fast_clock_t g_fast_clock; int fast_clock__get_time(fast_clock_t* fast_clock, uint64_t* time_micros, bool force_new_timepoint); void fast_clock__get_timezone(const fast_clock_t* fast_clock, int* tz_minutewest, int* tz_dsttime); bool fast_clock__is_enabled(const fast_clock_t* fast_clock); +void fast_clock__disable(fast_clock_t* fast_clock); #ifdef __cplusplus } /* extern int */