From 664de3ddc19edbc3fedf529da23b2aec86d996bb Mon Sep 17 00:00:00 2001 From: Jonathan Shamir Date: Sun, 24 Mar 2024 17:47:09 +0200 Subject: [PATCH] [PAL/Linux-SGX] Add new implementation for clock emulation without relying on cpuid and steady clock frequency heuristics (fast-clock) Signed-off-by: Jonathan Shamir --- common/include/arch/x86_64/cpu.h | 2 - pal/src/host/linux-sgx/enclave_ocalls.c | 82 +++-- pal/src/host/linux-sgx/enclave_ocalls.h | 2 +- pal/src/host/linux-sgx/host_ocalls.c | 4 +- pal/src/host/linux-sgx/meson.build | 1 + pal/src/host/linux-sgx/pal_exception.c | 9 +- pal/src/host/linux-sgx/pal_linux.h | 2 - pal/src/host/linux-sgx/pal_main.c | 32 +- pal/src/host/linux-sgx/pal_misc.c | 208 +------------ pal/src/host/linux-sgx/pal_ocall_types.h | 1 + pal/src/host/linux-sgx/utils/fast_clock.c | 364 ++++++++++++++++++++++ pal/src/host/linux-sgx/utils/fast_clock.h | 46 +++ 12 files changed, 504 insertions(+), 249 deletions(-) create mode 100644 pal/src/host/linux-sgx/utils/fast_clock.c create mode 100644 pal/src/host/linux-sgx/utils/fast_clock.h diff --git a/common/include/arch/x86_64/cpu.h b/common/include/arch/x86_64/cpu.h index ad42bcbab4..f55c7a21fa 100644 --- a/common/include/arch/x86_64/cpu.h +++ b/common/include/arch/x86_64/cpu.h @@ -45,8 +45,6 @@ enum extended_state_sub_leaf { #define PROC_FREQ_LEAF 0x16 #define AMX_TILE_INFO_LEAF 0x1D #define AMX_TMUL_INFO_LEAF 0x1E -#define HYPERVISOR_INFO_LEAF 0x40000000 -#define HYPERVISOR_VMWARE_TIME_LEAF 0x40000010 #define MAX_INPUT_EXT_VALUE_LEAF 0x80000000 #define EXT_SIGNATURE_AND_FEATURES_LEAF 0x80000001 #define CPU_BRAND_LEAF 0x80000002 diff --git a/pal/src/host/linux-sgx/enclave_ocalls.c b/pal/src/host/linux-sgx/enclave_ocalls.c index c7259056e9..43afc62ffb 100644 --- a/pal/src/host/linux-sgx/enclave_ocalls.c +++ b/pal/src/host/linux-sgx/enclave_ocalls.c @@ -1766,47 +1766,85 @@ int ocall_shutdown(int sockfd, int how) { return retval; } -int ocall_gettime(uint64_t* microsec_ptr) { +int ocall_gettime(uint64_t* microsec_ptr, uint64_t* tsc_ptr) { int retval = 0; - struct ocall_gettime* ocall_gettime_args; + struct ocall_gettime* ocall_gettime_args = NULL; void* old_ustack = sgx_prepare_ustack(); ocall_gettime_args = sgx_alloc_on_ustack_aligned(sizeof(*ocall_gettime_args), alignof(*ocall_gettime_args)); if (!ocall_gettime_args) { - sgx_reset_ustack(old_ustack); - return -EPERM; + retval = -EPERM; + goto out; } /* Last seen time value. This guards against time rewinding. */ - static uint64_t last_microsec = 0; - uint64_t last_microsec_before_ocall = __atomic_load_n(&last_microsec, __ATOMIC_ACQUIRE); + struct gettime_guard + { + spinlock_t lock; + uint64_t microsec; + uint64_t tsc; + }; + static struct gettime_guard last_value = { + .lock = INIT_SPINLOCK_UNLOCKED, + .microsec = 0, + .tsc = 0, + }; + + spinlock_lock(&last_value.lock); + uint64_t last_microsec_before_ocall = last_value.microsec; + uint64_t last_tsc_before_ocall = last_value.tsc; + spinlock_unlock(&last_value.lock); + + uint64_t tsc_before_ocall = 0; + uint64_t tsc_after_ocall = 0; do { + tsc_before_ocall = get_tsc(); retval = sgx_exitless_ocall(OCALL_GETTIME, ocall_gettime_args); + tsc_after_ocall = get_tsc(); } while (retval == -EINTR); if (retval < 0 && retval != -EINVAL && retval != -EPERM) { retval = -EPERM; } + if (retval != 0) { + goto out; + } - if (!retval) { - uint64_t microsec = COPY_UNTRUSTED_VALUE(&ocall_gettime_args->microsec); - if (microsec < last_microsec_before_ocall) { - /* Probably a malicious host. */ - log_error("OCALL_GETTIME returned time value smaller than in the previous call"); - _PalProcessExit(1); - } - /* Update `last_microsec`. */ - uint64_t expected_microsec = last_microsec_before_ocall; - while (expected_microsec < microsec) { - if (__atomic_compare_exchange_n(&last_microsec, &expected_microsec, microsec, - /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_ACQUIRE)) { - break; - } - } - *microsec_ptr = MAX(microsec, expected_microsec); + /* detect malicious host - time and tsc must monotonically increase */ + uint64_t new_microsec = COPY_UNTRUSTED_VALUE(&ocall_gettime_args->microsec); + uint64_t new_tsc = COPY_UNTRUSTED_VALUE(&ocall_gettime_args->tsc); + if (new_microsec < last_microsec_before_ocall) { + log_error("OCALL_GETTIME returned time value smaller than in the previous call"); + _PalProcessExit(1); } + if (new_tsc <= last_tsc_before_ocall) { + log_error("OCALL_GETTIME returned TSC value smaller than in previous call"); + _PalProcessExit(1); + } + if (!((tsc_before_ocall < new_tsc) && (new_tsc < tsc_after_ocall))) { + log_error("OCALL_GETTIME returned TSC value inconsistent with values taken within the enclave"); + _PalProcessExit(1); + } + + /* Update `last_value` guard. */ + spinlock_lock(&last_value.lock); + if (last_value.tsc < new_tsc) { + last_value.microsec = new_microsec; + last_value.tsc = new_tsc; + } else { + /* there was a more recent ocall */ + new_microsec = last_value.microsec; + new_tsc = last_value.tsc; + } + spinlock_unlock(&last_value.lock); + *microsec_ptr = new_microsec; + if (tsc_ptr != NULL) { + *tsc_ptr = new_tsc; + } + +out: sgx_reset_ustack(old_ustack); return retval; } diff --git a/pal/src/host/linux-sgx/enclave_ocalls.h b/pal/src/host/linux-sgx/enclave_ocalls.h index bf8b05f849..a0fcefc84b 100644 --- a/pal/src/host/linux-sgx/enclave_ocalls.h +++ b/pal/src/host/linux-sgx/enclave_ocalls.h @@ -89,7 +89,7 @@ int ocall_create_process(size_t nargs, const char** args, uintptr_t (*reserved_m int ocall_futex(uint32_t* uaddr, int op, int val, uint64_t* timeout_us); -int ocall_gettime(uint64_t* microsec); +int ocall_gettime(uint64_t* microsec, uint64_t* tsc); void ocall_sched_yield(void); diff --git a/pal/src/host/linux-sgx/host_ocalls.c b/pal/src/host/linux-sgx/host_ocalls.c index eba742e55c..fdf1fa9aa0 100644 --- a/pal/src/host/linux-sgx/host_ocalls.c +++ b/pal/src/host/linux-sgx/host_ocalls.c @@ -603,8 +603,10 @@ static long sgx_ocall_shutdown(void* args) { static long sgx_ocall_gettime(void* args) { struct ocall_gettime* ocall_gettime_args = args; struct timeval tv; + uint64_t tsc = get_tsc(); DO_SYSCALL(gettimeofday, &tv, NULL); - ocall_gettime_args->microsec = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; + ocall_gettime_args->microsec = tv.tv_sec * (uint64_t)1000000UL + tv.tv_usec; + ocall_gettime_args->tsc = tsc; return 0; } diff --git a/pal/src/host/linux-sgx/meson.build b/pal/src/host/linux-sgx/meson.build index e10defc768..347ad98924 100644 --- a/pal/src/host/linux-sgx/meson.build +++ b/pal/src/host/linux-sgx/meson.build @@ -86,6 +86,7 @@ libpal_sgx = shared_library('pal', 'pal_sockets.c', 'pal_streams.c', 'pal_threading.c', + 'utils/fast_clock.c', pal_sgx_asm_offsets_h, pal_common_sources, pal_linux_common_sources_enclave, diff --git a/pal/src/host/linux-sgx/pal_exception.c b/pal/src/host/linux-sgx/pal_exception.c index 6bc890bdc4..7d38aa3229 100644 --- a/pal/src/host/linux-sgx/pal_exception.c +++ b/pal/src/host/linux-sgx/pal_exception.c @@ -111,11 +111,14 @@ static void save_pal_context(PAL_CONTEXT* ctx, sgx_cpu_context_t* uc, } } +#include "utils/fast_clock.h" + +int g_atomic_is_rdtsc_emulated = 0; static void emulate_rdtsc_and_print_warning(sgx_cpu_context_t* uc) { if (FIRST_TIME()) { - /* if we end up emulating RDTSC/RDTSCP instruction, we cannot use invariant TSC */ - extern uint64_t g_tsc_hz; - g_tsc_hz = 0; + /* if we end up emulating RDTSC/RDTSCP instruction, we cannot use TSC-based clock emulation */ + __atomic_store_n(&g_atomic_is_rdtsc_emulated, 1, __ATOMIC_SEQ_CST); + fast_clock_disable(&g_fast_clock); log_warning("all RDTSC/RDTSCP instructions are emulated (imprecisely) via gettime() " "syscall."); } diff --git a/pal/src/host/linux-sgx/pal_linux.h b/pal/src/host/linux-sgx/pal_linux.h index 914d75f1a4..50a2555b03 100644 --- a/pal/src/host/linux-sgx/pal_linux.h +++ b/pal/src/host/linux-sgx/pal_linux.h @@ -99,8 +99,6 @@ void _PalExceptionHandler(uint32_t trusted_exit_info_, uint32_t untrusted_external_event, sgx_cpu_context_t* uc, PAL_XREGS_STATE* xregs_state, sgx_arch_exinfo_t* exinfo); -void init_tsc(void); - int init_cpuid(void); int init_enclave(void); diff --git a/pal/src/host/linux-sgx/pal_main.c b/pal/src/host/linux-sgx/pal_main.c index a5d32f47f7..3257e6dcff 100644 --- a/pal/src/host/linux-sgx/pal_main.c +++ b/pal/src/host/linux-sgx/pal_main.c @@ -31,6 +31,7 @@ #include "pal_topology.h" #include "toml.h" #include "toml_utils.h" +#include "utils/fast_clock.h" struct pal_linuxsgx_state g_pal_linuxsgx_state; @@ -407,7 +408,6 @@ static int import_and_init_extra_runtime_domain_names(struct pal_dns_host_conf* extern void* g_enclave_base; extern void* g_enclave_top; extern bool g_allowed_files_warn; -extern uint64_t g_tsc_hz; extern size_t g_unused_tcs_pages_num; static int print_warnings_on_insecure_configs(PAL_HANDLE parent_process) { @@ -552,11 +552,17 @@ static int print_warnings_on_insecure_configs(PAL_HANDLE parent_process) { return ret; } -static void print_warning_on_invariant_tsc(PAL_HANDLE parent_process) { - if (!parent_process && !g_tsc_hz) { - /* Warn only in the first process. */ - log_warning("Could not set up Invariant TSC (CPU is too old or you run on a VM that does " - "not expose corresponding CPUID leaves). This degrades performance."); +static void print_warnings_on_disabled_clock_emulation(PAL_HANDLE parent_process) { + if (parent_process) { + return; /* Warn only in the first process */ + } + + /* We call get_tsc() early in pal_linux_main - + * if rdtsc opcode is emulated, the error handler disables fast-clock + */ + if (!fast_clock_is_enabled(&g_fast_clock)) { + log_warning("Could not enable fast clock emulation (CPU is too old or VM does " + "not support TSC within SGX enclave). This degrades performance."); } } @@ -581,8 +587,7 @@ static void post_callback(void) { ocall_exit(1, /*is_exitgroup=*/true); } - print_warning_on_invariant_tsc(g_pal_common_state.parent_process); - + print_warnings_on_disabled_clock_emulation(g_pal_common_state.parent_process); print_warnings_on_invalid_dns_host_conf(g_pal_common_state.parent_process); } @@ -725,12 +730,11 @@ noreturn void pal_linux_main(void* uptr_libpal_uri, size_t libpal_uri_len, void* SET_ENCLAVE_TCB(ready_for_exceptions, 1UL); - /* initialize "Invariant TSC" HW feature for fast and accurate gettime and immediately probe - * RDTSC instruction inside SGX enclave (via dummy get_tsc) -- it is possible that - * the CPU supports invariant TSC but doesn't support executing RDTSC inside SGX enclave, in - * this case the SIGILL exception is generated and leads to emulate_rdtsc_and_print_warning() - * which unsets invariant TSC, and we end up falling back to the slower ocall_gettime() */ - init_tsc(); + /* We implement a "fast-path" clock that is emulated internally using x86 RDTSC instruction. + * It is possible that the CPU does not support the RDTSC instruction within SGX enclave, + * in this case the SIGILL exception is generated and leads to emulate_rdtsc_and_print_warning() + * which disables the TSC based clock, and we end up falling back to the slower ocall_gettime() + */ (void)get_tsc(); /* must be after `ready_for_exceptions=1` since it may generate SIGILL */ ret = init_cpuid(); diff --git a/pal/src/host/linux-sgx/pal_misc.c b/pal/src/host/linux-sgx/pal_misc.c index d86a1b21e8..d2b118b337 100644 --- a/pal/src/host/linux-sgx/pal_misc.c +++ b/pal/src/host/linux-sgx/pal_misc.c @@ -22,208 +22,10 @@ #include "spinlock.h" #include "toml_utils.h" #include "topo_info.h" - -/* The timeout of 50ms was found to be a safe TSC drift correction periodicity based on results - * from multiple systems. Any higher or lower could pose risks of negative time drift or - * performance hit respectively. - */ -#define TSC_REFINE_INIT_TIMEOUT_USECS 50000 - -uint64_t g_tsc_hz = 0; /* TSC frequency for fast and accurate time ("invariant TSC" HW feature) */ -static uint64_t g_start_tsc = 0; -static uint64_t g_start_usec = 0; -static seqlock_t g_tsc_lock = INIT_SEQLOCK_UNLOCKED; - -static bool is_tsc_usable(void) { - uint32_t words[CPUID_WORD_NUM]; - _PalCpuIdRetrieve(INVARIANT_TSC_LEAF, 0, words); - return words[CPUID_WORD_EDX] & (1 << 8); -} - -/* return TSC frequency or 0 if invariant TSC is not supported */ -static uint64_t get_tsc_hz_baremetal(void) { - uint32_t words[CPUID_WORD_NUM]; - - /* - * Based on "Time Stamp Counter and Nominal Core Crystal Clock Information" leaf, calculate TSC - * frequency as ECX * EBX / EAX, where - * - EAX is denominator of the TSC/"core crystal clock" ratio, - * - EBX is numerator of the TSC/"core crystal clock" ratio, - * - ECX is core crystal clock (nominal) frequency in Hz. - */ - _PalCpuIdRetrieve(TSC_FREQ_LEAF, 0, words); - if (!words[CPUID_WORD_EAX] || !words[CPUID_WORD_EBX]) { - /* TSC/core crystal clock ratio is not enumerated, can't use RDTSC for accurate time */ - return 0; - } - - if (words[CPUID_WORD_ECX] > 0) { - /* cast to 64-bit first to prevent integer overflow */ - return (uint64_t)words[CPUID_WORD_ECX] * words[CPUID_WORD_EBX] / words[CPUID_WORD_EAX]; - } - - /* some Intel CPUs do not report nominal frequency of crystal clock, let's calculate it - * based on Processor Frequency Information Leaf (CPUID 16H); this leaf always exists if - * TSC Frequency Leaf exists; logic is taken from Linux 5.11's arch/x86/kernel/tsc.c */ - _PalCpuIdRetrieve(PROC_FREQ_LEAF, 0, words); - if (!words[CPUID_WORD_EAX]) { - /* processor base frequency (in MHz) is not enumerated, can't calculate frequency */ - return 0; - } - - /* processor base frequency is in MHz but we need to return TSC frequency in Hz; cast to 64-bit - * first to prevent integer overflow */ - return (uint64_t)words[CPUID_WORD_EAX] * 1000000; -} - -/* return TSC frequency or 0 if invariant TSC is not supported */ -static uint64_t get_tsc_hz_hypervisor(void) { - uint32_t words[CPUID_WORD_NUM]; - - /* - * We rely on the Generic CPUID space for hypervisors: - * - 0x40000000: EAX: The maximum input value for CPUID supported by the hypervisor - * - EBX, ECX, EDX: Hypervisor vendor ID signature (hypervisor_id) - * - * If we detect QEMU/KVM or Cloud Hypervisor/KVM (hypervisor_id = "KVMKVMKVM") or VMWare - * ("VMwareVMware"), then we assume that leaf 0x40000010 contains virtual TSC frequency in kHz - * in EAX. We check hypervisor_id because leaf 0x40000010 is not standardized and e.g. Microsoft - * Hyper-V may use it for other purposes. - * - * Relevant materials: - * - https://github.com/qemu/qemu/commit/9954a1582e18b03ddb66f6c892dccf2c3508f4b2 - * - qemu/target/i386/cpu.h, qemu/target/i386/cpu.c, qemu/target/i386/kvm/kvm.c sources - * - https://github.com/freebsd/freebsd-src/blob/9df6eea/sys/x86/x86/identcpu.c#L1372-L1377 (for - * the list of hypervisor_id values) - */ - _PalCpuIdRetrieve(HYPERVISOR_INFO_LEAF, 0, words); - - bool is_kvm = words[CPUID_WORD_EBX] == 0x4b4d564b - && words[CPUID_WORD_ECX] == 0x564b4d56 - && words[CPUID_WORD_EDX] == 0x0000004d; - bool is_vmware = words[CPUID_WORD_EBX] == 0x61774d56 - && words[CPUID_WORD_ECX] == 0x4d566572 - && words[CPUID_WORD_EDX] == 0x65726177; - - if (!is_kvm && !is_vmware) { - /* not a hypervisor that contains "virtual TSC frequency" in leaf 0x40000010 */ - return 0; - } - - if (words[CPUID_WORD_EAX] < HYPERVISOR_VMWARE_TIME_LEAF) { - /* virtual TSC frequency is not available */ - return 0; - } - - _PalCpuIdRetrieve(HYPERVISOR_VMWARE_TIME_LEAF, 0, words); - if (!words[CPUID_WORD_EAX]) { - /* TSC frequency (in kHz) is not enumerated, can't calculate frequency */ - return 0; - } - - /* TSC frequency is in kHz but we need to return TSC frequency in Hz; cast to 64-bit first to - * prevent integer overflow */ - return (uint64_t)words[CPUID_WORD_EAX] * 1000; -} - -/* initialize the data structures used for date/time emulation using TSC */ -void init_tsc(void) { - if (!is_tsc_usable()) - return; - - g_tsc_hz = get_tsc_hz_baremetal(); - if (g_tsc_hz) - return; - - /* hypervisors may not expose crystal-clock frequency CPUID leaves, so instead try - * hypervisor-special synthetic CPUID leaf 0x40000010 (VMWare-style Timing Information) */ - g_tsc_hz = get_tsc_hz_hypervisor(); - if (g_tsc_hz) - return; -} +#include "utils/fast_clock.h" int _PalSystemTimeQuery(uint64_t* out_usec) { - int ret; - - if (!g_tsc_hz) { - /* RDTSC is not allowed or no Invariant TSC feature -- fallback to the slow ocall */ - return ocall_gettime(out_usec); - } - - uint32_t seq; - uint64_t start_tsc; - uint64_t start_usec; - do { - seq = read_seqbegin(&g_tsc_lock); - start_tsc = g_start_tsc; - start_usec = g_start_usec; - } while (read_seqretry(&g_tsc_lock, seq)); - - uint64_t usec = 0; - /* Last seen RDTSC-calculated time value. This guards against time rewinding. */ - static uint64_t last_usec = 0; - if (start_tsc > 0 && start_usec > 0) { - /* baseline TSC/usec pair was initialized, can calculate time via RDTSC (but should be - * careful with integer overflow during calculations) */ - uint64_t diff_tsc = get_tsc() - start_tsc; - if (diff_tsc < UINT64_MAX / 1000000) { - uint64_t diff_usec = diff_tsc * 1000000 / g_tsc_hz; - if (diff_usec < TSC_REFINE_INIT_TIMEOUT_USECS) { - /* less than TSC_REFINE_INIT_TIMEOUT_USECS passed from the previous update of - * TSC/usec pair (time drift is contained), use the RDTSC-calculated time */ - usec = start_usec + diff_usec; - if (usec < start_usec) - return -PAL_ERROR_OVERFLOW; - - /* It's simply `last_usec = max(last_usec, usec)`, but executed atomically. */ - uint64_t expected_usec = __atomic_load_n(&last_usec, __ATOMIC_ACQUIRE); - while (expected_usec < usec) { - if (__atomic_compare_exchange_n(&last_usec, &expected_usec, usec, - /*weak=*/true, __ATOMIC_RELEASE, - __ATOMIC_ACQUIRE)) { - break; - } - } - - *out_usec = MAX(usec, expected_usec); - return 0; - } - } - } - - /* if we are here, either the baseline TSC/usec pair was not yet initialized or too much time - * passed since the previous TSC/usec update, so let's refresh them to contain the time drift */ - uint64_t tsc_cyc1 = get_tsc(); - ret = ocall_gettime(&usec); - if (ret < 0) - return -PAL_ERROR_DENIED; - uint64_t tsc_cyc2 = get_tsc(); - - uint64_t last_recorded_rdtsc = __atomic_load_n(&last_usec, __ATOMIC_ACQUIRE); - if (usec < last_recorded_rdtsc) { - /* new OCALL-obtained timestamp (`usec`) is "back in time" than the last recorded timestamp - * from RDTSC (`last_recorded_rdtsc`); this can happen if the actual host time drifted - * backwards compared to the RDTSC time. */ - usec = last_recorded_rdtsc; - } - - /* we need to match the OCALL-obtained timestamp (`usec`) with the RDTSC-obtained number of - * cycles (`tsc_cyc`); since OCALL is a time-consuming operation, we estimate `tsc_cyc` as a - * mid-point between the RDTSC values obtained right-before and right-after the OCALL. */ - uint64_t tsc_cyc = tsc_cyc1 + (tsc_cyc2 - tsc_cyc1) / 2; - if (tsc_cyc < tsc_cyc1) - return -PAL_ERROR_OVERFLOW; - - /* refresh the baseline data if no other thread updated g_start_tsc */ - write_seqbegin(&g_tsc_lock); - if (g_start_tsc < tsc_cyc) { - g_start_tsc = tsc_cyc; - g_start_usec = usec; - } - write_seqend(&g_tsc_lock); - - *out_usec = usec; - return 0; + return fast_clock_get_time(&g_fast_clock, out_usec); } static uint32_t g_extended_feature_flags_max_supported_sub_leaves = 0; @@ -512,10 +314,8 @@ static const struct cpuid_leaf cpuid_known_leaves[] = { {.leaf = 0x1F, .zero_subleaf = false, .cache = false}, /* Intel V2 Ext Topology Enumeration */ /* basic CPUID leaf functions end here */ - /* hypervisor-specific CPUID leaf functions (0x40000000 - 0x400000FF) start here */ - {.leaf = 0x40000000, .zero_subleaf = true, .cache = true}, /* CPUID Info */ - {.leaf = 0x40000010, .zero_subleaf = true, .cache = true}, /* VMWare-style Timing Info */ - /* NOTE: currently only the above two leaves are used, see also get_tsc_hz_hypervisor() */ + /* hypervisor-specific CPUID leaf functions (0x40000000 - 0x400000FF) */ + /* not used, see code below */ /* invalid CPUID leaf functions (no existing or future CPU will return any meaningful * information in these leaves) occupy 0x40000100 - 0x4FFFFFFF -- they are treated the same as diff --git a/pal/src/host/linux-sgx/pal_ocall_types.h b/pal/src/host/linux-sgx/pal_ocall_types.h index 793282c81e..20d3a1e44e 100644 --- a/pal/src/host/linux-sgx/pal_ocall_types.h +++ b/pal/src/host/linux-sgx/pal_ocall_types.h @@ -294,6 +294,7 @@ struct ocall_shutdown { struct ocall_gettime { uint64_t microsec; + uint64_t tsc; }; struct ocall_poll { diff --git a/pal/src/host/linux-sgx/utils/fast_clock.c b/pal/src/host/linux-sgx/utils/fast_clock.c new file mode 100644 index 0000000000..f9b202616f --- /dev/null +++ b/pal/src/host/linux-sgx/utils/fast_clock.c @@ -0,0 +1,364 @@ +#include "api.h" +#include "cpu.h" +#include "enclave_ocalls.h" +#include "pal_internal.h" +#include "utils/fast_clock.h" + +/** + * FastClock + * + * The purpose of this module is to provide a fast sgx implementation for gettimeofday(). + * - What this does: avoids OCALL on every gettimeofday() invocation. Given a "ground truth" + * timepoint, we can calculate the current time directly inside the enclave. + * - What this doesn't do: this solution does *NOT* provide a trusted time implementation. + * This still relies on the untrusted host time. + * + * In order to calculate the current time inside the enclave, we need the following: + * 1. t0 - a point in time that all fast clock times will be calculated against. + * 2. tsc0 - the clock cycle counter for that point in time + * 3. clock_frequency - how many clock cycles do we have per second. The tsc value is synced + * between all cores. + * Using the above, given the current tsc we can calculate the current time. + * + * Note: old SGX enclaves (prior to SGX2) do not support using the `rdtsc` opcode to read the TSC. + * + * *** Implementation *** + * + * FastClock is implemented as a state machine. This was done since we don't have a good portable + * way to get the cpu clock frequency. So, our general strategy is to simply "calculate" it, by + * comparing two timeval values and their corresponding tsc values. + * + * The naive way of making this calculation is to take two timepoints during initialization with a + * "sleep" in between. Instead, we're letting the program run "organically", and using the time + * that passes between calls to gettimeofday() as our sleep. This means FastClock will perform an + * OCALL when needed, and calculate the time internally when it can. + * + * FastClock has the following states: + * + * INIT┌─►CALIBRATING┌─►RDTSC──►RDTSC_RECALIBRATE─┐ + * │ │ │ + * └─►DISABLED └────────────────────────────┘ + * + * 1. INIT - this is the initial state for fast_clock. All calls are OCALLs + * a. check if rdtsc() is allowed from sgx within the current enclave (-> DISABLED otherwise) + * b. take the initial t0 and tsc0 values used for calibration (-> CALIBRATING) + * 2. DISABLED - slow path, all calls will be OCALLs + * 3. CALIBRATING - wait for some time to pass so we can calculate the clock_frequency + * a. OCALL to get the current time + * b. if "enough" time has passed since t0, we calculate clock_frequency (-> RDTSC) + * 4. RDTSC - fast path, time calculation is done within the enclave + * a. calculate the current time by using clock_frequency, t0 and the tsc value taken within + * the enclave. + * b. if a "long" time has passed since we last synced with the host, OCALL to get new values + * for t0 and tsc0 to reduce divergence (-> RDTSC_RECALIBRATING) + * 5. RDTSC_RECALIBRATE - similar to CALIBRATING, calculate an updated clock_frequency + * a. since we have a previous calculation of clock_frequency, we still use the "fast path" to + * calculate the time within the enclave. + * b. when enough time has passed to re-calculate the frequency, we OCALL to get a second + * "ground truth" and calculate a new clock_frequency (-> RDTSC) + * + * *** Thread safety *** + * + * As far as multithreading goes, we had the following goals. We wanted the solution to give + * consistent times between all threads. This means FastClock state can't be thread local, and + * needs to be thread safe. And since this is a performance optimization, we need this to be + * lockless (and definitely no OCALLs other than gettimeofday). + * + * To achieve the above, we use the following data structures. + * + * 1. fast_clock_timepoint - this contains all the internal state needed by FastClock to + * calculate the time as discussed above. FastClock internally has *two* timepoints, which + * are used in round-robin (alternating). + * 2. fast_clock_desc - this is read and written to atomically, which is how the lockless + * thread safety is implemented. The descriptor contains: + * - The current "state" of the FastClock state machine. + * - The round-robin index of the timepoint that is currently in use. + * - A flag that guards state transitions, in case of concurrent calls only a single thread + * should calculate the new timepoint data and transition the state. + * + * By using an atomic descriptor and round-robin timepoints, we can make sure only a single thread + * is changing the timepoint values, and no one can read "intermediate" state. We will only store + * the new descriptor pointing to the "next" state and timepoint after it's usable. + * + * Note: in theory this is not thread safe, as we can have the following - + * 1. Thread A reads descriptor, starts flow using timepoint #0, then context switch. + * 2. Some time passes and we transition to timepoint #1. + * 3. Some more time passes and we transition back to timepoint #0. + * 4. Thread A wakes up and reads inconsistent state in timepoint #0. At the worst case this might + * lead to negative \ max time. + * In practice this will never happen, since a long time passes between transitioning timepoints. + */ + + +/** + * We got these values experimentally (on azure dc#sv3 machines, SGX2 secure compute) - + * 1. increasing CALIBRATION_TIME beyond 1sec doesn't increase the accuracy of the calculated + * clock frequency or times, + * 2. 120 seconds keeps the time-drift with host time typically in the 50us range, and very rarely + * at the 1ms range. + * + * Note, time drift can vary, "ground truth" values can be "bad" and offset the calculation. This is + * true regardless of the numbers we choose or the implementation (as long as we rely on OCALLing to + * tell the time). The recalibration interval is used to offset this. + */ +#define RDTSC_CALIBRATION_TIME ((uint64_t)1 * TIME_US_IN_S) +#define RDTSC_RECALIBRATION_INTERVAL ((uint64_t)120 * TIME_US_IN_S) + +typedef enum +{ + FC_STATE_RDTSC, + FC_STATE_RDTSC_RECALIBRATE, + FC_STATE_CALIBRATING, + FC_STATE_INIT, + + FC_STATE_RDTSC_DISABLED, +} fast_clock_state; + +fast_clock g_fast_clock = { + .atomic_descriptor = { + .state = FC_STATE_INIT, + .timepoint_index = 0, + .state_changing = 0, + }, + .time_points = { [0 ... FC_NUM_TIMEPOINTS-1] = { + .clock_freq = 0, + .tsc0 = 0, + .t0_usec = 0, + .expiration_usec = 0, + }} +}; + + +static inline fast_clock_desc advance_state(fast_clock_desc curr, fast_clock_state new_state, bool advance_timepoint) +{ + fast_clock_desc new_descriptor = { + .state = new_state, + .timepoint_index = advance_timepoint ? curr.timepoint_index + 1 : curr.timepoint_index, + .state_changing = 0, + }; + return new_descriptor; +} + +static inline bool is_expired(const fast_clock_timepoint* timepoint, uint64_t now_usec) +{ + return (timepoint->expiration_usec < now_usec); +} + +static inline void calc_time(const fast_clock_timepoint* timepoint, uint64_t* time_usec) +{ + uint64_t tsc = get_tsc(); + uint64_t dtsc = tsc - timepoint->tsc0; + uint64_t dt_usec = (dtsc * TIME_US_IN_S) / timepoint->clock_freq; + *time_usec = timepoint->t0_usec + dt_usec; +} + +static inline void reset_clock_frequency(fast_clock_timepoint* timepoint, uint64_t tsc, uint64_t time_usec) +{ + // calculate clock frequency in Hz + uint64_t dt_usec = time_usec - timepoint->t0_usec; + uint64_t dtsc = tsc - timepoint->tsc0; + timepoint->clock_freq = (dtsc * TIME_US_IN_S) / dt_usec; +} + +static inline long reset_timepoint(fast_clock_timepoint* timepoint) +{ + int ret = ocall_gettime(&timepoint->t0_usec, &timepoint->tsc0); + return ret; +} + +static inline void reset_expiration(fast_clock_timepoint* timepoint, uint64_t next_expiration) +{ + timepoint->expiration_usec = timepoint->t0_usec + next_expiration; +} + +static inline bool set_change_state_guard(fast_clock* fast_clock, fast_clock_desc descriptor) +{ + if (descriptor.state_changing != 0) { + return false; + } + + fast_clock_desc state_change_guard_desc = descriptor; + state_change_guard_desc.state_changing = 1; + return __atomic_compare_exchange_n( + &fast_clock->atomic_descriptor.desc, &descriptor.desc, state_change_guard_desc.desc, + /*weak=*/false, __ATOMIC_RELAXED, __ATOMIC_RELAXED + ); +} + +static inline fast_clock_timepoint* get_timepoint(fast_clock* fast_clock, fast_clock_desc descriptor) +{ + return &fast_clock->time_points[descriptor.timepoint_index]; +} + +static bool is_rdtsc_available(void) { + // we just need to check if rdtsc opcode is emulated (otherwise using it is really slow) - + extern int g_atomic_is_rdtsc_emulated; + + // "optimistic path", assume rdtsc() was called at least once + int is_emulated = __atomic_load_n(&g_atomic_is_rdtsc_emulated, __ATOMIC_SEQ_CST); + if (is_emulated) { + return false; + } + + // make sure the is_emulated guard is initialized + (void)get_tsc(); + is_emulated = __atomic_load_n(&g_atomic_is_rdtsc_emulated, __ATOMIC_SEQ_CST); + return !is_emulated; +} + +static int handle_state_rdtsc_disabled(uint64_t* time_usec) +{ + // slow path - OCALL to get time + return ocall_gettime(time_usec, NULL); +} + +static int handle_state_init(fast_clock* fast_clock, fast_clock_desc descriptor, uint64_t* time_usec) +{ + if (!set_change_state_guard(fast_clock, descriptor)) { + return handle_state_rdtsc_disabled(time_usec); + } + + if (!is_rdtsc_available()) { + fast_clock_desc next_desc = advance_state(descriptor, FC_STATE_RDTSC_DISABLED, false); + __atomic_store_n(&fast_clock->atomic_descriptor.desc, next_desc.desc, __ATOMIC_RELAXED); + return handle_state_rdtsc_disabled(time_usec); + } + + fast_clock_desc next_desc = advance_state(descriptor, FC_STATE_CALIBRATING, false); + fast_clock_timepoint* timepoint = get_timepoint(fast_clock, next_desc); + int ret = reset_timepoint(timepoint); + + // gettimeofday failed - restore descriptor + if (ret != 0) { + __atomic_store_n(&fast_clock->atomic_descriptor.desc, descriptor.desc, __ATOMIC_RELAXED); + return ret; + } + + // advance state + reset_expiration(timepoint, RDTSC_CALIBRATION_TIME); + __atomic_store_n(&fast_clock->atomic_descriptor.desc, next_desc.desc, __ATOMIC_RELEASE); + + // output results from the timepoint + *time_usec = timepoint->t0_usec; + return ret; +} + +static int handle_state_calibrating(fast_clock* fast_clock, fast_clock_desc descriptor, uint64_t* time_usec) +{ + // all callers in this state will perform an OCALL - no need to set the change_state_guard before OCALLing + uint64_t tmp_tsc = 0; + int ret = ocall_gettime(time_usec, &tmp_tsc); + if (ret != 0) { + return ret; + } + + fast_clock_timepoint* timepoint = get_timepoint(fast_clock, descriptor); + if (!is_expired(timepoint, *time_usec) || !set_change_state_guard(fast_clock, descriptor)) { + return ret; + } + + // calculate the clock_freq and advance state + reset_clock_frequency(timepoint, tmp_tsc, *time_usec); + reset_expiration(timepoint, RDTSC_RECALIBRATION_INTERVAL); + fast_clock_desc new_desc = advance_state(descriptor, FC_STATE_RDTSC, false); + __atomic_store_n(&fast_clock->atomic_descriptor.desc, new_desc.desc, __ATOMIC_RELEASE); + + return ret; +} + +static inline int handle_state_rdtsc(fast_clock* fast_clock, fast_clock_desc descriptor, uint64_t* time_usec) +{ + fast_clock_timepoint* timepoint = get_timepoint(fast_clock, descriptor); + + // fast path - calculate time with rdtsc + calc_time(timepoint, time_usec); + bool should_advance = is_expired(timepoint, *time_usec); + if (!should_advance || !set_change_state_guard(fast_clock, descriptor)) { + return 0; + } + + // acquire the state_change_guard and prepare the next state (get new ground truth timepoint) + fast_clock_desc next_desc = advance_state(descriptor, FC_STATE_RDTSC_RECALIBRATE, true); + fast_clock_timepoint* next_timepoint = get_timepoint(fast_clock, next_desc); + + int ret = reset_timepoint(next_timepoint); + if (ret != 0) { + // gettimeofday failed - restore the state_change_guard and return + __atomic_store_n(&fast_clock->atomic_descriptor.desc, descriptor.desc, __ATOMIC_RELAXED); + return ret; + } + + // use current clock freq until RDTSC_CALIBRATE state ends and the new clock_freq can be calculated + next_timepoint->clock_freq = timepoint->clock_freq; + reset_expiration(next_timepoint, RDTSC_CALIBRATION_TIME); + __atomic_store_n(&fast_clock->atomic_descriptor.desc, next_desc.desc, __ATOMIC_RELEASE); + + return ret; +} + +static inline int handle_state_rdtsc_recalibrate(fast_clock* fast_clock, fast_clock_desc descriptor, uint64_t* time_usec) +{ + fast_clock_timepoint* timepoint = get_timepoint(fast_clock, descriptor); + + // fast path - calculate time with rdtsc + calc_time(timepoint, time_usec); + if (!is_expired(timepoint, *time_usec) || !set_change_state_guard(fast_clock, descriptor)) { + return 0; + } + + uint64_t tsc = 0; + int ret = ocall_gettime(time_usec, &tsc); + if (ret != 0) { + __atomic_store_n(&fast_clock->atomic_descriptor.desc, descriptor.desc, __ATOMIC_RELAXED); + return ret; + } + + reset_clock_frequency(timepoint, tsc, *time_usec); + reset_expiration(timepoint, RDTSC_RECALIBRATION_INTERVAL); + fast_clock_desc next_desc = advance_state(descriptor, FC_STATE_RDTSC, false); + __atomic_store_n(&fast_clock->atomic_descriptor.desc, next_desc.desc, __ATOMIC_RELEASE); + + return ret; +} + +int fast_clock_get_time(fast_clock* fast_clock, uint64_t* time_usec) +{ + fast_clock_desc descriptor = { + .desc = __atomic_load_n(&fast_clock->atomic_descriptor.desc, __ATOMIC_ACQUIRE), + }; + switch (descriptor.state) + { + case FC_STATE_RDTSC: + return handle_state_rdtsc(fast_clock, descriptor, time_usec); + case FC_STATE_RDTSC_RECALIBRATE: + return handle_state_rdtsc_recalibrate(fast_clock, descriptor, time_usec); + case FC_STATE_CALIBRATING: + return handle_state_calibrating(fast_clock, descriptor, time_usec); + case FC_STATE_INIT: + return handle_state_init(fast_clock, descriptor, time_usec); + case FC_STATE_RDTSC_DISABLED: + default: + return handle_state_rdtsc_disabled(time_usec); + } +} + +bool fast_clock_is_enabled(const fast_clock* fast_clock) +{ + fast_clock_desc descriptor = { + .desc = __atomic_load_n(&fast_clock->atomic_descriptor.desc, __ATOMIC_RELAXED), + }; + return (descriptor.state != FC_STATE_RDTSC_DISABLED); +} + +void fast_clock_disable(fast_clock* fast_clock) +{ + /* We need to busy-loop until the state change guard is acquired here - since fast-clock + * might be in the midst of transitioning states. We can't simply store the DISABLED state. */ + fast_clock_desc descriptor; + do { + descriptor.desc = __atomic_load_n(&fast_clock->atomic_descriptor.desc, __ATOMIC_ACQUIRE); + } while(!set_change_state_guard(fast_clock, descriptor)); + + fast_clock_desc disabled_desc = advance_state(descriptor, FC_STATE_RDTSC_DISABLED, false); + __atomic_store_n(&fast_clock->atomic_descriptor.desc, disabled_desc.desc, __ATOMIC_RELEASE); +} diff --git a/pal/src/host/linux-sgx/utils/fast_clock.h b/pal/src/host/linux-sgx/utils/fast_clock.h new file mode 100644 index 0000000000..6d85452101 --- /dev/null +++ b/pal/src/host/linux-sgx/utils/fast_clock.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include "api.h" + + +#define _FC_NUM_TIMEPOINT_BITS (1) +#define FC_NUM_TIMEPOINTS (1<<_FC_NUM_TIMEPOINT_BITS) + +typedef union +{ + struct + { + uint16_t state : 4; + uint16_t timepoint_index : _FC_NUM_TIMEPOINT_BITS; + uint16_t _pad0 : (16 - _FC_NUM_TIMEPOINT_BITS - 5); + uint16_t state_changing : 1; + }; + + uint16_t desc; +} fast_clock_desc; + +static_assert(_FC_NUM_TIMEPOINT_BITS >= 1, "timepoint_index must have at minimum 1-bit"); +static_assert(_FC_NUM_TIMEPOINT_BITS + 5 <= 16, "timepoint_index uses too many bits"); +static_assert(sizeof(fast_clock_desc) == sizeof(uint16_t), "fast_clock_desc size mismatch"); + +typedef struct +{ + uint64_t clock_freq; + uint64_t tsc0; + uint64_t t0_usec; + uint64_t expiration_usec; +} fast_clock_timepoint; + +typedef struct +{ + fast_clock_desc atomic_descriptor; + fast_clock_timepoint time_points[FC_NUM_TIMEPOINTS]; +} fast_clock; + +extern fast_clock g_fast_clock; + +int fast_clock_get_time(fast_clock* fast_clock, uint64_t* time_micros); +bool fast_clock_is_enabled(const fast_clock* fast_clock); +void fast_clock_disable(fast_clock* fast_clock);