Skip to content

Commit

Permalink
[PAL/LinuxSGX] enable fast-clock implementation of pal time query (ge…
Browse files Browse the repository at this point in the history
…ttimeofday)

Signed-off-by: Jonathan Shamir <[email protected]>
  • Loading branch information
jonathan-sha committed Apr 8, 2024
1 parent 4f1ebde commit e8f263a
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 228 deletions.
2 changes: 0 additions & 2 deletions common/include/arch/x86_64/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ enum extended_state_sub_leaf {
#define PROC_FREQ_LEAF 0x16
#define AMX_TILE_INFO_LEAF 0x1D
#define AMX_TMUL_INFO_LEAF 0x1E
#define HYPERVISOR_INFO_LEAF 0x40000000
#define HYPERVISOR_VMWARE_TIME_LEAF 0x40000010
#define MAX_INPUT_EXT_VALUE_LEAF 0x80000000
#define EXT_SIGNATURE_AND_FEATURES_LEAF 0x80000001
#define CPU_BRAND_LEAF 0x80000002
Expand Down
7 changes: 4 additions & 3 deletions pal/src/host/linux-sgx/pal_exception.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,12 @@ static void save_pal_context(PAL_CONTEXT* ctx, sgx_cpu_context_t* uc,
}
}

#include "utils/fast_clock.h"

static void emulate_rdtsc_and_print_warning(sgx_cpu_context_t* uc) {
if (FIRST_TIME()) {
/* if we end up emulating RDTSC/RDTSCP instruction, we cannot use invariant TSC */
extern uint64_t g_tsc_hz;
g_tsc_hz = 0;
/* if we end up emulating RDTSC/RDTSCP instruction, we cannot use TSC-based clock emulation */
fast_clock_disable(&g_fast_clock);
log_warning("all RDTSC/RDTSCP instructions are emulated (imprecisely) via gettime() "
"syscall.");
}
Expand Down
2 changes: 0 additions & 2 deletions pal/src/host/linux-sgx/pal_linux.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,6 @@ void _PalExceptionHandler(uint32_t trusted_exit_info_,
uint32_t untrusted_external_event, sgx_cpu_context_t* uc,
PAL_XREGS_STATE* xregs_state, sgx_arch_exinfo_t* exinfo);

void init_tsc(void);

int init_cpuid(void);

int init_enclave(void);
Expand Down
22 changes: 5 additions & 17 deletions pal/src/host/linux-sgx/pal_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,6 @@ static int import_and_init_extra_runtime_domain_names(struct pal_dns_host_conf*
extern void* g_enclave_base;
extern void* g_enclave_top;
extern bool g_allowed_files_warn;
extern uint64_t g_tsc_hz;
extern size_t g_unused_tcs_pages_num;

static int print_warnings_on_insecure_configs(PAL_HANDLE parent_process) {
Expand Down Expand Up @@ -552,14 +551,6 @@ static int print_warnings_on_insecure_configs(PAL_HANDLE parent_process) {
return ret;
}

static void print_warning_on_invariant_tsc(PAL_HANDLE parent_process) {
if (!parent_process && !g_tsc_hz) {
/* Warn only in the first process. */
log_warning("Could not set up Invariant TSC (CPU is too old or you run on a VM that does "
"not expose corresponding CPUID leaves). This degrades performance.");
}
}

static void print_warnings_on_invalid_dns_host_conf(PAL_HANDLE parent_process) {
if (!parent_process) {
/* Warn only in the first process. */
Expand All @@ -581,8 +572,6 @@ static void post_callback(void) {
ocall_exit(1, /*is_exitgroup=*/true);
}

print_warning_on_invariant_tsc(g_pal_common_state.parent_process);

print_warnings_on_invalid_dns_host_conf(g_pal_common_state.parent_process);
}

Expand Down Expand Up @@ -721,12 +710,11 @@ noreturn void pal_linux_main(void* uptr_libpal_uri, size_t libpal_uri_len, void*

SET_ENCLAVE_TCB(ready_for_exceptions, 1UL);

/* initialize "Invariant TSC" HW feature for fast and accurate gettime and immediately probe
* RDTSC instruction inside SGX enclave (via dummy get_tsc) -- it is possible that
* the CPU supports invariant TSC but doesn't support executing RDTSC inside SGX enclave, in
* this case the SIGILL exception is generated and leads to emulate_rdtsc_and_print_warning()
* which unsets invariant TSC, and we end up falling back to the slower ocall_gettime() */
init_tsc();
/* We implement a "fast-path" clock that is emulated internally using x86 RDTSC instruction.
* It is possible that the CPU does not support the RDTSC instruction within SGX enclave,
* in this case the SIGILL exception is generated and leads to emulate_rdtsc_and_print_warning()
* which disables the TSC based clock, and we end up falling back to the slower ocall_gettime()
*/
(void)get_tsc(); /* must be after `ready_for_exceptions=1` since it may generate SIGILL */

ret = init_cpuid();
Expand Down
208 changes: 4 additions & 204 deletions pal/src/host/linux-sgx/pal_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,208 +22,10 @@
#include "spinlock.h"
#include "toml_utils.h"
#include "topo_info.h"

/* The timeout of 50ms was found to be a safe TSC drift correction periodicity based on results
* from multiple systems. Any higher or lower could pose risks of negative time drift or
* performance hit respectively.
*/
#define TSC_REFINE_INIT_TIMEOUT_USECS 50000

uint64_t g_tsc_hz = 0; /* TSC frequency for fast and accurate time ("invariant TSC" HW feature) */
static uint64_t g_start_tsc = 0;
static uint64_t g_start_usec = 0;
static seqlock_t g_tsc_lock = INIT_SEQLOCK_UNLOCKED;

static bool is_tsc_usable(void) {
uint32_t words[CPUID_WORD_NUM];
_PalCpuIdRetrieve(INVARIANT_TSC_LEAF, 0, words);
return words[CPUID_WORD_EDX] & (1 << 8);
}

/* return TSC frequency or 0 if invariant TSC is not supported */
static uint64_t get_tsc_hz_baremetal(void) {
uint32_t words[CPUID_WORD_NUM];

/*
* Based on "Time Stamp Counter and Nominal Core Crystal Clock Information" leaf, calculate TSC
* frequency as ECX * EBX / EAX, where
* - EAX is denominator of the TSC/"core crystal clock" ratio,
* - EBX is numerator of the TSC/"core crystal clock" ratio,
* - ECX is core crystal clock (nominal) frequency in Hz.
*/
_PalCpuIdRetrieve(TSC_FREQ_LEAF, 0, words);
if (!words[CPUID_WORD_EAX] || !words[CPUID_WORD_EBX]) {
/* TSC/core crystal clock ratio is not enumerated, can't use RDTSC for accurate time */
return 0;
}

if (words[CPUID_WORD_ECX] > 0) {
/* cast to 64-bit first to prevent integer overflow */
return (uint64_t)words[CPUID_WORD_ECX] * words[CPUID_WORD_EBX] / words[CPUID_WORD_EAX];
}

/* some Intel CPUs do not report nominal frequency of crystal clock, let's calculate it
* based on Processor Frequency Information Leaf (CPUID 16H); this leaf always exists if
* TSC Frequency Leaf exists; logic is taken from Linux 5.11's arch/x86/kernel/tsc.c */
_PalCpuIdRetrieve(PROC_FREQ_LEAF, 0, words);
if (!words[CPUID_WORD_EAX]) {
/* processor base frequency (in MHz) is not enumerated, can't calculate frequency */
return 0;
}

/* processor base frequency is in MHz but we need to return TSC frequency in Hz; cast to 64-bit
* first to prevent integer overflow */
return (uint64_t)words[CPUID_WORD_EAX] * 1000000;
}

/* return TSC frequency or 0 if invariant TSC is not supported */
static uint64_t get_tsc_hz_hypervisor(void) {
uint32_t words[CPUID_WORD_NUM];

/*
* We rely on the Generic CPUID space for hypervisors:
* - 0x40000000: EAX: The maximum input value for CPUID supported by the hypervisor
* - EBX, ECX, EDX: Hypervisor vendor ID signature (hypervisor_id)
*
* If we detect QEMU/KVM or Cloud Hypervisor/KVM (hypervisor_id = "KVMKVMKVM") or VMWare
* ("VMwareVMware"), then we assume that leaf 0x40000010 contains virtual TSC frequency in kHz
* in EAX. We check hypervisor_id because leaf 0x40000010 is not standardized and e.g. Microsoft
* Hyper-V may use it for other purposes.
*
* Relevant materials:
* - https://github.com/qemu/qemu/commit/9954a1582e18b03ddb66f6c892dccf2c3508f4b2
* - qemu/target/i386/cpu.h, qemu/target/i386/cpu.c, qemu/target/i386/kvm/kvm.c sources
* - https://github.com/freebsd/freebsd-src/blob/9df6eea/sys/x86/x86/identcpu.c#L1372-L1377 (for
* the list of hypervisor_id values)
*/
_PalCpuIdRetrieve(HYPERVISOR_INFO_LEAF, 0, words);

bool is_kvm = words[CPUID_WORD_EBX] == 0x4b4d564b
&& words[CPUID_WORD_ECX] == 0x564b4d56
&& words[CPUID_WORD_EDX] == 0x0000004d;
bool is_vmware = words[CPUID_WORD_EBX] == 0x61774d56
&& words[CPUID_WORD_ECX] == 0x4d566572
&& words[CPUID_WORD_EDX] == 0x65726177;

if (!is_kvm && !is_vmware) {
/* not a hypervisor that contains "virtual TSC frequency" in leaf 0x40000010 */
return 0;
}

if (words[CPUID_WORD_EAX] < HYPERVISOR_VMWARE_TIME_LEAF) {
/* virtual TSC frequency is not available */
return 0;
}

_PalCpuIdRetrieve(HYPERVISOR_VMWARE_TIME_LEAF, 0, words);
if (!words[CPUID_WORD_EAX]) {
/* TSC frequency (in kHz) is not enumerated, can't calculate frequency */
return 0;
}

/* TSC frequency is in kHz but we need to return TSC frequency in Hz; cast to 64-bit first to
* prevent integer overflow */
return (uint64_t)words[CPUID_WORD_EAX] * 1000;
}

/* initialize the data structures used for date/time emulation using TSC */
void init_tsc(void) {
if (!is_tsc_usable())
return;

g_tsc_hz = get_tsc_hz_baremetal();
if (g_tsc_hz)
return;

/* hypervisors may not expose crystal-clock frequency CPUID leaves, so instead try
* hypervisor-special synthetic CPUID leaf 0x40000010 (VMWare-style Timing Information) */
g_tsc_hz = get_tsc_hz_hypervisor();
if (g_tsc_hz)
return;
}
#include "utils/fast_clock.h"

int _PalSystemTimeQuery(uint64_t* out_usec) {
int ret;

if (!g_tsc_hz) {
/* RDTSC is not allowed or no Invariant TSC feature -- fallback to the slow ocall */
return ocall_gettime(out_usec);
}

uint32_t seq;
uint64_t start_tsc;
uint64_t start_usec;
do {
seq = read_seqbegin(&g_tsc_lock);
start_tsc = g_start_tsc;
start_usec = g_start_usec;
} while (read_seqretry(&g_tsc_lock, seq));

uint64_t usec = 0;
/* Last seen RDTSC-calculated time value. This guards against time rewinding. */
static uint64_t last_usec = 0;
if (start_tsc > 0 && start_usec > 0) {
/* baseline TSC/usec pair was initialized, can calculate time via RDTSC (but should be
* careful with integer overflow during calculations) */
uint64_t diff_tsc = get_tsc() - start_tsc;
if (diff_tsc < UINT64_MAX / 1000000) {
uint64_t diff_usec = diff_tsc * 1000000 / g_tsc_hz;
if (diff_usec < TSC_REFINE_INIT_TIMEOUT_USECS) {
/* less than TSC_REFINE_INIT_TIMEOUT_USECS passed from the previous update of
* TSC/usec pair (time drift is contained), use the RDTSC-calculated time */
usec = start_usec + diff_usec;
if (usec < start_usec)
return -PAL_ERROR_OVERFLOW;

/* It's simply `last_usec = max(last_usec, usec)`, but executed atomically. */
uint64_t expected_usec = __atomic_load_n(&last_usec, __ATOMIC_ACQUIRE);
while (expected_usec < usec) {
if (__atomic_compare_exchange_n(&last_usec, &expected_usec, usec,
/*weak=*/true, __ATOMIC_RELEASE,
__ATOMIC_ACQUIRE)) {
break;
}
}

*out_usec = MAX(usec, expected_usec);
return 0;
}
}
}

/* if we are here, either the baseline TSC/usec pair was not yet initialized or too much time
* passed since the previous TSC/usec update, so let's refresh them to contain the time drift */
uint64_t tsc_cyc1 = get_tsc();
ret = ocall_gettime(&usec);
if (ret < 0)
return -PAL_ERROR_DENIED;
uint64_t tsc_cyc2 = get_tsc();

uint64_t last_recorded_rdtsc = __atomic_load_n(&last_usec, __ATOMIC_ACQUIRE);
if (usec < last_recorded_rdtsc) {
/* new OCALL-obtained timestamp (`usec`) is "back in time" than the last recorded timestamp
* from RDTSC (`last_recorded_rdtsc`); this can happen if the actual host time drifted
* backwards compared to the RDTSC time. */
usec = last_recorded_rdtsc;
}

/* we need to match the OCALL-obtained timestamp (`usec`) with the RDTSC-obtained number of
* cycles (`tsc_cyc`); since OCALL is a time-consuming operation, we estimate `tsc_cyc` as a
* mid-point between the RDTSC values obtained right-before and right-after the OCALL. */
uint64_t tsc_cyc = tsc_cyc1 + (tsc_cyc2 - tsc_cyc1) / 2;
if (tsc_cyc < tsc_cyc1)
return -PAL_ERROR_OVERFLOW;

/* refresh the baseline data if no other thread updated g_start_tsc */
write_seqbegin(&g_tsc_lock);
if (g_start_tsc < tsc_cyc) {
g_start_tsc = tsc_cyc;
g_start_usec = usec;
}
write_seqend(&g_tsc_lock);

*out_usec = usec;
return 0;
return fast_clock_get_time(&g_fast_clock, out_usec, false);
}

static uint32_t g_extended_feature_flags_max_supported_sub_leaves = 0;
Expand Down Expand Up @@ -512,10 +314,8 @@ static const struct cpuid_leaf cpuid_known_leaves[] = {
{.leaf = 0x1F, .zero_subleaf = false, .cache = false}, /* Intel V2 Ext Topology Enumeration */
/* basic CPUID leaf functions end here */

/* hypervisor-specific CPUID leaf functions (0x40000000 - 0x400000FF) start here */
{.leaf = 0x40000000, .zero_subleaf = true, .cache = true}, /* CPUID Info */
{.leaf = 0x40000010, .zero_subleaf = true, .cache = true}, /* VMWare-style Timing Info */
/* NOTE: currently only the above two leaves are used, see also get_tsc_hz_hypervisor() */
/* hypervisor-specific CPUID leaf functions (0x40000000 - 0x400000FF) */
/* not used, see code below */

/* invalid CPUID leaf functions (no existing or future CPU will return any meaningful
* information in these leaves) occupy 0x40000100 - 0x4FFFFFFF -- they are treated the same as
Expand Down

0 comments on commit e8f263a

Please sign in to comment.