diff --git a/common/include/arch/x86_64/cpu.h b/common/include/arch/x86_64/cpu.h index 8eaeef1c01..b40bac8d4f 100644 --- a/common/include/arch/x86_64/cpu.h +++ b/common/include/arch/x86_64/cpu.h @@ -44,6 +44,8 @@ enum extended_state_sub_leaf { #define PROC_FREQ_LEAF 0x16 #define AMX_TILE_INFO_LEAF 0x1D #define AMX_TMUL_INFO_LEAF 0x1E +#define HYPERVISOR_INFO_LEAF 0x40000000 +#define HYPERVISOR_TIME_LEAF 0x40000010 #define MAX_INPUT_EXT_VALUE_LEAF 0x80000000 #define EXT_SIGNATURE_AND_FEATURES_LEAF 0x80000001 #define CPU_BRAND_LEAF 0x80000002 diff --git a/pal/src/host/linux-sgx/pal_linux.h b/pal/src/host/linux-sgx/pal_linux.h index 643b3533bd..35b523f594 100644 --- a/pal/src/host/linux-sgx/pal_linux.h +++ b/pal/src/host/linux-sgx/pal_linux.h @@ -101,8 +101,6 @@ void _PalExceptionHandler(unsigned int exit_info, sgx_cpu_context_t* uc, * its underlying type. */ void _PalHandleExternalEvent(long event_, sgx_cpu_context_t* uc, PAL_XREGS_STATE* xregs_state); -bool is_tsc_usable(void); -uint64_t get_tsc_hz(void); void init_tsc(void); int init_cpuid(void); diff --git a/pal/src/host/linux-sgx/pal_misc.c b/pal/src/host/linux-sgx/pal_misc.c index b2ca85f5d9..e88e6bbefc 100644 --- a/pal/src/host/linux-sgx/pal_misc.c +++ b/pal/src/host/linux-sgx/pal_misc.c @@ -23,24 +23,206 @@ #include "toml_utils.h" #include "topo_info.h" -/* The timeout of 50ms was found to be a safe TSC drift correction periodicity based on results - * from multiple systems. Any higher or lower could pose risks of negative time drift or - * performance hit respectively. - */ +/* The timeout of 50ms was found to be a safe TSC drift correction periodicity based on results from + * multiple systems. Any higher or lower could pose risks of negative time drift or performance hit + * respectively. */ #define TSC_REFINE_INIT_TIMEOUT_USECS 50000 +#define KHZ_TO_HZ(x) (x * 1000) +#define MHZ_TO_HZ(x) (KHZ_TO_HZ(x) * 1000) +#define GHZ_TO_HZ(x) (MHZ_TO_HZ(x) * 1000) +#define THZ_TO_HZ(x) (GHZ_TO_HZ(x) * 1000) + uint64_t g_tsc_hz = 0; /* TSC frequency for fast and accurate time ("invariant TSC" HW feature) */ static uint64_t g_start_tsc = 0; static uint64_t g_start_usec = 0; static seqlock_t g_tsc_lock = INIT_SEQLOCK_UNLOCKED; -/** - * Initialize the data structures used for date/time emulation using TSC - */ -void init_tsc(void) { - if (is_tsc_usable()) { - g_tsc_hz = get_tsc_hz(); +static bool is_tsc_usable(void) { + uint32_t words[CPUID_WORD_NUM]; + _PalCpuIdRetrieve(INVARIANT_TSC_LEAF, 0, words); + return words[CPUID_WORD_EDX] & 1 << 8; +} + +/* return TSC frequency or 0 if invariant TSC is not supported */ +static uint64_t get_tsc_hz_baremetal(void) { + uint32_t words[CPUID_WORD_NUM]; + + _PalCpuIdRetrieve(TSC_FREQ_LEAF, 0, words); + if (!words[CPUID_WORD_EAX] || !words[CPUID_WORD_EBX]) { + /* TSC/core crystal clock ratio is not enumerated, can't use RDTSC for accurate time */ + return 0; + } + + if (words[CPUID_WORD_ECX] > 0) { + /* calculate TSC frequency as core crystal clock frequency (EAX) * EBX / EAX; cast to 64-bit + * first to prevent integer overflow */ + uint64_t ecx_hz = words[CPUID_WORD_ECX]; + return ecx_hz * words[CPUID_WORD_EBX] / words[CPUID_WORD_EAX]; + } + + /* some Intel CPUs do not report nominal frequency of crystal clock, let's calculate it + * based on Processor Frequency Information Leaf (CPUID 16H); this leaf always exists if + * TSC Frequency Leaf exists; logic is taken from Linux 5.11's arch/x86/kernel/tsc.c */ + _PalCpuIdRetrieve(PROC_FREQ_LEAF, 0, words); + if (!words[CPUID_WORD_EAX]) { + /* processor base frequency (in MHz) is not enumerated, can't calculate frequency */ + return 0; + } + + /* processor base frequency is in MHz but we need to return TSC frequency in Hz; cast to 64-bit + * first to prevent integer overflow */ + uint64_t base_frequency_mhz = words[CPUID_WORD_EAX]; + return MHZ_TO_HZ(base_frequency_mhz); +} + +/* return TSC frequency or 0 if invariant TSC is not supported */ +static uint64_t get_tsc_hz_hypervisor(void) { + uint32_t words[CPUID_WORD_NUM]; + + /* + * We rely on the Generic CPUID space for hypervisors: + * - 0x40000000: EAX: The maximum input value for CPUID supported by the hypervisor + * - EBX, ECX, EDX: Hypervisor vendor ID signature (`hypervisor_id`) + * + * If we detect QEMU/KVM or Cloud Hypervisor/KVM (hypervisor_id = "KVMKVMKVM") or VMWare + * ("VMwareVMware"), then we assume that leaf 0x40000010 contains virtual TSC frequency in kHz + * in EAX. We check hypervisor_id because leaf 0x40000010 is not standardized and e.g. Microsoft + * Hyper-V may use it for other purposes. + * + * Relevant materials: + * - https://github.com/qemu/qemu/commit/9954a1582e18b03ddb66f6c892dccf2c3508f4b2 + * - qemu/target/i386/cpu.h, qemu/target/i386/cpu.c, qemu/target/i386/kvm/kvm.c sources + * - https://github.com/freebsd/freebsd-src/blob/9df6eea/sys/x86/x86/identcpu.c#L1372-L1377 (for + * the list of hypervisor_id values) + */ + _PalCpuIdRetrieve(HYPERVISOR_INFO_LEAF, 0, words); + + uint32_t hypervisor_id[3]; + hypervisor_id[0] = words[CPUID_WORD_EBX]; + hypervisor_id[1] = words[CPUID_WORD_ECX]; + hypervisor_id[2] = words[CPUID_WORD_EDX]; + if (memcmp(hypervisor_id, "KVMKVMKVM\0\0\0", 12) && memcmp(hypervisor_id, "VMWareVMWare", 12)) { + /* QEMU/KVM, Cloud Hypervisor/KVM and VMWare expose TSC frequency in leaf 0x40000010 */ + return 0; + } + + if (words[CPUID_WORD_EAX] < HYPERVISOR_TIME_LEAF) { + /* virtual TSC frequency is not available */ + return 0; + } + + _PalCpuIdRetrieve(HYPERVISOR_TIME_LEAF, 0, words); + if (!words[CPUID_WORD_EAX]) { + /* TSC frequency (in kHz) is not enumerated, can't calculate frequency */ + return 0; + } + + /* TSC frequency is in kHz but we need to return TSC frequency in Hz; cast to 64-bit first to + * prevent integer overflow */ + uint64_t tsc_frequency_khz = words[CPUID_WORD_EAX]; + return KHZ_TO_HZ(tsc_frequency_khz); +} + +/* return TSC frequency or 0 if cannot parse CPU brand string */ +static uint64_t get_tsc_hz_cpu_model_name(void) { + uint32_t words[CPUID_WORD_NUM]; + + char brand_string[48 + 1] = {0}; + static_assert(sizeof(brand_string) == sizeof(uint32_t) * CPUID_WORD_NUM * 3 + 1, + "wrong sizeof(brand_string)"); + + _PalCpuIdRetrieve(CPU_BRAND_LEAF, 0, words); + memcpy(&brand_string[ 0], words, sizeof(uint32_t) * CPUID_WORD_NUM); + _PalCpuIdRetrieve(CPU_BRAND_CNTD_LEAF, 0, words); + memcpy(&brand_string[16], words, sizeof(uint32_t) * CPUID_WORD_NUM); + _PalCpuIdRetrieve(CPU_BRAND_CNTD2_LEAF, 0, words); + memcpy(&brand_string[32], words, sizeof(uint32_t) * CPUID_WORD_NUM); + brand_string[sizeof(brand_string) - 1] = '\0'; + + /* we roughly follow the algo suggested in the Intel SDM (specifically "Algorithm for Extracting + * Processor Frequency" in Section 3.2 CPUID, Volume. 2A) */ + const char* hz_str = &brand_string[sizeof(brand_string) - 1]; + while (hz_str > brand_string && *hz_str == '\0') + hz_str--; + if (hz_str - brand_string < 3 || *hz_str-- != 'z' || *hz_str-- != 'H') + return 0; + + uint64_t multiplier = 0; + if (*hz_str == 'T') + multiplier = THZ_TO_HZ(1UL); + else if (*hz_str == 'G') + multiplier = GHZ_TO_HZ(1UL); + else if (*hz_str == 'M') + multiplier = MHZ_TO_HZ(1UL); + else + return 0; + + /* scan digits in reverse order until we hit space/tab or beginning of string */ + const char* s = hz_str; + while (s > brand_string && *s != ' ' && *s != '\t') + s--; + + char* end = NULL; + long base = 0, fractional = 0; + + base = strtol(s, &end, 10); + if (end == s) { + /* no frequency specified at all (no base digits found) */ + return 0; } + if (base < 0 || base >= 1000) { + /* unsupported format of smth like "-3GHz" or "1000GHz" (but "0.8GHz" is supported) */ + return 0; + } + s = end; + + if (*s == '.') { + s++; + fractional = strtol(s, &end, 10); + if (fractional < 0 || end - s > 3) { + /* don't support negative fractional or more than 3 digits after dot */ + return 0; + } + for (int i = 0; i < 3 - (end - s); i++) + fractional *= 10; + s = end; + } + + if (s != hz_str) { + /* frequency number is not immediately followed by "MHz", "GHz", "THz" suffix */ + return 0; + } + + /* base and fractional are less than 1000, so no danger of int overflow */ + assert(base < 1000 && fractional < 1000 && multiplier > 0); + return base * multiplier + fractional * multiplier / 1000; +} + +/* initialize the data structures used for date/time emulation using TSC */ +void init_tsc(void) { + if (!is_tsc_usable()) + return; + + g_tsc_hz = get_tsc_hz_baremetal(); + if (g_tsc_hz) + return; + + /* hypervisors may not expose crystal-clock frequency CPUID leaves, so instead try + * hypervisor-special synthetic CPUID leaf 0x40000010 (Timing Information) */ + g_tsc_hz = get_tsc_hz_hypervisor(); + if (g_tsc_hz) + return; + + /* final fallback -- parse "Processor Brand String" CPUID leaves (guaranteed to exist on CPUs + * with SGX), extract the CPU frequency from there and assume it reflects TSC frequency */ + g_tsc_hz = get_tsc_hz_cpu_model_name(); + if (g_tsc_hz) + return; + + /* can't use log_warning because at this point we didn't parse the manifest yet */ + log_error("Could not set up Invariant TSC (CPU is too old or you run on a VM that does not " + "expose corresponding CPUID leaves). This degrades performance."); } /* TODO: result comes from the untrusted host, introduce some schielding */ @@ -396,8 +578,13 @@ static const struct cpuid_leaf cpuid_known_leaves[] = { {.leaf = 0x1F, .zero_subleaf = false, .cache = false}, /* Intel V2 Ext Topology Enumeration */ /* basic CPUID leaf functions end here */ + /* hypervisor-specific CPUID leaf functions (0x40000000 - 0x400000FF) start here */ + {.leaf = 0x40000000, .zero_subleaf = true, .cache = true}, /* CPUID Info */ + {.leaf = 0x40000010, .zero_subleaf = true, .cache = true}, /* Timing Info */ + /* NOTE: currently only the above two leaves are used, see also get_tsc_hz_hypervisor() */ + /* invalid CPUID leaf functions (no existing or future CPU will return any meaningful - * information in these leaves) occupy 40000000 - 4FFFFFFFH -- they are treated the same as + * information in these leaves) occupy 0x40000100 - 0x4FFFFFFFH -- they are treated the same as * unrecognized leaves, see code below */ /* extended CPUID leaf functions start here */ @@ -655,44 +842,6 @@ ssize_t read_file_buffer(const char* filename, char* buf, size_t buf_size) { return n; } -bool is_tsc_usable(void) { - uint32_t words[CPUID_WORD_NUM]; - _PalCpuIdRetrieve(INVARIANT_TSC_LEAF, 0, words); - return words[CPUID_WORD_EDX] & 1 << 8; -} - -/* return TSC frequency or 0 if invariant TSC is not supported */ -uint64_t get_tsc_hz(void) { - uint32_t words[CPUID_WORD_NUM]; - - _PalCpuIdRetrieve(TSC_FREQ_LEAF, 0, words); - if (!words[CPUID_WORD_EAX] || !words[CPUID_WORD_EBX]) { - /* TSC/core crystal clock ratio is not enumerated, can't use RDTSC for accurate time */ - return 0; - } - - if (words[CPUID_WORD_ECX] > 0) { - /* calculate TSC frequency as core crystal clock frequency (EAX) * EBX / EAX; cast to 64-bit - * first to prevent integer overflow */ - uint64_t ecx_hz = words[CPUID_WORD_ECX]; - return ecx_hz * words[CPUID_WORD_EBX] / words[CPUID_WORD_EAX]; - } - - /* some Intel CPUs do not report nominal frequency of crystal clock, let's calculate it - * based on Processor Frequency Information Leaf (CPUID 16H); this leaf always exists if - * TSC Frequency Leaf exists; logic is taken from Linux 5.11's arch/x86/kernel/tsc.c */ - _PalCpuIdRetrieve(PROC_FREQ_LEAF, 0, words); - if (!words[CPUID_WORD_EAX]) { - /* processor base frequency (in MHz) is not enumerated, can't calculate frequency */ - return 0; - } - - /* processor base frequency is in MHz but we need to return TSC frequency in Hz; cast to 64-bit - * first to prevent integer overflow */ - uint64_t base_frequency_mhz = words[CPUID_WORD_EAX]; - return base_frequency_mhz * 1000000; -} - int _PalRandomBitsRead(void* buffer, size_t size) { uint32_t rand; for (size_t i = 0; i < size; i += sizeof(rand)) {