Skip to content

Commit

Permalink
fby35: gl: Support monitor cpu thermal event (#1351)
Browse files Browse the repository at this point in the history
Summary:
# Description:
Support monitor cpu thermal event via PCS 20
Bit 0: TCC Activation
Bit 2: PROCHOT
Bit 4: CPU Critical Temperature

# Motivation:
Currently, there is no PCH in BHS CPU.
Thus, BIC must monitor CPU thermal status via PECI and send SEL to BMC when event triggered

Pull Request resolved: #1351

Test Plan:
Check SEL when CPU be overheated - pass

# Test Log:
Overheat:
root@bmc-oob:~# log-util slot1 --print
1    slot1    2018-03-09 10:27:06    ipmid            SEL Entry: FRU: 1, Record: Standard (0x02), Time: 2018-03-09 10:27:06, Sensor: CPU0_THERM_STATUS (0x1C), Event Data: (01FFFF) PROCHOT# Assertion
1    slot1    2018-03-09 10:27:06    ipmid            SEL Entry: FRU: 1, Record: Standard (0x02), Time: 2018-03-09 10:27:06, Sensor: CPU0_THERM_STATUS (0x1C), Event Data: (00FFFF) CPU Critical Temperature Assertion
1    slot1    2018-03-09 10:27:07    ipmid            SEL Entry: FRU: 1, Record: Standard (0x02), Time: 2018-03-09 10:27:07, Sensor: CPU0_THERM_STATUS (0x1C), Event Data: (02FFFF) TCC Activation Assertion

Not overheat:
1    slot1    2018-03-09 10:27:10    ipmid            SEL Entry: FRU: 1, Record: Standard (0x02), Time: 2018-03-09 10:27:10, Sensor: CPU0_THERM_STATUS (0x1C), Event Data: (01FFFF) PROCHOT# Deassertion
1    slot1    2018-03-09 10:27:10    ipmid            SEL Entry: FRU: 1, Record: Standard (0x02), Time: 2018-03-09 10:27:10, Sensor: CPU0_THERM_STATUS (0x1C), Event Data: (00FFFF) CPU Critical Temperature Deassertion
1    slot1    2018-03-09 10:27:11    ipmid            SEL Entry: FRU: 1, Record: Standard (0x02), Time: 2018-03-09 10:27:11, Sensor: CPU0_THERM_STATUS (0x1C), Event Data: (02FFFF) TCC Activation Deassertion

Reviewed By: jagpalgill

Differential Revision: D49262471

fbshipit-source-id: e017996b2e38be02a380055108b02d900db370a0
  • Loading branch information
WayneLiu-wiwynn authored and facebook-github-bot committed Sep 15, 2023
1 parent df3f100 commit 85e682a
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 2 deletions.
8 changes: 7 additions & 1 deletion common/service/ipmi/include/libipmi.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
Expand Down Expand Up @@ -46,6 +46,7 @@
#define IPMI_OEM_SENSOR_TYPE_SYS_BOOT_STA 0xCA
#define IPMI_OEM_SENSOR_TYPE_VR 0xCB
#define IPMI_OEM_SENSOR_TYPE_HDT 0xCC
#define IPMI_OEM_SENSOR_TYPE_CPU_THERM_STATUS 0xCD

/* event/reading type, see IPMI spec 42.1, table 42-1 */
#define IPMI_EVENT_TYPE_THRESHOLD 0x01
Expand Down Expand Up @@ -190,6 +191,11 @@
#define IPMI_OEM_EVENT_OFFSET_1OU 0x01
#define IPMI_OEM_EVENT_OFFSET_2OU 0x02

/* sensor-specific offset for CPUX_THERM_STATUS*/
#define IPMI_OEM_EVENT_OFFSET_CPU_CRIT_TEMP 0x00
#define IPMI_OEM_EVENT_OFFSET_PROCHOT 0x01
#define IPMI_OEM_EVENT_OFFSET_TCC_ACT 0x02

enum ipmi_chassis_control_e {
IPMI_CHASSIS_CTRL_POWER_DOWN,
IPMI_CHASSIS_CTRL_POWER_UP,
Expand Down
126 changes: 125 additions & 1 deletion meta-facebook/yv35-gl/src/platform/plat_cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,135 @@
#include <stdint.h>
#include <drivers/peci.h>
#include "intel_peci.h"
#include "ipmi.h"
#include "libipmi.h"
#include "libutil.h"
#include "hal_peci.h"
#include <logging/log.h>
#include <plat_sensor_table.h>
#include "power_status.h"

LOG_MODULE_REGISTER(pal_cpu);
LOG_MODULE_REGISTER(plat_cpu);

K_THREAD_STACK_DEFINE(monitor_cpu_stack, MONITOR_CPU_STACK_SIZE);
struct k_thread monitor_cpu_thread;
k_tid_t monitor_cpu_tid;

void start_monitor_cpu_thread()
{
LOG_INF("Start thread to monitor CPU");

monitor_cpu_tid =
k_thread_create(&monitor_cpu_thread, monitor_cpu_stack,
K_THREAD_STACK_SIZEOF(monitor_cpu_stack), monitor_cpu_handler, NULL,
NULL, NULL, CONFIG_MAIN_THREAD_PRIORITY, 0, K_NO_WAIT);
k_thread_name_set(&monitor_cpu_thread, "monitor_cpu_thread");
}

void monitor_cpu_handler()
{
uint8_t command = PECI_RD_PKG_CFG0_CMD;
uint8_t readlen = 0x05;
uint8_t prochot = 0, tcc_act = 0, cpu_crti_temp = 0;
static bool tcc_act_assert = false, prochot_assert = false, cpu_crti_temp_assert = false;
common_addsel_msg_t sel_msg;

int ret = 0;
uint8_t *readbuf = (uint8_t *)malloc(readlen * sizeof(uint8_t));
if (!readbuf) {
LOG_ERR("%s fail to allocate readbuf memory", __func__);
return;
}

sel_msg.InF_target = BMC_IPMB;
sel_msg.sensor_type = IPMI_OEM_SENSOR_TYPE_CPU_THERM_STATUS;
sel_msg.sensor_number = SENSOR_NUM_CPU0_THERM_STATUS;
sel_msg.event_data2 = 0xFF;
sel_msg.event_data3 = 0xFF;

while (1) {
// PECI can only be accessed after post complete
if (!get_post_status()) {
k_msleep(MONITOR_CPU_TIME_MS);
continue;
}

ret = peci_read(command, CPU_PECI_ADDR, RDPKG_IDX_PKG_THERMAL_STATUS, 0, readlen,
readbuf);
if (ret) {
LOG_ERR("Get cpu thermal status peci read error");
goto cleanup;
}

if (readbuf[0] != PECI_CC_RSP_SUCCESS) {
if (readbuf[0] == PECI_CC_ILLEGAL_REQUEST) {
LOG_ERR("Get cpu thermal status unknown request");
} else {
LOG_ERR("Get cpu thermal status peci control hardware, firmware or associated logic error");
}
goto cleanup;
}

tcc_act = GETBIT(readbuf[1], 0);
sel_msg.event_data1 = IPMI_OEM_EVENT_OFFSET_TCC_ACT;
if ((tcc_act == THERMAL_STATUS_ASSERT) && (tcc_act_assert == false)) {
sel_msg.event_type = IPMI_EVENT_TYPE_SENSOR_SPECIFIC;
tcc_act_assert = true;
if (!common_add_sel_evt_record(&sel_msg)) {
LOG_ERR("Failed to add TCC Activation Assert SEL");
}
}
if ((tcc_act == THERMAL_STATUS_DEASSERT) && (tcc_act_assert == true)) {
sel_msg.event_type = IPMI_OEM_EVENT_TYPE_DEASSERT;
tcc_act_assert = false;
if (!common_add_sel_evt_record(&sel_msg)) {
LOG_ERR("Failed to add TCC Activation Deassert SEL");
}
}

prochot = GETBIT(readbuf[1], 2);
sel_msg.event_data1 = IPMI_OEM_EVENT_OFFSET_PROCHOT;
if ((prochot == THERMAL_STATUS_ASSERT) && (prochot_assert == false)) {
sel_msg.event_type = IPMI_EVENT_TYPE_SENSOR_SPECIFIC;
prochot_assert = true;
if (!common_add_sel_evt_record(&sel_msg)) {
LOG_ERR("Failed to add PROCHOT Assert SEL\n");
}
}
if ((prochot == THERMAL_STATUS_DEASSERT) && (prochot_assert == true)) {
sel_msg.event_type = IPMI_OEM_EVENT_TYPE_DEASSERT;
prochot_assert = false;
if (!common_add_sel_evt_record(&sel_msg)) {
LOG_ERR("Failed to add PROCHOT Deassert SEL\n");
}
}

cpu_crti_temp = GETBIT(readbuf[1], 4);
sel_msg.event_data1 = IPMI_OEM_EVENT_OFFSET_CPU_CRIT_TEMP;
if ((cpu_crti_temp == THERMAL_STATUS_ASSERT) && (cpu_crti_temp_assert == false)) {
sel_msg.event_type = IPMI_EVENT_TYPE_SENSOR_SPECIFIC;
cpu_crti_temp_assert = true;
if (!common_add_sel_evt_record(&sel_msg)) {
LOG_ERR("Failed to add CPU critical temperature Assert SEL\n");
}
}
if ((prochot == THERMAL_STATUS_DEASSERT) && (cpu_crti_temp_assert == true)) {
sel_msg.event_type = IPMI_OEM_EVENT_TYPE_DEASSERT;
cpu_crti_temp_assert = false;
if (!common_add_sel_evt_record(&sel_msg)) {
LOG_ERR("Failed to add CPU critical temperature Deassert SEL\n");
}
}

memset(readbuf, 0, readlen * sizeof(uint8_t));

k_msleep(MONITOR_CPU_TIME_MS);
}

cleanup:
SAFE_FREE(readbuf);
return;
}

bool pal_get_cpu_energy(uint8_t addr, uint32_t *pkg_energy, uint32_t *run_time)
{
Expand Down
10 changes: 10 additions & 0 deletions meta-facebook/yv35-gl/src/platform/plat_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,14 @@

#define CPU_TIME_UNIT 100000000

#define MONITOR_CPU_STACK_SIZE 600
#define MONITOR_CPU_TIME_MS 1000

#define RDPKG_IDX_PKG_THERMAL_STATUS 0x14
#define THERMAL_STATUS_DEASSERT 0
#define THERMAL_STATUS_ASSERT 1

void monitor_cpu_handler();
void start_monitor_cpu_thread();

#endif
2 changes: 2 additions & 0 deletions meta-facebook/yv35-gl/src/platform/plat_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "plat_dimm.h"
#include "plat_i3c.h"
#include "plat_pmic.h"
#include "plat_cpu.h"

/*
* The operating voltage of GPIO input pins are lower than actual voltage because the chip
Expand Down Expand Up @@ -78,6 +79,7 @@ void pal_device_init()
init_i3c_dimm_prsnt_status();
start_get_dimm_info_thread();
start_monitor_pmic_error_thread();
start_monitor_cpu_thread();
}

#define DEF_PROJ_GPIO_PRIORITY 78
Expand Down
1 change: 1 addition & 0 deletions meta-facebook/yv35-gl/src/platform/plat_sensor_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@
// Event-only sensor number definition
#define SENSOR_NUM_SYSTEM_STATUS 0x10
#define SENSOR_NUM_END_OF_POST 0x11
#define SENSOR_NUM_CPU0_THERM_STATUS 0x1C
#define SENSOR_NUM_POWER_ERROR 0x56
#define SENSOR_NUM_PROC_FAIL 0x65
#define SENSOR_NUM_VR_HOT 0xB2
Expand Down

0 comments on commit 85e682a

Please sign in to comment.