diff --git a/usr/usr/bin/hw-management-done.sh b/usr/usr/bin/hw-management-done.sh new file mode 100755 index 000000000..78624cf32 --- /dev/null +++ b/usr/usr/bin/hw-management-done.sh @@ -0,0 +1,72 @@ +#!/bin/bash +################################################################################## +# Copyright (c) 2020 - 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Neither the names of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# Alternatively, this software may be distributed under the terms of the +# GNU General Public License ("GPL") version 2 as published by the Free +# Software Foundation. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +source hw-management-helpers.sh + +log_info "hw-mngmt-done started." + +# Wait until the first run file is created by check_n_link. +while [[ ! -f "$WATCHDOG_FIRST_RUN_FILE" ]]; do + sleep "$WATCHDOG_DELAY" +done + +log_info "first run file created, starting Watchdog logic." + +# Watchdog loop +while true; do + # Get the current time with milliseconds. + current_time=$(date +%s%3N) + # Read the last reset time atomically by checking both files. + last_reset_time_A=$(cat "/tmp/watchdog_reset_time_a" 2>/dev/null || echo 0) + last_reset_time_B=$(cat "/tmp/watchdog_reset_time_b" 2>/dev/null || echo 0) + # Determine which file has the most recent reset time. + last_reset_time=$(( last_reset_time_A > last_reset_time_B ? last_reset_time_A : last_reset_time_B )) + # Calculate the time difference in milliseconds. + time_diff=$((current_time - last_reset_time)) + # Check if the time difference exceeds the timeout in milliseconds. + if [ "$time_diff" -ge $((WATCHDOG_TIMEOUT * 1000)) ]; then + # Update the status file atomically using using a temporary file. + echo "1" > "$WATCHDOG_STATUS_TEMP_FILE" + mv "$WATCHDOG_STATUS_TEMP_FILE" "$WATCHDOG_STATUS_FILE" # Atomic move. + # Update syslog. + log_info "current_time $current_time" + log_info "last_reset_time $last_reset_time" + log_info "WD time diff = $time_diff" + log_info "hw-management script done!" + # Create the sysfs ready file + touch "$HW_MGMT_SYSFS_RDY" + # Exit the watchdog. + exit 0 + fi + # Sleep for the specified delay period in seconds. + sleep "$WATCHDOG_DELAY" +done \ No newline at end of file diff --git a/usr/usr/bin/hw-management-helpers.sh b/usr/usr/bin/hw-management-helpers.sh index a1b342c24..d4744c36b 100755 --- a/usr/usr/bin/hw-management-helpers.sh +++ b/usr/usr/bin/hw-management-helpers.sh @@ -31,6 +31,20 @@ # POSSIBILITY OF SUCH DAMAGE. # +# HW-MGMT WATCHDOG GLOBALS +WATCHDOG_TIMEOUT=20 # Total WD T/O. +WATCHDOG_DELAY=1 # Internal delay for WD loop to free CPU. +MONITOR_INTERVAL=1 # Internal delay for Monitor WD loop to free CPU. +HW_MGMT_SYSFS_RDY="/var/run/HW_MGMT_SYSFS_RDY" +WATCHDOG_FIRST_RUN_FILE="tmp/watchdog_first_run" +WATCHDOG_RESET_FILE_A="/tmp/watchdog_reset_time_a" +WATCHDOG_RESET_FILE_B="/tmp/watchdog_reset_time_b" +WATCHDOG_PID_FILE="/tmp/watchdog.pid" +WATCHDOG_STATUS_FILE="/tmp/watchdog_status" +WATCHDOG_STATUS_TEMP_FILE="/tmp/watchdog_status.tmp" +WATCHDOG_SCRIPT="hw-management-done.sh" + + hw_management_path=/var/run/hw-management environment_path=$hw_management_path/environment alarm_path=$hw_management_path/alarm @@ -284,12 +298,134 @@ check_simx() fi } +init_hw_management_done_wd_files() +{ + WD_FILES=( + "$WATCHDOG_RESET_FILE_A" + "$WATCHDOG_RESET_FILE_B" + "$WATCHDOG_STATUS_FILE" + "$WATCHDOG_STATUS_TEMP_FILE" + ) + + # Remove all WD files if they exist from previous runs. + # They might contain garbage. + for FILE in "${WD_FILES[@]}"; do + [ -f "$FILE" ] && rm "$FILE" + touch "$FILE" + done + + # remove the sysfs ready file if it exists + if [[ -f "$HW_MGMT_SYSFS_RDY" ]]; then + rm "$HW_MGMT_SYSFS_RDY" + fi + + # WATCHDOG_FIRST_RUN_FILE is created the first time + # check_n_link function is called. + if [[ -f "$WATCHDOG_FIRST_RUN_FILE" ]]; then + rm "$WATCHDOG_FIRST_RUN_FILE" + fi + + echo "0" > "$WATCHDOG_STATUS_FILE" + + file_exist=true + for FILE in "${WD_FILES[@]}"; do + [ ! -f "$FILE" ] + file_exist=false + break + done + # In case one of the WD files was not created, + # or in case the first run file was not removed, + # exit with error. + if [ ! "$file_exist" ] || [ -f "$WATCHDOG_FIRST_RUN_FILE" ];then + log_info "Error init. WD files." + exit 1 + fi + + log_info "Successfully init. WD files." +} + +init_hw_management_done_wd() +{ + # Remove older WD process if it exists. + if [ -f "$WATCHDOG_PID_FILE" ]; then + local WATCHDOG_PID + WATCHDOG_PID=$(cat "$WATCHDOG_PID_FILE") + if kill -0 "$WATCHDOG_PID" 2>/dev/null; then + if kill "$WATCHDOG_PID"; then + log_info "HW Mangement old WD process killed succesfully." + rm -f "$WATCHDOG_PID_FILE" + else + log_info "HW Mangement failed to kill old WD process." + exit 1 + fi + else + log_info "HW Mangement old WD process $WATCHDOG_PID already dead, remove the pid file." + rm -f "$WATCHDOG_PID_FILE" + fi + fi + + # Start the watchdog process in the background. + bash "$WATCHDOG_SCRIPT" & + # Save the PID of the watchdog process. + touch "$WATCHDOG_PID_FILE" + echo $! > "$WATCHDOG_PID_FILE" + log_info "HW Mangement WD process created." +} + +refresh_hw_management_done_wd() +{ + # Capture the current time with milliseconds. + local current_time=$(date +%s%3N) + # Read the last update time from both reset files. + last_reset_time_A=$(cat "$WATCHDOG_RESET_FILE_A" 2>/dev/null || echo 0) + last_reset_time_B=$(cat "$WATCHDOG_RESET_FILE_B" 2>/dev/null || echo 0) + # Ensure both variables are valid integers, defaulting to 0 if empty or invalid. + last_reset_time_A=${last_reset_time_A:-0} + last_reset_time_B=${last_reset_time_B:-0} + # Determine which file was written most recently. + if [ "$last_reset_time_A" -gt "$last_reset_time_B" ]; then + # Write the current time to the less recently updated file (B). + echo "$current_time" > "$WATCHDOG_RESET_FILE_B" + else + # Write the current time to the less recently updated file (A). + echo "$current_time" > "$WATCHDOG_RESET_FILE_A" + fi +} + +# Function to monitor the watchdog status. +monitor_link_wd() +{ + log_info "monitor_link_wd was called" + while true; do + if [[ -f "$WATCHDOG_STATUS_FILE" ]]; then + status=$(cat "$WATCHDOG_STATUS_FILE") + if [[ "$status" == "1" ]]; then + log_info "Watchdog status is 1: Condition met." + return 0 + else + log_info "Watchdog status is not 1: Current status is $status." + fi + else + log_info "Watchdog Status: Not available" + fi + # Sleep for the specified interval to avoid stressing the CPU. + sleep "$MONITOR_INTERVAL" + done +} + # Check if file exists and create soft link # $1 - file path # $2 - link path # return none check_n_link() { + # In case this is the first call to check_n_link, + # Signal the WD to start monitoring. + # This will insure that the WD uses valid timestamps. + if [[ ! -f "$WATCHDOG_FIRST_RUN_FILE" ]]; then + touch "$WATCHDOG_FIRST_RUN_FILE" + fi + refresh_hw_management_done_wd if [ -f "$1" ]; then ln -sf "$1" "$2" diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh index 662485044..5206ae44e 100755 --- a/usr/usr/bin/hw-management.sh +++ b/usr/usr/bin/hw-management.sh @@ -3076,6 +3076,8 @@ map_asic_pci_to_i2c_bus() do_start() { + init_hw_management_done_wd_files + init_hw_management_done_wd create_symbolic_links check_cpu_type pre_devtr_init @@ -3133,6 +3135,9 @@ do_start() cp $thermal_control_configs_path/tc_config_default.json $config_path/tc_config.json fi log_info "Init completed." + + # Wait for hw-mgmt done WD to finish. + monitor_link_wd } do_stop()