Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hw-mgmt: scripts: Added hw-mgmt done WatchDog. #1298

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions usr/usr/bin/hw-management-done.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash
##################################################################################
# Copyright (c) 2020 - 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. Neither the names of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# Alternatively, this software may be distributed under the terms of the
# GNU General Public License ("GPL") version 2 as published by the Free
# Software Foundation.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
source hw-management-helpers.sh

log_info "hw-mngmt-done started."

# Wait until the first run file is created by check_n_link.
while [[ ! -f "$WATCHDOG_FIRST_RUN_FILE" ]]; do
sleep "$WATCHDOG_DELAY"
done

log_info "first run file created, starting Watchdog logic."

# Watchdog loop
while true; do
# Get the current time with milliseconds.
current_time=$(date +%s%3N)
# Read the last reset time atomically by checking both files.
last_reset_time_A=$(cat "/tmp/watchdog_reset_time_a" 2>/dev/null || echo 0)
last_reset_time_B=$(cat "/tmp/watchdog_reset_time_b" 2>/dev/null || echo 0)
# Determine which file has the most recent reset time.
last_reset_time=$(( last_reset_time_A > last_reset_time_B ? last_reset_time_A : last_reset_time_B ))
# Calculate the time difference in milliseconds.
time_diff=$((current_time - last_reset_time))
# Check if the time difference exceeds the timeout in milliseconds.
if [ "$time_diff" -ge $((WATCHDOG_TIMEOUT * 1000)) ]; then
# Update the status file atomically using using a temporary file.
echo "1" > "$WATCHDOG_STATUS_TEMP_FILE"
mv "$WATCHDOG_STATUS_TEMP_FILE" "$WATCHDOG_STATUS_FILE" # Atomic move.
# Update syslog.
log_info "current_time $current_time"
log_info "last_reset_time $last_reset_time"
log_info "WD time diff = $time_diff"
log_info "hw-management script done!"
# Create the sysfs ready file
touch "$HW_MGMT_SYSFS_RDY"
# Exit the watchdog.
exit 0
fi
# Sleep for the specified delay period in seconds.
sleep "$WATCHDOG_DELAY"
done
136 changes: 136 additions & 0 deletions usr/usr/bin/hw-management-helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,20 @@
# POSSIBILITY OF SUCH DAMAGE.
#

# HW-MGMT WATCHDOG GLOBALS
WATCHDOG_TIMEOUT=20 # Total WD T/O.
WATCHDOG_DELAY=1 # Internal delay for WD loop to free CPU.
MONITOR_INTERVAL=1 # Internal delay for Monitor WD loop to free CPU.
HW_MGMT_SYSFS_RDY="/var/run/HW_MGMT_SYSFS_RDY"
WATCHDOG_FIRST_RUN_FILE="tmp/watchdog_first_run"
WATCHDOG_RESET_FILE_A="/tmp/watchdog_reset_time_a"
WATCHDOG_RESET_FILE_B="/tmp/watchdog_reset_time_b"
WATCHDOG_PID_FILE="/tmp/watchdog.pid"
WATCHDOG_STATUS_FILE="/tmp/watchdog_status"
WATCHDOG_STATUS_TEMP_FILE="/tmp/watchdog_status.tmp"
WATCHDOG_SCRIPT="hw-management-done.sh"


hw_management_path=/var/run/hw-management
environment_path=$hw_management_path/environment
alarm_path=$hw_management_path/alarm
Expand Down Expand Up @@ -284,12 +298,134 @@ check_simx()
fi
}

init_hw_management_done_wd_files()
{
WD_FILES=(
"$WATCHDOG_RESET_FILE_A"
"$WATCHDOG_RESET_FILE_B"
"$WATCHDOG_STATUS_FILE"
"$WATCHDOG_STATUS_TEMP_FILE"
)

# Remove all WD files if they exist from previous runs.
# They might contain garbage.
for FILE in "${WD_FILES[@]}"; do
[ -f "$FILE" ] && rm "$FILE"
touch "$FILE"
done

# remove the sysfs ready file if it exists
if [[ -f "$HW_MGMT_SYSFS_RDY" ]]; then
rm "$HW_MGMT_SYSFS_RDY"
fi

# WATCHDOG_FIRST_RUN_FILE is created the first time
# check_n_link function is called.
if [[ -f "$WATCHDOG_FIRST_RUN_FILE" ]]; then
rm "$WATCHDOG_FIRST_RUN_FILE"
fi

echo "0" > "$WATCHDOG_STATUS_FILE"

file_exist=true
for FILE in "${WD_FILES[@]}"; do
[ ! -f "$FILE" ]
file_exist=false
break
done
# In case one of the WD files was not created,
# or in case the first run file was not removed,
# exit with error.
if [ ! "$file_exist" ] || [ -f "$WATCHDOG_FIRST_RUN_FILE" ];then
log_info "Error init. WD files."
exit 1
fi

log_info "Successfully init. WD files."
}

init_hw_management_done_wd()
{
# Remove older WD process if it exists.
if [ -f "$WATCHDOG_PID_FILE" ]; then
local WATCHDOG_PID
WATCHDOG_PID=$(cat "$WATCHDOG_PID_FILE")
if kill -0 "$WATCHDOG_PID" 2>/dev/null; then
if kill "$WATCHDOG_PID"; then
log_info "HW Mangement old WD process killed succesfully."
rm -f "$WATCHDOG_PID_FILE"
else
log_info "HW Mangement failed to kill old WD process."
exit 1
fi
else
log_info "HW Mangement old WD process $WATCHDOG_PID already dead, remove the pid file."
rm -f "$WATCHDOG_PID_FILE"
fi
fi

# Start the watchdog process in the background.
bash "$WATCHDOG_SCRIPT" &
# Save the PID of the watchdog process.
touch "$WATCHDOG_PID_FILE"
echo $! > "$WATCHDOG_PID_FILE"
log_info "HW Mangement WD process created."
}

refresh_hw_management_done_wd()
{
# Capture the current time with milliseconds.
local current_time=$(date +%s%3N)
# Read the last update time from both reset files.
last_reset_time_A=$(cat "$WATCHDOG_RESET_FILE_A" 2>/dev/null || echo 0)
last_reset_time_B=$(cat "$WATCHDOG_RESET_FILE_B" 2>/dev/null || echo 0)
# Ensure both variables are valid integers, defaulting to 0 if empty or invalid.
last_reset_time_A=${last_reset_time_A:-0}
last_reset_time_B=${last_reset_time_B:-0}
# Determine which file was written most recently.
if [ "$last_reset_time_A" -gt "$last_reset_time_B" ]; then
# Write the current time to the less recently updated file (B).
echo "$current_time" > "$WATCHDOG_RESET_FILE_B"
else
# Write the current time to the less recently updated file (A).
echo "$current_time" > "$WATCHDOG_RESET_FILE_A"
fi
}

# Function to monitor the watchdog status.
monitor_link_wd()
{
log_info "monitor_link_wd was called"
while true; do
if [[ -f "$WATCHDOG_STATUS_FILE" ]]; then
status=$(cat "$WATCHDOG_STATUS_FILE")
if [[ "$status" == "1" ]]; then
log_info "Watchdog status is 1: Condition met."
return 0
else
log_info "Watchdog status is not 1: Current status is $status."
fi
else
log_info "Watchdog Status: Not available"
fi
# Sleep for the specified interval to avoid stressing the CPU.
sleep "$MONITOR_INTERVAL"
done
}

# Check if file exists and create soft link
# $1 - file path
# $2 - link path
# return none
check_n_link()
{
# In case this is the first call to check_n_link,
# Signal the WD to start monitoring.
# This will insure that the WD uses valid timestamps.
if [[ ! -f "$WATCHDOG_FIRST_RUN_FILE" ]]; then
touch "$WATCHDOG_FIRST_RUN_FILE"
fi
refresh_hw_management_done_wd
if [ -f "$1" ];
then
ln -sf "$1" "$2"
Expand Down
5 changes: 5 additions & 0 deletions usr/usr/bin/hw-management.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3076,6 +3076,8 @@ map_asic_pci_to_i2c_bus()

do_start()
{
init_hw_management_done_wd_files
init_hw_management_done_wd
create_symbolic_links
check_cpu_type
pre_devtr_init
Expand Down Expand Up @@ -3133,6 +3135,9 @@ do_start()
cp $thermal_control_configs_path/tc_config_default.json $config_path/tc_config.json
fi
log_info "Init completed."

# Wait for hw-mgmt done WD to finish.
monitor_link_wd
}

do_stop()
Expand Down