Skip to content

Commit

Permalink
hw-mgmt: scripts: Added hw-mgmt done WatchDog.
Browse files Browse the repository at this point in the history
Signed-off-by: Roy Cohen <[email protected]>
  • Loading branch information
bspguy committed Jul 11, 2024
1 parent 53fb87d commit aa14170
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 0 deletions.
65 changes: 65 additions & 0 deletions usr/usr/bin/hw-management-done.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash
##################################################################################
# Copyright (c) 2020 - 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. Neither the names of the copyright holders nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# Alternatively, this software may be distributed under the terms of the
# GNU General Public License ("GPL") version 2 as published by the Free
# Software Foundation.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
source hw-management-helpers.sh

trace_udev_events "hw-mngmt done called"

# Watchdog loop
while true; do
# Get the current time with milliseconds.
current_time=$(date +%s%3N)

# Read the last reset time atomically.
if [[ -f "$WATCHDOG_RESET_FILE" ]]; then
last_reset_time=$(cat "$WATCHDOG_RESET_FILE")
else
last_reset_time=$current_time
fi

# Calculate the time difference in milliseconds.
time_diff=$((current_time - last_reset_time))

# Check if the time difference exceeds the timeout in milliseconds.
if [ "$time_diff" -ge $((WATCHDOG_TIMEOUT * 1000)) ]; then
# Update the status file.
echo "1" > "$WATCHDOG_STATUS_FILE"
# Update syslog.
logger "hw-management script done!"

# Exit the watchdog.
exit 0
fi

# Sleep for the specified delay period in seconds.
sleep "$WATCHDOG_DELAY"
done
38 changes: 38 additions & 0 deletions usr/usr/bin/hw-management-helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@
# POSSIBILITY OF SUCH DAMAGE.
#

# HW-MGMT WATCHDOG GLOBALS
WATCHDOG_TIMEOUT=5
WATCHDOG_DELAY=1
WATCHDOG_RESET_FILE="/tmp/watchdog_reset_time"
WATCHDOG_TEMP_FILE="/tmp/watchdog_reset_time.tmp"
WATCHDOG_PID_FILE="/tmp/watchdog.pid"
WATCHDOG_STATUS_FILE="/tmp/watchdog_status"
WATCHDOG_SCRIPT="hw-management-done.sh"
MONITOR_INTERVAL=2

hw_management_path=/var/run/hw-management
environment_path=$hw_management_path/environment
alarm_path=$hw_management_path/alarm
Expand Down Expand Up @@ -284,12 +294,40 @@ check_simx()
fi
}

# Function to monitor the watchdog status.
monitor_link_wd() {
trace_udev_events "monitor_link_wd was called"
while true; do
if [[ -f "$WATCHDOG_STATUS_FILE" ]]; then
status=$(cat "$WATCHDOG_STATUS_FILE")
if [[ "$status" == "1" ]]; then
trace_udev_events "Watchdog status is 1: Condition met."
return 0
else
echo "Watchdog status is not 1: Current status is $status."
fi
else
echo "Watchdog Status: Not available"
fi

# Sleep for the specified interval to avoid stressing the CPU.
sleep "$MONITOR_INTERVAL"
done
}

# Check if file exists and create soft link
# $1 - file path
# $2 - link path
# return none
check_n_link()
{
# Write the current time with milliseconds to a temporary file
local current_time=$(date +%s%3N)
echo "$current_time" > "$WATCHDOG_TEMP_FILE"

# Atomically move the temporary file to the reset file
mv "$WATCHDOG_TEMP_FILE" "$WATCHDOG_RESET_FILE"

if [ -f "$1" ];
then
ln -sf "$1" "$2"
Expand Down
19 changes: 19 additions & 0 deletions usr/usr/bin/hw-management.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3076,6 +3076,22 @@ map_asic_pci_to_i2c_bus()

do_start()
{
# Create WD files.
touch "$WATCHDOG_RESET_FILE"
touch "$WATCHDOG_TEMP_FILE"
touch "$WATCHDOG_PID_FILE"
touch "$WATCHDOG_STATUS_FILE"
echo "0" > "$WATCHDOG_STATUS_FILE"

# Start the watchdog process if it's not running.
if [[ ! -f "$WATCHDOG_PID_FILE" ]] || ! kill -0 $(cat "$WATCHDOG_PID_FILE") 2>/dev/null; then
trace_udev_events "do_start: wd script created"
# Start the watchdog script in the background.
bash "$WATCHDOG_SCRIPT" &
# Save the PID of the watchdog process.
echo $! > "$WATCHDOG_PID_FILE"
fi

create_symbolic_links
check_cpu_type
pre_devtr_init
Expand Down Expand Up @@ -3133,6 +3149,9 @@ do_start()
cp $thermal_control_configs_path/tc_config_default.json $config_path/tc_config.json
fi
log_info "Init completed."

# Wait for hw-mgmt done WD to finish.
monitor_link_wd
}

do_stop()
Expand Down

0 comments on commit aa14170

Please sign in to comment.