From aa141703de2fe69179f574f63297736e2133daa5 Mon Sep 17 00:00:00 2001 From: Roy Cohen Date: Mon, 27 May 2024 16:41:43 +0300 Subject: [PATCH] hw-mgmt: scripts: Added hw-mgmt done WatchDog. Signed-off-by: Roy Cohen --- usr/usr/bin/hw-management-done.sh | 65 ++++++++++++++++++++++++++++ usr/usr/bin/hw-management-helpers.sh | 38 ++++++++++++++++ usr/usr/bin/hw-management.sh | 19 ++++++++ 3 files changed, 122 insertions(+) create mode 100755 usr/usr/bin/hw-management-done.sh diff --git a/usr/usr/bin/hw-management-done.sh b/usr/usr/bin/hw-management-done.sh new file mode 100755 index 000000000..738586810 --- /dev/null +++ b/usr/usr/bin/hw-management-done.sh @@ -0,0 +1,65 @@ +#!/bin/bash +################################################################################## +# Copyright (c) 2020 - 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. Neither the names of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# Alternatively, this software may be distributed under the terms of the +# GNU General Public License ("GPL") version 2 as published by the Free +# Software Foundation. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +source hw-management-helpers.sh + +trace_udev_events "hw-mngmt done called" + +# Watchdog loop +while true; do + # Get the current time with milliseconds. + current_time=$(date +%s%3N) + + # Read the last reset time atomically. + if [[ -f "$WATCHDOG_RESET_FILE" ]]; then + last_reset_time=$(cat "$WATCHDOG_RESET_FILE") + else + last_reset_time=$current_time + fi + + # Calculate the time difference in milliseconds. + time_diff=$((current_time - last_reset_time)) + + # Check if the time difference exceeds the timeout in milliseconds. + if [ "$time_diff" -ge $((WATCHDOG_TIMEOUT * 1000)) ]; then + # Update the status file. + echo "1" > "$WATCHDOG_STATUS_FILE" + # Update syslog. + logger "hw-management script done!" + + # Exit the watchdog. + exit 0 + fi + + # Sleep for the specified delay period in seconds. + sleep "$WATCHDOG_DELAY" +done \ No newline at end of file diff --git a/usr/usr/bin/hw-management-helpers.sh b/usr/usr/bin/hw-management-helpers.sh index a1b342c24..1f1b10ace 100755 --- a/usr/usr/bin/hw-management-helpers.sh +++ b/usr/usr/bin/hw-management-helpers.sh @@ -31,6 +31,16 @@ # POSSIBILITY OF SUCH DAMAGE. # +# HW-MGMT WATCHDOG GLOBALS +WATCHDOG_TIMEOUT=5 +WATCHDOG_DELAY=1 +WATCHDOG_RESET_FILE="/tmp/watchdog_reset_time" +WATCHDOG_TEMP_FILE="/tmp/watchdog_reset_time.tmp" +WATCHDOG_PID_FILE="/tmp/watchdog.pid" +WATCHDOG_STATUS_FILE="/tmp/watchdog_status" +WATCHDOG_SCRIPT="hw-management-done.sh" +MONITOR_INTERVAL=2 + hw_management_path=/var/run/hw-management environment_path=$hw_management_path/environment alarm_path=$hw_management_path/alarm @@ -284,12 +294,40 @@ check_simx() fi } +# Function to monitor the watchdog status. +monitor_link_wd() { + trace_udev_events "monitor_link_wd was called" + while true; do + if [[ -f "$WATCHDOG_STATUS_FILE" ]]; then + status=$(cat "$WATCHDOG_STATUS_FILE") + if [[ "$status" == "1" ]]; then + trace_udev_events "Watchdog status is 1: Condition met." + return 0 + else + echo "Watchdog status is not 1: Current status is $status." + fi + else + echo "Watchdog Status: Not available" + fi + + # Sleep for the specified interval to avoid stressing the CPU. + sleep "$MONITOR_INTERVAL" + done +} + # Check if file exists and create soft link # $1 - file path # $2 - link path # return none check_n_link() { + # Write the current time with milliseconds to a temporary file + local current_time=$(date +%s%3N) + echo "$current_time" > "$WATCHDOG_TEMP_FILE" + + # Atomically move the temporary file to the reset file + mv "$WATCHDOG_TEMP_FILE" "$WATCHDOG_RESET_FILE" + if [ -f "$1" ]; then ln -sf "$1" "$2" diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh index 662485044..070df15ec 100755 --- a/usr/usr/bin/hw-management.sh +++ b/usr/usr/bin/hw-management.sh @@ -3076,6 +3076,22 @@ map_asic_pci_to_i2c_bus() do_start() { + # Create WD files. + touch "$WATCHDOG_RESET_FILE" + touch "$WATCHDOG_TEMP_FILE" + touch "$WATCHDOG_PID_FILE" + touch "$WATCHDOG_STATUS_FILE" + echo "0" > "$WATCHDOG_STATUS_FILE" + + # Start the watchdog process if it's not running. + if [[ ! -f "$WATCHDOG_PID_FILE" ]] || ! kill -0 $(cat "$WATCHDOG_PID_FILE") 2>/dev/null; then + trace_udev_events "do_start: wd script created" + # Start the watchdog script in the background. + bash "$WATCHDOG_SCRIPT" & + # Save the PID of the watchdog process. + echo $! > "$WATCHDOG_PID_FILE" + fi + create_symbolic_links check_cpu_type pre_devtr_init @@ -3133,6 +3149,9 @@ do_start() cp $thermal_control_configs_path/tc_config_default.json $config_path/tc_config.json fi log_info "Init completed." + + # Wait for hw-mgmt done WD to finish. + monitor_link_wd } do_stop()