Skip to content

Commit

Permalink
hw-mgmt: scripts: Added hw-mgmt done WatchDog.
Browse files Browse the repository at this point in the history
Signed-off-by: Roy Cohen <[email protected]>
  • Loading branch information
bspguy committed Aug 25, 2024
1 parent 437ed26 commit 12a87b1
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 48 deletions.
33 changes: 19 additions & 14 deletions usr/usr/bin/hw-management-done.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,34 +32,39 @@
#
source hw-management-helpers.sh

log_info "hw-mngmt done called"
log_info "hw-mngmt-done started."

# Wait until the first run file is created by check_n_link.
while [[ ! -f "$WATCHDOG_FIRST_RUN_FILE" ]]; do
sleep "$WATCHDOG_DELAY"
done

log_info "first run file created, starting Watchdog logic."

# Watchdog loop
while true; do
# Get the current time with milliseconds.
current_time=$(date +%s%3N)

# Read the last reset time atomically.
if [[ -f "$WATCHDOG_RESET_FILE" ]]; then
last_reset_time=$(cat "$WATCHDOG_RESET_FILE")
else
last_reset_time=$current_time
fi

# Read the last reset time atomically by checking both files.
last_reset_time_A=$(cat "/tmp/watchdog_reset_time_a" 2>/dev/null || echo 0)
last_reset_time_B=$(cat "/tmp/watchdog_reset_time_b" 2>/dev/null || echo 0)
# Determine which file has the most recent reset time.
last_reset_time=$(( last_reset_time_A > last_reset_time_B ? last_reset_time_A : last_reset_time_B ))
# Calculate the time difference in milliseconds.
time_diff=$((current_time - last_reset_time))

# Check if the time difference exceeds the timeout in milliseconds.
if [ "$time_diff" -ge $((WATCHDOG_TIMEOUT * 1000)) ]; then
# Update the status file.
echo "1" > "$WATCHDOG_STATUS_FILE"
# Update the status file atomically using using a temporary file.
echo "1" > "$WATCHDOG_STATUS_TEMP_FILE"
mv "$WATCHDOG_STATUS_TEMP_FILE" "$WATCHDOG_STATUS_FILE" # Atomic move.
# Update syslog.
log_info "current_time $current_time"
log_info "last_reset_time $last_reset_time"
log_info "WD time diff = $time_diff"
log_info "hw-management script done!"

# Exit the watchdog.
exit 0
fi

# Sleep for the specified delay period in seconds.
sleep "$WATCHDOG_DELAY"
done
118 changes: 100 additions & 18 deletions usr/usr/bin/hw-management-helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,18 @@
#

# HW-MGMT WATCHDOG GLOBALS
WATCHDOG_TIMEOUT=5
WATCHDOG_DELAY=1
WATCHDOG_RESET_FILE="/tmp/watchdog_reset_time"
WATCHDOG_TEMP_FILE="/tmp/watchdog_reset_time.tmp"
WATCHDOG_TIMEOUT=5 # Total WD T/O.
WATCHDOG_DELAY=1 # Internal delay for WD loop to free CPU.
MONITOR_INTERVAL=1 # Internal delay for Monitor WD loop to free CPU.
WATCHDOG_FIRST_RUN_FILE="tmp/watchdog_first_run"
WATCHDOG_RESET_FILE_A="/tmp/watchdog_reset_time_a"
WATCHDOG_RESET_FILE_B="/tmp/watchdog_reset_time_b"
WATCHDOG_TIMESTAMP_TEMP_FILE="/tmp/watchdog_reset_time.tmp"
WATCHDOG_PID_FILE="/tmp/watchdog.pid"
WATCHDOG_STATUS_FILE="/tmp/watchdog_status"
WATCHDOG_STATUS_TEMP_FILE="/tmp/watchdog_status.tmp"
WATCHDOG_SCRIPT="hw-management-done.sh"
MONITOR_INTERVAL=2


hw_management_path=/var/run/hw-management
environment_path=$hw_management_path/environment
Expand Down Expand Up @@ -294,22 +298,106 @@ check_simx()
fi
}

init_hw_management_done_wd_files()
{
WD_FILES=(
"$WATCHDOG_RESET_FILE_A"
"$WATCHDOG_RESET_FILE_B"
"$WATCHDOG_PID_FILE"
"$WATCHDOG_STATUS_FILE"
"$WATCHDOG_STATUS_TEMP_FILE"
"$WATCHDOG_TIMESTAMP_TEMP_FILE"
)

# Remove all WD files if they exist from previous runs.
# They might contain garbage.
for FILE in "${WD_FILES[@]}"; do
[ -f "$FILE" ] && rm "$FILE"
touch "$FILE"
done

# WATCHDOG_FIRST_RUN_FILE is created the first time
# check_n_link function is called.
if [[ -f "$WATCHDOG_FIRST_RUN_FILE" ]]; then
rm "$WATCHDOG_FIRST_RUN_FILE"
fi

echo "0" > "$WATCHDOG_STATUS_FILE"

file_exist=true
for FILE in "${WD_FILES[@]}"; do
[ ! -f "$FILE" ]
file_exist=false
break
done
# In case one of the WD files was not created,
# or in case the first run file was not removed,
# exit with error.
if [ ! "$file_exist" ] || [ -f "$WATCHDOG_FIRST_RUN_FILE" ];then
log_info "Error init. WD files."
exit 1
fi

log_info "Successfully init. WD files."
}

init_hw_management_done_wd()
{
# Start the watchdog process if it's not running.
if [[ ! -f "$WATCHDOG_PID_FILE" ]] || ! kill -0 $(cat "$WATCHDOG_PID_FILE") 2>/dev/null; then
log_info "HW Mangement WD process created."
# Start the watchdog process in the background.
bash "$WATCHDOG_SCRIPT" &
# Save the PID of the watchdog process.
echo $! > "$WATCHDOG_PID_FILE"
else
log_info "WD Process already running."
fi
}

refresh_hw_management_done_wd()
{
# Write the current time with milliseconds to a temporary file.
local current_time=$(date +%s%3N)
echo "$current_time" > "$WATCHDOG_TIMESTAMP_TEMP_FILE"
# In case this is the first call to check_n_link,
# Signal the WD to start monitoring.
# This will insure that the WD uses valid timestamps.
if [[ ! -f "$WATCHDOG_FIRST_RUN_FILE" ]]; then
mv "$WATCHDOG_TIMESTAMP_TEMP_FILE" "$WATCHDOG_RESET_FILE_A"
touch "$WATCHDOG_FIRST_RUN_FILE"
log_info "[A] first run check_n_link $current_time"
else
# Use double-buffering:
# Alternate between the two reset files.
# Determine which file to use for atomic update.
if [[ -f "$WATCHDOG_RESET_FILE_A" ]]; then
# Atomically move the temporary file to the reset file
mv "$WATCHDOG_TIMESTAMP_TEMP_FILE" "$WATCHDOG_RESET_FILE_B"
log_info "[B] check_n_link $current_time"
else
mv "$WATCHDOG_TIMESTAMP_TEMP_FILE" "$WATCHDOG_RESET_FILE_A"
log_info "[A] check_n_link $current_time"
fi
fi
}

# Function to monitor the watchdog status.
monitor_link_wd() {
log_info "monitor_link_wd was called"
monitor_link_wd()
{
log_info "monitor_link_wd was called"
while true; do
if [[ -f "$WATCHDOG_STATUS_FILE" ]]; then
status=$(cat "$WATCHDOG_STATUS_FILE")
if [[ "$status" == "1" ]]; then
log_info "Watchdog status is 1: Condition met."
return 0
return 0
else
echo "Watchdog status is not 1: Current status is $status."
log_info "Watchdog status is not 1: Current status is $status."
fi
else
echo "Watchdog Status: Not available"
log_info "Watchdog Status: Not available"
fi

# Sleep for the specified interval to avoid stressing the CPU.
sleep "$MONITOR_INTERVAL"
done
Expand All @@ -321,13 +409,7 @@ monitor_link_wd() {
# return none
check_n_link()
{
# Write the current time with milliseconds to a temporary file
local current_time=$(date +%s%3N)
echo "$current_time" > "$WATCHDOG_TEMP_FILE"

# Atomically move the temporary file to the reset file
mv "$WATCHDOG_TEMP_FILE" "$WATCHDOG_RESET_FILE"

refresh_hw_management_done_wd
if [ -f "$1" ];
then
ln -sf "$1" "$2"
Expand Down
18 changes: 2 additions & 16 deletions usr/usr/bin/hw-management.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3076,22 +3076,8 @@ map_asic_pci_to_i2c_bus()

do_start()
{
# Create WD files.
touch "$WATCHDOG_RESET_FILE"
touch "$WATCHDOG_TEMP_FILE"
touch "$WATCHDOG_PID_FILE"
touch "$WATCHDOG_STATUS_FILE"
echo "0" > "$WATCHDOG_STATUS_FILE"

# Start the watchdog process if it's not running.
if [[ ! -f "$WATCHDOG_PID_FILE" ]] || ! kill -0 $(cat "$WATCHDOG_PID_FILE") 2>/dev/null; then
log_info "do_start: wd script created"
# Start the watchdog script in the background.
bash "$WATCHDOG_SCRIPT" &
# Save the PID of the watchdog process.
echo $! > "$WATCHDOG_PID_FILE"
fi

init_hw_management_done_wd_files
init_hw_management_done_wd
create_symbolic_links
check_cpu_type
pre_devtr_init
Expand Down

0 comments on commit 12a87b1

Please sign in to comment.