From aa30ff4d4f204f3ad1fc08c1d1d21f48a4cb40c1 Mon Sep 17 00:00:00 2001 From: Roy Cohen Date: Mon, 27 May 2024 16:41:43 +0300 Subject: [PATCH] hw-mgmt: scripts: Added hw-mgmt done WatchDog. Signed-off-by: Roy Cohen --- usr/usr/bin/hw-management-done.sh | 33 ++++---- usr/usr/bin/hw-management-helpers.sh | 118 +++++++++++++++++++++++---- usr/usr/bin/hw-management.sh | 18 +--- 3 files changed, 121 insertions(+), 48 deletions(-) diff --git a/usr/usr/bin/hw-management-done.sh b/usr/usr/bin/hw-management-done.sh index cbc02bc1b..9200a6228 100755 --- a/usr/usr/bin/hw-management-done.sh +++ b/usr/usr/bin/hw-management-done.sh @@ -32,34 +32,39 @@ # source hw-management-helpers.sh -log_info "hw-mngmt done called" +log_info "hw-mngmt-done called." + +# Wait until the first run file is created by check_n_link. +while [[ ! -f "$WATCHDOG_FIRST_RUN_FILE" ]]; do + sleep "$WATCHDOG_DELAY" +done + +log_info "first run file created, starting Watchdog logic." # Watchdog loop while true; do # Get the current time with milliseconds. current_time=$(date +%s%3N) - - # Read the last reset time atomically. - if [[ -f "$WATCHDOG_RESET_FILE" ]]; then - last_reset_time=$(cat "$WATCHDOG_RESET_FILE") - else - last_reset_time=$current_time - fi - + # Read the last reset time atomically by checking both files. + last_reset_time_A=$(cat "/tmp/watchdog_reset_time_a" 2>/dev/null || echo 0) + last_reset_time_B=$(cat "/tmp/watchdog_reset_time_b" 2>/dev/null || echo 0) + # Determine which file has the most recent reset time. + last_reset_time=$(( last_reset_time_A > last_reset_time_B ? last_reset_time_A : last_reset_time_B )) # Calculate the time difference in milliseconds. time_diff=$((current_time - last_reset_time)) - # Check if the time difference exceeds the timeout in milliseconds. if [ "$time_diff" -ge $((WATCHDOG_TIMEOUT * 1000)) ]; then - # Update the status file. - echo "1" > "$WATCHDOG_STATUS_FILE" + # Update the status file atomically using using a temporary file. + echo "1" > "$WATCHDOG_STATUS_TEMP_FILE" + mv "$WATCHDOG_STATUS_TEMP_FILE" "$WATCHDOG_STATUS_FILE" # Atomic move. # Update syslog. + log_info "current_time $current_time" + log_info "last_reset_time $last_reset_time" + log_info "WD time diff = $time_diff" log_info "hw-management script done!" - # Exit the watchdog. exit 0 fi - # Sleep for the specified delay period in seconds. sleep "$WATCHDOG_DELAY" done \ No newline at end of file diff --git a/usr/usr/bin/hw-management-helpers.sh b/usr/usr/bin/hw-management-helpers.sh index f8a081a06..dc211fff4 100755 --- a/usr/usr/bin/hw-management-helpers.sh +++ b/usr/usr/bin/hw-management-helpers.sh @@ -32,14 +32,18 @@ # # HW-MGMT WATCHDOG GLOBALS -WATCHDOG_TIMEOUT=5 -WATCHDOG_DELAY=1 -WATCHDOG_RESET_FILE="/tmp/watchdog_reset_time" -WATCHDOG_TEMP_FILE="/tmp/watchdog_reset_time.tmp" +WATCHDOG_TIMEOUT=5 # Total WD T/O. +WATCHDOG_DELAY=1 # Internal delay for WD loop to free CPU. +MONITOR_INTERVAL=1 # Internal delay for Monitor WD loop to free CPU. +WATCHDOG_FIRST_RUN_FILE="tmp/watchdog_first_run" +WATCHDOG_RESET_FILE_A="/tmp/watchdog_reset_time_a" +WATCHDOG_RESET_FILE_B="/tmp/watchdog_reset_time_b" +WATCHDOG_TIMESTAMP_TEMP_FILE="/tmp/watchdog_reset_time.tmp" WATCHDOG_PID_FILE="/tmp/watchdog.pid" WATCHDOG_STATUS_FILE="/tmp/watchdog_status" +WATCHDOG_STATUS_TEMP_FILE="/tmp/watchdog_status.tmp" WATCHDOG_SCRIPT="hw-management-done.sh" -MONITOR_INTERVAL=2 + hw_management_path=/var/run/hw-management environment_path=$hw_management_path/environment @@ -294,22 +298,106 @@ check_simx() fi } +init_hw_management_done_wd_files() +{ + WD_FILES=( + "$WATCHDOG_RESET_FILE_A" + "$WATCHDOG_RESET_FILE_B" + "$WATCHDOG_PID_FILE" + "$WATCHDOG_STATUS_FILE" + "$WATCHDOG_STATUS_TEMP_FILE" + "$WATCHDOG_TIMESTAMP_TEMP_FILE" + ) + + # Remove all WD files if they exist from previous runs. + # They might contain garbage. + for FILE in "${WD_FILES[@]}"; do + [ -f "$FILE" ] && rm "$FILE" + touch "$FILE" + done + + # WATCHDOG_FIRST_RUN_FILE is created the first time + # check_n_link function is called. + if [[ -f "$WATCHDOG_FIRST_RUN_FILE" ]]; then + rm "$WATCHDOG_FIRST_RUN_FILE" + fi + + echo "0" > "$WATCHDOG_STATUS_FILE" + + file_exist=true + for FILE in "${WD_FILES[@]}"; do + [ ! -f "$FILE" ] + file_exist=false + break + done + # In case one of the WD files was not created, + # or in case the first run file was not removed, + # exit with error. + if [ ! "$file_exist" ] || [ -f "$WATCHDOG_FIRST_RUN_FILE" ];then + log_info "Error init. WD files." + exit 1 + fi + + log_info "Successfully init. WD files." +} + +init_hw_management_done_wd() +{ + # Start the watchdog process if it's not running. + if [[ ! -f "$WATCHDOG_PID_FILE" ]] || ! kill -0 $(cat "$WATCHDOG_PID_FILE") 2>/dev/null; then + log_info "HW Mangement WD process created." + # Start the watchdog process in the background. + bash "$WATCHDOG_SCRIPT" & + # Save the PID of the watchdog process. + echo $! > "$WATCHDOG_PID_FILE" + else + log_info "WD Process already running." + fi +} + +refresh_hw_management_done_wd() +{ + # Write the current time with milliseconds to a temporary file. + local current_time=$(date +%s%3N) + echo "$current_time" > "$WATCHDOG_TIMESTAMP_TEMP_FILE" + # In case this is the first call to check_n_link, + # Signal the WD to start monitoring. + # This will insure that the WD uses valid timestamps. + if [[ ! -f "$WATCHDOG_FIRST_RUN_FILE" ]]; then + mv "$WATCHDOG_TIMESTAMP_TEMP_FILE" "$WATCHDOG_RESET_FILE_A" + touch "$WATCHDOG_FIRST_RUN_FILE" + log_info "[A] first run check_n_link $current_time" + else + # Use double-buffering: + # Alternate between the two reset files. + # Determine which file to use for atomic update. + if [[ -f "$WATCHDOG_RESET_FILE_A" ]]; then + # Atomically move the temporary file to the reset file + mv "$WATCHDOG_TIMESTAMP_TEMP_FILE" "$WATCHDOG_RESET_FILE_B" + log_info "[B] check_n_link $current_time" + else + mv "$WATCHDOG_TIMESTAMP_TEMP_FILE" "$WATCHDOG_RESET_FILE_A" + log_info "[A] check_n_link $current_time" + fi + fi +} + # Function to monitor the watchdog status. -monitor_link_wd() { - log_info "monitor_link_wd was called" +monitor_link_wd() +{ + log_info "monitor_link_wd was called" while true; do if [[ -f "$WATCHDOG_STATUS_FILE" ]]; then status=$(cat "$WATCHDOG_STATUS_FILE") if [[ "$status" == "1" ]]; then log_info "Watchdog status is 1: Condition met." - return 0 + return 0 else - echo "Watchdog status is not 1: Current status is $status." + log_info "Watchdog status is not 1: Current status is $status." fi else - echo "Watchdog Status: Not available" + log_info "Watchdog Status: Not available" fi - # Sleep for the specified interval to avoid stressing the CPU. sleep "$MONITOR_INTERVAL" done @@ -321,13 +409,7 @@ monitor_link_wd() { # return none check_n_link() { - # Write the current time with milliseconds to a temporary file - local current_time=$(date +%s%3N) - echo "$current_time" > "$WATCHDOG_TEMP_FILE" - - # Atomically move the temporary file to the reset file - mv "$WATCHDOG_TEMP_FILE" "$WATCHDOG_RESET_FILE" - + refresh_hw_management_done_wd if [ -f "$1" ]; then ln -sf "$1" "$2" diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh index 29238a359..5206ae44e 100755 --- a/usr/usr/bin/hw-management.sh +++ b/usr/usr/bin/hw-management.sh @@ -3076,22 +3076,8 @@ map_asic_pci_to_i2c_bus() do_start() { - # Create WD files. - touch "$WATCHDOG_RESET_FILE" - touch "$WATCHDOG_TEMP_FILE" - touch "$WATCHDOG_PID_FILE" - touch "$WATCHDOG_STATUS_FILE" - echo "0" > "$WATCHDOG_STATUS_FILE" - - # Start the watchdog process if it's not running. - if [[ ! -f "$WATCHDOG_PID_FILE" ]] || ! kill -0 $(cat "$WATCHDOG_PID_FILE") 2>/dev/null; then - log_info "do_start: wd script created" - # Start the watchdog script in the background. - bash "$WATCHDOG_SCRIPT" & - # Save the PID of the watchdog process. - echo $! > "$WATCHDOG_PID_FILE" - fi - + init_hw_management_done_wd_files + init_hw_management_done_wd create_symbolic_links check_cpu_type pre_devtr_init