From b6ebb058e03e1346f67ed7abb90b70074fad52e4 Mon Sep 17 00:00:00 2001
From: Jarell <91372088+jarelllama@users.noreply.github.com>
Date: Mon, 1 Apr 2024 18:34:37 +0800
Subject: [PATCH] Tidy test_functions.sh

---
 functions/test_functions.sh | 390 ++++++++++++++++++++++++------------
 1 file changed, 266 insertions(+), 124 deletions(-)
diff --git a/functions/test_functions.sh b/functions/test_functions.sh
index 965245e79..19e1e937b 100644
--- a/functions/test_functions.sh
+++ b/functions/test_functions.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 
-# This script is used to test the various functions.
+# This script is used to test the various functions/scripts of this project.
+# Each test consist of an input file which will be processed by
+# the called script, and an output file which is the expected results
+# from the processing. The input and output files are compared to determine
+# the success or failure of the test.
 
 readonly RAW='data/raw.txt'
 readonly RAW_LIGHT='data/raw_light.txt'
@@ -16,32 +20,39 @@ readonly DEAD_DOMAINS='data/dead_domains.txt'
 readonly PARKED_DOMAINS='data/parked_domains.txt'
 readonly DOMAIN_LOG='config/domain_log.csv'
 
-function main {
-    : > "$RAW"  # Initialize raw file
-    sed -i '1q' "$DOMAIN_LOG"  # Initialize domain log file
+main() {
+    # Initialize
+    : > "$RAW"
+    sed -i '1q' "$DOMAIN_LOG"
+    error=false
 
     case "$1" in
         ('retrieve')
-            test_retrieve_validate "$1" ;;
+            TEST_RETRIEVE_VALIDATE "$1" ;;
         ('validate')
-            test_retrieve_validate "$1" ;;
+            TEST_RETRIEVE_VALIDATE "$1" ;;
         ('dead')
             test_dead_check ;;
         ('parked')
             test_parked_check ;;
         ('shellcheck')
-            shellcheck ;;
+            SHELLCHECK ;;
     esac
 
     exit 0
 }
 
-function shellcheck {
+# Function 'SHELLCHECK' runs ShellCheck for all scripts along with other checks
+# for common errors/mistakes.
+SHELLCHECK() {
+    # Download ShellCheck
     url='https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz'
-    wget -qO - "$url" | tar -xJ  # Download ShellCheck
+    wget -qO - "$url" | tar -xJ
+
     printf "%s\n" "$(shellcheck-stable/shellcheck --version)"
 
-    scripts=$(find . ! -path "./legacy/*" -type f -name "*.sh")  # Find scripts
+    # Find scripts
+    scripts=$(find . ! -path "./legacy/*" -type f -name "*.sh")
 
     # Run ShellCheck for each script
     while read -r script; do
@@ -65,76 +76,106 @@ function shellcheck {
     fi
 
     printf "\n[info] Scripts checked (%s):\n%s\n" "$(wc -l <<< "$scripts")" "$scripts"
-    [[ "$error" == true ]] && { printf "\n"; exit 1; }  # Exit with error if test failed
+
+    # Exit with error if test failed
+    if [[ "$error" == true ]]; then
+        printf "\n"
+        exit 1
+    fi
 }
 
-test_retrieve_validate() {
-    script_to_test="$1"
-    [[ -d data/pending ]] && rm -r data/pending  # Initialize pending directory
-    [[ "$script_to_test" == 'retrieve' ]] && mkdir data/pending
+# The 'test_<function>' scripts are to test individual functions within scripts
+# The input.txt file is to be processed by the called script. The out_raw.txt
+# file is the expected raw file after processing by the called script.
 
-    if [[ "$script_to_test" == 'retrieve' ]]; then
-        # Test removal of known dead domains
-        {
-            printf "dead-test.com\n"
-            printf "www.dead-test-2.com\n"
-        } > "$DEAD_DOMAINS"  # Sample data
-        {
-            printf "dead-test.com\n"
-            printf "www.dead-test-2.com\n"
-        } >> input.txt  # Input
-        # No expected output (dead domains check does not log)
-    fi
+# TEST: conversion from URLs to domains
+test_conversion() {
+    # INPUT
+    printf "https://conversion-test.com/folder/" >> input.txt
+    # EXPECTED OUTPUT
+    printf "conversion-test.com" >> out_raw.txt
+}
+
+# TEST: removal of known dead domains
+test_known_dead_removal() {
+    {
+        printf "dead-test.com\n"
+        printf "www.dead-test-2.com\n"
+    } >> "$DEAD_DOMAINS"  # Known dead domains
+    {
+        printf "dead-test.com\n"
+        printf "www.dead-test-2.com\n"
+    } >> input.txt  # INPUTS
+
+    # No expected output (dead domains check does not log)
+}
 
-    # Test removal of common subdomains
-    : > "$SUBDOMAINS"  # Initialize subdomains file
-    : > "$ROOT_DOMAINS"  # Initialize root domains file
+# TEST: removal of common subdomains
+test_subdomain_removal() {
     while read -r subdomain; do
         subdomain="${subdomain}.subdomain-test.com"
-        printf "%s\n" "$subdomain" >> input.txt  # Input
-        printf "%s\n" "$subdomain" >> out_subdomains.txt  # Expected output
-        grep -v 'www.' <(printf "subdomain,%s" "$subdomain") >> out_log.txt  # Expected output
+
+        # INPUT
+        printf "%s\n" "$subdomain" >> input.txt
+
+        # EXPECTED OUTPUTS
+        printf "%s\n" "$subdomain" >> out_subdomains.txt
+        grep -v 'www.' <(printf "subdomain,%s" "$subdomain") >> out_log.txt
     done < "$SUBDOMAINS_TO_REMOVE"
-    # Expected output
-    [[ "$script_to_test" == 'validate' ]] && printf "subdomain,www.subdomain-test.com\n" >> out_log.txt  # The Check script does not exclude 'www' subdomains
+
+    # EXPECTED OUTPUTS
+    if [[ "$script_to_test" == 'validate' ]]; then
+        # Only the retrieval script skips logging 'www.' subdomains
+        printf "subdomain,www.subdomain-test.com\n" >> out_log.txt
+    fi
     printf "subdomain-test.com\n" >> out_raw.txt
     printf "subdomain-test.com\n" >> out_root_domains.txt
+}
 
-    # Removal of domains already in raw file is redundant to test
-
-    if [[ "$script_to_test" == 'retrieve' ]]; then
-        # Test removal of known parked domains
-        printf "parked-domains-test.com\n" > "$PARKED_DOMAINS"  # Sample data
-        printf "parked-domains-test.com\n" >> input.txt  # Input
-        printf "parked,parked-domains-test.com\n" >> out_log.txt  # Expected output
-    fi
+# TEST: removal of know parked domains
+test_known_parked_removal() {
+    # Known parked domain
+    printf "parked-domains-test.com\n" >> "$PARKED_DOMAINS"
+    # INPUT
+    printf "parked-domains-test.com\n" >> input.txt
+    # EXPECTED OUTPUT
+    printf "parked,parked-domains-test.com\n" >> out_log.txt
+}
 
-    # Test removal of whitelisted domains and blacklist exclusion
-    # Sample data
-    printf "whitelist\n" > "$WHITELIST"
-    printf "whitelist-blacklisted-test.com\n" > "$BLACKLIST"
-    # Input
+# TEST: whitelisted domains removal
+test_whitelist_removal() {
+    # Sample whitelist term
+    printf "whitelist\n" >> "$WHITELIST"
+    # Sample blacklisted domain
+    printf "whitelist-blacklisted-test.com\n" >> "$BLACKLIST"
+    # INPUTS
     printf "whitelist-test.com\n" >> input.txt
     printf "whitelist-blacklisted-test.com\n" >> input.txt
-    # Expected output
+
+    # EXPECTED OUTPUTS
     printf "whitelist-blacklisted-test.com\n" >> out_raw.txt
     printf "whitelist,whitelist-test.com\n" >> out_log.txt
-    [[ "$script_to_test" == 'retrieve' ]] && printf "blacklist,whitelist-blacklisted-test.com\n" \
-        >> out_log.txt  # The check script does not log blacklisted domains
+    # The check script does not log blacklisted domains
+    [[ "$script_to_test" == 'check' ]] && return
+    printf "blacklist,whitelist-blacklisted-test.com\n" >> out_log.txt
+}
 
-    # Test removal of domains with whitelisted TLDs
+# TEST: removal of domains with whitelisted TLDs
+test_whitelisted_tld_removal() {
     {
         printf "white-tld-test.gov\n"
         printf "white-tld-test.edu\n"
         printf "white-tld-test.mil\n"
-    } >> input.txt  # Input
+    } >> input.txt  # INPUTS
     {
         printf "tld,white-tld-test.gov\n"
         printf "tld,white-tld-test.edu\n"
         printf "tld,white-tld-test.mil\n"
-    } >> out_log.txt  # Expected output
+    } >> out_log.txt  # EXPECTED OUTPUTS
+}
 
-    # Test removal of invalid entries and IP addresses
+# TEST: removal of invalid entries and IP addresses
+test_invalid_removal() {
     {
         printf "invalid-test-com\n"
         printf "100.100.100.100\n"
@@ -142,94 +183,170 @@ test_retrieve_validate() {
         printf "invalid-test.x\n"
         printf "invalid-test.100\n"
         printf "invalid-test.1x\n"
-    } >> input.txt  # Input
-    printf "invalid-test.xn--903fds\n" >> out_raw.txt  # Expected output
-    [[ "$script_to_test" == 'retrieve' ]] &&
-        {
-            printf "invalid-test-com\n"
-            printf "100.100.100.100\n"
-            printf "invalid-test.x\n"
-            printf "invalid-test.100\n"
-            printf "invalid-test.1x\n"
-        } >> out_manual.txt  # Expected output
+    } >> input.txt  # INPUTS
+
+    # EXPECTED OUTPUTS
+    printf "invalid-test.xn--903fds\n" >> out_raw.txt
     {
         printf "invalid,invalid-test-com\n"
         printf "invalid,100.100.100.100\n"
         printf "invalid,invalid-test.x\n"
         printf "invalid,invalid-test.100\n"
         printf "invalid,invalid-test.1x\n"
-    } >> out_log.txt  # Expected output
+    } >> out_log.txt
 
-    : > "$REDUNDANT_DOMAINS"  # Initialize redundant domains file
+    # Check script does not save invalid domains to manual review file
+    [[ "$script_to_test" == 'check' ]] && return
+
+    {
+        printf "invalid-test-com\n"
+        printf "100.100.100.100\n"
+        printf "invalid-test.x\n"
+        printf "invalid-test.100\n"
+        printf "invalid-test.1x\n"
+    } >> out_manual.txt
+}
+
+test_redundant_removal() {
     if [[ "$script_to_test" == 'retrieve' ]]; then
-        # Test removal of new redundant domains
-        printf "redundant-test.com\n" > "$WILDCARDS"  # Sample data
-        printf "redundant-test.com\n" >> out_wildcards.txt  # Wildcard should already be in expected wildcards file
-        printf "domain.redundant-test.com\n" >> input.txt  # Input
-        printf "redundant,domain.redundant-test.com\n" >> out_log.txt  # Expected output
-    elif [[ "$script_to_test" == 'validate' ]]; then
-        # Test addition of new wildcard from wildcard file (manually adding a new wildcard to wildcards file)
-        printf "domain.redundant-test.com\n" >> input.txt  # Sample data
-        printf "redundant-test.com\n" > "$WILDCARDS"  # Input
-        # Expected output
-        printf "redundant-test.com\n" >> out_raw.txt
+        # Test removal of redundant domains during retrieval
+        printf "redundant-test.com\n" > "$WILDCARDS"
+         # Wildcard should already be in expected wildcards file
         printf "redundant-test.com\n" >> out_wildcards.txt
-        printf "domain.redundant-test.com\n" >> out_redundant.txt
+        # INPUT
+        printf "domain.redundant-test.com\n" >> input.txt
+        # EXPECTED OUTPUT
         printf "redundant,domain.redundant-test.com\n" >> out_log.txt
+        return
     fi
+    # Test addition of new wildcard from wildcard file
+    # (manually adding a new wildcard to wildcards file)
+    # Existing redundant domain in raw file
+    printf "domain.redundant-test.com\n" >> input.txt
+    # INPUT
+    printf "redundant-test.com\n" > "$WILDCARDS"
+    # EXPECTED OUTPUTS
+    printf "redundant-test.com\n" >> out_raw.txt
+    printf "redundant-test.com\n" >> out_wildcards.txt
+    printf "domain.redundant-test.com\n" >> out_redundant.txt
+    printf "redundant,domain.redundant-test.com\n" >> out_log.txt
+}
 
-    # Test toplist check
-    if [[ "$script_to_test" == 'validate' ]]; then
-        printf "microsoft.com\n" >> input.txt  # Input
-        printf "microsoft.com\n" >> out_raw.txt  # Expected output
-    elif [[ "$script_to_test" == 'retrieve' ]]; then
-        printf "microsoft.com\n" >> data/pending/domains_guntab.com.tmp  # Input
-        # Expected output
-        printf "microsoft.com\n" >> out_pending.txt
+# TEST: removal of domains found in toplist
+test_toplist_removal() {
+    if [[ "$script_to_test" == 'retrieve' ]]; then
+        # INPUT
+        printf "microsoft.com\n" >> data/pending/domains_manual.tmp
+        # EXPECTED OUTPUTS
+        printf "microsoft.com\n" >> out_manual.txt
         printf "toplist,microsoft.com\n" >> out_log.txt
+        return
     fi
+    # INPUT
+    printf "microsoft.com\n" >> input.txt
+    # EXPECTED OUTPUTS
+    printf "microsoft.com\n" >> out_raw.txt
+    printf "toplist,microsoft.com\n" >> out_log.txt
+}
+
+# TEST: test exclusion of specific sources from light version
+test_light_build() {
+    cp "$RAW" "$RAW_LIGHT"
+    # INPUT
+    printf "raw-light-test.com\n" >> data/pending/domains_guntab.com.tmp
+    # EXPECTED OUTPUT
+    printf "raw-light-test.com\n" >> out_raw.txt
+    # Domain from excluded source should not be in output
+    grep -vF "raw-light-test.com" out_raw.txt > out_raw_light.txt
+}
+
+# Function 'TEST_RETRIEVE_VALIDATE' can test both the retrieval process and the
+# validation process depending on which argument is passed to the function.
+# $1: script to test, can either be 'retrieve' or 'validate'
+TEST_RETRIEVE_VALIDATE() {
+    script_to_test="$1"
+
+    # Initialize files
+    : > "$DEAD_DOMAINS"
+    : > "$SUBDOMAINS"
+    : > "$ROOT_DOMAINS"
+    : > "$PARKED_DOMAINS"
+    : > "$WHITELIST"
+    : > "$BLACKLIST"
+    : > "$REDUNDANT_DOMAINS"
+
+    # Initialize pending directory
+    [[ -d data/pending ]] && rm -r data/pending
+    [[ "$script_to_test" == 'retrieve' ]] && mkdir data/pending
+
+    # Note removal of domains already in raw file is redundant to test
 
-    # Test light raw file exclusion of specific sources
     if [[ "$script_to_test" == 'retrieve' ]]; then
-        cp "$RAW" "$RAW_LIGHT"
-        printf "raw-light-test.com\n" >> data/pending/domains_guntab.com.tmp  # Input
-        printf "raw-light-test.com\n" >> out_raw.txt  # Expected output
-        grep -vF "raw-light-test.com" out_raw.txt > out_raw_light.txt  # Expected output for light (source excluded from light)
-    elif [[ "$script_to_test" == 'validate' ]]; then
-        cp out_raw.txt out_raw_light.txt  # Expected output for light
+        test_conversion
+        test_known_dead_removal
+        test_known_parked_removal
+        test_light_build
     fi
 
+    test_subdomain_removal
+    test_whitelist_removal
+    test_whitelisted_tld_removal
+    test_invalid_removal
+    test_redundant_removal
+    test_toplist_removal
+
+    # Prepare and run retrieval script
     if [[ "$script_to_test" == 'retrieve' ]]; then
         # Distribute the sample input into various sources
         split -n l/3 input.txt
         mv xaa data/pending/domains_aa419.org.tmp
         mv xab data/pending/domains_google_search_search-term-1.tmp
         mv xac data/pending/domains_google_search_search-term-2.tmp
-        run_script "retrieve_domains.sh" "exit 0"
-    elif [[ "$script_to_test" == 'validate' ]]; then
-        cp input.txt "$RAW"  # Input
-        mv input.txt "$RAW_LIGHT"  # Input
-        run_script "validate_raw.sh" "exit 0"
+
+        run_script retrieve_domains.sh
     fi
 
-    check_output "$RAW" "out_raw.txt" "Raw"  # Check raw file
-    check_output "$RAW_LIGHT" "out_raw_light.txt" "Raw light"  # Check raw light file
-    check_output "$SUBDOMAINS" "out_subdomains.txt" "Subdomains"  # Check subdomains file
-    check_output "$ROOT_DOMAINS" "out_root_domains.txt" "Root domains"  # Check root domains file
+    # Prepare and run check script
+    if [[ "$script_to_test" == 'validate' ]]; then
+        # Use input.txt as sample raw files to test
+        cp input.txt "$RAW"
+        cp input.txt "$RAW_LIGHT"
+
+        # Expected output for light version
+        cp out_raw.txt out_raw_light.txt
+
+        run_script validate_raw.sh
+    fi
+
+    # Check and verify outputs
+
+    check_output "$RAW" out_raw.txt Raw
+    check_output "$RAW_LIGHT" out_raw_light.txt "Raw light"
+    check_output "$SUBDOMAINS" out_subdomains.txt Subdomains
+    check_output "$ROOT_DOMAINS" out_root_domains.txt "Root domains"
+
+    # Check entries saved for manual review
     if [[ "$script_to_test" == 'retrieve' ]]; then
-        check_output "data/pending/domains_guntab.com.tmp" "out_pending.txt" "Manual review"  # Check manual review file
-    elif [[ "$script_to_test" == 'validate' ]]; then
-        check_output "$REDUNDANT_DOMAINS" "out_redundant.txt" "Redundant domains"  # Check redundant domains file
-        check_output "$WILDCARDS" "out_wildcards.txt" "Wildcards"  # Check wildcards file
+        check_output "data/pending/domains_guntab.com.tmp" "out_manual.txt" "Manual review"
     fi
-    check_log  # Check log file
 
-    [[ "$error" != true ]] && printf "\e[1m[success] Test completed. No errors found\e[0m\n\n"
-    [[ "$log_error" != true ]] && printf "Log:\n%s\n" "$(<$DOMAIN_LOG)"
-    [[ "$error" == true ]] && { printf "\n"; exit 1; }  # Exit with error if test failed
+    if [[ "$script_to_test" == 'validate' ]]; then
+        check_output "$REDUNDANT_DOMAINS" "out_redundant.txt" "Redundant domains"
+        check_output "$WILDCARDS" "out_wildcards.txt" "Wildcards"
+    fi
+
+    check_log
+
+    on_exit
 }
 
-function test_dead_check {
+# Function 'test_dead_check' tests the removal/addition of dead and resurrected
+# domains respectively.
+test_dead_check() {
+    # Initialize files
+    : > "$SUBDOMAINS"
+    : > "$ROOT_DOMAINS"
+
     # Test addition of resurrected domains
     # Input
     printf "www.google.com\n" > "$DEAD_DOMAINS"  # Subdomains should be stripped
@@ -239,16 +356,19 @@ function test_dead_check {
     printf "584031dead-domain-test.com\n" >> out_dead.txt
     printf "resurrected,google.com,dead_domains_file\n" >> out_log.txt
 
-    # Test removal of dead domains with subdomains
-    : > "$SUBDOMAINS"  # Initialize subdomains file
-    printf "584308-dead-subdomain-test.com\n" >> "$RAW"  # Input
-    printf "584308-dead-subdomain-test.com\n" > "$ROOT_DOMAINS"  # Input
+    # TEST: removal of dead domains with subdomains
+    # Dead root domains
+    printf "584308-dead-subdomain-test.com\n" >> "$RAW"
+    printf "584308-dead-subdomain-test.com\n" >> "$ROOT_DOMAINS"
     while read -r subdomain; do
         subdomain="${subdomain}.584308-dead-subdomain-test.com"
-        printf "%s\n" "$subdomain" >> "$SUBDOMAINS"  # Input
-        printf "%s\n" "$subdomain" >> out_dead.txt  # Expected output
+        # INPUT
+        printf "%s\n" "$subdomain" >> "$SUBDOMAINS"
+        # EXPECTED OUTPUT
+        printf "%s\n" "$subdomain" >> out_dead.txt
     done < "$SUBDOMAINS_TO_REMOVE"
-    printf "%s\n" "dead,584308-dead-subdomain-test.com,raw" >> out_log.txt  # Expected output
+    # EXPECTED OUTPUT
+    printf "%s\n" "dead,584308-dead-subdomain-test.com,raw" >> out_log.txt
 
     # Test removal of dead redundant domains and wildcards
     : > "$REDUNDANT_DOMAINS"  # Initialize redundant domains file
@@ -341,20 +461,42 @@ function test_parked_check {
     pick_exit
 }
 
-function run_script {
-    for file in out_*; do  # Format expected output files
-        [[ "$file" != out_dead.txt ]] && [[ "$file" != out_parked.txt ]] && sort "$file" -o "$file"
+# Function 'run_script' executes the script passed by the caller.
+# Input:
+#   $1: scrip to execute
+# Output:
+#   return 1 if script has an exit status of 1
+run_script() {
+    # Format expected output files
+    for file in out_*; do
+        if [[ "$file" != out_dead.txt && "$file" != out_parked.txt ]]; then
+            sort "$file" -o "$file"
+        fi
     done
+
     printf "\e[1m[start] %s\e[0m\n" "$1"
     printf "%s\n" "----------------------------------------------------------------------"
+
+    # Run script
     bash "functions/${1}" || errored=true
+
     printf "%s\n" "----------------------------------------------------------------------"
-    [[ -z "$2" ]] && [[ "$errored" == true ]] && { printf "\e[1m[warn] Script returned an error\e[0m\n"; error=true; }  # Check exit status
+
+    # Return 1 if script has an exit status of 1
+    if [[ "$errored" == true ]]; then
+        return 1
+    fi
 }
 
-pick_exit() {
+# Function 'on_exit' checks if the script should exit with an
+# exit status of 1 or 0.
+on_exit() {
+    # Test completed successfully
     [[ "$error" != true ]] && printf "\e[1m[success] Test completed. No errors found\e[0m\n\n"
+
+    # Print log if not already printed by 'check_log' function
     [[ "$log_error" != true ]] && printf "Log:\n%s\n" "$(<$DOMAIN_LOG)"
+
     # Exit with error if test failed
     [[ "$error" == true ]] && { printf "\n"; exit 1; }
 }