HPCC-32791 Partition the index LRU cache to reduce contention #440
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Test Hyperlinks | |
on: | |
pull_request: | |
branches: | |
- "master" | |
- "candidate-*" | |
- "!candidate-9.4.*" | |
- "!candidate-9.2.*" | |
- "!candidate-9.0.*" | |
- "!candidate-8.*" | |
- "!candidate-7.*" | |
- "!candidate-6.*" | |
workflow_call: | |
inputs: | |
event-type: | |
type: string | |
default: "workflow_call" | |
required: false | |
file-path: | |
type: string | |
description: Specify the path for the directory or file. To specify multiple directories or files, separate them by Commas(,). Eg. docs/EN_US,docs/PT_BR | |
default: "docs/" | |
required: false | |
file-type: | |
type: string | |
description: Specify the files which need to be scanned (md/xml/rst). To specify multiple file types separate them by Commas(,). Eg. xml,md | |
default: 'xml' | |
required: false | |
debug-mode: | |
type: boolean | |
description: Run in Debug mode to upload all created files | |
default: false | |
required: false | |
workflow_dispatch: | |
inputs: | |
file-path: | |
type: string | |
description: Specify the path for the directory or file. To specify multiple directories or files, separate them by Commas(,). Eg. docs/EN_US,devdoc/ | |
default: "/" | |
required: false | |
file-type: | |
type: string | |
description: Specify the files which need to be scanned (md/xml/rst). To specify multiple file types separate them by Commas(,). Eg. xml,md | |
default: 'xml,md,rst' | |
required: false | |
debug-mode: | |
type: boolean | |
description: Run in Debug mode to upload all created files | |
default: false | |
required: false | |
jobs: | |
main: | |
runs-on: ubuntu-22.04 | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
with: | |
repository: hpcc-systems/HPCC-Platform | |
fetch-depth: 2 | |
- name: List Documentation files | |
run: | | |
# Determine the event type that triggered this workflow | |
# When a workflow is triggered by `workflow_call`, it doesn't explicitly provide | |
# the event type of the call. Instead, it shares the event context of the calling workflow. | |
# To identify if the workflow was triggered by `workflow_call`, we use an input parameter | |
# called `event-type`. If this input is provided, it helps us identify that the workflow was | |
# triggered by `workflow_call`. If the input is not present, we use the github.event_name to determine the event. | |
if [ -n "${{ inputs.event-type }}" ]; then | |
EVENT_TYPE="${{ inputs.event-type }}" | |
else | |
EVENT_TYPE="${{ github.event_name }}" | |
fi | |
touch xmlFilesList.txt mdFilesList.txt rstFilesList.txt | |
if [[ "${EVENT_TYPE}" == "workflow_dispatch" || "${EVENT_TYPE}" == "workflow_call" ]]; then | |
IFS=',' read -a DIR_LIST <<< "${{ inputs.file-path }}" | |
IFS=',' read -a FILE_TYPE_LIST <<< "${{ inputs.file-type }}" | |
for DIR in ${DIR_LIST[@]} | |
do | |
DIR=${PWD}/${DIR} #gets the complete path | |
DIR=$( realpath ${DIR} ) #gets the actual path ex: HPCC-Platform//docs --> HPCC-Platform/docs | |
if [[ -f ${DIR} ]]; then #if the specified path points to a file append it to respective list | |
FILE_TYPE=${DIR##*.} #extract the file extension | |
echo ${DIR} | tee -a ${FILE_TYPE}FilesList.txt | |
continue | |
fi | |
for FILE_TYPE in ${FILE_TYPE_LIST[@]} | |
do | |
FILE_TYPE=${FILE_TYPE#.} #remove leading dot(.) if present | |
FILE_TYPE=${FILE_TYPE,,} #convert the FILE_TYPE to lowercase | |
find ${DIR} -name "*.${FILE_TYPE}" -type f | tee -a ${FILE_TYPE}FilesList.txt | |
# remove if any duplicate files are present | |
sort -u ${FILE_TYPE}FilesList.txt -o ${FILE_TYPE}FilesList.txt | |
done | |
done | |
elif [[ "${EVENT_TYPE}" == "pull_request" ]]; then | |
git diff --name-only HEAD^1 HEAD > updatedFiles.txt | |
cat updatedFiles.txt | grep -E "*\.xml" | tee xmlFilesList.txt | |
cat updatedFiles.txt | grep -E "*\.md" | tee mdFilesList.txt | |
cat updatedFiles.txt | grep -E "*\.rst" | tee rstFilesList.txt | |
fi | |
- name: List links from Documentation files | |
run: | | |
IFS=$'\n' | |
touch missingFiles.txt | |
for FILE in $( cat xmlFilesList.txt ) | |
do | |
#check if the file is missing | |
if [[ ! -f $FILE ]]; then | |
echo -e "$FILE -\e[31m file missing\e[0m" | |
echo $FILE >> missingFiles.txt | |
continue | |
fi | |
grep -onHE -e "<ulink" -e 'url="http[^\"\]+' -e "</ulink>" ${FILE} | sed 's/url="//' > links.tmp | |
FLAG=0 | |
for LINE in $( cat links.tmp ) | |
do | |
LINK=$( echo $LINE | cut -d ':' -f3- ) | |
if [[ ${LINK:0:6} == '<ulink' ]]; then | |
FLAG=1 | |
continue | |
elif [[ ${LINK:0:8} == '</ulink>' ]]; then | |
FLAG=0 | |
continue | |
fi | |
if [[ $FLAG -eq 1 ]]; then | |
echo $LINE >> linksList.txt | |
fi | |
done | |
done | |
for FILE in $( cat mdFilesList.txt ) | |
do | |
#check if the file is missing | |
if [[ ! -f $FILE ]]; then | |
echo -e "$FILE -\e[31m file missing\e[0m" | |
echo $FILE >> missingFiles.txt | |
continue | |
fi | |
grep -onHE -e "\]\([^\)]+" -e "\`\`\`" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${FILE} | sed 's/](//' > links.tmp | |
FLAG=0 | |
for LINE in $( cat links.tmp ) | |
do | |
LINK=$( echo $LINE | cut -d ':' -f3- ) | |
if [[ ${LINK:0:3} == "\`\`\`" ]]; then | |
FLAG=$(( 1 - FLAG )) | |
continue | |
fi | |
if [[ $FLAG -eq 0 ]]; then | |
echo $LINE >> linksList.txt | |
fi | |
done | |
done | |
for FILE in $( cat rstFilesList.txt ) | |
do | |
#check if the file is missing | |
if [[ ! -f $FILE ]]; then | |
echo -e "$FILE -\e[31m file missing\e[0m" | |
echo $FILE >> missingFiles.txt | |
continue | |
fi | |
grep -onHE -e ".. _[^\]+" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${FILE} | sed 's/.. _[^\:]*: //' >> linksList.txt | |
done | |
if [[ -f linksList.txt ]]; then | |
echo "External links: " | |
cat linksList.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -E 'https://|http://' | tee externalLinks.txt | |
echo -e "\nInternal links: " | |
cat linksList.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -vE 'https://|http://' | tee internalLinks.txt | |
fi | |
- name: Test External links | |
run: | | |
touch checkedLinksCache.txt | |
IFS=$'\n' | |
if [[ -f externalLinks.txt ]]; then | |
for LINE in $(cat externalLinks.txt ) | |
do | |
LINK=$( echo $LINE | cut -d ':' -f3- ) | |
LINK=${LINK%.} #remove trailing dot(.) | |
LINK=${LINK% } #remove trailing space | |
CHECK_CACHE=$( cat checkedLinksCache.txt | grep "$LINK~" | wc -w ) | |
TRY=3 #Max attempts to check status code of hyperlinks | |
if [[ $CHECK_CACHE -eq 0 ]]; then | |
while [[ $TRY -ne 0 ]] | |
do | |
HTTP_RESPONSE_CODE=$( curl -o /dev/null -m 60 -sL -w "%{response_code}" $LINK ) || true | |
if [[ $HTTP_RESPONSE_CODE -ne 0 ]]; then | |
echo "$LINK~$HTTP_RESPONSE_CODE" >> checkedLinksCache.txt | |
break | |
else | |
echo $LINE | |
echo "retrying..." | |
TRY=$(( TRY - 1)) | |
fi | |
done | |
else | |
HTTP_RESPONSE_CODE=$( cat checkedLinksCache.txt | grep "$LINK~" | cut -d '~' -f2 ) | |
fi | |
if [[ $HTTP_RESPONSE_CODE -eq 404 ]]; then | |
echo -e "${LINK} - \e[31m404 Error\e[0m" | |
echo "${LINE}" >> error-report.log | |
elif [[ $HTTP_RESPONSE_CODE -eq 0 ]]; then | |
HTTP_ERROR_MESSAGE=$( curl -o /dev/null -m 60 -sSL $LINK 2>&1) || true | |
echo -e "${LINK} - \e[31m${HTTP_ERROR_MESSAGE}\e[0m" | |
HTTP_ERROR_MESSAGE=$( echo $HTTP_ERROR_MESSAGE | sed 's/ /-/g' ) | |
echo "${LINE}(${HTTP_ERROR_MESSAGE})" >> error-report.log | |
else | |
echo "${LINK} - ${HTTP_RESPONSE_CODE}" | |
fi | |
done | |
fi | |
- name: Test Internal Links | |
run: | | |
if [[ -f internalLinks.txt ]]; then | |
for LINE in $( cat internalLinks.txt ) | |
do | |
REFERENCE=$( echo $LINE | cut -d ':' -f3- ) | |
FILE=$( echo $LINE | cut -d ':' -f1 ) | |
if [[ ${REFERENCE:0:1} == '#' ]]; then | |
LINK_TEXT=$( cat $FILE | grep -oE "\[.*\]\(${REFERENCE}\)" | sed 's/\[//' | cut -d ']' -f1 ) | |
IS_PRESENT=$(cat $FILE | grep -oE "# ${LINK_TEXT}" | wc -w) | |
if [[ $IS_PRESENT -eq 0 ]]; then | |
echo -e "${LINE} -\e[31m invalid reference\e[0m" | |
echo "${LINE}" >> error-report.log | |
else | |
echo -e "${LINE} -\e[32m valid reference\e[0m" | |
fi | |
else | |
if [[ ${REFERENCE:0:1} == '/' ]]; then | |
BASE_DIR=$PWD | |
else | |
BASE_DIR=${FILE/$( basename $FILE )} | |
fi | |
SEARCH_PATH="$BASE_DIR/${REFERENCE}" | |
SEARCH_PATH=$( realpath $SEARCH_PATH ) | |
# if it is neither a valid file nor valid a directory, then it is an invalid reference | |
if [[ ! -f $SEARCH_PATH && ! -d $SEARCH_PATH ]]; then | |
echo -e "${LINE} -\e[31m invalid reference\e[0m" | |
echo ${LINE/$REFERENCE/$SEARCH_PATH} >> error-report.log | |
else | |
echo -e "${LINE} -\e[32m valid reference\e[0m" | |
fi | |
fi | |
done | |
fi | |
- name: Report Error links | |
run: | | |
if [[ -f error-report.log ]]; then | |
NUMBER_OF_404_LINKS=$( cat error-report.log | wc -l ) | |
fi | |
echo -e "\e[32mNo. of files scanned : $( cat *FilesList.txt | wc -l )\e[0m" | |
if [[ $NUMBER_OF_404_LINKS -ne 0 ]]; then | |
echo -e "\e[31mNo. of unique broken links : $( cat error-report.log | cut -d: -f3- | sort | uniq | wc -l )\e[0m" | |
echo -e "\e[31mTotal No. of reference to broken links : $( cat error-report.log | cut -d: -f3- | sort | wc -l )\e[0m" | |
echo "Checkout the log artifact in the summary page for more details about the broken links." | |
echo "Note: If any of the reported broken links are just example links or placeholders and are not valid links, please enclose them in triple backticks to ignore them." | |
echo "For example: \`\`\`https://This/is/not/a/valid/link.com\`\`\`" | |
exit -1 | |
else | |
echo -e "\e[32mNo Broken-links found\e[0m" | |
fi | |
- name: Modify log file | |
if: ${{ failure() || cancelled() }} | |
run: | | |
BASE_DIR=${PWD} | |
BASE_DIR=$(echo $BASE_DIR | sed 's/\//\\\//g') | |
sed -i "s/${BASE_DIR}/HPCC-Platform/g" error-report.log | |
FILE_NAMES_LIST=$(cat error-report.log | cut -d ':' -f1 | sort | uniq ) | |
FILE_COUNT=1 | |
for LINE in $FILE_NAMES_LIST | |
do | |
LINKS_LIST=$( cat error-report.log | grep $LINE | cut -d ':' -f2- ) | |
echo "$FILE_COUNT. $LINE" >> error-reportTmp.log | |
FILE_COUNT=$(( FILE_COUNT + 1)) | |
for LINK in $LINKS_LIST | |
do | |
echo -e "\t Line $LINK" | sed 's/:/ : /' >> error-reportTmp.log | |
done | |
done | |
if [[ $(cat missingFiles.txt | wc -w ) -eq 0 ]]; then | |
echo -e "Broken links: \n" > error-report.log | |
cat error-reportTmp.log >> error-report.log | |
else | |
echo -e "Missing Files:" > error-report.log | |
FILE_COUNT=1 | |
for FILE in $( cat missingFiles.txt ) | |
do | |
echo -e "${FILE_COUNT}. ${FILE}" >> error-report.log | |
FILE_COUNT=$(( FILE_COUNT + 1 )) | |
done | |
echo -e "\nBroken links: \n" >> error-report.log | |
cat error-reportTmp.log >> error-report.log | |
fi | |
if [ -z ${{ inputs.debug-mode }} ]; then | |
DEBUG_MODE=false | |
else | |
DEBUG_MODE=${{ inputs.debug-mode }} | |
fi | |
if [[ ${{ github.event_name }} == "pull_request" || $DEBUG_MODE == false ]]; then | |
rm -rf *FilesList.txt \ | |
checkedLinksCache.txt \ | |
*Links.txt \ | |
linksList.txt | |
fi | |
- name: Upload logs | |
uses: actions/upload-artifact@v4 | |
if: ${{ failure() || cancelled() || inputs.debug-mode == 'true'}} | |
with: | |
name: Hyperlinks-testing-log | |
path: | | |
./error-report.log | |
./*FilesList.txt | |
./checkedLinksCache.txt | |
./*Links.txt | |
./linksList.txt | |
if-no-files-found: ignore |