wiki-recursive-extract-normalise

#!/bin/bash
# Takes a wiki page in new-style viewer format on stdin.
# Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
# Everything else is run through website-extract-social-media.
# This is done recursively until no new links are discovered anymore.
# The output is further fed through url-normalise before and during processing to avoid equivalent but slightly different duplicates, and the output is deduplicated within each section at the end.

verbose=
while [[ $# -gt 0 ]]
do
	if [[ "$1" == '--verbose' || "$1" == '-v' ]]
	then
		verbose='--verbose'
	else
		echo "Unknown option: $1" >&2
		exit 1
	fi
	shift
done

function verbose_echo {
	if [[ "${verbose}" ]]
	then
		echo "$@"
	fi
}

function stderr_annotate {
	name="$1"
	shift
	if [[ "${name}" == '' ]]; then name="${1##*/}"; fi
	"$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
}

scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
declare -A sectionUrls
stderr_annotate 'url-normalise/before' "${scriptpath}/url-normalise" ${verbose} | while read -r line
do
	echo "${line}"
	if [[ "${line}" == '=='* ]]
	then
		verbose_echo "${line}" >&2
		unset sectionUrls
		declare -A sectionUrls
	fi
	if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
	then
		url="${line:2}"
		if [[ "${url}" == *' | '* ]]
		then
			url="${url%% | *}"
		fi

		if [[ "${sectionUrls[${url}]}" ]]
		then
			# Processed already, skip
			continue
		fi
		sectionUrls["${url}"]=1
		toProcess=("${url}")
		while [[ ${#toProcess[@]} -gt 0 ]]
		do
			curUrl="${toProcess[0]}"
			toProcess=("${toProcess[@]:1}")

			if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
			then
				mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-social' "${scriptpath}/url-normalise" ${verbose})
			else
				mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-web' "${scriptpath}/url-normalise" ${verbose})
			fi

			for outUrl in "${outUrls[@]}"
			do
				if [[ "${sectionUrls[${outUrl}]}" ]]
				then
					# The discovered URL was processed already, skip it entirely
					continue
				else
					# Not-yet-known URL, add to the list of URLs to process, mark as seen, and print
					toProcess+=("${outUrl}")
					sectionUrls["${outUrl}"]=1
					echo "* ${outUrl}"
				fi
			done
		done
	fi
done | mawk -W interactive '! /^\*/ { print; }  /^\*/ && !seen[$0]++ { print; }  /^==/ { delete seen; }'