url-normalise

#!/bin/bash
# Taking a list of URLs from stdin (optionally in new-viewer style wiki format), every URL is normalised as follows:
# - For social media URLs, the correct capitalisation is extracted and extraneous parameters are removed.
# - For YouTube user or channel URLs, the canonical base URL is extracted.
# - For anything else, retrieval is attempted and the final, post-redirect URL is used. (To not follow redirects, use --other-no-redirects.)

otherCurlRedirectOpt='-L'
verbose=
while [[ $# -gt 0 ]]
do
	if [[ "$1" == '--other-no-redirects' ]]
	then
		otherCurlRedirectOpt=
	elif [[ "$1" == '--verbose' || "$1" == '-v' ]]
	then
		verbose=1
	else
		echo "Unknown option: $1" >&2
		exit 1
	fi
	shift
done

function verbose_echo {
	if [[ "${verbose}" ]]
	then
		echo "$@"
	fi
}

userAgent='Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0'

while read -r line
do
	if [[ "${line}" != 'http://'* && "${line}" != 'https://'* && "${line}" != '* http://'* && "${line}" != '* https://'* ]]
	then
		echo "${line}"
		continue
	fi

	if [[ "${line}" == '* '* ]]
	then
		prefix="${line::2}"
		url="${line:2}"
	else
		prefix=""
		url="${line}"
	fi

	if [[ "${url}" == *' | '* ]]
	then
		suffix=" | ${url#* | }"
		url="${url%% | *}"
	else
		suffix=""
	fi

	# Normalise domain
	if [[ "${url}" =~ ^https?://.*/ ]]
	then
		domain="${url#*://}"
		domain="${domain%%/*}"
		url="${url%%://*}://${domain,,}/${url#*://*/}"
	fi

	if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/login/.*[?\&]next=https?%3A%2F%2F((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com%2F && "${url}" != *'%0A'* && "${url}" != *'%00'* ]]
	then
		url="${url##*\?next=}"
		url="${url##*&next=}"
		url="${url%%&*}"
		url="$(printf '%b' "${url//%/\\x}")"
	fi
	if [[ "${url}" =~ ^https?://((www|m|[a-z][a-z]-[a-z][a-z]).)?facebook.com/([^/]+/?(\?|$)|pages/[^/]+/[0-9]+/?(\?|$)|pages/category/[^/]+/[^/]+/?(\?|$)|pg/[^/]+([/?]|$)|profile\.php\?id=[0-9]+(&|$)) ]]
	then
		verbose_echo "Normalising Facebook URL: ${url}" >&2
		if [[ "${url}" == *profile.php* ]]
		then
			url="${url%%&*}"
		else
			url="${url%%\?*}"
		fi
		page="$(curl -sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "https://www.facebook.com/${url#*facebook.com/}")"
		user="$(grep -Po '<div\s[^>]*(?<=\s)data-key\s*=\s*"tab_home".*?</div>' <<< "${page}" | grep -Po '<a\s[^>]*(?<=\s)href="/\K[^/]+')"
		if [[ "${user}" ]]
		then
			echo "${prefix}https://www.facebook.com/${user}/${suffix}"
			continue
		elif grep -q 'id="pagelet_loggedout_sign_up"' <<< "${page}"
		then
			# Profile page which is only visible when logged in
			# Extract canonical URL
			user="$(grep -Po '<link rel="canonical" href="\K[^"]+' <<< "${page}")"
			if [[ "${user}" ]]
			then
				echo "${prefix}${user}${suffix}"
				continue
			fi
		fi
		echo "Failed to normalise Facebook URL: ${url}" >&2
		echo "${line}"
	elif [[ "${url}" =~ ^https?://(www\.)?twitter\.com/[^/]+/?(\?.*)?$ ]]
	then
		if [[ "${url}" =~ ^https?://(www\.)?twitter\.com/(i|web|search|hashtag)[/?] ]]
		then
			verbose_echo "Leaving Twitter URL alone: ${url}" >&2
			echo "${line}"
			continue
		fi
		verbose_echo "Normalising Twitter URL: ${url}" >&2
		url="${url%%\?*}"
		url="${url%/}"
		unnormalisedUser="${url##*/}"
		user="$(curl -sL --max-time 10 -A "Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18" "https://twitter.com/${unnormalisedUser}" | grep -Po '<a class="([^"]*\s)?ProfileHeaderCard-screennameLink(\s[^"]*)?" href="/\K[^/"]+(?=")')"
		if [[ "${user}" ]]
		then
			echo "${prefix}https://twitter.com/${user}${suffix}"
		else
			echo "Failed to normalise Twitter URL: ${url}" >&2
			echo "${line}"
		fi
	elif [[ "${url}" =~ ^https?://(www\.)?instagram\.com/[^/]+/?$ ]]
	then
		if [[ "${url}" =~ ^https?://(www\.)?instagram\.com/(p|explore)/ ]]
		then
			verbose_echo "Leaving Instagram URL alone: ${url}" >&2
			echo "${line}"
			continue
		fi
		verbose_echo "Normalising Instagram URL: ${url}" >&2
		user="${url%/}"
		user="${user##*/}"
		echo "${prefix}https://www.instagram.com/${user,,}/${suffix}"
	elif [[ "${url}" =~ ^https?://(www\.)?youtube\.com/ ]]
	then
		verbose_echo "Normalising YouTube URL: ${url}" >&2
		if [[ "${url}" == *'?'* ]]
		then
			rurl="${url}&disable_polymer=1"
		else
			rurl="${url}?disable_polymer=1"
		fi
		page="$(curl -4sL --max-time 10 -A "${userAgent}" -H 'Accept-Language: en-US,en;q=0.5' "${rurl}")"
		canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kuser/[^"]+' <<< "${page}")"
		if [[ "${canonical}" ]]
		then
			echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
		else
			canonical="$(grep -Po '<link itemprop="url" href="http://www\.youtube\.com/\Kchannel/[^"]+' <<< "${page}")"
			if [[ "${canonical}" ]]
			then
				echo "${prefix}https://www.youtube.com/${canonical}${suffix}"
			else
				echo "Failed to normalise YouTube URL: ${url}" >&2
				echo "${line}"
			fi
		fi
	else
		verbose_echo "Normalising other URL: ${url}" >&2
		canonical="$(curl -sS ${otherCurlRedirectOpt} --max-time 10 -A "${userAgent}" -o /dev/null -w '%{url_effective}' "${url}")"
		if [[ "${canonical}" ]]
		then
			echo "${prefix}${canonical}${suffix}"
		else
			echo "Failed to normalise other URL: ${url}" >&2
			echo "${line}"
		fi
	fi
done