-
Notifications
You must be signed in to change notification settings - Fork 6
/
wiki-recursive-extract-normalise
executable file
·88 lines (81 loc) · 2.63 KB
/
wiki-recursive-extract-normalise
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/bash
# Takes a wiki page in new-style viewer format on stdin.
# Everything that looks like a social media link (including YouTube) is run through social-media-extract-profile-link.
# Everything else is run through website-extract-social-media.
# This is done recursively until no new links are discovered anymore.
# The output is further fed through url-normalise before and during processing to avoid equivalent but slightly different duplicates, and the output is deduplicated within each section at the end.
verbose=
while [[ $# -gt 0 ]]
do
if [[ "$1" == '--verbose' || "$1" == '-v' ]]
then
verbose='--verbose'
else
echo "Unknown option: $1" >&2
exit 1
fi
shift
done
function verbose_echo {
if [[ "${verbose}" ]]
then
echo "$@"
fi
}
function stderr_annotate {
name="$1"
shift
if [[ "${name}" == '' ]]; then name="${1##*/}"; fi
"$@" 2> >(while read -r line; do echo "[${name}] ${line}"; done >&2)
}
scriptpath="$(cd "$(dirname "$0")"; pwd -P)"
declare -A sectionUrls
stderr_annotate 'url-normalise/before' "${scriptpath}/url-normalise" ${verbose} | while read -r line
do
echo "${line}"
if [[ "${line}" == '=='* ]]
then
verbose_echo "${line}" >&2
unset sectionUrls
declare -A sectionUrls
fi
if [[ "${line}" == '* http://'* || "${line}" == '* https://'* ]]
then
url="${line:2}"
if [[ "${url}" == *' | '* ]]
then
url="${url%% | *}"
fi
if [[ "${sectionUrls[${url}]}" ]]
then
# Processed already, skip
continue
fi
sectionUrls["${url}"]=1
toProcess=("${url}")
while [[ ${#toProcess[@]} -gt 0 ]]
do
curUrl="${toProcess[0]}"
toProcess=("${toProcess[@]:1}")
if grep -Pq '//([^/]+\.)?(facebook\.com|flickr\.com|instagram\.com|twitter\.com|vk\.com|youtube\.com|youtu\.be)/' <<<"${curUrl}"
then
mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/social-media-extract-profile-link" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-social' "${scriptpath}/url-normalise" ${verbose})
else
mapfile -t outUrls < <(stderr_annotate '' "${scriptpath}/website-extract-social-media" ${verbose} "${curUrl}" < <(:) | stderr_annotate 'url-normalise/post-web' "${scriptpath}/url-normalise" ${verbose})
fi
for outUrl in "${outUrls[@]}"
do
if [[ "${sectionUrls[${outUrl}]}" ]]
then
# The discovered URL was processed already, skip it entirely
continue
else
# Not-yet-known URL, add to the list of URLs to process, mark as seen, and print
toProcess+=("${outUrl}")
sectionUrls["${outUrl}"]=1
echo "* ${outUrl}"
fi
done
done
fi
done | mawk -W interactive '! /^\*/ { print; } /^\*/ && !seen[$0]++ { print; } /^==/ { delete seen; }'