-
Notifications
You must be signed in to change notification settings - Fork 6
/
website-extract-social-media
executable file
·101 lines (97 loc) · 2.67 KB
/
website-extract-social-media
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/bin/bash
function verbose_echo { if [[ "${verbose}" ]]; then echo "$@"; fi; }
function fetch_n_extract {
local url="$1"
verbose_echo "Fetching ${url}" >&2
{
curl -sSL --max-time 10 -A 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0' "${url}" | \
grep -Fi -e 'facebook' -e 'flickr' -e 'instagram' -e 'twitter' -e 't.me' -e 'youtube' -e 'youtu.be' -e 'vk.com' | \
tee \
>(
# Facebook
grep -Poi 'facebook\.com/((pages(/category)?|people)/((?!")[^/ <"'"'"'])+/|groups/|pg/)?((?!")[^/ <"'"'"'])+' | \
sed 's,^,https://www.,' | \
grep -vi -e '^https://www\.facebook\.com/2008$' -e '^https://www\.facebook\.com/tr\?' -e '^https://www\.facebook\.com/plugins$' -e '^https://www\.facebook\.com/l\.php\?' | \
grep -Pvi '^https://www\.facebook\.com/share(r(\.php)?)?(\?|$)'
) \
>(
# Flickr
grep -Poi 'flickr\.com/photos/((?!")[^/ <"'"'"'])+' | \
sed 's,^,https://www.,'
) \
>(
# Instagram
grep -Poi 'instagram\.com/(p/)?((?!")[^/ <"'"'"'])+' | \
sed 's,^,https://www.,' | \
grep -Pvi -e '^https://www\.instagram\.com/v?p$'
) \
>(
# Telegram
grep -Poi '//(www\.)?t\.me/((?!")[^/ <"'"'"'])+' | \
sed 's,^//,,; s,^www\.,,; s,^,https://,'
) \
>(
# Twitter
grep -Poi 'twitter\.com/(#!/)?(hashtag/)?((?!")[^/ <"'"'"'])+' | \
sed 's,^twitter\.com/#!/,twitter.com/,; s,^,https://,' | \
grep -vi -e '^https://twitter\.com/home\?' -e '^https://twitter\.com/widgets\.js$' -e '^https://twitter\.com/share\?' -e '^https://twitter\.com/intent$' | \
sed 's,\([?&]\)ref_src=[^&]\+&\?,\1,; s,?$,,'
) \
>(
# VKontakte
grep -Poi 'vk\.com/((?!")[^/ <"'"'"'])+' | \
sed 's,^,https://,'
) \
>(
# YouTube
grep -Poi '(youtube\.com/((user|channel|c|embed)/)?((?!")[^/ <"'"'"'])+|youtu\.be/((?!")[^/ <"'"'"'])+)' | \
awk '/^youtube/ { print "https://www." $0 } /^youtu\.be/ { print "https://" $0 }' | \
grep -vi -e '^https://www\.youtube\.com/vi$'
) \
>/dev/null
} | awk '!seen[$0]++'
}
# Parse options
printInputUrl=
verbose=
while [[ $# -gt 0 ]]
do
if [[ "$1" == '--print-input-urls' || "$1" == '--print-input-url' ]]
then
printInputUrl=true
shift
elif [[ "$1" == '--verbose' || "$1" == 'v' ]]
then
verbose=1
shift
elif [[ "$1" == '--' ]]
then
# End of options
shift
break
elif [[ "$1" == '--'* ]]
then
echo "Unknown option: $1" >&2
exit 1
else
# Assume end of options
break
fi
done
{
for arg in "$@"
do
echo "${arg}"
done
if [ ! -t 0 ]
then
cat
fi
} | while read -r url
do
if [[ "${printInputUrl}" ]]
then
echo "${url}"
fi
fetch_n_extract "${url}"
done