-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstandardize.rb
95 lines (68 loc) · 2.87 KB
/
standardize.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
require './config/environment'
require 'uri'
def twitter_username_from_url(url)
match = url && url.match(/twitter.com\/([a-z0-9_]{1,15})\/?/i)
match && match[1]
end
def github_username_from_url(url)
url.scan(/github.com\/([a-z0-9\-]{1,39})\/?/i).map(&:last).sort_by(&:size)[0]
end
def uris_from_string(string)
uri_strings = URI.extract(string, ['http', 'https'])
uri_strings += string.scan(/\b[a-z0-9\-\_\~]+(?:\.[a-z0-9\-\_\~]+)+\b/i) # No protocol
uri_strings = uri_strings.uniq do |uri_string|
uri_string.gsub(/^https?:\/\//i, '')
end
uri_strings.uniq.map do |uri_string|
uri_string.chop! if uri_string[-1] == ')'
URI.parse(uri_string)
end
end
def emails_from_string(string)
string.scan(/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/)
end
query = Neo4j::Session.current.query
query.match(u: :User).set(u: {domains: nil, twitter_username: nil, uris: nil, usernames: nil, github_username: nil, emails: nil}).exec
StackOverflowUser.all.each do |user|
user.twitter_username = twitter_username_from_url(user.website_url) || twitter_username_from_url(user.about_me)
user.github_username = github_username_from_url(user.website_url) || github_username_from_url(user.about_me)
user.uris = (uris_from_string(user.about_me) + uris_from_string(user.website_url)).map(&:to_s)
user.emails = emails_from_string(user.about_me)
user.domains = user.uris.map { |u| u.match(/^([^\/]+\/\/)?([^\/]+)/)[2] }
user.domains += user.emails.map { |e| e.split('@')[1] }
user.domains = user.domains.map { |d| d.gsub(/^www\./, '') }
user.domains = user.domains.uniq.compact.map(&:downcase)
user.uncommon_domains = user.domains.uniq.compact.map(&:downcase)
user.usernames = user.emails.map { |e| e.split('@')[0] }
user.usernames << user.display_name
user.usernames << user.twitter_username
user.usernames << user.github_username
user.usernames = user.usernames.uniq.compact.map(&:downcase)
user.save
putc 's'
end
GitHubUser.all.each do |user|
user.twitter_username = twitter_username_from_url(user.blog.to_s)
user.usernames = [user.login]
user.usernames << user.twitter_username
user.usernames = user.usernames.uniq.compact.map(&:downcase)
user.uris = uris_from_string(user.blog.to_s).map(&:to_s)
user.domains = user.uris.map { |u| u.match(/^([^\/]+\/\/)?([^\/]+)/)[2] }
user.domains = user.domains.map { |d| d.gsub(/^www\./, '') }
user.domains = user.domains.uniq.map(&:downcase)
user.uncommon_domains = user.domains.uniq.map(&:downcase)
user.save
putc 'g'
end
COMMON_DOMAINS = query.match(u: :User).unwind(domain: 'u.domains').with(:domain, count: 'count(domain)').where('count > 2').pluck(:domain)
puts 'COMMON_DOMAINS', COMMON_DOMAINS.inspect
StackOverflowUser.all.each do |user|
user.uncommon_domains -= COMMON_DOMAINS
user.save
putc 's'
end
GitHubUser.all.each do |user|
user.uncommon_domains -= COMMON_DOMAINS
user.save
putc 'g'
end