-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels.rb
522 lines (472 loc) · 15 KB
/
models.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
class Campaign
include MongoMapper::Document
plugin MongoMapper::Plugins::Sluggable
sluggable :name
belongs_to :theme
many :terms
key :page_title
key :theme_id, ObjectId
# deprecated?
# key :slug, String, :required => true
key :name, String, :required => true
key :edit_link, String, :required => true
key :front_page, String
key :description, String
key :conditions
key :description_link, String
key :start_timestamp
key :end_timestamp
key :bamp, String
key :ordering_key, String
key :ordering_dir, String
key :cover_image, String
key :media_count
#any other conditions, as a mongo doc
def update_media
@search_terms = Term.all({:conditions=>{:campaign_id=>self.id}})
puts @search_terms.inspect
@search_terms.each do |term|
#get the tweets from the internet
#do formatting, link expansion etc in this method
#save the tweet
tweets = term.crawl
since_id = term.since_id
puts "got back from cralwer"
tweets.each do |tweet|
puts tweet["text"]
#does it conform to campaign settings (has media?)
if tweet["entities"] && tweet["entities"]["media"]
#build a campaingn tweet object
ct = CampaignMedia.new
ct.media_id = tweet.id_str
ct.campaign_id = self.id
ct.ordering_key = tweet.timestamp
#save it
ct.save!
#and the aggreate table
puts "entities"
puts tweet.entities.inspect
url = tweet["entities"]["media"][0]["media_url"]
v = {:media_url=>url,
:media_type=>"tweet",
:ordering_key=>ct.ordering_key,
:media_id=>ct.media_id}
AggregateMedia.collection.update({:media_url=>url},{'$set'=>v},{:upsert=>true})
a = AggregateMedia.first({:media_url=>url})
a.add_to_set(:campaign_id=>ct.campaign_id)
a.add_to_set(:campaign_media_id=>ct.id)
a.set(:score=>a.campaign_media_id.size)
puts a.inspect
end
since_id = tweet.id_str
end
#update since time for term
#update since id for highest tweet id crawled
puts since_id
Term.collection.update({:id=>term.id},{'$set'=>{:since_id=>since_id,:last_checked=>Time.now}},{:upsert=>false})
# term.since_id = since_id
# term.last_checked = Time.now
# term.save
end
#weird bug where a campaign would lose recently save-data.
#suspect it was caused here, not sure
Campaign.collection.update({:slug=>self.slug},
{'$set'=>{:media_count=>AggregateMedia.count({:hidden=>{'$exists'=>false},:campaign_id=>self.id})}})
# self.save
end
def build_edit_link
#if no edit link
if self.edit_link ==nil
(0...31).map{97.+(rand(26)).chr}.join
end
end
end
#join table
class CampaignMedia
include MongoMapper::Document
belongs_to :campaign
belongs_to :tweet
key :campaign_id, ObjectId
key :media_id
key :media_type, String
key :ordering_key
timestamps!
def aggregate_media
#build an aggregate media collection
# for each campaign
# scoop all media links
# upsert by media_url
# increment count of each insert, plus retweet score
# CM id in an array in aggregate media
# campaign id in aggregate media
#reverse chronological order so the final aggrgated media key is the original tweet
#skip the blocked media
@campaigns = Campaign.all({:hidden=>{'$exists'=>false}})
@campaigns.each do |campaign|
cm = CampaignMedia.all({
:conditions=>
{:campaign_id=>campaign.id},
:order=>[:ordering_key,'desc']})
cm.each do |c|
t = Tweet.first({:id_str=>c.media_id})
begin
url = t[:entities]['media'][0]['media_url']
v = {:media_url=>url,
:media_type=>c.media_type,
:ordering_key=>c.ordering_key,
:media_id=>c.media_id}
AggregateMedia.collection.update({:media_url=>url},{'$set'=>v},{:upsert=>true})
a = AggregateMedia.first({:media_url=>url})
a.increment(:score=>1)
a.add_to_set(:campaign_id=>c.campaign_id)
a.add_to_set(:campaign_media_id=>c.id)
rescue
end
end
end
end
def build_collection
#get each campaign
@campaigns = Campaign.all
@campaigns.each do |campaign|
terms = Term.all({:campaign_id=>campaign.id})
tags = []
terms.each do |term|
tags << term['term'].gsub!('#','')
end
if terms.empty?
terms = {}
else
terms = {:ows_meta_tags=>tags}
end
#build conditions array
conditions = {
:conditions=>{
'entities.media.0.media_url'=>{'$exists'=>true}, 'entities.media.0.sizes.small.h'=>{:$exists=>true},
:timestamp=> {'$gte'=>campaign[:start_timestamp],'$lte'=>campaign[:end_timestamp]},
:block=>{'$exists'=>false}.merge(terms)
}
}
puts conditions.inspect
#get all matching tweets
tweets = Tweet.all(conditions)
puts campaign.name
puts tweets.size
t_count = tweets.size
Campaign.collection.update({:slug=>campaign[:slug]},{'$set'=>{:media_count=>t_count}})
tweets.each do |t|
#build CM object
cmd = CampaignMedia.new
CampaignMedia.collection.update({:campaign_id=>campaign.id, :media_id=>t.id_str.to_s,:media_type=>'tweet'},{:media_id => t.id_str,
:media_type => 'tweet',
:campaign_id => campaign.id,
:ordering_key => t.timestamp},{:upsert=>true})
end
end
end
def save_from_url (url,c)
if (c.slug !=nil)
self.campaign_id = c.id
self.ordering_key = 'timestamp'
if (url.split("twitter.com").size >1)
#ugly split
id = url.split("twitter.com")[1].split("/")[4]
a_tweet = Twitter.status(id).attrs
self.media_type = 'twitter'
a.tweet['id_str'] = a_tweet[:id].to_s
a_tweet.id=nil
Tweet.collection.update({:id_str=>a_tweet["id_str"].to_s},a_tweet, {:upsert => true})
#now view the tweet
self.media_id = a_tweet['id_str']
self.save
@tweet = a_tweet["text"]
else
#if not, parse what we can with hpricot and just save the whole page
html = ""
open(url) {|f|
f.each_line {|line| html << line}
}
domain = url.split("/")[2]
@html = Hpricot(html)
title = (@html/"title")[0].inner_html
#bookmarked so we know it was intentionally saved, not crawled
Tweet.collection.update({:url=>params[:url]}, {:html=>html, :url=>params[:url],:title=>title, :id_str=>url, :origin=>domain,:bookmarked=>true}, {:upsert => true})
self.media_type = domain
self.media_id = url
@tweet = title
end
@tweet
end
end
end
class Tweet
include MongoMapper::Document
key :ows_meta_tags, Array
attr_accessor :dims, :image_size, :score, :sized, :sizes
def removeIds
end
def removeRetweets
end
def dimensions!(pixels)
pixels = @sized.to_i * pixels
@image_size = ":small"
begin
if (@sizes["thumb"] !=nil)
@image_size =":small"
end
rescue
@image_size = ""
end
r = 1
h = pixels
biggest = 1
begin
d = @sizes["medium"]
puts "original size"+d.inspect
d["w"] = d["w"]/4
d["h"] = d["h"]/4
pixels = d["w"] * @sized.to_i
@sizes.keys.reverse.each do |k|
if (pixels > @sizes[k]["w"].to_i)
@image_size = ":"+k
d = @sizes[k]
biggest = @sizes[k]["w"]
end
end
pixels = (pixels > @sizes["large"]["w"]) ? @sizes["large"]["w"] : pixels
h = (d["h"].to_f / d['w'].to_f) * pixels.to_f
ww = (d["w"]*@sized.to_i).to_f.clip(@sizes["large"]["w"])
hh = (d["h"]*@sized.to_i).to_f.clip(@sizes["large"]["h"])
@dims = {:width=>ww, :height=>hh}
rescue Exception=>ex
@dims = {:width=>pixels, :height=>pixels}
end
true
end
def sizes?
begin
if @sizes["large"] !=nil
true
end
rescue
false
end
end
def not_instagram!(url)
if url.split("instagram").size > 1
@image_size = ""
puts "url" + url
puts "is instagram"
else
puts "url"+ url
puts "is not instagram"
end
end
def build_hashtag_array
tags = []
if ( (! self.entities.empty?) && self[:entities][:hashtags] !=nil)
self[:entities][:hashtags].each do |tag|
tags << tag[:text]
end
end
self.ows_meta_tags = tags
end
#tweetstache - crawl and save media from a url
def expand_urls!
self[:entities][:urls].each do |url|
begin
url[:expanded_url].expand_urls!
rescue NoMethodError
url[:expanded_url] = ''
end
end
self.save
end
def add_timestamp!
end
end
#a search term, for crawler. campaign_id, start_time, end_time
class Term
include MongoMapper::Document
key :campaign_id, ObjectId
key :term, String
key :start_time, Time
key :end_time, Time
key :since_id, String
key :last_checked, Time
belongs_to :campaign
timestamps!
def crawl
@blocked = BlockedUser.all
@block = {}
sleep 1
@blocked.each do |block|
@block[block["user_id"]] = block["user_id"]
end
@tweets = []
puts self.term
d = Time.at(self.campaign.end_timestamp).to_datetime
#only do this if current time is before campaign.end_timestamp
date_until = [d.year, d.month, ((d.day.to_i)+1).to_s].join('-').to_s
1.times do |p|
begin
#campaign.since_id, campaign.end_date
query = {:count=>100, :since_id =>self.since_id, :until=>date_until,:include_entities=>true}
tweets = Twitter.search(self.term.to_s + " -rt", query)
puts "tweets found"
puts tweets.results.size
rescue Exception => ex
puts ex.inspect
puts "bad gateway"
sleep 30
tweets = []
end
begin
puts tweets.results.size
rescue NoMethodError
tweets = []
end
begin
if tweets.results.size==0
break
end
rescue
break
end
tweets.results.each do | a_tweet |
#add an integer timestamp
begin
a_tweet.attrs["timestamp"] = Time.parse(a_tweet.attrs[:created_at]).to_i
rescue NoMethodError
a_tweet.attrs["timestamp"] = 1
end
#extract vids for embed code
if a_tweet.attrs[:entities]
if a_tweet.attrs[:entities][:urls] !=nil
a_tweet.attrs[:entities][:urls].each do |url|
#corny method to traverse urls that have been encoded multiple times
3.times do |x|
begin
url[:expanded_url].expand_urls!
rescue NoMethodError
url[:expanded_url] = ""
end
end
if url[:expanded_url].split("youtube.com").size >1 || url[:expanded_url].split("youtu.be").size > 1
client = YouTubeIt::Client.new(:dev_key => @devkey)
begin
vid = client.video_by(url[:expanded_url])
a_tweet.attrs["video_embed"] = vid.embed_html
rescue OpenURI::HTTPError => e
end
#vimeo
elsif (url[:expanded_url].split("vimeo.com").size > 1)
video_id = url[:expanded_url].split("/").last
vid = Vimeo::Simple::Video.info(video_id)
a_tweet.attrs["video_embed"] = '<iframe src="http://player.vimeo.com/video/#{vid.id}" width="500" height="313" frameborder="0" webkitAllowFullScreen mozallowfullscreen allowFullScreen></iframe>'
#ht.ly is only used by porn spammers
elsif (url[:expanded_url].split("ht.ly").size > 1)
a_tweet.attrs[:block] =1
#manually grab instagrams for the thumbnail
elsif ((url[:expanded_url]).split("instagr.am").size > 1) || ((url[:expanded_url]).split("instagram.com").size > 1)
begin OpenURI::HTTPError
#add the media link
html = ""
open(url[:expanded_url]) {|f|
f.each_line {|line| html << line}
}
@html = Hpricot(html)
a_tweet.attrs[:entities][:media] = [:media_url=>(@html/"img.photo")[0][:src] , :expanded_url=> (@html/"img.photo")[0][:src],:size=>{:small=>{:h=>320}}]
rescue
end
end
end
end
end
#block this tweet if the user is in the blocked list
if @block[a_tweet.attrs[:from_user_id].to_s] !=nil
a_tweet.attrs[:block] = 1
end
begin
#kill the twitter ID so we get a mongoID object instead
a_tweet.attrs[:id] = nil
#save / update
#save all tweets, to mine them later
puts "id_str"
puts a_tweet.attrs[:id_str]
Tweet.collection.update({:id_str=>a_tweet.attrs[:id_str].to_s},a_tweet.attrs, {:upsert => true})
puts "upsert"
tweet = Tweet.first({:id_str=>a_tweet.attrs[:id_str].to_s})
puts tweet.inspect
puts "find again"
puts tweet["text"]
begin
tweet.build_hashtag_array
rescue Exception => ex
puts ex.inspect
end
@tweets << tweet
rescue
end
end
end
#return the list of tweets to save CampaignMedia objects
@tweets
end
end
#just a collection of user ids that are blocked
class BlockedUser
include MongoMapper::Document
#purge all existing media from this user
def purge
end
end
#indexes
Tweet.ensure_index(:timestamp)
Tweet.ensure_index([[:id_str,1]], :unique=>true)
CampaignMedia.ensure_index(:media_id)
CampaignMedia.ensure_index(:campaign_id)
CampaignMedia.ensure_index([[:media_id, 1],[:campaign_id,1]],:unique=>true)
CampaignMedia.ensure_index([[:ordering_key,1]])
CampaignMedia.ensure_index([[:ordering_key,-1]])
class AggregateMedia
include MongoMapper::Document
key :media_url, String
key :campaign_id, Array
key :campaign_media_id, Array
key :score, Integer, :default=>0
end
AggregateMedia.ensure_index(:score)
class NewTweet
include MongoMapper::Document
key :media_id
def cleanup
@cm = CampaignMedia.all({:order=>'media_id.asc'.to_sym})
@cm.each do |cm|
ct = CampaignMedia.all({:media_id =>cm[:media_id]})
x = ct.size-1
ct[1..x].each {|p|p.destroy}
end
@prev_id=""
@cm = CampaignMedia.all({:order=>'media_id.asc'.to_sym})
@cm.each do |cm|
if cm.media_id == @prev_id
NewTweet.collection.update({:media_id=>cm.media_id}, {:twid=>cm.id, :media_id=>cm.media_id},{:upsert=>true})
end
@prev_id = cm.media_id
end
@nt = NewTweet.all
@nt.each do |tw|
cm = CampaignMedia.first(:id=>tw[:twid])
cm.destroy
end
end
end
class Theme
include MongoMapper::Document
plugin MongoMapper::Plugins::Sluggable
sluggable :name
key :name, String
key :template_name, String
key :cover_image, String
end