Skip to content
This repository has been archived by the owner on Oct 27, 2022. It is now read-only.

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
* develop:
  changing timestamp format to work with MMD
  Gemfile update
  fixing double parens
  more merge conflicts
  adding dedup
  Fix for GaugesLogger error message
  Added a --dedup option for removing duplicate entries
  merge
  Adding parenthesis to all plugin tags
  Working out some Ruby 2.0 encoding issues
  Fixing triple entries of most tweeted posts on feed errors
  adding encoding settings to try to fix bloglogger error
  Fix for GaugesLogger error message
  Added a --dedup option for removing duplicate entries
  debugging Markdownify and encoding
  Adding parenthesis to all plugin tags
  Working out some Ruby 2.0 encoding issues
  Fixing triple entries of most tweeted posts on feed errors
  adding encoding settings to try to fix bloglogger error
  • Loading branch information
ttscoff committed Mar 14, 2014
2 parents e733368 + 3868f82 commit 06aba2a
Show file tree
Hide file tree
Showing 21 changed files with 823 additions and 110 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ gem 'feed-normalizer'
gem 'twitter', '~> 5.3.0'
gem 'twitter_oauth'
gem 'json'
gem 'levenshtein'

gem 'nokogiri'

Expand Down
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ GEM
http_parser.rb
http_parser.rb (0.6.0)
json (1.8.1)
levenshtein (0.2.2)
memoizable (0.4.0)
thread_safe (~> 0.1.3)
mime-types (2.1)
Expand Down Expand Up @@ -64,6 +65,7 @@ PLATFORMS
DEPENDENCIES
feed-normalizer
json
levenshtein
rake
rspec
twitter (~> 5.3.0)
Expand Down
185 changes: 184 additions & 1 deletion lib/dayone.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
require 'fileutils'
require 'digest/md5'
require 'levenshtein'
require 'pp'

class DayOne < Slogger
def to_dayone(options = {})
Expand Down Expand Up @@ -82,7 +85,6 @@ def process_image(image)
unless ext =~ /\.jpg$/
case ext
when '.jpeg'
@log.info("81")
target = orig.gsub(/\.jpeg$/,'.jpg')
FileUtils.mv(orig,target)
return target
Expand Down Expand Up @@ -126,4 +128,185 @@ def store_single_photo(file, options = {}, copy = false)
return self.to_dayone(options)
end
end

def levenshtein_distance(s, t)
m = s.size
n = t.size
d = Array.new(m+1) { Array.new(n+1) }
for i in 0..m
d[i][0] = i
end
for j in 0..n
d[0][j] = j
end
for j in 0...n
for i in 0...m
if s[i,1] == t[j,1]
d[i+1][j+1] = d[i][j]
else
d[i+1][j+1] = [d[i ][j+1] + 1, # deletion
d[i+1][j ] + 1, # insertion
d[i ][j ] + 1 # substitution
].min
end
end
end
d[m][n]
end

def dedup(similar=false)
files = Dir.glob(File.join(storage_path, 'entries', '*.doentry'))
to_keep = []
to_delete = []
similar_threshold = 30

if (similar)
dot_counter = 0
files.each {|file|
next if to_keep.include?(file) || to_delete.include?(file)
photo_path = File.join(storage_path, 'photos')
photo = File.join(photo_path, File.basename(file,'.doentry')+'.jpg')
if File.exists?(photo)
to_keep.push(file)
next
end

to_keep.push(file)

data = Plist::parse_xml(file)
date = data['Creation Date'].strftime('%Y%m%d')
lines = data['Entry Text'].split("\n")
lines.delete_if {|line| line =~ /^\s*$/ }
text1 = lines.join('')[0..30]

files.each {|file2|
next if to_keep.include?(file2) || to_delete.include?(file2)
photo = File.join(photo_path, File.basename(file,'.doentry')+'.jpg')
if File.exists?(photo)
to_keep.push(file)
next
end

data2 = Plist::parse_xml(file2)

if data2['Creation Date'].strftime('%Y%m%d') == date
lines2 = data2['Entry Text'].split("\n")
lines2.delete_if {|line| line =~ /^\s*$/ }
text2 = lines2.join('')[0..30]

distance = Levenshtein.normalized_distance(text1, text2, threshold=nil) * 100
if distance < similar_threshold
distance2 = Levenshtein.normalized_distance(lines.join('')[0..500], lines2.join('')[0..500])
if distance2 > similar_threshold
printf "\r%02.4f: %s => %s\n" % [distance, File.basename(file), File.basename(file2)]
dot_counter = 0
if lines2.join("\n").length > lines.join("\n").length
to_delete.push(file)
to_keep.delete(file)
else
to_delete.push(file2)
to_keep.delete(file2)
end
end
else
print "."
dot_counter += 1
if dot_counter == 91
print "\r"
dot_counter = 0
end
to_keep.push(file2)
end
# if distance < similar_threshold
# puts "#{distance}: #{File.basename(file)} => #{File.basename(file2)}"
# if lines2.join("\n").length > lines.join("\n").length
# to_delete.push(file)
# to_keep.delete(file)
# else
# to_delete.push(file2)
# to_keep.delete(file2)
# end
end
}
}
exit
else
hashes = []
files.each {|file|
data = Plist::parse_xml(file)
tags = data['Tags'].nil? ? '' : data['Tags'].join('')
hashes.push({ 'filename' => file, 'date' => data['Creation Date'], 'hash' => Digest::MD5.hexdigest(data['Entry Text']+tags+data['Starred'].to_s) })
}

hashes.sort_by!{|entry| entry['date']}

existing = []
to_delete = []
hashes.each {|entry|
if existing.include?(entry['hash'])
to_delete.push(entry['filename'])
else
existing.push(entry['hash'])
end
}
to_delete.uniq!
end

images = Dir.glob(File.join(storage_path, 'photos', '*.jpg'))
image_hashes = []

images_to_delete = []
images.each {|image|
image_hashes.push({ 'filename' => image, 'hash' => Digest::MD5.file(image), 'date' => File.stat(image).ctime })
}

image_hashes.sort_by!{|image| image['date']}

images_existing = []
images_to_delete = []
image_hashes.each {|image|
if images_existing.include?(image['hash'])
images_to_delete.push(image['filename'])
else
images_existing.push(image['hash'])
end
}

# puts "Ready to move #{to_delete.length} files to the Trash?"
trash = File.expand_path('~/Desktop/DayOneDuplicates')

FileUtils.mkdir_p(File.join(trash,"photos")) unless File.directory?(File.join(trash,"photos"))
FileUtils.mkdir_p(File.join(trash,"entries")) unless File.directory?(File.join(trash,"entries"))

photo_path = File.join(storage_path, 'photos')

to_delete.each {|file|

photo = File.join(photo_path, File.basename(file,'.doentry')+'.jpg')
if File.exists?(photo)
images_to_delete.delete(photo)
FileUtils.mv(photo,File.join(trash,'photos'))
end

FileUtils.mv(file,File.join(trash,'entries'))
}

entry_path = File.join(storage_path, 'entries')
images_deleted = 0

images_to_delete.each {|file|

entry = File.join(entry_path, File.basename(file,'.jpg')+'.doentry')
next if File.exists?(entry)

if File.exists?(file)
FileUtils.mv(file,File.join(trash,"photos"))
images_deleted += 1
end
}

@log.info("Moved #{to_delete.length} entries/photos to #{trash}.")
@log.info("Found and moved #{images_deleted} images without entries.")
# %x{open -a Finder #{trash}}
end
end
Loading

0 comments on commit 06aba2a

Please sign in to comment.