Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rake task for importing annotations #233

Merged
merged 2 commits into from
Feb 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,7 @@ TOKEN_PRIVATE_KEY='MIIEogIBAAKCAQEAufNrDQRl6Gj1yuga0DVHeJ4fi+lNWtn4S8XRU8/nBwm9v

# In production, set up Sentry.io for error tracking
# SENTRY_DSN=

# Set these if you are running rake tasks to import data from Google Sheets
# GOOGLE_CLIENT_ID=XYZ
# GOOGLE_CLIENT_SECRET=XYZ
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ gem 'oj', '~> 3.4'
gem 'sentry-raven'
gem 'readthis'
gem 'hiredis'
gem 'google-api-client'

# See https://github.com/rails/execjs#readme for more supported runtimes
# gem 'therubyracer', platforms: :ruby
Expand Down
39 changes: 39 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ GEM
crack (0.4.3)
safe_yaml (~> 1.0.0)
crass (1.0.3)
declarative (0.0.10)
declarative-option (0.1.0)
devise (4.4.1)
bcrypt (~> 3.0)
orm_adapter (~> 0.1)
Expand All @@ -82,10 +84,26 @@ GEM
ffi (1.9.21)
globalid (0.4.1)
activesupport (>= 4.2.0)
google-api-client (0.19.8)
addressable (~> 2.5, >= 2.5.1)
googleauth (>= 0.5, < 0.7.0)
httpclient (>= 2.8.1, < 3.0)
mime-types (~> 3.0)
representable (~> 3.0)
retriable (>= 2.0, < 4.0)
googleauth (0.6.2)
faraday (~> 0.12)
jwt (>= 1.4, < 3.0)
logging (~> 2.0)
memoist (~> 0.12)
multi_json (~> 1.11)
os (~> 0.9)
signet (~> 0.7)
hashdiff (0.3.7)
hiredis (0.6.1)
httparty (0.16.0)
multi_xml (>= 0.5.2)
httpclient (2.8.3)
i18n (0.9.5)
concurrent-ruby (~> 1.0)
jmespath (1.3.1)
Expand All @@ -95,12 +113,20 @@ GEM
rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7)
ruby_dep (~> 1.2)
little-plugger (1.1.4)
logging (2.2.2)
little-plugger (~> 1.1)
multi_json (~> 1.10)
loofah (2.2.0)
crass (~> 1.0.2)
nokogiri (>= 1.5.9)
mail (2.7.0)
mini_mime (>= 0.1.1)
memoist (0.16.0)
method_source (0.9.0)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_mime (1.0.0)
mini_portile2 (2.3.0)
minitest (5.11.3)
Expand All @@ -115,6 +141,7 @@ GEM
mini_portile2 (~> 2.3.0)
oj (3.4.0)
orm_adapter (0.5.0)
os (0.9.6)
parallel (1.12.1)
parser (2.5.0.0)
ast (~> 2.4.0)
Expand Down Expand Up @@ -173,6 +200,10 @@ GEM
redis (4.0.1)
redis-namespace (1.6.0)
redis (>= 3.0.4)
representable (3.0.4)
declarative (< 0.1.0)
declarative-option (< 0.2.0)
uber (< 0.2.0)
responders (2.4.0)
actionpack (>= 4.2.0, < 5.3)
railties (>= 4.2.0, < 5.3)
Expand All @@ -182,6 +213,7 @@ GEM
redis-namespace (~> 1.3)
sinatra (>= 0.9.2)
vegas (~> 0.1.2)
retriable (3.1.1)
rubocop (0.52.1)
parallel (~> 1.10)
parser (>= 2.4.0.2, < 3.0)
Expand All @@ -205,6 +237,11 @@ GEM
tilt (>= 1.1, < 3)
sentry-raven (2.7.2)
faraday (>= 0.7.6, < 1.0)
signet (0.8.1)
addressable (~> 2.3)
faraday (~> 0.9)
jwt (>= 1.5, < 3.0)
multi_json (~> 1.10)
sinatra (2.0.1)
mustermann (~> 1.0)
rack (~> 2.0)
Expand All @@ -227,6 +264,7 @@ GEM
tilt (2.0.8)
tzinfo (1.2.5)
thread_safe (~> 0.1)
uber (0.1.0)
uglifier (4.1.6)
execjs (>= 0.3.0, < 3)
unicode-display_width (1.3.0)
Expand Down Expand Up @@ -255,6 +293,7 @@ DEPENDENCIES
byebug
devise
dotenv-rails
google-api-client
hiredis
httparty
jwt (~> 2.1)
Expand Down
175 changes: 175 additions & 0 deletions lib/tasks/import_from_sheets.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
require 'google/apis/sheets_v4'
require 'googleauth'
require 'googleauth/stores/file_token_store'

IMPORT_TYPE = 'rake_task_v1'.freeze
OOB_URI = 'urn:ietf:wg:oauth:2.0:oob'.freeze
APPLICATION_NAME = 'Web Monitoring DB Importer'.freeze


desc 'Create annotations from data in analysts’ Google sheets -- only sheet ID & user e-mail are required.'
task :import_annotations_from_sheet, [:sheet_id, :user_email, :tabs, :start_row, :end_row] => [:environment] do |_t, args|
verbose = ENV['VERBOSE']
sheet_id = args[:sheet_id]
start_row = args.fetch(:start_row, 7).to_i
end_row = args[:end_row] || ''
client = sheets_client

user = User.find_by!(email: args[:user_email])

tab_count = 0
annotated_count = 0
skipped_count = 0
error_count = 0

tabs =
if args[:tabs]
args[:tabs].split(',').collect(&:strip)
else
client.get_spreadsheet(sheet_id).sheets.collect do |sheet|
sheet.properties.title
end
end

begin
tabs.each do |tab_title|
puts "Importing spreadsheet tab '#{tab_title}'"

rows = client.get_spreadsheet_values(
sheet_id,
"#{tab_title}!A#{start_row}:AL#{end_row}"
).values

rows.each_with_index do |row, index|
# Column 9 is latest-to-base
begin
change = change_for_version_url(row[9])
rescue StandardError => error
puts "Row #{start_row + index}: #{error.message}"
error_count += 1
end
next unless change

change.annotate(annotation_data_for_row(row), user)
annotated_count += 1

puts "Annotated '#{change.version.page.url}' change '#{change.api_id}'" if verbose
end

tab_count += 1
end
ensure
puts ''
puts 'RESULTS:'
puts '--------'
puts "Created #{annotated_count} annotations"
puts "Skipped #{skipped_count} rows"
puts "Errored #{error_count} rows"
puts "In #{tab_count} spreadsheet tabs"
puts ''
end
end

def change_for_version_url(url)
return nil unless url.present?

# Handle versionista URLs
match = /versionista\.com\/\d+\/\d+\/(\d+):(\d+)/.match(url)
if match
to_version = Version.find_by!(
"source_type = 'versionista' AND source_metadata->>'version_id' = ?",
match[1]
)
from_version = Version.find_by!(
"source_type = 'versionista' AND source_metadata->>'version_id' = ?",
match[2]
)
return Change.between(from: from_version, to: to_version, create: true)
end

# Handle our URLs
match = /monitoring\.envirodatagov\.org\/page\/[^\/]+\/([^\/.]+)\.\.([^\/.]+)/.match(url)
if match
from_version = Version.find(match[1])
to_version = Version.find(match[2])
return Change.between(from: from_version, to: to_version, create: true)
end

raise StandardError, "Unknown change URL format: '#{url}'"
end

def annotation_data_for_row(row)
start_index = 17
# fields from UI project
fields = [
['indiv_1', :boolean],
['indiv_2', :boolean],
['indiv_3', :boolean],
['indiv_4', :boolean],
['indiv_5', :boolean],
['indiv_6', :boolean],
['repeat_7', :boolean],
['repeat_8', :boolean],
['repeat_9', :boolean],
['repeat_10', :boolean],
['repeat_11', :boolean],
['repeat_12', :boolean],
['sig_1', :boolean],
['sig_2', :boolean],
['sig_3', :boolean],
['sig_4', :boolean],
['sig_5', :boolean],
['sig_6', :boolean],
'notes'
]

data = { _importer: IMPORT_TYPE }
fields.each_with_index do |field, index|
field_name, field_type = field.is_a?(Array) ? field : [field, :text]

value = row[start_index + index]
value = value.present? if field_type == :boolean

data[field_name] = value
end

data
end

def sheets_client
service = Google::Apis::SheetsV4::SheetsService.new
service.client_options.application_name = APPLICATION_NAME
service.authorization = authorize_google
service
end

def authorize_google
unless ENV['GOOGLE_CLIENT_ID'] && ENV['GOOGLE_CLIENT_SECRET']
raise 'You must have both `GOOGLE_CLIENT_ID` and `GOOGLE_CLIENT_SECRET` environment variables set.'
end

client_id = Google::Auth::ClientId.new(
ENV['GOOGLE_CLIENT_ID'],
ENV['GOOGLE_CLIENT_SECRET']
)
scope = Google::Apis::SheetsV4::AUTH_SPREADSHEETS_READONLY
token_store = Google::Auth::Stores::FileTokenStore.new(file: Tempfile.new)
authorizer = Google::Auth::UserAuthorizer.new(client_id, scope, token_store)

user_id = 'default'
credentials = authorizer.get_credentials(user_id)
if credentials.nil?
url = authorizer.get_authorization_url(base_url: OOB_URI)
puts 'Open the following URL in your browser and enter the ' \
'resulting code after authorization:'
puts url
code = STDIN.gets.strip
credentials = authorizer.get_and_store_credentials_from_code(
user_id: user_id,
code: code,
base_url: OOB_URI
)
end

credentials
end