Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a reader for harvesting directly from purl-fetcher HTTP API #1511

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,7 @@ gem 'stanford-geo', '0.2.0'
# traject brings in httpclient, and we'll need this for ruby 3.4 support:
gem 'mutex_m'

gem 'faraday', '~> 2.9'
gem 'faraday-net_http_persistent', '~> 2.1'
gem 'match_map', '~> 3.0'
gem 'progress_bar'
21 changes: 21 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,13 @@ GEM
erubi (1.13.0)
factory_bot (6.5.0)
activesupport (>= 5.0.0)
faraday (2.9.1)
faraday-net_http (>= 2.0, < 3.2)
faraday-net_http (3.1.0)
net-http
faraday-net_http_persistent (2.1.0)
faraday (~> 2.5)
net-http-persistent (~> 4.0)
ffi (1.17.0-arm64-darwin)
ffi (1.17.0-x86_64-darwin)
ffi (1.17.0-x86_64-linux-gnu)
Expand All @@ -154,6 +161,8 @@ GEM
rake
hashdiff (1.1.1)
hashie (5.0.0)
highline (3.1.0)
reline
honeybadger (5.15.6)
http (5.2.0)
addressable (~> 2.8)
Expand Down Expand Up @@ -205,6 +214,10 @@ GEM
view_component
multi_json (1.15.0)
mutex_m (0.2.0)
net-http (0.4.1)
uri
net-http-persistent (4.0.2)
connection_pool (~> 2.2)
net-scp (4.0.0)
net-ssh (>= 2.6.5, < 8.0.0)
net-sftp (4.0.0)
Expand All @@ -224,13 +237,17 @@ GEM
openapi_parser (1.0.0)
optimist (3.1.0)
ostruct (0.6.0)
options (2.3.2)
parallel (1.26.3)
parser (3.3.5.0)
ast (~> 2.4.1)
racc
patience_diff (1.2.0)
optimist (~> 3.0)
pg (1.5.8)
progress_bar (1.3.4)
highline (>= 1.6)
options (~> 2.3.0)
psych (5.1.2)
stringio
public_suffix (6.0.1)
Expand Down Expand Up @@ -345,6 +362,7 @@ GEM
concurrent-ruby (~> 1.0)
unf (0.2.0)
unicode-display_width (2.5.0)
uri (0.13.0)
useragent (0.16.10)
view_component (3.14.0)
activesupport (>= 5.2.0, < 8.0)
Expand Down Expand Up @@ -381,6 +399,8 @@ DEPENDENCIES
dor-event-client
dor-rights-auth
factory_bot (~> 6.2)
faraday (~> 2.9)
faraday-net_http_persistent (~> 2.1)
honeybadger
http
i18n
Expand All @@ -390,6 +410,7 @@ DEPENDENCIES
mutex_m
parallel
pg
progress_bar
rake
retriable
rspec
Expand Down
8 changes: 3 additions & 5 deletions lib/public_cocina_record.rb
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# frozen_string_literal: true

require 'http'

class PublicCocinaRecord
attr_reader :public_cocina_doc, :druid, :purl_url

def self.fetch(druid, purl_url: 'https://purl.stanford.edu')
response = HTTP.get("#{purl_url}/#{druid}.json")
new(druid, response.body, purl_url:) if response.status.ok?
def self.fetch(druid, purl_url: 'https://purl.stanford.edu', client: Faraday.new)
response = client.get("#{purl_url}/#{druid}.json")
new(druid, response.body, purl_url:) if response.success?
end

def initialize(druid, public_cocina, purl_url: 'https://purl.stanford.edu')
Expand Down
7 changes: 3 additions & 4 deletions lib/public_xml_record.rb
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
# frozen_string_literal: true

require 'http'
require 'mods_display'
require 'dor/rights_auth'

class PublicXmlRecord
attr_reader :public_xml_doc, :druid, :purl_url

def self.fetch(druid, purl_url: 'https://purl.stanford.edu')
response = HTTP.get("#{purl_url}/#{druid}.xml")
new(druid, response.body, purl_url:) if response.status.ok?
def self.fetch(druid, purl_url: 'https://purl.stanford.edu', client: Faraday.new)
response = client.get("#{purl_url}/#{druid}.xml")
new(druid, response.body, purl_url:) if response.success?
end

def initialize(druid, public_xml, purl_url: 'https://purl.stanford.edu')
Expand Down
9 changes: 5 additions & 4 deletions lib/purl_record.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
require 'active_support/core_ext/module/delegation'

class PurlRecord
attr_reader :druid, :purl_url
attr_reader :druid, :purl_url, :client

def initialize(druid, purl_url: 'https://purl.stanford.edu')
def initialize(druid, purl_url: 'https://purl.stanford.edu', client: Faraday.new)
@druid = druid
@purl_url = purl_url
@client = client
end

def searchworks_id
Expand All @@ -21,11 +22,11 @@ def druid_tree
end

def public_xml
@public_xml ||= PublicXmlRecord.fetch(druid, purl_url:)
@public_xml ||= PublicXmlRecord.fetch(druid, purl_url:, client:)
end

def public_cocina
@public_cocina ||= PublicCocinaRecord.fetch(druid, purl_url:)
@public_cocina ||= PublicCocinaRecord.fetch(druid, purl_url:, client:)
end

def public_meta_json
Expand Down
45 changes: 45 additions & 0 deletions lib/traject/readers/purl_fetcher_reader.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# frozen_string_literal: true

require 'faraday'
require 'progress_bar'

module Traject
# A reader that fetches all items released to a target from purl-fetcher
class PurlFetcherReader
attr_reader :input_stream, :settings

def initialize(input_stream, settings)
@settings = Traject::Indexer::Settings.new settings
@input_stream = input_stream
end

def each
return to_enum(:each) unless block_given?

response = client.get("/released/#{target}.json")
records = JSON.parse(response.body)
bar = ProgressBar.new(records.length)

records.each do |record|
yield PurlRecord.new(record['druid'].gsub('druid:', ''), purl_url: @settings['purl.url'], client:)
bar.increment!
end
end

private

def target
@settings['purl_fetcher.target'] || 'Searchworks'
end

def host
@settings['purl_fetcher.url'] || 'https://purl-fetcher.stanford.edu'
end

def client
@client ||= Faraday.new(url: host) do |builder|
builder.adapter(:net_http_persistent, pool_size: @settings['processing_thread_pool'])
end
end
end
end