diff --git a/Gemfile b/Gemfile index 5d0be16a..fd186669 100644 --- a/Gemfile +++ b/Gemfile @@ -49,4 +49,7 @@ gem 'stanford-geo', '0.2.0' # traject brings in httpclient, and we'll need this for ruby 3.4 support: gem 'mutex_m' +gem 'faraday', '~> 2.9' +gem 'faraday-net_http_persistent', '~> 2.1' gem 'match_map', '~> 3.0' +gem 'progress_bar' diff --git a/Gemfile.lock b/Gemfile.lock index 97c6ebe6..a79c6434 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -146,6 +146,13 @@ GEM erubi (1.13.0) factory_bot (6.4.6) activesupport (>= 5.0.0) + faraday (2.9.1) + faraday-net_http (>= 2.0, < 3.2) + faraday-net_http (3.1.0) + net-http + faraday-net_http_persistent (2.1.0) + faraday (~> 2.5) + net-http-persistent (~> 4.0) ffi (1.17.0-arm64-darwin) ffi (1.17.0-x86_64-darwin) ffi (1.17.0-x86_64-linux-gnu) @@ -154,6 +161,8 @@ GEM rake hashdiff (1.1.1) hashie (5.0.0) + highline (3.1.0) + reline honeybadger (5.15.6) http (5.2.0) addressable (~> 2.8) @@ -205,6 +214,10 @@ GEM view_component multi_json (1.15.0) mutex_m (0.2.0) + net-http (0.4.1) + uri + net-http-persistent (4.0.2) + connection_pool (~> 2.2) net-scp (4.0.0) net-ssh (>= 2.6.5, < 8.0.0) net-sftp (4.0.0) @@ -223,6 +236,7 @@ GEM commonmarker (>= 1.0) openapi_parser (1.0.0) optimist (3.1.0) + options (2.3.2) parallel (1.26.3) parser (3.3.4.2) ast (~> 2.4.1) @@ -230,6 +244,9 @@ GEM patience_diff (1.2.0) optimist (~> 3.0) pg (1.5.7) + progress_bar (1.3.4) + highline (>= 1.6) + options (~> 2.3.0) psych (5.1.2) stringio public_suffix (6.0.1) @@ -345,6 +362,7 @@ GEM concurrent-ruby (~> 1.0) unf (0.2.0) unicode-display_width (2.5.0) + uri (0.13.0) useragent (0.16.10) view_component (3.14.0) activesupport (>= 5.2.0, < 8.0) @@ -381,6 +399,8 @@ DEPENDENCIES dor-event-client dor-rights-auth factory_bot (~> 6.2) + faraday (~> 2.9) + faraday-net_http_persistent (~> 2.1) honeybadger http i18n @@ -390,6 +410,7 @@ DEPENDENCIES mutex_m parallel pg + progress_bar rake retriable rspec diff --git a/lib/public_cocina_record.rb b/lib/public_cocina_record.rb index 6ee77d47..eba65a20 100644 --- a/lib/public_cocina_record.rb +++ b/lib/public_cocina_record.rb @@ -1,19 +1,17 @@ # frozen_string_literal: true -require 'http' - class PublicCocinaRecord attr_reader :public_cocina_doc, :druid, :purl_url - def self.fetch(druid, purl_url: 'https://purl.stanford.edu') - response = HTTP.get("#{purl_url}/#{druid}.json") - new(druid, response.body, purl_url:) if response.status.ok? + def self.fetch(druid, purl_url: 'https://purl.stanford.edu', client: Faraday.new) + response = client.get("#{purl_url}/#{druid}.json") + new(druid, response.body, purl_url:) if response.success? end def initialize(druid, public_cocina, purl_url: 'https://purl.stanford.edu') @druid = druid @purl_url = purl_url - @public_cocina_doc = JSON.parse(public_cocina) + @public_cocina_doc = public_cocina end def cocina_access diff --git a/lib/public_xml_record.rb b/lib/public_xml_record.rb index e364a80f..ad316086 100644 --- a/lib/public_xml_record.rb +++ b/lib/public_xml_record.rb @@ -1,15 +1,14 @@ # frozen_string_literal: true -require 'http' require 'mods_display' require 'dor/rights_auth' class PublicXmlRecord attr_reader :public_xml_doc, :druid, :purl_url - def self.fetch(druid, purl_url: 'https://purl.stanford.edu') - response = HTTP.get("#{purl_url}/#{druid}.xml") - new(druid, response.body, purl_url:) if response.status.ok? + def self.fetch(druid, purl_url: 'https://purl.stanford.edu', client: Faraday.new) + response = client.get("#{purl_url}/#{druid}.xml") + new(druid, response.body, purl_url:) if response.success? end def initialize(druid, public_xml, purl_url: 'https://purl.stanford.edu') diff --git a/lib/purl_record.rb b/lib/purl_record.rb index 752b0fef..f83836e3 100644 --- a/lib/purl_record.rb +++ b/lib/purl_record.rb @@ -5,11 +5,12 @@ require 'active_support/core_ext/module/delegation' class PurlRecord - attr_reader :druid, :purl_url + attr_reader :druid, :purl_url, :client - def initialize(druid, purl_url: 'https://purl.stanford.edu') + def initialize(druid, purl_url: 'https://purl.stanford.edu', client: nil) @druid = druid @purl_url = purl_url + @client = client end def searchworks_id @@ -21,11 +22,11 @@ def druid_tree end def public_xml - @public_xml ||= PublicXmlRecord.fetch(druid, purl_url:) + @public_xml ||= PublicXmlRecord.fetch(druid, purl_url:, client:) end def public_cocina - @public_cocina ||= PublicCocinaRecord.fetch(druid, purl_url:) + @public_cocina ||= PublicCocinaRecord.fetch(druid, purl_url:, client:) end def public_meta_json diff --git a/lib/traject/readers/purl_fetcher_reader.rb b/lib/traject/readers/purl_fetcher_reader.rb new file mode 100644 index 00000000..e2e1f38e --- /dev/null +++ b/lib/traject/readers/purl_fetcher_reader.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +require 'faraday' +require 'progress_bar' + +module Traject + # A reader that fetches all items released to a target from purl-fetcher + class PurlFetcherReader + attr_reader :input_stream, :settings + + def initialize(input_stream, settings) + @settings = Traject::Indexer::Settings.new settings + @input_stream = input_stream + end + + def each + return to_enum(:each) unless block_given? + + response = client.get("/released/#{target}.json") + bar = ProgressBar.new(response.body.length) + response.body.each do |obj| + yield PurlRecord.new(obj['druid'].gsub('druid:', ''), purl_url: @settings['purl.url'], client:) + bar.increment! + end + end + + private + + def target + @settings['purl_fetcher.target'] || 'Searchworks' + end + + def host + @settings['purl_fetcher.url'] || 'https://purl-fetcher.stanford.edu' + end + + def client + @client ||= Faraday.new(url: host) do |builder| + builder.response :json, content_type: /\bjson$/ + builder.adapter(:net_http_persistent, pool_size: @settings['processing_thread_pool']) + end + end + end +end