Skip to content

Commit

Permalink
Change the way of downloading the .csv from .zip, changed the tests, …
Browse files Browse the repository at this point in the history
…added rubyzip for extracting .csv from the .zip
  • Loading branch information
tomasdrga committed Nov 9, 2023
1 parent 32ae261 commit 1fba5dd
Show file tree
Hide file tree
Showing 12 changed files with 98 additions and 26 deletions.
3 changes: 2 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ gem 'dotenv'
gem 'sidekiq', '~> 5.2.9' # Unlock when we migrate to Redis 4+
gem 'faraday'

gem 'faraday-httpclient', '~> 2.0', '>= 2.0.1'
gem 'faraday-httpclient'
gem 'rubyzip'

gem 'newrelic_rpm'
gem 'rollbar'
Expand Down
4 changes: 3 additions & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ GEM
rspec-mocks (~> 3.9)
rspec-support (~> 3.9)
rspec-support (3.9.3)
rubyzip (2.3.2)
safe_yaml (1.0.5)
sidekiq (5.2.9)
connection_pool (~> 2.2, >= 2.2.2)
Expand Down Expand Up @@ -209,7 +210,7 @@ DEPENDENCIES
dotenv
factory_bot_rails
faraday
faraday-httpclient (~> 2.0, >= 2.0.1)
faraday-httpclient
foreman
newrelic_rpm
pg (~> 0.20)
Expand All @@ -219,6 +220,7 @@ DEPENDENCIES
rails (~> 6.0.3.2)
rollbar
rspec-rails
rubyzip
sidekiq (~> 5.2.9)
simplecov (~> 0.10, < 0.18)
webmock
Expand Down
13 changes: 2 additions & 11 deletions app/jobs/upvs/fetch_services_with_forms_list_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ class Upvs::FetchServicesWithFormsListJob < ApplicationJob
queue_as :upvs

def perform(url, downloader: HarvesterUtils::Downloader)
csv_file = downloader.download_file(url)
zip_file = downloader.download_file(url)
csv_file = downloader.extract_csv(zip_file)

csv_options = {
encoding: 'UTF-8',
Expand Down Expand Up @@ -36,16 +37,13 @@ def self.source
def each_row_as_attributes(csv_file, csv_options)
CSV.foreach(csv_file, csv_options) do |row|
row = row.to_h.transform_keys { |k| k.to_s.gsub(/\p{Cf}|"/, '') }
row = row.to_h.transform_keys { |k| k.to_s }

row[row.keys.first]&.sub!(/\A"/, '')
row[row.keys.last]&.sub!(/"\z/, '')

row = row.transform_values { |value| value&.gsub(/[\\"]/,'') }
row = row.transform_values { |v| v == 'NULL' ? nil : v }

check_for_row_formatting(row)

yield(
instance_id: row.fetch('IdServiceInstance'),
external_code: row.fetch('ExternalCode').presence,
Expand All @@ -63,11 +61,4 @@ def each_row_as_attributes(csv_file, csv_options)
)
end
end

def check_for_row_formatting(row)
row.keys.first&.split(',').each_with_index do |key, index|
row[key] = row.values.first&.split(',')[index]&.gsub(/[\\"]/,'')
end
row.delete(row.keys.first) if row.keys.first&.include?(',')
end
end
2 changes: 1 addition & 1 deletion app/jobs/upvs/find_services_with_forms_list_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def perform(downloader: HarvesterUtils::Downloader, fetch_job: Upvs::FetchServic
html = downloader.download(DATASET_URL)
doc = Nokogiri::HTML.parse(html)
resource_link = doc.search('.resource-item .dropdown-menu a').detect do |a|
a['href'].include?('.csv')
a['href'].include?('.zip')
end

if resource_link
Expand Down

This file was deleted.

67 changes: 65 additions & 2 deletions db/structure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4612,6 +4612,39 @@ CREATE TABLE public.schema_migrations (
);


--
-- Name: public_authority_active_edesks; Type: TABLE; Schema: upvs; Owner: -
--

CREATE TABLE upvs.public_authority_active_edesks (
id bigint NOT NULL,
cin bigint NOT NULL,
uri character varying NOT NULL,
name character varying NOT NULL,
created_at timestamp(6) without time zone NOT NULL,
updated_at timestamp(6) without time zone NOT NULL
);


--
-- Name: public_authority_active_edesks_id_seq; Type: SEQUENCE; Schema: upvs; Owner: -
--

CREATE SEQUENCE upvs.public_authority_active_edesks_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;


--
-- Name: public_authority_active_edesks_id_seq; Type: SEQUENCE OWNED BY; Schema: upvs; Owner: -
--

ALTER SEQUENCE upvs.public_authority_active_edesks_id_seq OWNED BY upvs.public_authority_active_edesks.id;


--
-- Name: public_authority_edesks; Type: TABLE; Schema: upvs; Owner: -
--
Expand Down Expand Up @@ -4657,7 +4690,7 @@ CREATE TABLE upvs.services_with_forms (
meta_is_code character varying,
name character varying,
type character varying,
institution_uri character varying,
institution_uri character varying NOT NULL,
institution_name character varying,
valid_from timestamp without time zone,
valid_to timestamp without time zone,
Expand Down Expand Up @@ -5551,6 +5584,13 @@ ALTER TABLE ONLY metais.project_versions ALTER COLUMN id SET DEFAULT nextval('me
ALTER TABLE ONLY metais.projects ALTER COLUMN id SET DEFAULT nextval('metais.projects_id_seq'::regclass);


--
-- Name: public_authority_active_edesks id; Type: DEFAULT; Schema: upvs; Owner: -
--

ALTER TABLE ONLY upvs.public_authority_active_edesks ALTER COLUMN id SET DEFAULT nextval('upvs.public_authority_active_edesks_id_seq'::regclass);


--
-- Name: public_authority_edesks id; Type: DEFAULT; Schema: upvs; Owner: -
--
Expand Down Expand Up @@ -6581,6 +6621,14 @@ ALTER TABLE ONLY public.schema_migrations
ADD CONSTRAINT schema_migrations_pkey PRIMARY KEY (version);


--
-- Name: public_authority_active_edesks public_authority_active_edesks_pkey; Type: CONSTRAINT; Schema: upvs; Owner: -
--

ALTER TABLE ONLY upvs.public_authority_active_edesks
ADD CONSTRAINT public_authority_active_edesks_pkey PRIMARY KEY (id);


--
-- Name: public_authority_edesks public_authority_edesks_pkey; Type: CONSTRAINT; Schema: upvs; Owner: -
--
Expand Down Expand Up @@ -8942,6 +8990,20 @@ CREATE INDEX "index_metais.projects_on_latest_version_id" ON metais.projects USI
CREATE INDEX "index_metais.projects_on_uuid" ON metais.projects USING btree (uuid);


--
-- Name: index_upvs.public_authority_active_edesks_on_cin; Type: INDEX; Schema: upvs; Owner: -
--

CREATE INDEX "index_upvs.public_authority_active_edesks_on_cin" ON upvs.public_authority_active_edesks USING btree (cin);


--
-- Name: index_upvs.public_authority_active_edesks_on_uri; Type: INDEX; Schema: upvs; Owner: -
--

CREATE UNIQUE INDEX "index_upvs.public_authority_active_edesks_on_uri" ON upvs.public_authority_active_edesks USING btree (uri);


--
-- Name: index_upvs.public_authority_edesks_on_cin; Type: INDEX; Schema: upvs; Owner: -
--
Expand Down Expand Up @@ -11002,6 +11064,7 @@ INSERT INTO "schema_migrations" (version) VALUES
('20220919080112'),
('20220919084308'),
('20221219105855'),
('20230919065811');
('20231106173059'),
('20231107130000');


15 changes: 15 additions & 0 deletions lib/harvester_utils/downloader.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
require 'faraday'
require 'tempfile'
require 'faraday/httpclient'
require 'zip'

module HarvesterUtils
class Downloader

DownloadError = Class.new(StandardError)
NoCSVError = Class.new(StandardError)

def self.url_exists?(url)
status = Faraday.head(url).status
Expand All @@ -31,6 +33,19 @@ def self.download_file(url)
file
end

def self.extract_csv(zip_file)
Zip::File.open(zip_file) do |zip|
csv_file = zip.glob('*.csv')
raise NoCSVError, "No CSV file found in the provided zip_file: #{zip_file}" if csv_file.nil?

file = Tempfile.new(rand(1_000_000).to_s)
file.binmode
file.write(csv_file.first.get_input_stream.read)
file.close
file
end
end

def self.download(url)
response = Faraday.get(url)
raise DownloadError, "Unexpected response status: #{response.status} for url: #{url}" if response.status != 200
Expand Down
Binary file added spec/fixtures/upvs/services-v1.zip
Binary file not shown.
Binary file added spec/fixtures/upvs/services-v2.zip
Binary file not shown.
Binary file added spec/fixtures/upvs/services-v3.zip
Binary file not shown.
Binary file added spec/fixtures/upvs/services-v4.zip
Binary file not shown.
15 changes: 10 additions & 5 deletions spec/jobs/upvs/fetch_services_with_forms_list_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
let(:downloader) { double }

it 'downloads and imports ServicesWithForms list in V4 format' do
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v4.csv'))
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v4.zip'))
expect(downloader).to receive(:extract_csv).with(fixture_filepath('upvs/services-v4.zip')).and_return(fixture_filepath('upvs/services-v4.csv'))
subject.perform(url, downloader: downloader)

expect(Upvs::ServiceWithForm.first).to have_attributes(
Expand All @@ -27,7 +28,8 @@
end

it 'downloads and imports ServicesWithForms list in V3 format' do
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v3.csv'))
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v3.zip'))
expect(downloader).to receive(:extract_csv).with(fixture_filepath('upvs/services-v3.zip')).and_return(fixture_filepath('upvs/services-v3.csv'))
subject.perform(url, downloader: downloader)

expect(Upvs::ServiceWithForm.first).to have_attributes(
Expand All @@ -47,7 +49,8 @@
end

it 'downloads and imports ServicesWithForms list in V2 format' do
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v2.csv'))
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v2.zip'))
expect(downloader).to receive(:extract_csv).with(fixture_filepath('upvs/services-v2.zip')).and_return(fixture_filepath('upvs/services-v2.csv'))
subject.perform(url, downloader: downloader)

expect(Upvs::ServiceWithForm.first).to have_attributes(
Expand All @@ -68,7 +71,8 @@

context 'meta_is_code and info_url attributes not nil' do
it 'downloads and imports ServicesWithForms list in V1 format' do
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v1.csv'))
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v1.zip'))
expect(downloader).to receive(:extract_csv).with(fixture_filepath('upvs/services-v1.zip')).and_return(fixture_filepath('upvs/services-v1.csv'))
subject.perform(url, downloader: downloader)

expect(Upvs::ServiceWithForm.first).to have_attributes(
Expand All @@ -93,7 +97,8 @@
create_list(:upvs_service_with_form, 10)

expect(Upvs::ServiceWithForm.count).to eq(10)
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v2.csv'))
expect(downloader).to receive(:download_file).with(url).and_return(fixture_filepath('upvs/services-v2.zip'))
expect(downloader).to receive(:extract_csv).with(fixture_filepath('upvs/services-v2.zip')).and_return(fixture_filepath('upvs/services-v2.csv'))

subject.perform(url, downloader: downloader)

Expand Down

0 comments on commit 1fba5dd

Please sign in to comment.