From 77f2a6630883e9843f81b322bbc245e2428acab6 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Mon, 27 Sep 2021 16:09:41 -0400 Subject: [PATCH 01/10] NOREF Improve proxy endpoint This extends some settings for the proxy endpoint, allowing to accept `OPTION` requests and adding more CORS headers to ensure that proper requests can exist. --- CHANGELOG.md | 1 + api/blueprints/drbUtils.py | 7 ++++++- tests/unit/test_api_utils_blueprint.py | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8a8bc0b0a..30d4de03ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ - Improve clustering stability by improving individual error handling - Handle relative links from redirects in proxy endpoint - Add `embed` flag for HTML links +- Extended settings for `utils/proxy` epndoint to be more flexible ## 2021-09-09 -- v0.9.1 ### Fixed diff --git a/api/blueprints/drbUtils.py b/api/blueprints/drbUtils.py index 509a08c871..2d68a9e92f 100644 --- a/api/blueprints/drbUtils.py +++ b/api/blueprints/drbUtils.py @@ -47,7 +47,7 @@ def totalCounts(): return APIUtils.formatResponseObject(200, 'totalCounts', totalsSummary) -@utils.route('/proxy', methods=['GET', 'POST', 'PUT', 'HEAD']) +@utils.route('/proxy', methods=['GET', 'POST', 'PUT', 'HEAD', 'OPTIONS']) @cross_origin(origins=os.environ.get('API_PROXY_CORS_ALLOWED', '*')) def getProxyResponse(): proxyUrl = request.args.get('proxy_url') @@ -93,5 +93,10 @@ def getProxyResponse(): if k.lower() not in excludedHeaders ] + headers.append(( + 'Access-Control-Allow-Origin', + os.environ.get('API_PROXY_CORS_ALLOWED', '*') + )) + proxyResp = Response(resp.content, resp.status_code, headers) return proxyResp diff --git a/tests/unit/test_api_utils_blueprint.py b/tests/unit/test_api_utils_blueprint.py index e943267db5..9ec94e894c 100644 --- a/tests/unit/test_api_utils_blueprint.py +++ b/tests/unit/test_api_utils_blueprint.py @@ -91,6 +91,8 @@ def test_getProxyResponse_direct_success(self, testApp, mocker): assert testAPIResponse.status_code == 200 assert testAPIResponse.response == [b'Test Content'] assert testAPIResponse.headers['Media-Type'] == 'allow' + assert testAPIResponse.headers['Access-Control-Allow-Origin'] ==\ + '*' mockHead.assert_called_once_with( 'https://www.testURL.com', From 8777f6c61e4f07452153363811f79f1475526cf5 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Mon, 27 Sep 2021 16:22:45 -0400 Subject: [PATCH 02/10] NOREF Update dev environment --- config/development.yaml | 61 +++++++++++++++++++++++++++++------------ task-definition.json | 4 +-- 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/config/development.yaml b/config/development.yaml index b87c250d5e..baa32c0308 100644 --- a/config/development.yaml +++ b/config/development.yaml @@ -1,40 +1,51 @@ +# LOGGING +LOG_LEVEL: info + # POSTGRES CONNECTION DETAILS # POSTGRES_USER, POSTGRES_PSWD, POSTGRES_ADMIN_USER and POSTGRES_ADMIN_PSWD must be configured in secrets file -# POSTGRES_HOST: '' -# POSTGRES_NAME: '' -# POSTGRES_PORT: '' +POSTGRES_HOST: sfr-new-metadata-production-cluster.cluster-cvy7z512hcjg.us-east-1.rds.amazonaws.com +POSTGRES_NAME: dcdw_qa +POSTGRES_PORT: '5432' # REDIS CONFIGURATION -# REDIS_HOST: '' -# REDIS_PORT: '' +# REDIS_HOST configured as part of ECS deployment +REDIS_PORT: '6379' # ELASTICSEARCH CONFIGURATION # ELASTICSEARCH_INDEX, ELASTICSEARCH_HOST must be configured in secrets file -# ELASTICSEARCH_PORT: '' -# ELASTICSEARCH_TIMEOUT: '' +ELASTICSEARCH_PORT: '443' +ELASTICSEARCH_TIMEOUT: '5' # RABBITMQ CONFIGURATION -# RABBIT_HOST: '' -# RABBIT_PORT: '' -OCLC_QUEUE: oclc_catalog -EPUB_QUEUE: epub_files +# RABBIT_USER and RABBIT_PSWD must be configured in secrets file +RABBIT_HOST: qa.rmq.aws.nypl.org +RABBIT_PORT: '5672' +RABBIT_VIRTUAL_HOST: /sfr +RABBIT_EXCHANGE: sfrIngestExchange +OCLC_QUEUE: sfrOCLCCatalog +OCLC_ROUTING_KEY: sfrOCLCCatalog +FILE_QUEUE: sfrS3Files +FILE_ROUTING_KEY: sfrS3Files # HATHITRUST CONFIGURATION +# HATHI_API_KEY and HATHI_API_SECRET must be configured as secrets HATHI_DATAFILES: https://www.hathitrust.org/filebrowser/download/244651 +HATHI_API_ROOT: https://babel.hathitrust.org/cgi/htd # OCLC CONFIGURATION # OCLC_API_KEY must be configured in secrets file +OCLC_QUERY_LIMIT: '390000' # AWS CONFIGURATION # AWS_ACCESS and AWS_SECRET must be configured in secrets file -AWS_REGION: 'us-east-1' -EPUB_BUCKET: 'sfr_files' +AWS_REGION: us-east-1 +FILE_BUCKET: drb-files-qa # NYPL BIB REPLICA DB CONNECTION -# NYPL_BIB_USER, NYPL_BIB_PSWD must be configured in secrets file -# NYPL_BIB_HOST: '' -# NYPL_BIB_NAME: '' -# NYPL_BIB_PORT: '' +# NYPL_BIB_USER and NYPL_BIB_PSWD must be configured in secrets file +NYPL_BIB_HOST: bib-service-production-rep.cvy7z512hcjg.us-east-1.rds.amazonaws.com +NYPL_BIB_NAME: bib_service_production +NYPL_BIB_PORT: '5432' # NYPL Location Code Lookup NYPL_LOCATIONS_BY_CODE: https://nypl-core-objects-mapping-qa.s3.amazonaws.com/by_sierra_location.json @@ -52,9 +63,23 @@ BARDO_CCE_API: http://sfr-bardo-copyright-development.us-east-1.elasticbeanstalk # Project MUSE MARC endpoint MUSE_MARC_URL: https://about.muse.jhu.edu/lib/metadata?format=marc&content=book&include=oa&filename=open_access_books&no_auth=1 +MUSE_CSV_URL: https://about.muse.jhu.edu/static/org/local/holdings/muse_book_metadata.csv # DOAB OAI-PMH endpoint -DOAB_OAI_URL: http://www.doabooks.org/oai? +DOAB_OAI_URL: https://directory.doabooks.org/oai/request? + +# Google Books API +# GOOGLE_BOOKS_KEY must be configured as a secret + +# ContentCafe2 API +# CONTENT_CAFE_USER and CONTENT_CAFE_PSWD must be configured as secrets + +# SmartSheet API +# SMARTSHEET_API_TOKEN must be configured as a secret +SMARTSHEET_SHEET_ID: '3683038090553220' + +# Default Cover Image for OPDS2 Feed +DEFAULT_COVER_URL: https://drb-files-qa.s3.amazonaws.com/covers/default/defaultCover.png # ePub-to-Webpub Conversion Service WEBPUB_CONVERSION_URL: https://epub-to-webpub.vercel.app diff --git a/task-definition.json b/task-definition.json index cea88e719c..fd4deb559b 100644 --- a/task-definition.json +++ b/task-definition.json @@ -17,12 +17,12 @@ "essential": true, "command": [ "--process", "APIProcess", - "--environment", "qa" + "--environment", "development" ], "environment": [ { "name": "ENVIRONMENT", - "value": "qa" + "value": "development" }, { "name": "ELASTICSEARCH_HOST", From 4a38b284d1cddf2c57b96d5aeeb0f91c0f055c7c Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Mon, 27 Sep 2021 17:13:32 -0400 Subject: [PATCH 03/10] NOREF Remove duplicate CORS header --- api/blueprints/drbUtils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/api/blueprints/drbUtils.py b/api/blueprints/drbUtils.py index 2d68a9e92f..1bb8c90dbd 100644 --- a/api/blueprints/drbUtils.py +++ b/api/blueprints/drbUtils.py @@ -93,10 +93,5 @@ def getProxyResponse(): if k.lower() not in excludedHeaders ] - headers.append(( - 'Access-Control-Allow-Origin', - os.environ.get('API_PROXY_CORS_ALLOWED', '*') - )) - proxyResp = Response(resp.content, resp.status_code, headers) return proxyResp From fddffea3da8880207b6fe99bb750325a7014e487 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Tue, 28 Sep 2021 16:57:00 -0400 Subject: [PATCH 04/10] NOREF Fix format filtering bug An issue when filtering with formats was due to a conflict with how webpub manifests must be handled. This removes that format from basic filtering and adds it back to criteria when necessary. This simplifies the process and ensures that the filters work conceptually as they would be expected to. --- CHANGELOG.md | 1 + tests/unit/test_api_es.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30d4de03ba..4fb2cb3521 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - Handle relative links from redirects in proxy endpoint - Add `embed` flag for HTML links - Extended settings for `utils/proxy` epndoint to be more flexible +- Resolve issue with display of links when filtering by format ## 2021-09-09 -- v0.9.1 ### Fixed diff --git a/tests/unit/test_api_es.py b/tests/unit/test_api_es.py index 173d3fb65f..fc3f9e4830 100644 --- a/tests/unit/test_api_es.py +++ b/tests/unit/test_api_es.py @@ -665,14 +665,14 @@ def test_createFilterClausesAndAggregations_w_format(self, testInstance, mocker) mocker.call('exists', field='editions.formats'), mocker.call( 'terms', - editions__formats=['application/pdf', 'application/webpub+json', 'application/html+edd', 'application/x.html+edd'] + editions__formats=['application/pdf', 'application/html+edd', 'application/x.html+edd'] ) ]) mockAgg.assert_has_calls([ mocker.call('filter', exists={'field': 'editions.formats'}), mocker.call( 'filter', - terms={'editions.formats': ['application/pdf', 'application/webpub+json', 'application/html+edd', 'application/x.html+edd']} + terms={'editions.formats': ['application/pdf', 'application/html+edd', 'application/x.html+edd']} ) ]) From c0e521e2185e347f7bcf5e55bbcf911fe7541b3c Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Tue, 28 Sep 2021 16:58:50 -0400 Subject: [PATCH 05/10] NOREF Add file missed from previous commit --- api/utils.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/api/utils.py b/api/utils.py index 907166a468..da127c0835 100644 --- a/api/utils.py +++ b/api/utils.py @@ -13,11 +13,11 @@ class APIUtils(): ] FORMAT_CROSSWALK = { - 'epub_zip': ['application/epub+zip', 'application/epub+xml', 'application/webpub+json'], - 'epub_xml': ['application/epub+zip', 'application/epub+xml', 'application/webpub+json'], + 'epub_zip': ['application/epub+zip', 'application/epub+xml'], + 'epub_xml': ['application/epub+zip', 'application/epub+xml'], 'html': ['text/html'], 'html_edd': ['application/html+edd', 'application/x.html+edd'], - 'pdf': ['application/pdf', 'application/webpub+json'], + 'pdf': ['application/pdf'], 'webpub_json': ['application/webpub+json'] } @@ -199,18 +199,13 @@ def formatEdition( itemDict['links'] = [] - validLinks = list(filter(lambda x: x.media_type in formats, item.links))\ - if formats else item.links - - if ( - len(validLinks) < 1 - or ( - formats - and len(validLinks) == 1 - and validLinks[0].media_type == 'application/webpub+json' - ) - ): - continue + if formats: + formats.append('application/webpub+json') + validLinks = list(filter( + lambda x: x.media_type in formats, item.links + )) + else: + validLinks = item.links for link in validLinks: flags = link.flags From 6d2201abc97ce7ea554ad7041e34a0e092ecaf01 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Wed, 29 Sep 2021 13:26:00 -0400 Subject: [PATCH 06/10] NOREF Sort links by media_type Sorting links in the API by `media_type` provides an extra layer of assurance that deplyoments while we are transitioning to a new version of the web reader will not cause errors. It ensures that the "safest" links are displayed first, while those that provide more advanced features are displayed last. This can be reversed to improve performance after web reader deployment --- api/utils.py | 14 ++++++++++++++ tests/unit/test_api_utils.py | 19 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/api/utils.py b/api/utils.py index da127c0835..8b4220a4eb 100644 --- a/api/utils.py +++ b/api/utils.py @@ -230,6 +230,8 @@ def formatEdition( 'flags': flags }) + itemDict['links'].sort(key=cls.sortByMediaType) + itemDict['rights'] = [ { 'source': rights.source, @@ -260,6 +262,18 @@ def formatEdition( return editionDict + @staticmethod + def sortByMediaType(link): + scores = { + 'application/epub+xml': 1, 'application/epub+zip': 1, + 'text/html': 2, + 'application/pdf': 3, + 'application/html+edd': 4, + 'application/webpub+json': 5 + } + + return scores[link['mediaType']] + @classmethod def formatRecord(cls, record, itemsByLink): outRecord = { diff --git a/tests/unit/test_api_utils.py b/tests/unit/test_api_utils.py index 32f17ed315..66cf751daf 100644 --- a/tests/unit/test_api_utils.py +++ b/tests/unit/test_api_utils.py @@ -1,5 +1,6 @@ from hashlib import scrypt import pytest +from random import shuffle from api.utils import APIUtils @@ -547,3 +548,21 @@ def test_validatePassword_error(self): assert APIUtils.validatePassword('testError', testHash, b'testSalt')\ is False + + def test_sortByMediaType(self): + testList = [ + {'id': 2, 'mediaType': 'text/html'}, + {'id': 1, 'mediaType': 'application/epub+xml'}, + {'id': 4, 'mediaType': 'application/html+edd'}, + {'id': 1, 'mediaType': 'application/epub+zip'}, + {'id': 5, 'mediaType': 'application/webpub+json'}, + {'id': 3, 'mediaType': 'application/pdf'} + ] + + shuffle(testList) + testList.sort(key=APIUtils.sortByMediaType) + assert [i['id'] for i in testList] == [1, 1, 2, 3, 4, 5] + + shuffle(testList) + testList.sort(key=APIUtils.sortByMediaType) + assert [i['id'] for i in testList] == [1, 1, 2, 3, 4, 5] From 83095344b75dbb6d686d778b3a60b46c44b31366 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Wed, 29 Sep 2021 13:29:54 -0400 Subject: [PATCH 07/10] NOREF Update MDPI flags --- managers/parsers/mdpiParser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/managers/parsers/mdpiParser.py b/managers/parsers/mdpiParser.py index 0b287129ae..ed69ddff44 100644 --- a/managers/parsers/mdpiParser.py +++ b/managers/parsers/mdpiParser.py @@ -28,7 +28,7 @@ def generatePDFLinks(self, s3Root): return [ (manifestURI, {'reader': True}, 'application/webpub+json', (manifestPath, manifestJSON), None), - (pdfSourceURI, {'download': True}, self.mediaType, None, None) + (pdfSourceURI, {'download': True}, 'application/pdf', None, None) ] def generateManifest(self, sourceURI, manifestURI): From 4f69d7d57a97126289cb3a56bf870355313e8ca0 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Wed, 29 Sep 2021 13:30:16 -0400 Subject: [PATCH 08/10] NOREF Deploy production from unique tag To eliminate possibility production will mistakenly grab a non-released version of the container this deploys to a `production` tag that only gets built during the production release process. --- .github/workflows/build-production.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-production.yaml b/.github/workflows/build-production.yaml index 59b558ff39..98b535d08b 100644 --- a/.github/workflows/build-production.yaml +++ b/.github/workflows/build-production.yaml @@ -32,8 +32,8 @@ jobs: run: | docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG - docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:latest - docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest + docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:production + docker push $ECR_REGISTRY/$ECR_REPOSITORY:production - name: Force ECS Update run: | From 7659ed4d9d3aa30c0d93f9c86abdca3f507430fa Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Wed, 29 Sep 2021 13:40:47 -0400 Subject: [PATCH 09/10] NOREF Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fb2cb3521..2380e94ffc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - Add `embed` flag for HTML links - Extended settings for `utils/proxy` epndoint to be more flexible - Resolve issue with display of links when filtering by format +- Release stability via distinct production tag ## 2021-09-09 -- v0.9.1 ### Fixed From d305cb2bd4e2a0883aa25e1601f99cb96faa9a50 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Mon, 4 Oct 2021 12:15:35 -0400 Subject: [PATCH 10/10] NOREF Bump v0.9.2 --- CHANGELOG.md | 2 +- swagger.v4.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2380e94ffc..de9c30172e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # CHANGELOG -## unreleased -- v0.9.2 +## 2021-10-04 -- v0.9.2 ### Added - Detect file types in s3 process and specify during storage process - `readerVersion` parameter for `/search`, `/work` and `/edition` endpoints to control media types returned diff --git a/swagger.v4.json b/swagger.v4.json index 92628aca21..166dcebe0f 100644 --- a/swagger.v4.json +++ b/swagger.v4.json @@ -1,7 +1,7 @@ { "swagger": "2.0", "info": { - "version": "v0.9.0", + "version": "v0.9.2", "title": "Digital Research Books Search API", "description": "RESTful API for the Digital Research Books Project" },