From 178c1ad651657899d1770a72b5aaa3feb4ecb9e2 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Fri, 24 Sep 2021 17:17:12 -0400 Subject: [PATCH 1/3] SFR-1304 Add highlighting to search responses This adds the highlighted fields from ElasticSearch to the DRB API response for both the default DRB API format and the OPDS2 feed. This provides information to consuming applications about what strings, in what fields, were matched against. This is returned in a new `_meta` object in both APIs, an object is designed to be extensible and contain future metadata about DRB records in the future (prior to this all metadata pertained to the works being described by the DRB records). --- api/blueprints/drbOPDS2.py | 118 +++++++++++++++++++++++++++--------- api/blueprints/drbSearch.py | 48 +++++++++++---- api/utils.py | 26 +++++--- 3 files changed, 142 insertions(+), 50 deletions(-) diff --git a/api/blueprints/drbOPDS2.py b/api/blueprints/drbOPDS2.py index 1e0f6af757..14e10c6236 100644 --- a/api/blueprints/drbOPDS2.py +++ b/api/blueprints/drbOPDS2.py @@ -33,7 +33,11 @@ def newPublications(): dbClient = DBClient(current_app.config['DB_CLIENT']) dbClient.createSession() - baseFeed = constructBaseFeed(request.full_path, 'New Publications: Digital Research Books', grouped=True) + baseFeed = constructBaseFeed( + request.full_path, + 'New Publications: Digital Research Books', + grouped=True + ) pubCount, newPubs = dbClient.fetchNewWorks(page=page, size=pageSize) @@ -59,7 +63,9 @@ def opdsSearch(): searchTerms = {'query': [], 'filter': [], 'sort': []} for queryField in ['keyword', 'title', 'author', 'subject']: - searchTerms['query'].extend([(queryField, term) for term in params.get(queryField, [])]) + searchTerms['query'].extend([ + (queryField, term) for term in params.get(queryField, []) + ]) searchTerms['filter'] = APIUtils.extractParamPairs('filter', params) if params.get('showAll', None): @@ -71,25 +77,40 @@ def opdsSearch(): logger.info('Executing ES Query {}'.format(searchTerms)) - searchResult = esClient.searchQuery(searchTerms, page=page, perPage=pageSize) + searchResult = esClient.searchQuery( + searchTerms, page=page, perPage=pageSize + ) - resultIds = [ - (r.uuid, [e.edition_id for e in r.meta.inner_hits.editions.hits]) - for r in searchResult.hits - ] + results = [] + highlights = {} + for res in searchResult: + editionIds = [e.edition_id for e in res.meta.inner_hits.editions.hits] - works = dbClient.fetchSearchedWorks(resultIds) + if res.meta.highlight: + highlights[res.uuid] = { + key: list(set(res.meta.highlight[key])) + for key in res.meta.highlight + } + + results.append((res.uuid, editionIds)) - searchFeed = constructBaseFeed(request.full_path, 'Search Results', grouped=True) + works = dbClient.fetchSearchedWorks(results) + + searchFeed = constructBaseFeed( + request.full_path, 'Search Results', grouped=True + ) OPDSUtils.addPagingOptions( searchFeed, request.full_path, searchResult.hits.total, page=page+1, perPage=pageSize ) - addFacets(searchFeed, request.full_path, searchResult.aggregations.to_dict()) + addFacets( + searchFeed, request.full_path, searchResult.aggregations.to_dict() + ) - addPublications(searchFeed, works, grouped=True) + print(highlights) + addPublications(searchFeed, works, grouped=True, highlights=highlights) dbClient.closeSession() @@ -107,12 +128,19 @@ def fetchPublication(uuid): if workRecord is None: return APIUtils.formatResponseObject( - 404, 'opdsPublication', {'message': 'Unable to find work for uuid {}'.format(uuid)} + 404, + 'opdsPublication', + {'message': 'Unable to find work for uuid {}'.format(uuid)} ) publication = createPublicationObject(workRecord, searchResult=False) - publication.addLink({'rel': 'search', 'href': '/opds/search{?query,title,subject,author}', 'type': 'application/opds+json', 'templated': True}) + publication.addLink({ + 'rel': 'search', + 'href': '/opds/search{?query,title,subject,author}', + 'type': 'application/opds+json', + 'templated': True + }) dbClient.closeSession() @@ -126,21 +154,41 @@ def constructBaseFeed(path, title, grouped=False): feed.addMetadata(feedMetadata) selfLink = Link(rel='self', href=path, type='application/opds+json') - searchLink = Link(rel='search', href='/opds/search{?query,title,subject,author}', type='application/opds+json', templated=True) + searchLink = Link( + rel='search', + href='/opds/search{?query,title,subject,author}', + type='application/opds+json', + templated=True + ) altLink = Link(rel='alternative', href='/', type='text/html') feed.addLinks([selfLink, searchLink, altLink]) - currentNavigation = Navigation(href=path, title=title, type='application/opds+json', rel='current') + currentNavigation = Navigation( + href=path, + title=title, + type='application/opds+json', + rel='current' + ) navOptions = [currentNavigation] if path != '/opds/': - baseNavigation = Navigation(href='/opds', title='Home', type='application/opds+json', rel='home') + baseNavigation = Navigation( + href='/opds', + title='Home', + type='application/opds+json', + rel='home' + ) navOptions.append(baseNavigation) if path != '/opds/new/': - newNavigation = Navigation(href='/opds/new', title='New Works', type='application/opds+json', rel='http://opds-spec.org/sort/new') + newNavigation = Navigation( + href='/opds/new', + title='New Works', + type='application/opds+json', + rel='http://opds-spec.org/sort/new' + ) navOptions.append(newNavigation) if grouped is True: @@ -153,8 +201,13 @@ def constructBaseFeed(path, title, grouped=False): return feed -def addPublications(feed, publications, grouped=False): - opdsPubs = [createPublicationObject(pub) for pub in publications] +def addPublications(feed, publications, grouped=False, highlights={}): + opdsPubs = [ + createPublicationObject( + pub, _meta={'highlights': highlights.get(str(pub.uuid), {})} + ) + for pub in publications + ] if grouped is True: pubGroup = Group(metadata={'title': 'Publications'}) @@ -164,11 +217,11 @@ def addPublications(feed, publications, grouped=False): feed.addPublications(opdsPubs) -def createPublicationObject(publication, searchResult=True): - newPub = Publication() - newPub.parseWorkToPublication(publication, searchResult=searchResult) +def createPublicationObject(publication, searchResult=True, _meta={}): + newPub = Publication(metadata={'_meta': _meta}) + newPub.parseWorkToPublication(publication, searchResult=searchResult) - return newPub + return newPub def addFacets(feed, path, facets): @@ -178,9 +231,12 @@ def addFacets(feed, path, facets): for facet, options in reducedFacets.items(): newFacet = Facet(metadata={'title': facet}) + facetOptions = [ { - 'href': '{}&filter={}:{}'.format(path, facet[:-1], option['value']), + 'href': '{}&filter={}:{}'.format( + path, facet[:-1], option['value'] + ), 'type': 'application/opds+json', 'title': option['value'], 'properties': {'numberOfItems': option['count']} @@ -195,11 +251,17 @@ def addFacets(feed, path, facets): opdsFacets.append(Facet( metadata={'title': 'Show All Editions'}, links=[ - {'href': '{}&showAll=true'.format(path), 'type': 'application/opds+json', 'title': 'True'}, - {'href': '{}&showAll=false'.format(path), 'type': 'application/opds+json', 'title': 'False'} + { + 'href': '{}&showAll=true'.format(path), + 'type': 'application/opds+json', + 'title': 'True' + }, + { + 'href': '{}&showAll=false'.format(path), + 'type': 'application/opds+json', + 'title': 'False' + } ] )) feed.addFacets(opdsFacets) - - diff --git a/api/blueprints/drbSearch.py b/api/blueprints/drbSearch.py index cab0b28d7c..cc332d4e53 100644 --- a/api/blueprints/drbSearch.py +++ b/api/blueprints/drbSearch.py @@ -8,6 +8,7 @@ search = Blueprint('search', __name__, url_prefix='/search') + @search.route('/', methods=['GET']) def standardQuery(): esClient = ElasticClient(current_app.config['REDIS_CLIENT']) @@ -30,37 +31,58 @@ def standardQuery(): readerVersion = searchParams.get('readerVersion', [None])[0]\ or current_app.config['READER_VERSION'] - logger.info('Executing ES Query {} with filters {}'.format(searchParams, terms['filter'])) + logger.info('Executing ES Query {} with filters {}'.format( + searchParams, terms['filter']) + ) try: - searchResult = esClient.searchQuery(terms, page=searchPage, perPage=searchSize) + searchResult = esClient.searchQuery( + terms, page=searchPage, perPage=searchSize + ) except ElasticClientError as e: return APIUtils.formatResponseObject( 400, 'searchResponse', {'message': str(e)} ) - resultIds = [ - (r.uuid, [e.edition_id for e in r.meta.inner_hits.editions.hits]) - for r in searchResult.hits - ] + results = [] + for res in searchResult: + editionIds = [e.edition_id for e in res.meta.inner_hits.editions.hits] + + try: + highlights = { + key: list(set(res.meta.highlight[key])) + for key in res.meta.highlight + } + except AttributeError: + highlights = {} + + results.append((res.uuid, editionIds, highlights)) if esClient.sortReversed is True: - resultIds = [r for r in reversed(resultIds)] + results = [r for r in reversed(results)] filteredFormats = [ - mediaType for f in list(filter(lambda x: x[0] == 'format', terms['filter'])) + mediaType for f in list(filter( + lambda x: x[0] == 'format', terms['filter'] + )) for mediaType in APIUtils.FORMAT_CROSSWALK[f[1]] ] - logger.info('Executing DB Query for {} editions'.format(len(resultIds))) + logger.info('Executing DB Query for {} editions'.format(len(results))) - works = dbClient.fetchSearchedWorks(resultIds) - facets = APIUtils.formatAggregationResult(searchResult.aggregations.to_dict()) - paging = APIUtils.formatPagingOptions(searchPage + 1, searchSize, searchResult.hits.total) + works = dbClient.fetchSearchedWorks(results) + facets = APIUtils.formatAggregationResult( + searchResult.aggregations.to_dict() + ) + paging = APIUtils.formatPagingOptions( + searchPage + 1, searchSize, searchResult.hits.total + ) dataBlock = { 'totalWorks': searchResult.hits.total, - 'works': APIUtils.formatWorkOutput(works, resultIds, formats=filteredFormats, reader=readerVersion), + 'works': APIUtils.formatWorkOutput( + works, results, formats=filteredFormats, reader=readerVersion + ), 'paging': paging, 'facets': facets } diff --git a/api/utils.py b/api/utils.py index 907166a468..f87227a7e7 100644 --- a/api/utils.py +++ b/api/utils.py @@ -110,22 +110,24 @@ def formatWorkOutput( outWorks = [] workDict = {str(work.uuid): work for work in works} - for workUUID, editionIds in identifiers: + for workUUID, editionIds, highlights in identifiers: work = workDict.get(workUUID, None) if work is None: continue - outWorks.append( - cls.formatWork( - work, - editionIds, - showAll, - formats=formats, - reader=reader - ) + outWork = cls.formatWork( + work, + editionIds, + showAll, + formats=formats, + reader=reader ) + cls.addWorkMeta(outWork, highlights=highlights) + + outWorks.append(outWork) + return outWorks else: formattedWork = cls.formatWork( @@ -167,6 +169,12 @@ def formatWork(cls, work, editionIds, showAll, formats=None, reader=None): return workDict + @classmethod + def addWorkMeta(cls, work, **kwargs): + work['_meta'] = { + metaField: metaValue for metaField, metaValue in kwargs.items() + } + @classmethod def formatEditionOutput( cls, edition, records=None, showAll=False, reader=None From 771e0a4c7e6c34cc66bd966a994708b701c771e3 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Fri, 24 Sep 2021 18:37:28 -0400 Subject: [PATCH 2/3] SFR-1304 Add tests and improvements This adds unit tests for the search highlighting responses in the API. It also makes several minor tweaks to the highlighting code to resolve potential issues revealed by testing. --- api/blueprints/drbOPDS2.py | 4 +-- api/blueprints/drbSearch.py | 2 +- tests/unit/test_api_opds_blueprint.py | 45 ++++++++++++++++++++----- tests/unit/test_api_search_blueprint.py | 11 +++--- tests/unit/test_api_utils.py | 22 +++++++++++- 5 files changed, 68 insertions(+), 16 deletions(-) diff --git a/api/blueprints/drbOPDS2.py b/api/blueprints/drbOPDS2.py index 14e10c6236..b7be600a67 100644 --- a/api/blueprints/drbOPDS2.py +++ b/api/blueprints/drbOPDS2.py @@ -83,7 +83,7 @@ def opdsSearch(): results = [] highlights = {} - for res in searchResult: + for res in searchResult.hits: editionIds = [e.edition_id for e in res.meta.inner_hits.editions.hits] if res.meta.highlight: @@ -109,7 +109,6 @@ def opdsSearch(): searchFeed, request.full_path, searchResult.aggregations.to_dict() ) - print(highlights) addPublications(searchFeed, works, grouped=True, highlights=highlights) dbClient.closeSession() @@ -202,6 +201,7 @@ def constructBaseFeed(path, title, grouped=False): def addPublications(feed, publications, grouped=False, highlights={}): + print(publications) opdsPubs = [ createPublicationObject( pub, _meta={'highlights': highlights.get(str(pub.uuid), {})} diff --git a/api/blueprints/drbSearch.py b/api/blueprints/drbSearch.py index cc332d4e53..0868652f60 100644 --- a/api/blueprints/drbSearch.py +++ b/api/blueprints/drbSearch.py @@ -45,7 +45,7 @@ def standardQuery(): ) results = [] - for res in searchResult: + for res in searchResult.hits: editionIds = [e.edition_id for e in res.meta.inner_hits.editions.hits] try: diff --git a/tests/unit/test_api_opds_blueprint.py b/tests/unit/test_api_opds_blueprint.py index 6ee064ae6b..6477b1610d 100644 --- a/tests/unit/test_api_opds_blueprint.py +++ b/tests/unit/test_api_opds_blueprint.py @@ -45,6 +45,7 @@ def __init__(self, uuid, edition_ids): editions.append(ed) mockMeta = mocker.MagicMock() mockMeta.inner_hits.editions.hits = editions + mockMeta.highlight = {'field': ['highlight_{}'.format(uuid)]} self.meta = mockMeta return FakeHit @@ -55,9 +56,11 @@ class FakeHits: def __init__(self): self.total = 5 self.hits = [ - FakeHit('uuid1', ['ed1', 'ed2']), FakeHit('uuid2', ['ed3']), + FakeHit('uuid1', ['ed1', 'ed2']), + FakeHit('uuid2', ['ed3']), FakeHit('uuid3', ['ed4', 'ed5', 'ed6']), - FakeHit('uuid4', ['ed7']), FakeHit('uuid5', ['ed8']) + FakeHit('uuid4', ['ed7']), + FakeHit('uuid5', ['ed8']) ] def __iter__(self): @@ -110,7 +113,9 @@ def test_newPublicationsEndpoint(self, mockUtils, opdsMocks, mocker): 'testBaseFeed', '/new?', 3, page=1, perPage=25 ) opdsMocks['addPublications'].assert_called_once_with( - 'testBaseFeed', ['pub1', 'pub2', 'pub3'], grouped=True + 'testBaseFeed', + ['pub1', 'pub2', 'pub3'], + grouped=True ) mockUtils['formatOPDS2Object'].assert_called_once_with(200, 'testBaseFeed') @@ -172,7 +177,16 @@ def test_opdsSearch(self, mockUtils, opdsMocks, mockHits, mocker): 'testBaseFeed', '/search?', {'aggs': []} ) opdsMocks['addPublications'].assert_called_once_with( - 'testBaseFeed', ['work1', 'work2', 'work3', 'work4', 'work5'], grouped=True + 'testBaseFeed', + ['work1', 'work2', 'work3', 'work4', 'work5'], + grouped=True, + highlights={ + 'uuid1': {'field': ['highlight_uuid1']}, + 'uuid2': {'field': ['highlight_uuid2']}, + 'uuid3': {'field': ['highlight_uuid3']}, + 'uuid4': {'field': ['highlight_uuid4']}, + 'uuid5': {'field': ['highlight_uuid5']} + } ) mockUtils['formatOPDS2Object'].assert_called_once_with(200, 'testBaseFeed') @@ -297,10 +311,18 @@ def test_addPublications_grouped(self, opdsMocks, mocker): mockGroupCon = mocker.patch('api.blueprints.drbOPDS2.Group') mockGroupCon.return_value = mockGroup - addPublications(mockFeed, [1, 2, 3], grouped=True) + mockPubs = [mocker.MagicMock(uuid=1), mocker.MagicMock(uuid=2), mocker.MagicMock(uuid=3)] + + addPublications( + mockFeed, + mockPubs, + grouped=True, + highlights={'1': 'highlights1', '2': 'highlights2', '3': 'highlights3'} + ) opdsMocks['createPublicationObject'].assert_has_calls([ - mocker.call(i) for i in range(1, 4) + mocker.call(mockPubs[i], _meta={'highlights': 'highlights{}'.format(i+1)}) + for i in range(3) ]) mockGroupCon.assert_called_once_with(metadata={'title': 'Publications'}) mockGroup.addPublications.assert_called_once_with(['pub1', 'pub2', 'pub3']) @@ -310,10 +332,17 @@ def test_addPublications_not_grouped(self, opdsMocks, mocker): opdsMocks['createPublicationObject'].side_effect = ['pub1', 'pub2', 'pub3'] mockFeed = mocker.MagicMock() - addPublications(mockFeed, [1, 2, 3]) + mockPubs = [mocker.MagicMock(uuid=1), mocker.MagicMock(uuid=2), mocker.MagicMock(uuid=3)] + + addPublications( + mockFeed, + mockPubs, + highlights={'1': 'highlights1', '2': 'highlights2', '3': 'highlights3'} + ) opdsMocks['createPublicationObject'].assert_has_calls([ - mocker.call(i) for i in range(1, 4) + mocker.call(mockPubs[i], _meta={'highlights': 'highlights{}'.format(i+1)}) + for i in range(3) ]) mockFeed.addPublications.assert_called_once_with(['pub1', 'pub2', 'pub3']) diff --git a/tests/unit/test_api_search_blueprint.py b/tests/unit/test_api_search_blueprint.py index 5c525b000a..a964ba1f8b 100644 --- a/tests/unit/test_api_search_blueprint.py +++ b/tests/unit/test_api_search_blueprint.py @@ -31,6 +31,8 @@ def __init__(self, uuid, edition_ids): editions.append(ed) mockMeta = mocker.MagicMock() mockMeta.inner_hits.editions.hits = editions + if int(uuid[-1]) % 2 != 0: + mockMeta.highlight = {'field': ['highlight_{}'.format(uuid)]} self.meta = mockMeta return FakeHit @@ -121,10 +123,11 @@ def test_standardQuery(self, mockUtils, mockHits, testApp, mocker): page=0, perPage=5 ) testResultIds = [ - ('uuid1', ['ed1', 'ed2']), - ('uuid2', ['ed3']), - ('uuid3', ['ed4', 'ed5', 'ed6']), - ('uuid4', ['ed7']), ('uuid5', ['ed8']) + ('uuid1', ['ed1', 'ed2'], {'field': ['highlight_uuid1']}), + ('uuid2', ['ed3'], {}), + ('uuid3', ['ed4', 'ed5', 'ed6'], {'field': ['highlight_uuid3']}), + ('uuid4', ['ed7'], {}), + ('uuid5', ['ed8'], {'field': ['highlight_uuid5']}) ] mockDB.fetchSearchedWorks.assert_called_once_with(testResultIds) mockUtils['formatAggregationResult'].assert_called_once_with( diff --git a/tests/unit/test_api_utils.py b/tests/unit/test_api_utils.py index 32f17ed315..1587c64b72 100644 --- a/tests/unit/test_api_utils.py +++ b/tests/unit/test_api_utils.py @@ -255,12 +255,19 @@ def test_formatWorkOutput_multiple_works(self, mocker): mockFormat = mocker.patch.object(APIUtils, 'formatWork') mockFormat.side_effect = ['formattedWork1', 'formattedWork2'] + mockAddMeta = mocker.patch.object(APIUtils, 'addWorkMeta') + testWorks = [ mocker.MagicMock(uuid='uuid1'), mocker.MagicMock(uuid='uuid2') ] outWorks = APIUtils.formatWorkOutput( - testWorks, [('uuid1', 1), ('uuid2', 2), ('uuid3', 3)] + testWorks, + [ + ('uuid1', 1, 'highlight1'), + ('uuid2', 2, 'highlight2'), + ('uuid3', 3, 'highlight3') + ] ) assert outWorks == ['formattedWork1', 'formattedWork2'] @@ -269,6 +276,11 @@ def test_formatWorkOutput_multiple_works(self, mocker): mocker.call(testWorks[1], 2, True, formats=None, reader=None) ]) + mockAddMeta.assert_has_calls([ + mocker.call('formattedWork1', highlights='highlight1'), + mocker.call('formattedWork2', highlights='highlight2') + ]) + def test_formatWork_showAll(self, testWork, mocker): mockFormatEdition = mocker.patch.object(APIUtils, 'formatEdition') mockFormatEdition.return_value = { @@ -547,3 +559,11 @@ def test_validatePassword_error(self): assert APIUtils.validatePassword('testError', testHash, b'testSalt')\ is False + + def test_addWorkMeta(self): + testWork = {} + + APIUtils.addWorkMeta(testWork, field1='value1', field2=['value2']) + + assert testWork['_meta']['field1'] == 'value1' + assert testWork['_meta']['field2'] == ['value2'] \ No newline at end of file From d3a92f55de390bdfed120b24a7c1dab110b94b90 Mon Sep 17 00:00:00 2001 From: Michael Benowitz Date: Mon, 27 Sep 2021 15:08:42 -0400 Subject: [PATCH 3/3] SFR-1304 Update CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8a8bc0b0a..e047147cfc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ ### Added - Detect file types in s3 process and specify during storage process - `readerVersion` parameter for `/search`, `/work` and `/edition` endpoints to control media types returned -- ElasticSearch query highlighting +- ElasticSearch query highlighting in search response `_meta` block ### Fixed - Improve clustering stability by improving individual error handling - Handle relative links from redirects in proxy endpoint