Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/NYPL/drb-etl-pipeline into …
Browse files Browse the repository at this point in the history
…main
  • Loading branch information
mwbenowitz committed Oct 4, 2021
2 parents 9509a17 + 2d65475 commit b5099a1
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 63 deletions.
118 changes: 90 additions & 28 deletions api/blueprints/drbOPDS2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@ def newPublications():
dbClient = DBClient(current_app.config['DB_CLIENT'])
dbClient.createSession()

baseFeed = constructBaseFeed(request.full_path, 'New Publications: Digital Research Books', grouped=True)
baseFeed = constructBaseFeed(
request.full_path,
'New Publications: Digital Research Books',
grouped=True
)

pubCount, newPubs = dbClient.fetchNewWorks(page=page, size=pageSize)

Expand All @@ -59,7 +63,9 @@ def opdsSearch():

searchTerms = {'query': [], 'filter': [], 'sort': []}
for queryField in ['keyword', 'title', 'author', 'subject']:
searchTerms['query'].extend([(queryField, term) for term in params.get(queryField, [])])
searchTerms['query'].extend([
(queryField, term) for term in params.get(queryField, [])
])

searchTerms['filter'] = APIUtils.extractParamPairs('filter', params)
if params.get('showAll', None):
Expand All @@ -71,25 +77,39 @@ def opdsSearch():

logger.info('Executing ES Query {}'.format(searchTerms))

searchResult = esClient.searchQuery(searchTerms, page=page, perPage=pageSize)
searchResult = esClient.searchQuery(
searchTerms, page=page, perPage=pageSize
)

resultIds = [
(r.uuid, [e.edition_id for e in r.meta.inner_hits.editions.hits])
for r in searchResult.hits
]
results = []
highlights = {}
for res in searchResult.hits:
editionIds = [e.edition_id for e in res.meta.inner_hits.editions.hits]

works = dbClient.fetchSearchedWorks(resultIds)
if res.meta.highlight:
highlights[res.uuid] = {
key: list(set(res.meta.highlight[key]))
for key in res.meta.highlight
}

results.append((res.uuid, editionIds))

searchFeed = constructBaseFeed(request.full_path, 'Search Results', grouped=True)
works = dbClient.fetchSearchedWorks(results)

searchFeed = constructBaseFeed(
request.full_path, 'Search Results', grouped=True
)

OPDSUtils.addPagingOptions(
searchFeed, request.full_path, searchResult.hits.total,
page=page+1, perPage=pageSize
)

addFacets(searchFeed, request.full_path, searchResult.aggregations.to_dict())
addFacets(
searchFeed, request.full_path, searchResult.aggregations.to_dict()
)

addPublications(searchFeed, works, grouped=True)
addPublications(searchFeed, works, grouped=True, highlights=highlights)

dbClient.closeSession()

Expand All @@ -107,12 +127,19 @@ def fetchPublication(uuid):

if workRecord is None:
return APIUtils.formatResponseObject(
404, 'opdsPublication', {'message': 'Unable to find work for uuid {}'.format(uuid)}
404,
'opdsPublication',
{'message': 'Unable to find work for uuid {}'.format(uuid)}
)

publication = createPublicationObject(workRecord, searchResult=False)

publication.addLink({'rel': 'search', 'href': '/opds/search{?query,title,subject,author}', 'type': 'application/opds+json', 'templated': True})
publication.addLink({
'rel': 'search',
'href': '/opds/search{?query,title,subject,author}',
'type': 'application/opds+json',
'templated': True
})

dbClient.closeSession()

Expand All @@ -126,21 +153,41 @@ def constructBaseFeed(path, title, grouped=False):
feed.addMetadata(feedMetadata)

selfLink = Link(rel='self', href=path, type='application/opds+json')
searchLink = Link(rel='search', href='/opds/search{?query,title,subject,author}', type='application/opds+json', templated=True)
searchLink = Link(
rel='search',
href='/opds/search{?query,title,subject,author}',
type='application/opds+json',
templated=True
)
altLink = Link(rel='alternative', href='/', type='text/html')

feed.addLinks([selfLink, searchLink, altLink])

currentNavigation = Navigation(href=path, title=title, type='application/opds+json', rel='current')
currentNavigation = Navigation(
href=path,
title=title,
type='application/opds+json',
rel='current'
)

navOptions = [currentNavigation]

if path != '/opds/':
baseNavigation = Navigation(href='/opds', title='Home', type='application/opds+json', rel='home')
baseNavigation = Navigation(
href='/opds',
title='Home',
type='application/opds+json',
rel='home'
)
navOptions.append(baseNavigation)

if path != '/opds/new/':
newNavigation = Navigation(href='/opds/new', title='New Works', type='application/opds+json', rel='http://opds-spec.org/sort/new')
newNavigation = Navigation(
href='/opds/new',
title='New Works',
type='application/opds+json',
rel='http://opds-spec.org/sort/new'
)
navOptions.append(newNavigation)

if grouped is True:
Expand All @@ -153,8 +200,14 @@ def constructBaseFeed(path, title, grouped=False):
return feed


def addPublications(feed, publications, grouped=False):
opdsPubs = [createPublicationObject(pub) for pub in publications]
def addPublications(feed, publications, grouped=False, highlights={}):
print(publications)
opdsPubs = [
createPublicationObject(
pub, _meta={'highlights': highlights.get(str(pub.uuid), {})}
)
for pub in publications
]

if grouped is True:
pubGroup = Group(metadata={'title': 'Publications'})
Expand All @@ -164,11 +217,11 @@ def addPublications(feed, publications, grouped=False):
feed.addPublications(opdsPubs)


def createPublicationObject(publication, searchResult=True):
newPub = Publication()
newPub.parseWorkToPublication(publication, searchResult=searchResult)
def createPublicationObject(publication, searchResult=True, _meta={}):
newPub = Publication(metadata={'_meta': _meta})
newPub.parseWorkToPublication(publication, searchResult=searchResult)

return newPub
return newPub


def addFacets(feed, path, facets):
Expand All @@ -178,9 +231,12 @@ def addFacets(feed, path, facets):

for facet, options in reducedFacets.items():
newFacet = Facet(metadata={'title': facet})

facetOptions = [
{
'href': '{}&filter={}:{}'.format(path, facet[:-1], option['value']),
'href': '{}&filter={}:{}'.format(
path, facet[:-1], option['value']
),
'type': 'application/opds+json',
'title': option['value'],
'properties': {'numberOfItems': option['count']}
Expand All @@ -195,11 +251,17 @@ def addFacets(feed, path, facets):
opdsFacets.append(Facet(
metadata={'title': 'Show All Editions'},
links=[
{'href': '{}&showAll=true'.format(path), 'type': 'application/opds+json', 'title': 'True'},
{'href': '{}&showAll=false'.format(path), 'type': 'application/opds+json', 'title': 'False'}
{
'href': '{}&showAll=true'.format(path),
'type': 'application/opds+json',
'title': 'True'
},
{
'href': '{}&showAll=false'.format(path),
'type': 'application/opds+json',
'title': 'False'
}
]
))

feed.addFacets(opdsFacets)


48 changes: 35 additions & 13 deletions api/blueprints/drbSearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

search = Blueprint('search', __name__, url_prefix='/search')


@search.route('/', methods=['GET'])
def standardQuery():
esClient = ElasticClient(current_app.config['REDIS_CLIENT'])
Expand All @@ -30,37 +31,58 @@ def standardQuery():
readerVersion = searchParams.get('readerVersion', [None])[0]\
or current_app.config['READER_VERSION']

logger.info('Executing ES Query {} with filters {}'.format(searchParams, terms['filter']))
logger.info('Executing ES Query {} with filters {}'.format(
searchParams, terms['filter'])
)

try:
searchResult = esClient.searchQuery(terms, page=searchPage, perPage=searchSize)
searchResult = esClient.searchQuery(
terms, page=searchPage, perPage=searchSize
)
except ElasticClientError as e:
return APIUtils.formatResponseObject(
400, 'searchResponse', {'message': str(e)}
)

resultIds = [
(r.uuid, [e.edition_id for e in r.meta.inner_hits.editions.hits])
for r in searchResult.hits
]
results = []
for res in searchResult.hits:
editionIds = [e.edition_id for e in res.meta.inner_hits.editions.hits]

try:
highlights = {
key: list(set(res.meta.highlight[key]))
for key in res.meta.highlight
}
except AttributeError:
highlights = {}

results.append((res.uuid, editionIds, highlights))

if esClient.sortReversed is True:
resultIds = [r for r in reversed(resultIds)]
results = [r for r in reversed(results)]

filteredFormats = [
mediaType for f in list(filter(lambda x: x[0] == 'format', terms['filter']))
mediaType for f in list(filter(
lambda x: x[0] == 'format', terms['filter']
))
for mediaType in APIUtils.FORMAT_CROSSWALK[f[1]]
]

logger.info('Executing DB Query for {} editions'.format(len(resultIds)))
logger.info('Executing DB Query for {} editions'.format(len(results)))

works = dbClient.fetchSearchedWorks(resultIds)
facets = APIUtils.formatAggregationResult(searchResult.aggregations.to_dict())
paging = APIUtils.formatPagingOptions(searchPage + 1, searchSize, searchResult.hits.total)
works = dbClient.fetchSearchedWorks(results)
facets = APIUtils.formatAggregationResult(
searchResult.aggregations.to_dict()
)
paging = APIUtils.formatPagingOptions(
searchPage + 1, searchSize, searchResult.hits.total
)

dataBlock = {
'totalWorks': searchResult.hits.total,
'works': APIUtils.formatWorkOutput(works, resultIds, formats=filteredFormats, reader=readerVersion),
'works': APIUtils.formatWorkOutput(
works, results, formats=filteredFormats, reader=readerVersion
),
'paging': paging,
'facets': facets
}
Expand Down
26 changes: 17 additions & 9 deletions api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,22 +110,24 @@ def formatWorkOutput(
outWorks = []
workDict = {str(work.uuid): work for work in works}

for workUUID, editionIds in identifiers:
for workUUID, editionIds, highlights in identifiers:
work = workDict.get(workUUID, None)

if work is None:
continue

outWorks.append(
cls.formatWork(
work,
editionIds,
showAll,
formats=formats,
reader=reader
)
outWork = cls.formatWork(
work,
editionIds,
showAll,
formats=formats,
reader=reader
)

cls.addWorkMeta(outWork, highlights=highlights)

outWorks.append(outWork)

return outWorks
else:
formattedWork = cls.formatWork(
Expand Down Expand Up @@ -167,6 +169,12 @@ def formatWork(cls, work, editionIds, showAll, formats=None, reader=None):

return workDict

@classmethod
def addWorkMeta(cls, work, **kwargs):
work['_meta'] = {
metaField: metaValue for metaField, metaValue in kwargs.items()
}

@classmethod
def formatEditionOutput(
cls, edition, records=None, showAll=False, reader=None
Expand Down
Loading

0 comments on commit b5099a1

Please sign in to comment.