Skip to content

Commit

Permalink
[ENG-6284] render tsv/csv (CenterForOpenScience#834)
Browse files Browse the repository at this point in the history
allow rendering search responses as lines of tab-separated or comma-separated values

main point:

- add simple_tsv and simple_csv renderers in trove.render
- - can be seen with query param acceptMediatype=text/tab-separated-values or acceptMediatype=text/csv
- - get default columns from static DEFAULT_TABULAR_SEARCH_COLUMN_PATHS in trove.vocab.osfmap
- allow "download" responses -- add withFileName=foo query param to get a response with Content-Disposition: attachment and a filename based on "foo"
- allow absurd page sizes

changes made along the way:

- introduce ProtoRendering as renderer output type, to better decouple rendering from view logic
- - include StreamableRendering for responses that might could be streamed, like csv/tsv (tho it's not currently handled any differently from SimpleRendering)
- - reshape BaseRenderer (and each existing renderer) to have a consistent call signature (and return ProtoRendering)
- - - replace trove.render.get_renderer with trove.render.get_renderer_type -- instantiate the renderer with response data
- add trove.views._responder with common logic for building a django HttpResponse for a ProtoRendering
- - consistently handles withFileName/Content-Disposition
- move some osf-specific constants to trove.vocab.osfmap for easier reuse
- pull out some abstractable logic:
- - from existing trove.render.simple_json into trove.render._simple_trovesearch (for renderers that include only the list of search results)
- - from existing tests.trove.derive._base into tests.trove._input_output_tests (for tests following the same simple input/output pattern as deriver and renderer tests)
- add tests.trove.render to cover the new renderers simple_tsv and simple_csv, as well as the existing renderers jsonapi, simple_json, jsonld, and turtle
- - minimally update existing renderers to create consistent output
  • Loading branch information
aaxelb authored Dec 23, 2024
1 parent 24bc70a commit 75ab046
Show file tree
Hide file tree
Showing 44 changed files with 2,366 additions and 636 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ xmltodict==0.12.0 # MIT
# Allows custom-rendered IDs, hiding null values, and including data in error responses
git+https://github.com/cos-forks/[email protected]+cos0

git+https://github.com/aaxelb/[email protected].09
git+https://github.com/aaxelb/[email protected].14
10 changes: 5 additions & 5 deletions share/search/index_strategy/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
CardsearchParams,
ValuesearchParams,
)
from trove.trovesearch.search_response import (
CardsearchResponse,
ValuesearchResponse,
from trove.trovesearch.search_handle import (
CardsearchHandle,
ValuesearchHandle,
)


Expand Down Expand Up @@ -219,10 +219,10 @@ def pls_stop_keeping_live(self):
def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict:
raise NotImplementedError(f'{self.__class__.__name__} does not implement pls_handle_search__sharev2_backcompat (either implement it or don\'t use this strategy for backcompat)')

def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse:
def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle:
raise NotImplementedError

def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse:
def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle:
raise NotImplementedError

def pls_get_mappings(self) -> dict:
Expand Down
22 changes: 5 additions & 17 deletions share/search/index_strategy/_trovesearch_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,15 @@
)
from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri
from trove.vocab.namespaces import (
DCTERMS,
FOAF,
OSFMAP,
OWL,
RDF,
RDFS,
SKOS,
TROVE,
XSD,
)
from trove.vocab.osfmap import is_date_property
from trove.vocab.osfmap import (
is_date_property,
SKIPPABLE_PROPERTIES,
)


_logger = logging.getLogger(__name__)
Expand All @@ -38,16 +36,6 @@
###
# constants

SKIPPABLE_PROPERTIES = (
OSFMAP.contains, # too much, not helpful
OWL.sameAs, # handled special
)

TITLE_PROPERTIES = (DCTERMS.title,)
NAME_PROPERTIES = (FOAF.name, OSFMAP.fileName)
LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel)
NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES)

KEYWORD_LENGTH_MAX = 8191 # skip keyword terms that might exceed lucene's internal limit
# (see https://www.elastic.co/guide/en/elasticsearch/reference/current/ignore-above.html)
KEYWORD_MAPPING = {'type': 'keyword', 'ignore_above': KEYWORD_LENGTH_MAX}
Expand Down Expand Up @@ -160,7 +148,7 @@ def __post_init__(self):
self.integer_values[_walk_path].add(_walk_obj)
elif isinstance(_walk_obj, rdf.Literal):
if XSD.integer in _walk_obj.datatype_iris:
self.integer_values[_walk_path].add(_walk_obj)
self.integer_values[_walk_path].add(int(_walk_obj.unicode_value))
if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris):
self.text_values[_walk_path].add(_walk_obj)
# try for date in a date property, regardless of the above
Expand Down
54 changes: 26 additions & 28 deletions share/search/index_strategy/trove_indexcard_flats.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,24 +31,20 @@
SortParam,
GLOB_PATHSTEP,
)
from trove.trovesearch.search_response import (
CardsearchResponse,
ValuesearchResponse,
from trove.trovesearch.search_handle import (
CardsearchHandle,
ValuesearchHandle,
TextMatchEvidence,
CardsearchResult,
ValuesearchResult,
PropertypathUsage,
)
from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword
from trove.vocab.osfmap import is_date_property
from trove.vocab import osfmap
from trove.vocab.namespaces import RDF, OWL
from ._trovesearch_util import (
latest_rdf_for_indexcard_pks,
GraphWalk,
TITLE_PROPERTIES,
NAME_PROPERTIES,
LABEL_PROPERTIES,
NAMELIKE_PROPERTIES,
KEYWORD_LENGTH_MAX,
)

Expand Down Expand Up @@ -288,7 +284,7 @@ def pls_handle_search__sharev2_backcompat(self, request_body=None, request_query
params=(request_queryparams or {}),
)

def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchResponse:
def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle:
_cursor = self._cardsearch_cursor(cardsearch_params)
_sort = self._cardsearch_sort(cardsearch_params.sort_list)
_query = self._cardsearch_query(
Expand All @@ -306,7 +302,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear
aggs=self._cardsearch_aggs(cardsearch_params),
sort=_sort,
from_=_from_offset,
size=_cursor.page_size,
size=_cursor.bounded_page_size,
source=False, # no need to get _source; _id is enough
)
if settings.DEBUG:
Expand All @@ -318,11 +314,11 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear
)
except elasticsearch8.TransportError as error:
raise exceptions.IndexStrategyError() from error # TODO: error messaging
return self._cardsearch_response(cardsearch_params, _es8_response, _cursor)
return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor)

def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse:
def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle:
_cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor)
_is_date_search = is_date_property(valuesearch_params.valuesearch_propertypath[-1])
_is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1])
_search_kwargs = dict(
query=self._cardsearch_query(
valuesearch_params.cardsearch_filter_set,
Expand All @@ -347,7 +343,7 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value
)
except elasticsearch8.TransportError as error:
raise exceptions.IndexStrategyError() from error # TODO: error messaging
return self._valuesearch_response(valuesearch_params, _es8_response, _cursor)
return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor)

###
# query implementation
Expand Down Expand Up @@ -449,7 +445,7 @@ def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: O
_nested_terms_agg = {
'field': 'nested_iri.iri_value',
# WARNING: terribly inefficient pagination (part one)
'size': cursor.start_offset + cursor.page_size + 1,
'size': cursor.start_offset + cursor.bounded_page_size + 1,
}
_iris = list(valuesearch_params.valuesearch_iris())
if _iris:
Expand Down Expand Up @@ -526,7 +522,7 @@ def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams):
}
return _aggs

def _valuesearch_response(
def _valuesearch_handle(
self,
valuesearch_params: ValuesearchParams,
es8_response: dict,
Expand All @@ -537,31 +533,33 @@ def _valuesearch_response(
_buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets']
_bucket_count = len(_buckets)
# WARNING: terribly inefficient pagination (part two)
_page_end_index = cursor.start_offset + cursor.page_size
_page_end_index = cursor.start_offset + cursor.bounded_page_size
_bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages
cursor.total_count = (
MANY_MORE
if (_bucket_count > _page_end_index) # agg includes one more, if there
else _bucket_count
)
return ValuesearchResponse(
return ValuesearchHandle(
cursor=cursor,
search_result_page=[
self._valuesearch_iri_result(_iri_bucket)
for _iri_bucket in _bucket_page
],
search_params=valuesearch_params,
)
else: # assume date
_year_buckets = (
es8_response['aggregations']['in_nested_date']
['value_at_propertypath']['count_by_year']['buckets']
)
return ValuesearchResponse(
return ValuesearchHandle(
cursor=PageCursor(len(_year_buckets)),
search_result_page=[
self._valuesearch_date_result(_year_bucket)
for _year_bucket in _year_buckets
],
search_params=valuesearch_params,
)

def _valuesearch_iri_result(self, iri_bucket):
Expand Down Expand Up @@ -664,7 +662,7 @@ def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]:
else:
raise ValueError(f'invalid date filter operator (got {search_filter.operator})')

def _cardsearch_sort(self, sort_list: tuple[SortParam]):
def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]):
if not sort_list:
return None
return [
Expand All @@ -683,12 +681,12 @@ def _cardsearch_sort(self, sort_list: tuple[SortParam]):
for _sortparam in sort_list
]

def _cardsearch_response(
def _cardsearch_handle(
self,
cardsearch_params: CardsearchParams,
es8_response: dict,
cursor: OffsetCursor,
) -> CardsearchResponse:
) -> CardsearchHandle:
_es8_total = es8_response['hits']['total']
if _es8_total['relation'] != 'eq':
cursor.total_count = MANY_MORE
Expand Down Expand Up @@ -717,11 +715,11 @@ def _cardsearch_response(
for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']:
_path = tuple(json.loads(_bucket['key']))
_relatedproperty_by_path[_path].usage_count += _bucket['doc_count']
return CardsearchResponse(
return CardsearchHandle(
cursor=cursor,
search_result_page=_results,
related_propertypath_results=_relatedproperty_list,
cardsearch_params=cardsearch_params,
search_params=cardsearch_params,
)

def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]:
Expand Down Expand Up @@ -833,7 +831,7 @@ def _inner_hits(self, *, highlight_query=None) -> dict:

def _should_skip_card(indexcard_rdf, rdfdoc):
# skip cards without some value for name/title/label
return not any(rdfdoc.q(indexcard_rdf.focus_iri, NAMELIKE_PROPERTIES))
return not any(rdfdoc.q(indexcard_rdf.focus_iri, osfmap.NAMELIKE_PROPERTIES))


def _bucketlist(agg_result: dict) -> list[str]:
Expand Down Expand Up @@ -911,17 +909,17 @@ def for_iri_at_path(cls, path: tuple[str, ...], iri: str, rdfdoc):
# TODO: don't discard language for name/title/label
name_text=frozenset(
_text.unicode_value
for _text in rdfdoc.q(iri, NAME_PROPERTIES)
for _text in rdfdoc.q(iri, osfmap.NAME_PROPERTIES)
if isinstance(_text, primitive_rdf.Literal)
),
title_text=frozenset(
_text.unicode_value
for _text in rdfdoc.q(iri, TITLE_PROPERTIES)
for _text in rdfdoc.q(iri, osfmap.TITLE_PROPERTIES)
if isinstance(_text, primitive_rdf.Literal)
),
label_text=frozenset(
_text.unicode_value
for _text in rdfdoc.q(iri, LABEL_PROPERTIES)
for _text in rdfdoc.q(iri, osfmap.LABEL_PROPERTIES)
if isinstance(_text, primitive_rdf.Literal)
),
)
Expand Down
Loading

0 comments on commit 75ab046

Please sign in to comment.