From ca95db40201f012ad9dea7b831b590d9f25e1bd1 Mon Sep 17 00:00:00 2001 From: PascalEgn Date: Tue, 6 Aug 2024 17:53:32 +0200 Subject: [PATCH] global: add pre-commit with ruff --- .github/workflows/build-and-release.yml | 1 - .gitignore | 2 +- .pre-commit-config.yaml | 17 + examples/demo_parser.py | 181 ----- inspire_query_parser/__init__.py | 7 +- inspire_query_parser/ast.py | 63 +- inspire_query_parser/config.py | 84 +- inspire_query_parser/parser.py | 605 +++++++++----- inspire_query_parser/parsing_driver.py | 65 +- inspire_query_parser/stateful_pypeg_parser.py | 25 +- .../utils/format_parse_tree.py | 27 +- inspire_query_parser/utils/visitor_utils.py | 378 ++++----- .../visitors/elastic_search_visitor.py | 702 +++++++++------- .../visitors/restructuring_visitor.py | 234 ++++-- inspire_query_parser/visitors/visitor_impl.py | 5 +- ruff.toml | 28 + run-tests.sh | 1 - setup.py | 30 +- tests/conftest.py | 21 +- tests/helpers/test_utils.py | 13 +- tests/test_elastic_search_visitor.py | 153 ++-- tests/test_format_parse_tree.py | 31 +- tests/test_parser.py | 60 +- tests/test_parser_functionality.py | 62 +- tests/test_parsing_driver.py | 31 +- tests/test_restructuring_visitor.py | 754 ++++++++++-------- tests/test_visitor_utils.py | 170 ++-- 27 files changed, 2065 insertions(+), 1685 deletions(-) create mode 100644 .pre-commit-config.yaml delete mode 100644 examples/demo_parser.py create mode 100644 ruff.toml diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml index 69c7afd..47787d7 100644 --- a/.github/workflows/build-and-release.yml +++ b/.github/workflows/build-and-release.yml @@ -47,4 +47,3 @@ jobs: with: user: __token__ password: ${{ secrets.pypi_password }} - diff --git a/.gitignore b/.gitignore index bea12b0..e652a7f 100644 --- a/.gitignore +++ b/.gitignore @@ -106,4 +106,4 @@ CHANGELOG .idea # vscode -.vscode \ No newline at end of file +.vscode diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2c56732 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: fix-byte-order-marker + - id: mixed-line-ending + - id: name-tests-test + args: [ --pytest-test-first ] + exclude: '^(?!factories/)' + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.5.6 + hooks: + - id: ruff + args: [ --fix ] diff --git a/examples/demo_parser.py b/examples/demo_parser.py deleted file mode 100644 index 7946d19..0000000 --- a/examples/demo_parser.py +++ /dev/null @@ -1,181 +0,0 @@ -# -*- coding: utf-8 -*- -# -# This file is part of INSPIRE. -# Copyright (C) 2014-2017 CERN. -# -# INSPIRE is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# INSPIRE is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with INSPIRE. If not, see . -# -# In applying this license, CERN does not waive the privileges and immunities -# granted to it by virtue of its status as an Intergovernmental Organization -# or submit itself to any jurisdiction. - -from __future__ import print_function, unicode_literals - -import sys - -from inspire_query_parser.parser import Query -from inspire_query_parser.stateful_pypeg_parser import StatefulParser -from inspire_query_parser.utils.format_parse_tree import emit_tree_format -from inspire_query_parser.visitors.restructuring_visitor import RestructuringVisitor - - -def repl(): - """Read-Eval-Print-Loop for reading the query, printing it and its parse tree. - - Exit the loop either with an interrupt or "quit". - """ - while True: - try: - sys.stdout.write("Type in next query: \n> ") - import locale - query_str = raw_input().decode(sys.stdin.encoding or locale.getpreferredencoding(True)) - except KeyboardInterrupt: - break - - if u'quit' in query_str: - break - - print_query_and_parse_tree(query_str) - - -def print_query_and_parse_tree(query_str): - parser = StatefulParser() - print('\033[94m' + "Parsing " + '\033[1m' + query_str + "" + '\033[0m') - _, parse_tree = parser.parse(query_str, Query) - print('\033[92m' + emit_tree_format(parse_tree.accept(RestructuringVisitor())) + '\033[0m') - print("————————————————————————————————————————————————————————————————————————————————") - - -if __name__ == '__main__': - # repl() - - # Find keyword combined with other production rules - print_query_and_parse_tree(r"FIN author:'ellis'") - print_query_and_parse_tree(r"find a T.A. Aibergenov and date = 1986") - print_query_and_parse_tree(r'Find author "ellis"') - print_query_and_parse_tree(r'f author ellis') - - # Invenio like search - print_query_and_parse_tree(r"author:ellis and title:boson") - print_query_and_parse_tree(r"unknown_keyword:'bar'") - print_query_and_parse_tree(r"dotted.keyword:'bar'") - - # Boolean operator testing (And/Or) - print_query_and_parse_tree(r"author ellis and title 'boson'") - print_query_and_parse_tree(r"f a appelquist and date 1983") - print_query_and_parse_tree(r"fin a henneaux and citedby a nicolai") - print_query_and_parse_tree(r"au ellis | title 'boson'") - print_query_and_parse_tree(r"-author ellis OR title 'boson'") - print_query_and_parse_tree(r"author ellis & title 'boson'") - - # Implicit And - # Works in the case of "A B": - # 1) B KeywordQuery is of format "keyword:value" - # 2) B is a NotQuery, e.g. "title foo not title bar" - # 3) A or B KeywordQueries have a ComplexValue as value, e.g. author 'ellis' title boson - # 4) B KeywordQuery has a keyword that is a non-shortened version of INSPIRE_KEYWORDS. - print_query_and_parse_tree(r"author ellis elastic.keyword:'boson'") - print_query_and_parse_tree(r"find cn atlas not tc c") - print_query_and_parse_tree(r"author:ellis j title:'boson' reference:M.N.1") - print_query_and_parse_tree(r"author ellis title 'boson' not title higgs") - print_query_and_parse_tree(r"author ellis - title 'boson'") - - # ##### Boolean operators at terminals level #### - # 1. Boolean operators among simple values - print_query_and_parse_tree(r"author ellis, j and smith") - # 2. An and query among terminals or and "j" signifies the "journal" keyword? - print_query_and_parse_tree(r"f author ellis, j and patrignani and j Chin.Phys.") - # This one is ambiguous since first name "j" overlaps with journals - print_query_and_parse_tree(r"f author ellis, j and patrignani and j ellis") - # While this is clearer - print_query_and_parse_tree(r"f author ellis, j and patrignani and j, ellis") - - # Negation - print_query_and_parse_tree(r"ellis and not title 'boson'") - print_query_and_parse_tree(r"-title 'boson'") - - # Nested expressions - print_query_and_parse_tree(r"author ellis, j. and (title boson or (author /^xi$/ and title foo))") - print_query_and_parse_tree(r"author ellis, j. and not (title boson or not (author /^xi$/ and title foo))") - - # Metadata search - print_query_and_parse_tree(r'fulltext:boson and (reference:Ellis or reference "Ellis")') - print_query_and_parse_tree(r"exactauthor:M.Vanderhaeghen.1 and ac: 42") - - # Simple phrases - print_query_and_parse_tree(r'ellis') - print_query_and_parse_tree(r"'ellis'") - - # Parenthesized keyword query values (working also with SPIRES operators - doesn't on legacy) - print_query_and_parse_tree(r"author:(title ellis)") - print_query_and_parse_tree(r"author (pardo, f AND slavich) OR (author:bernreuther and not date:2017)") - - # Non trivial terminals - print_query_and_parse_tree(r"author smith and j., ellis") - print_query_and_parse_tree(r"find title Alternative the Phase-II upgrade of the ATLAS Inner Detector or na61/shine") - print_query_and_parse_tree(r"find (j phys.rev. and vol d85) or (j phys.rev.lett.,62,1825)") - print_query_and_parse_tree(r"title e-10 and -author d'hoker") - print_query_and_parse_tree(r'a pang,yi and ekström and t SU(2)') # Full-width comma unicode character - print_query_and_parse_tree(r't e(+)e(-) or e+e- Colliders') - print_query_and_parse_tree(r"title: Si-28(p(pol.),n(pol.))") - print_query_and_parse_tree(r"t Si28(p→,p→′)Si28(6−,T=1) ") - print_query_and_parse_tree(r"ti C-12(vec-p,vec-n)N-12 (g.s.,1+)") - - # Regex - print_query_and_parse_tree(r"author:/^Ellis, (J|John)$/") - print_query_and_parse_tree(r"title:/dense ([^ $]* )?matter/") - - # Nestable keywords - print_query_and_parse_tree(r"referstox:author:s.p.martin.1") - print_query_and_parse_tree(r"find a parke, s j and refersto author witten") - print_query_and_parse_tree(r"citedbyx:author:s.p.martin.1") - print_query_and_parse_tree(r"citedby:author:s.p.martin.1") - print_query_and_parse_tree(r"-refersto:recid:1374998 and citedby:(A.A.Aguilar.Arevalo.1)") - print_query_and_parse_tree(r"citedby:(author A.A.Aguilar.Arevalo.1 and not a ellis)") - print_query_and_parse_tree(r"citedby:refersto:recid:1432705") - - # Ranges - print_query_and_parse_tree(r"d 2015->2017 and cited:1->9") - - # Empty query - print_query_and_parse_tree(r"") # Nothing - print_query_and_parse_tree(r" ") # Spaces and Tab - - # G, GE, LT, LE, E queries - print_query_and_parse_tree(r"date > 2000-10 and < 2000-12") - print_query_and_parse_tree(r"date after 10/2000 and before 2000-12") - print_query_and_parse_tree(r"date >= nov 2000 and d<=2005") - print_query_and_parse_tree(r"date 1978+ + -ac 100+") - print_query_and_parse_tree(r"f a wimpenny and date = 1987") - - # Date specifiers - print_query_and_parse_tree(r"date today - 2 and title foo") - print_query_and_parse_tree(r"date this month author ellis") - print_query_and_parse_tree(r"date yesterday - 2 - ac 100") - print_query_and_parse_tree(r"date last month - 2 + ac < 50") - print_query_and_parse_tree(r"date this month - 2") - print_query_and_parse_tree(r"du > yesterday - 2") - - # Star queries - print_query_and_parse_tree(r"find a 'o*aigh' and t \"alge*\" and date >2013") - print_query_and_parse_tree(r"a *alge | a alge* | a o*aigh") - - # Unrecognized queries - print_query_and_parse_tree(r"title and foo") - print_query_and_parse_tree(r"title γ-radiation and and") - - # The query below doesn't work on legacy. Currently, it is recognized as a boolean query (since theory is recognized - # as a keyword). Can be useful for testing multiple parse trees generation (one with the first parse and a second - # with removing ":" character (could be one heuristic)). - # print_query_and_parse_tree(r"find t Closed string field theory: Quantum action") diff --git a/inspire_query_parser/__init__.py b/inspire_query_parser/__init__.py index 2598359..ca6c098 100644 --- a/inspire_query_parser/__init__.py +++ b/inspire_query_parser/__init__.py @@ -19,10 +19,9 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. - -"""A PEG-based query parser for INSPIRE""" +"""A PEG-based query parser for INSPIRE.""" from __future__ import absolute_import, print_function -from . import config # noqa: F401 -from .parsing_driver import parse_query # noqa: F401 +from inspire_query_parser import config # noqa: F401 +from inspire_query_parser.parsing_driver import parse_query # noqa: F401 diff --git a/inspire_query_parser/ast.py b/inspire_query_parser/ast.py index 41efa83..cf82f5e 100644 --- a/inspire_query_parser/ast.py +++ b/inspire_query_parser/ast.py @@ -19,18 +19,15 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. +"""AbstractSyntaxTree classes along with their concrete ones. -""" -AbstractSyntaxTree classes along with their concrete ones. - -The module defines a generic AST element along with four AST node categories (which act as a basis for all the concrete -AST nodes) and finally, the concrete classes which represent the output of the parsing process. +The module defines a generic AST element along with four AST node +categories (which act as a basis for all the concrete AST nodes) and +finally, the concrete classes which represent the output of the parsing +process. -The generic AST node categories are: - - Leaf - - UnaryOp - - BinaryOp - - ListOp +The generic AST node categories are: - Leaf - UnaryOp - +BinaryOp - ListOp The concrete AST nodes, represent higher level (domain specific) nodes. """ @@ -40,18 +37,19 @@ # #### Abstract Syntax Tree classes #### class ASTElement(object): - """Root AbstractSyntaxTree node that acts as a stub for calling the Visitor's `visit` dispatcher method.""" + """Root AbstractSyntaxTree node that acts as a stub for calling the + Visitor's `visit` dispatcher method.""" + def accept(self, visitor, *args, **kwargs): return visitor.visit(self, *args, **kwargs) class Leaf(ASTElement): - def __init__(self, value=None): self.value = value def __eq__(self, other): - return type(self) == type(other) and self.value == other.value + return type(self) is type(other) and self.value == other.value def __repr__(self): return '%s(%r)' % (self.__class__.__name__, self.value) @@ -61,12 +59,11 @@ def __hash__(self): class UnaryOp(ASTElement): - def __init__(self, op): self.op = op def __eq__(self, other): - return type(self) == type(other) and self.op == other.op + return type(self) is type(other) and self.op == other.op def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.op) @@ -76,30 +73,29 @@ def __hash__(self): class BinaryOp(ASTElement): - def __init__(self, left, right): self.left = left self.right = right def __eq__(self, other): return ( - type(self) == type(other) - ) and ( - self.left == other.left - ) and ( - self.right == other.right + (type(self) is type(other)) + and (self.left == other.left) + and (self.right == other.right) ) def __repr__(self): - return "%s(%s, %s)" % (self.__class__.__name__, - repr(self.left), repr(self.right)) + return "%s(%s, %s)" % ( + self.__class__.__name__, + repr(self.left), + repr(self.right), + ) def __hash__(self): return hash((self.left, self.right)) class ListOp(ASTElement): - def __init__(self, children): try: iter(children) @@ -109,7 +105,7 @@ def __init__(self, children): self.children = children def __eq__(self, other): - return type(self) == type(other) and self.children == other.children + return type(self) is type(other) and self.children == other.children def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.children) @@ -144,15 +140,20 @@ class ValueOp(UnaryOp): class QueryWithMalformedPart(BinaryOp): - """A combination of recognized part of a query (with a parse tree) and some malformed input. + """A combination of recognized part of a query (with a parse tree) and some + malformed input. - Its left child is the recognized parse tree, while its right child has the :class:`MalformedQuery`. + Its left child is the recognized parse tree, while its right child + has the :class:`MalformedQuery`. """ + pass class MalformedQuery(ListOp): - """A :class:`ListOp` with children the unrecognized words of the parser's input.""" + """A :class:`ListOp` with children the unrecognized words of the parser's + input.""" + pass @@ -183,6 +184,7 @@ class Keyword(Leaf): class GenericValue(Leaf): """Represents a generic value, which might contain a wildcard.""" + WILDCARD_TOKEN = '*' def __init__(self, value, contains_wildcard=False): @@ -190,7 +192,10 @@ def __init__(self, value, contains_wildcard=False): self.contains_wildcard = contains_wildcard def __eq__(self, other): - return super(GenericValue, self).__eq__(other) and self.contains_wildcard == other.contains_wildcard + return ( + super(GenericValue, self).__eq__(other) + and self.contains_wildcard == other.contains_wildcard + ) def __hash__(self): return hash((super(GenericValue, self).__hash__(), self.contains_wildcard)) diff --git a/inspire_query_parser/config.py b/inspire_query_parser/config.py index 6601a93..8d9fba7 100644 --- a/inspire_query_parser/config.py +++ b/inspire_query_parser/config.py @@ -19,23 +19,20 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. +"""A collection of INSPIRE related keywords. -""" -A collection of INSPIRE related keywords. - -This dictionary has a twofold use. -Primarily, the parser uses its keys to generate INSPIRE related keywords (i.e. qualifiers) and secondly, provides -a normalization of the shortened keywords to their full version. +This dictionary has a twofold use. Primarily, the parser uses its keys +to generate INSPIRE related keywords (i.e. qualifiers) and secondly, +provides a normalization of the shortened keywords to their full +version. """ from __future__ import unicode_literals INSPIRE_PARSER_NONDATE_KEYWORDS = { # Abstract 'abstract': 'abstract', - # Address 'address': 'address', - # Affiliation 'affiliation': 'affiliation', 'affil': 'affiliation', @@ -43,95 +40,73 @@ 'af': 'affiliation', 'institution': 'affiliation', 'inst': 'affiliation', - # Affiliation Id 'affid': 'affiliation-id', 'affiliation-id': 'affiliation-id', - # Author 'author': 'author', 'au': 'author', 'a': 'author', 'name': 'author', - # Author-Count 'author-count': 'author-count', 'authorcount': 'author-count', 'ac': 'author-count', - # Cataloguer 'cat': 'cataloguer', - # Caption 'caption': 'caption', - # Cite, i.e. records that cite the given search term # Cite and c: SPIRES syntax while reference is INVENIO syntax 'cite': 'cite', 'c': 'cite', 'reference': 'cite', - # Citedby related 'citedby': 'citedby', # nested keyword query - # Cited by excluding self sites, e.g. citedbyexcludingselfcites:author:M.E.Peskin.1 'citedbyexcludingselfcites': 'citedbyexcludingselfcites', 'citedbyx': 'citedbyexcludingselfcites', - # Cited excluding self sites, e.g. citedexcludingselfcites:50+ 'citedexcludingselfcites': 'citedexcludingselfcites', 'cx': 'citedexcludingselfcites', - # Collaboration 'collaboration': 'collaboration', 'cn': 'collaboration', - # Conference number 'cnum': 'confnumber', - # Control number 'control_number': 'control_number', 'recid': 'control_number', - # Country 'country': 'country', 'cc': 'country', - # DOI 'doi': 'doi', - # ePrint 'bb': 'eprint', 'bull': 'eprint', 'eprint': 'eprint', 'arxiv': 'eprint', 'arXiv': 'eprint', - # Exact-Author 'exact-author': 'exact-author', 'exactauthor': 'exact-author', 'ea': 'exact-author', - # Experiment 'experiment': 'experiment', 'exp': 'experiment', - # Field-code 'fc': 'field-code', 'field-code': 'field-code', - # First-Author 'first-author': 'first_author', 'firstauthor': 'first_author', 'fa': 'first_author', - # Fulltext 'fulltext': 'fulltext', 'ft': 'fulltext', - # SPIRES identifiers 'irn': 'irn', - # Journal related 'coden': 'journal', 'journal': 'journal', @@ -139,34 +114,28 @@ 'published_in': 'journal', 'volume': 'volume', 'vol': 'volume', - # Keyword # keyword is Invenio style, while the rest are from SPIRES syntax. 'keyword': 'keyword', 'keywords': 'keyword', 'kw': 'keyword', 'k': 'keyword', - # Primary archive 'primarch': 'primary_arxiv_category', - # rawref 'rawref': 'rawref', - # Reference 'citation': 'reference', 'jour-vol-page': 'reference', 'jvp': 'reference', - # Refersto operator # Nested keyword query 'refersto': 'refersto', - - # Refers to excluding self cites, e.g. referstoexcludingselfcites:author:M.E.Peskin.1 + # Refers to excluding self cites, + # e.g. referstoexcludingselfcites:author:M.E.Peskin.1 # Nested keyword queries 'referstoexcludingselfcites': 'referstoexcludingselfcites', 'referstox': 'referstoexcludingselfcites', - # Report number 'reportnumber': 'reportnumber', 'report-num': 'reportnumber', @@ -174,24 +143,19 @@ 'rept': 'reportnumber', 'rn': 'reportnumber', 'r': 'reportnumber', - # Subject 'subject': 'subject', - # Title 'title': 'title', 'ti': 'title', 't': 'title', - # texkey 'texkey': 'texkeys.raw', - # Topcite, i.e. citation count # Cited used to be for Invenio style syntax while topcite for SPIRES 'cited': 'topcite', 'topcit': 'topcite', 'topcite': 'topcite', - # Type-Code 'type-code': 'type-code', 'type': 'type-code', @@ -199,7 +163,7 @@ 'ty': 'type-code', 'scl': 'type-code', 'ps': 'type-code', - 'collection': 'type-code', # Queries for this one include "collection published" only + 'collection': 'type-code',# Queries for this one include "collection published" only } INSPIRE_PARSER_DATE_KEYWORDS = { @@ -208,21 +172,17 @@ 'd': 'date', # From queries dataset, users seem to use year and date interchangeably. 'year': 'date', - # Date added 'date-added': 'date-added', 'dadd': 'date-added', 'da': 'date-added', - # Date earliest 'date-earliest': 'date-earliest', 'de': 'date-earliest', - # Date updated 'date-updated': 'date-updated', 'dupd': 'date-updated', 'du': 'date-updated', - # Journal year 'journal-year': 'publication_info.year', 'jy': 'publication_info.year', @@ -242,13 +202,33 @@ DATE_TODAY_REGEX_PATTERN, DATE_YESTERDAY_REGEX_PATTERN, DATE_THIS_MONTH_REGEX_PATTERN, - DATE_LAST_MONTH_REGEX_PATTERN + DATE_LAST_MONTH_REGEX_PATTERN, ) MONTH_REGEX = "|".join( [ - "january", "jan", "february", "feb", "march", "mar", "april", "apr", "may", - "june", 'jun', "july", "jul", "august", "aug", - "september", "sep", "october", "oct", "november", "nov", "december", "dec" + "january", + "jan", + "february", + "feb", + "march", + "mar", + "april", + "apr", + "may", + "june", + 'jun', + "july", + "jul", + "august", + "aug", + "september", + "sep", + "october", + "oct", + "november", + "nov", + "december", + "dec", ] ) # ##### diff --git a/inspire_query_parser/parser.py b/inspire_query_parser/parser.py index f031bab..eff9fb8 100644 --- a/inspire_query_parser/parser.py +++ b/inspire_query_parser/parser.py @@ -22,34 +22,55 @@ from __future__ import print_function, unicode_literals +import datefinder import six +from pypeg2 import ( + Enum, + GrammarValueError, + K, + Keyword, + Literal, + attr, + contiguous, + maybe_some, + omit, + optional, + re, + some, + whitespace, +) -from inspire_query_parser.config import DATE_SPECIFIERS_COLLECTION -from pypeg2 import (Enum, GrammarValueError, K, Keyword, Literal, attr, - contiguous, maybe_some, omit, optional, re, some, - whitespace) +from inspire_query_parser import ast +from inspire_query_parser.config import ( + DATE_SPECIFIERS_COLLECTION, + INSPIRE_PARSER_DATE_KEYWORDS, + INSPIRE_PARSER_KEYWORDS, + INSPIRE_PARSER_NONDATE_KEYWORDS, + MONTH_REGEX, +) -from . import ast -from .config import MONTH_REGEX, INSPIRE_PARSER_KEYWORDS, INSPIRE_PARSER_DATE_KEYWORDS, INSPIRE_PARSER_NONDATE_KEYWORDS -from dateutil import parser as date_parser -import datefinder # TODO Restrict what a simple query (i.e. Value) can accept (remove LessThanOp, etc.). -# For 'date > 2013 and < 2017' probably allow LessThanOp into SimpleValueBooleanQuery. +# For 'date > 2013 and < 2017' probably allow LessThanOp into +# SimpleValueBooleanQuery. # TODO 'date > 2000-10 and < date 2000-12' parses without a malformed query. (First fix the above) # #### Parser customization #### class CaseInsensitiveKeyword(Keyword): - """Supports case insensitive keywords + """Supports case insensitive keywords. - All subtypes must declare a grammar attribute with an Enum of accepted keywords/literals. + All subtypes must declare a grammar attribute with an Enum of + accepted keywords/literals. """ + def __init__(self, keyword): """Adds lowercase keyword to the keyword table.""" try: - self.grammar + self.grammar # noqa B018 except AttributeError: - raise GrammarValueError(self.__class__.__name__ + " expects a grammar attribute (Enum).") + raise GrammarValueError( + self.__class__.__name__ + " expects a grammar attribute (Enum)." + ) keyword = keyword.lower() if keyword not in Keyword.table: @@ -63,9 +84,11 @@ def parse(cls, parser, text, pos): if match: # Check if match is is not in the grammar of the specific keyword class. if match.group(0).lower() not in cls.grammar: - result = text, SyntaxError(repr(match.group(0)) + " is not a member of " + repr(cls.grammar)) + result = text, SyntaxError( + repr(match.group(0)) + " is not a member of " + repr(cls.grammar) + ) else: - result = text[len(match.group(0)):], cls(match.group(0)) + result = text[len(match.group(0)) :], cls(match.group(0)) else: result = text, SyntaxError("expecting " + repr(cls.__name__)) return result @@ -85,6 +108,7 @@ def __repr__(self): class BooleanOperator(object): """Serves as the possible case for a boolean operator.""" + AND = 'and' OR = 'or' @@ -111,13 +135,16 @@ def __init__(self, left=None, right=None): class BooleanRule(ast.BinaryOp): """Represents a boolean query rule. - This means that there is a left and right node, but also the boolean operator of the rule. - Can be called by PyPeg framework either when constructing a boolean query (which supports implicit and) or when - constructing a boolean query among simple values (thus, no implicit and support). + This means that there is a left and right node, but also the boolean + operator of the rule. Can be called by PyPeg framework either when + constructing a boolean query (which supports implicit and) or when + constructing a boolean query among simple values (thus, no implicit + and support). - Note: - When a BooleanRule is created from PyPeg, the format of the arguments is an iterable, when it's created from - the custom parse method of simple value boolean query, the non-default arguments are being used. + Note: When a BooleanRule is created from PyPeg, the format of + the arguments is an iterable, when it's created from the custom + parse method of simple value boolean query, the non-default + arguments are being used. """ def __init__(self, args, bool_op=None, right=None): @@ -133,7 +160,7 @@ def __init__(self, args, bool_op=None, right=None): self.left = args[0] if len(args) == 3: - if isinstance(args[1], And) or isinstance(args[1], Or): + if isinstance(args[1], (And, Or)) : self.bool_op = args[1] else: raise ValueError("Unexpected boolean operator: " + repr(args[1])) @@ -143,13 +170,17 @@ def __init__(self, args, bool_op=None, right=None): self.right = args[len(args) - 1] def __eq__(self, other): - return super(BooleanRule, self).__eq__(other) and type(self.bool_op) == type(other.bool_op) # noqa:E721 + return super(BooleanRule, self).__eq__(other) and type(self.bool_op) is type( + other.bool_op + ) # noqa:E721 def __repr__(self): - return "%s(%r, %r, %r)" % (self.__class__.__name__, - self.left, - self.bool_op, - self.right) + return "%s(%r, %r, %r)" % ( + self.__class__.__name__, + self.left, + self.bool_op, + self.right, + ) def __hash__(self): return hash((self.left, self.bool_op, self.right)) @@ -158,28 +189,31 @@ def __hash__(self): class ListRule(ast.ListOp): def __init__(self, children): super(ListRule, self).__init__(children) + + # ######################## # #### Keywords #### class And(CIKeyword): - """ - The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether - terminal symbols are actually DSL keywords. - """ + """The reason for defining an Enum grammar of Keywords is for populating + the Keyword.table for checking whether terminal symbols are actually DSL + keywords.""" + regex = re.compile(r"(and|\+|&)", re.IGNORECASE) grammar = Enum(K("and"), K("+"), K("&")) def __init__(self, *args): - # Normalize different AND keywords (ignore the keyword argument that was passed). + # Normalize different AND keywords + # (ignore the keyword argument that was passed). super(And, self).__init__(BooleanOperator.AND) class Or(CIKeyword): - """ - The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether - terminal symbols are actually DSL keywords. - """ + """The reason for defining an Enum grammar of Keywords is for populating + the Keyword.table for checking whether terminal symbols are actually DSL + keywords.""" + regex = re.compile(r"(or|\|)", re.IGNORECASE) grammar = Enum(K("or"), K("|")) @@ -189,12 +223,14 @@ def __init__(self, *args): class Not(CIKeyword): - """ - The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether - terminal symbols are actually DSL keywords. - """ + """The reason for defining an Enum grammar of Keywords is for populating + the Keyword.table for checking whether terminal symbols are actually DSL + keywords.""" + regex = re.compile(r"(not|-)", re.IGNORECASE) grammar = Enum(K("not"), K("-")) + + # ######################## @@ -204,7 +240,8 @@ class Whitespace(LeafRule): class InspireKeyword(LeafRule): - # InspireKeyword expects a word boundary at its end, excluding [.,] characters, since these might signify names. + # InspireKeyword expects a word boundary at its end, excluding [.,] characters, + # since these might signify names. grammar = re.compile( r"({0})(?![,.])(?=(:|\b))".format( "|".join(INSPIRE_PARSER_NONDATE_KEYWORDS.keys()) @@ -247,20 +284,26 @@ def parse(cls, parser, text, pos): class SimpleValueUnit(LeafRule): - """Represents either a terminal symbol (without parentheses) or a parenthesized SimpleValue. - - The parenthesized case (2nd option of SimpleValueUnit) accepts a SimpleValue which is the more generic case of - plaintext and in turn (its grammar) encapsulates whitespace and SimpleValueUnit recognition. + """Represents either a terminal symbol (without parentheses) or a + parenthesized SimpleValue. + The parenthesized case (2nd option of SimpleValueUnit) accepts a + SimpleValue which is the more generic case of plaintext and in turn + (its grammar) encapsulates whitespace and SimpleValueUnit + recognition. """ + token_regex = re.compile(r"[^\s:)(]+", re.UNICODE) - date_specifiers_regex = re.compile(r"({})\s*-\s*\d+".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE) + date_specifiers_regex = re.compile( + r"({})\s*-\s*\d+".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE + ) parenthesized_token_grammar = None # is set after SimpleValue definition. starts_with_colon = re.compile(r"\s*:", re.UNICODE) - """Used for recognizing whether terminal token is a keyword (i.e. followed by some whitespace and ":".""" + """Used for recognizing whether terminal token is a keyword (i.e. followed + by some whitespace and ":".""" def __init__(self, args): super(SimpleValueUnit, self).__init__() @@ -273,17 +316,20 @@ def __init__(self, args): @classmethod def parse_terminal_token(cls, parser, text): - """Parses a terminal token that doesn't contain parentheses nor colon symbol. + """Parses a terminal token that doesn't contain parentheses nor colon + symbol. - Note: - Handles a special case of tokens where a ':' is needed (for `texkey` queries). + Note: Handles a special case of tokens where a ':' is needed + (for `texkey` queries). - If we're parsing text not in parentheses, then some DSL keywords (e.g. And, Or, Not, defined above) should - not be recognized as terminals, thus we check if they are in the Keywords table (namespace like structure - handled by PyPeg). - This is done only when we are not parsing a parenthesized SimpleValue. + If we're parsing text not in parentheses, then some DSL keywords + (e.g. And, Or, Not, defined above) should not be recognized as + terminals, thus we check if they are in the Keywords table + (namespace like structure handled by PyPeg). This is done only + when we are not parsing a parenthesized SimpleValue. - Also, helps in supporting more implicit-and queries cases (last two checks). + Also, helps in supporting more implicit-and queries cases (last + two checks). """ token_regex = cls.token_regex @@ -291,58 +337,74 @@ def parse_terminal_token(cls, parser, text): if match: matched_token = match.group(0) - # Check if token is a DSL keyword. Disable this check in the case where the parser isn't parsing a - # parenthesized terminal. - if not parser._parsing_parenthesized_terminal and matched_token.lower() in Keyword.table: + # Check if token is a DSL keyword. Disable this check in the case where + # the parser isn't parsing a parenthesized terminal. + if ( + not parser._parsing_parenthesized_terminal + and matched_token.lower() in Keyword.table + ): return text, SyntaxError("found DSL keyword: " + matched_token) - remaining_text = text[len(matched_token):] + remaining_text = text[len(matched_token) :] - # Attempt to recognize whether current terminal is followed by a ":", which definitely signifies that - # we are parsing a keyword, and we shouldn't. + # Attempt to recognize whether current terminal is followed by a ":", + # which definitely signifies that we are parsing a keyword, + # and we shouldn't. if cls.starts_with_colon.match(remaining_text): - return text, \ - SyntaxError("parsing a keyword (token followed by \":\"): \"" + repr(matched_token) + "\"") + return text, SyntaxError( + "parsing a keyword (token followed by \":\"): \"" + + repr(matched_token) + + "\"" + ) result = remaining_text, matched_token else: - result = text, SyntaxError("expecting match on " + repr(cls.token_regex.pattern)) + result = text, SyntaxError( + "expecting match on " + repr(cls.token_regex.pattern) + ) return result @classmethod def parse(cls, parser, text, pos): """Imitates parsing a list grammar. - Specifically, this - grammar = [ - SimpleValueUnit.date_specifiers_regex, - SimpleValueUnit.token_regex, - SimpleValueUnit.parenthesized_token_grammar - ]. + Specifically, this grammar = [ + SimpleValueUnit.date_specifiers_regex, + SimpleValueUnit.token_regex, + SimpleValueUnit.parenthesized_token_grammar ]. - Parses plaintext which matches date specifiers or arxiv_identifier syntax, or is comprised of either 1) simple + Parses plaintext which matches date specifiers or + arxiv_identifier syntax, or is comprised of either 1) simple terminal (no parentheses) or 2) a parenthesized SimpleValue. - For example, "e(+)" will be parsed in two steps, first, "e" token will be recognized and then "(+)", as a - parenthesized SimpleValue. + For example, "e(+)" will be parsed in two steps, first, "e" + token will be recognized and then "(+)", as a parenthesized + SimpleValue. """ found = False # Attempt to parse date specifier match = cls.date_specifiers_regex.match(text) if match: - remaining_text, token, found = text[len(match.group(0)):], match.group(0), True + remaining_text, token, found = ( + text[len(match.group(0)) :], + match.group(0), + True, + ) else: # Attempt to parse a terminal token remaining_text, token = cls.parse_terminal_token(parser, text) - if type(token) != SyntaxError: + if not isinstance(token, SyntaxError): found = True else: # Attempt to parse a terminal with parentheses try: - # Enable parsing a parenthesized terminal so that we can accept {+, -, |} as terminals. + # Enable parsing a parenthesized terminal so that + # we can accept {+, -, |} as terminals. parser._parsing_parenthesized_terminal = True - remaining_text, token = parser.parse(text, cls.parenthesized_token_grammar, pos) + remaining_text, token = parser.parse( + text, cls.parenthesized_token_grammar, pos + ) found = True except SyntaxError: @@ -368,7 +430,9 @@ class SimpleValueWithColonUnit(SimpleValueUnit): class SimpleDateValueUnit(LeafRule): grammar = re.compile(r"[\d*\-\.\/]{4,10}(?=($|\s|\)))", re.UNICODE) - date_specifiers_regex = re.compile(r"({})\s*(-\s*\d+)?".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE) + date_specifiers_regex = re.compile( + r"({})\s*(-\s*\d+)?".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE + ) string_month_date_regex = re.compile(MONTH_REGEX, re.IGNORECASE) def __init__(self, args): @@ -389,7 +453,9 @@ def _parse_date_with_string_month(cls, text): remaining_text = text[date_end_index:] result = remaining_text, found_date_string except StopIteration: - result = text, SyntaxError("expecting match on " + repr(cls.string_month_date_regex.pattern)) + result = text, SyntaxError( + "expecting match on " + repr(cls.string_month_date_regex.pattern) + ) return result @classmethod @@ -399,7 +465,7 @@ def parse(cls, parser, text, pos): match = cls.date_specifiers_regex.match(text) string_month_date_match = cls.string_month_date_regex.match(text) if match: - remaining_text, token = text[len(match.group(0)):], match.group(0) + remaining_text, token = text[len(match.group(0)) :], match.group(0) elif string_month_date_match: remaining_text, token = cls._parse_date_with_string_month(text) else: @@ -411,7 +477,7 @@ def parse(cls, parser, text, pos): raise except ValueError: pass - if token and type(token) != SyntaxError: + if token and not isinstance(token, SyntaxError): result = remaining_text, cls(token) else: result = text, SyntaxError("expecting match on " + cls.__name__) @@ -431,32 +497,49 @@ def __init__(self, values): E.g. title top cross section, or title Si-28(p(pol.), n(pol.)). """ + @staticmethod - def unconsume_and_reconstruct_input(remaining_text, recognized_tokens, complex_value_idx): - """Reconstruct input in case of consuming a keyword query or a value query with ComplexValue as value. - - Un-consuming at most 3 elements and specifically (Keyword,) Whitespace and ComplexValue, while also - reconstructing parser's input text. - - Example: - Given this query "author foo t 'bar'", r would be: - r = [SimpleValueUnit("foo"), Whitespace(" "), SimpleValueUnit("t"), Whitespace(" "), - SimpleValueUnit("'bar'")] - thus after this method, r would be [SimpleValueUnit("foo"), Whitespace(" ")], while initial text will - have been reconstructed as "t 'bar' rest_of_the_text". + def unconsume_and_reconstruct_input( + remaining_text, recognized_tokens, complex_value_idx + ): + """Reconstruct input in case of consuming a keyword query or a value + query with ComplexValue as value. + + Un-consuming at most 3 elements and specifically (Keyword,) + Whitespace and ComplexValue, while also reconstructing parser's + input text. + + Example: Given this query "author foo t 'bar'", r would be: + r = [SimpleValueUnit("foo"), Whitespace(" "), + SimpleValueUnit("t"), Whitespace(" "), + SimpleValueUnit("'bar'")] thus after this method, r would be + [SimpleValueUnit("foo"), Whitespace(" ")], while initial text + will have been reconstructed as "t 'bar' rest_of_the_text". """ - # Default slicing index: i.e. at most 3 elements will be unconsumed, Keyword, Whitespace and ComplexValue. + # Default slicing index: i.e. at most 3 elements will be unconsumed, Keyword, + # Whitespace and ComplexValue. slicing_start_idx = 2 - # Check whether the 3rd element from the end is an InspireKeyword. If not, a Value query with ComplexValue - # was consumed. - if not INSPIRE_PARSER_KEYWORDS.get(recognized_tokens[complex_value_idx - slicing_start_idx].value, None): + # Check whether the 3rd element from the end is an InspireKeyword. If not, + # a Value query with ComplexValue was consumed. + if not INSPIRE_PARSER_KEYWORDS.get( + recognized_tokens[complex_value_idx - slicing_start_idx].value, None + ): slicing_start_idx = 1 - reconstructed_terminals = recognized_tokens[:complex_value_idx - slicing_start_idx] + reconstructed_terminals = recognized_tokens[ + : complex_value_idx - slicing_start_idx + ] reconstructed_text = '{} {}'.format( - ''.join([token.value for token in recognized_tokens[complex_value_idx - slicing_start_idx:]]), - remaining_text + ''.join( + [ + token.value + for token in recognized_tokens[ + complex_value_idx - slicing_start_idx : + ] + ] + ), + remaining_text, ) return reconstructed_text, reconstructed_terminals @@ -465,15 +548,22 @@ def parse(cls, parser, text, pos): try: remaining_text, recognized_tokens = parser.parse(text, cls.grammar) - # Covering a case of implicit-and when one of the SimpleValue tokens is a ComplexValue. - # This means we either have a KeywordQuery or a ValueQuery with a ComplexValue. - # E.g. "author foo t 'bar'", since 'bar' is a ComplexValue, then the previous token is a keyword. + # Covering a case of implicit-and when one of the SimpleValue tokens + # is a ComplexValue. + # This means we either have a KeywordQuery or a ValueQuery + # with a ComplexValue. + # E.g. "author foo t 'bar'", since 'bar' is a ComplexValue, + # then the previous token is a keyword. # This means we have consumed a KeywordQuery (due to 'and' missing). - # Same goes for "author foo 'bar'", but in this case we have a ValueQuery with a ComplexValue. + # Same goes for "author foo 'bar'", but in this case we have a ValueQuery + # with a ComplexValue. found_complex_value = False for idx, token in enumerate(recognized_tokens): if ComplexValue.regex.match(token.value): - reconstructed_text, reconstructed_terminals = cls.unconsume_and_reconstruct_input( + ( + reconstructed_text, + reconstructed_terminals, + ) = cls.unconsume_and_reconstruct_input( remaining_text, recognized_tokens, idx ) found_complex_value = True @@ -495,15 +585,27 @@ class SimpleValue(SimpleValueGeneric): E.g. title top cross section, or title Si-28(p(pol.), n(pol.)). """ - grammar = contiguous([SimpleValueUnit, SimpleValueWithColonUnit], maybe_some((optional(Whitespace), some(SimpleValueUnit)))) + + grammar = contiguous( + [SimpleValueUnit, SimpleValueWithColonUnit], + maybe_some((optional(Whitespace), some(SimpleValueUnit))), + ) class SimpleDateValue(SimpleValueGeneric): grammar = contiguous(SimpleDateValueUnit, optional(Whitespace)) -SimpleValueUnit.parenthesized_token_grammar = (re.compile(r"\("), SimpleValue, re.compile(r"\)")) -SimpleDateValueUnit.parenthesized_token_grammar = (re.compile(r"\("), SimpleDateValue, re.compile(r"\)")) +SimpleValueUnit.parenthesized_token_grammar = ( + re.compile(r"\("), + SimpleValue, + re.compile(r"\)"), +) +SimpleDateValueUnit.parenthesized_token_grammar = ( + re.compile(r"\("), + SimpleDateValue, + re.compile(r"\)"), +) # ################################################## # @@ -511,16 +613,19 @@ class SimpleDateValue(SimpleValueGeneric): # ################################################## # class SimpleValueNegation(UnaryRule): """Negation accepting only SimpleValues.""" + grammar = omit(Not), attr('op', SimpleValue) class SimpleDateValueNegation(UnaryRule): """Negation accepting only SimpleValues.""" + grammar = omit(Not), attr('op', SimpleDateValue) class SimpleValueBooleanQuery(BooleanRule): - """For supporting queries like author ellis or smith and not Vanderhaeghen.""" + """For supporting queries like author ellis or smith and not + Vanderhaeghen.""" @classmethod def parse(cls, parser, text, pos): @@ -531,14 +636,16 @@ def parse(cls, parser, text, pos): text_after_left_op, left_operand = parser.parse(text, cls.grammar[0]) # Parse boolean operators - text_after_bool_op, operator = parser.parse(text_after_left_op, cls.grammar[1]) + text_after_bool_op, operator = parser.parse( + text_after_left_op, cls.grammar[1] + ) if not operator: # Implicit AND at terminals level operator = And(BooleanOperator.AND) # Parse right operand. # We don't want to eagerly recognize anything else other than a SimpleValue. - # So we attempt to recognize the more specific rules, and if we do, then we need to stop identifying this - # rule. + # So we attempt to recognize the more specific rules, and if we do, + # then we need to stop identifying this rule. parser.parse( text_after_bool_op, [ @@ -548,22 +655,24 @@ def parse(cls, parser, text, pos): SpiresDateKeywordQuery, InvenioKeywordQuery, SpiresKeywordQuery, - ] - ), + ], + ), [ RangeOp, GreaterEqualOp, LessEqualOp, GreaterThanOp, LessThanOp, - ComplexValue - ] - ] + ComplexValue, + ], + ], ) # Identified something other than a SimpleValue, stop parsing this rule. - result = text, SyntaxError("expected simple value related rule as right operand of a " + - cls.__name__) + result = text, SyntaxError( + "expected simple value related rule as right operand of a " + + cls.__name__ + ) except SyntaxError as e: result = text, e @@ -571,13 +680,14 @@ def parse(cls, parser, text, pos): if left_operand and operator: # Attempt to parse a right operand try: - remaining_text, right_operand = parser.parse(text_after_bool_op, cls.grammar[2]) + remaining_text, right_operand = parser.parse( + text_after_bool_op, cls.grammar[2] + ) result = remaining_text, SimpleValueBooleanQuery( - left_operand, - bool_op=operator, - right=right_operand + left_operand, bool_op=operator, right=right_operand ) - except SyntaxError as e: # Actual failure of parsing boolean query at terminals level + # Actual failure of parsing boolean query at terminals level + except SyntaxError as e: return text, e return result @@ -591,9 +701,7 @@ def parse(cls, parser, text, pos): SimpleDateValueNegation, SimpleDateValue, ], - [And, Or, None], - # Right operand options [ SimpleValueBooleanQuery, @@ -601,13 +709,19 @@ def parse(cls, parser, text, pos): SimpleValue, SimpleDateValueNegation, SimpleDateValue, - ] + ], ) class ParenthesizedSimpleValues(UnaryRule): - """Parses parenthesized simple values along with boolean operations on them.""" - grammar = omit(Literal("(")), [SimpleValueBooleanQuery, SimpleValueNegation, SimpleValue], omit(Literal(")")) + """Parses parenthesized simple values along with boolean operations on + them.""" + + grammar = ( + omit(Literal("(")), + [SimpleValueBooleanQuery, SimpleValueNegation, SimpleValue], + omit(Literal(")")), + ) @classmethod def parse(cls, parser, text, pos): @@ -620,21 +734,26 @@ def parse(cls, parser, text, pos): return text, e finally: parser._parsing_parenthesized_simple_values_expression = False + + # ######################################## # class ComplexValue(LeafRule): - """Accepting value with either single/double quotes or a regex value (/^.../$). + """Accepting value with either single/double quotes or a regex value + (/^.../$). - These values have special and different meaning for the later phases of parsing: - * Single quotes: partial text matching (text is analyzed before searched) - * Double quotes: exact text matching - * Regex: regex searches + These values have special and different meaning for the later phases + of parsing: * Single quotes: partial text matching (text is + analyzed before searched) * Double quotes: exact text matching * + Regex: regex searches E.g. t 'Millisecond pulsar velocities'. - This makes no difference for the parser and will be handled at a later parsing phase. + This makes no difference for the parser and will be handled at a + later parsing phase. """ + EXACT_VALUE_TOKEN = '"' PARTIAL_VALUE_TOKEN = '\'' REGEX_VALUE_TOKEN = '/' @@ -652,7 +771,10 @@ class GreaterThanOp(UnaryRule): Supports queries like author-count > 2000 or date after 10-2000. """ - grammar = omit(re.compile(r"after|>", re.IGNORECASE)), attr('op', [SimpleDateValue, SimpleValue]) + + grammar = omit(re.compile(r"after|>", re.IGNORECASE)), attr( + 'op', [SimpleDateValue, SimpleValue] + ) class GreaterEqualOp(UnaryRule): @@ -660,11 +782,16 @@ class GreaterEqualOp(UnaryRule): Supports queries like date >= 10-2000 or topcite 200+. """ + grammar = [ (omit(Literal(">=")), attr('op', [SimpleDateValue, SimpleValue])), - # Accept a number or numbers that are separated with (/ or -) followed by a "-" which should be - # followed by \s or ) or end of input so that you don't accept a value like 1-e. - (attr('op', re.compile(r"\d+([/-]\d+)*(?=\+)")), omit(re.compile(r'\+(?=\s|\)|$)'))), + # Accept a number or numbers that are separated with (/ or -) + # followed by a "-" which should be followed by \s or ) or + # end of input so that you don't accept a value like 1-e. + ( + attr('op', re.compile(r"\d+([/-]\d+)*(?=\+)")), + omit(re.compile(r'\+(?=\s|\)|$)')), + ), ] @@ -673,7 +800,10 @@ class LessThanOp(UnaryRule): Supports queries like author-count < 100 or date before 1984. """ - grammar = omit(re.compile(r"before|<", re.IGNORECASE)), attr('op', [SimpleDateValue, SimpleValue]) + + grammar = omit(re.compile(r"before|<", re.IGNORECASE)), attr( + 'op', [SimpleDateValue, SimpleValue] + ) class LessEqualOp(UnaryRule): @@ -684,8 +814,9 @@ class LessEqualOp(UnaryRule): grammar = [ (omit(Literal("<=")), attr("op", [SimpleDateValue, SimpleValue])), - # Accept a number or numbers that are separated with (/ or -) followed by a "-" which should be - # followed by \s or ) or end of input so that you don't accept a value like 1-e. + # Accept a number or numbers that are separated with (/ or -) followed + # by a "-" which should befollowed by \s or ) or end of input + # so that you don't accept a value like 1-e. ( attr("op", re.compile(r"\d+([/-]\d+)*(?=-)")), omit(re.compile(r"-(?=\s|\)|$)")), @@ -696,16 +827,17 @@ class LessEqualOp(UnaryRule): class RangeOp(BinaryRule): """Range operator mixing any type of values. - E.g. muon decay year:1983->1992 - author:"Ellis, J"->"Ellis, Qqq" - author:"Ellis, J"->Ellis, M + E.g. muon decay year:1983->1992 author:"Ellis, + J"->"Ellis, Qqq" author:"Ellis, J"->Ellis, M The non symmetrical type of values will be handled at a later phase. """ - grammar = \ - attr('left', [ComplexValue, SimpleRangeValue]), \ - omit(Literal("->")), \ - attr('right', [ComplexValue, SimpleRangeValue]) + + grammar = ( + attr('left', [ComplexValue, SimpleRangeValue]), + omit(Literal("->")), + attr('right', [ComplexValue, SimpleRangeValue]), + ) class Value(UnaryRule): @@ -713,22 +845,26 @@ class Value(UnaryRule): Serves as an encapsulation of the listed rules. """ - grammar = attr('op', [ - (optional(omit(Literal("="))), RangeOp), - GreaterEqualOp, - LessEqualOp, - GreaterThanOp, - LessThanOp, - ( - optional(omit(Literal("="))), - [ - ComplexValue, - ParenthesizedSimpleValues, - SimpleValueBooleanQuery, - SimpleValue - ] - ) - ]) + + grammar = attr( + 'op', + [ + (optional(omit(Literal("="))), RangeOp), + GreaterEqualOp, + LessEqualOp, + GreaterThanOp, + LessThanOp, + ( + optional(omit(Literal("="))), + [ + ComplexValue, + ParenthesizedSimpleValues, + SimpleValueBooleanQuery, + SimpleValue, + ], + ), + ], + ) class DateValue(UnaryRule): @@ -736,45 +872,52 @@ class DateValue(UnaryRule): Serves as an encapsulation of the listed rules. """ - grammar = attr('op', [ - (optional(omit(Literal("="))), RangeOp), - GreaterEqualOp, - LessEqualOp, - GreaterThanOp, - LessThanOp, - ( - optional(omit(Literal("="))), - [ - ComplexValue, - SimpleValueBooleanQuery, - SimpleDateValue - ] - ) - ]) + + grammar = attr( + 'op', + [ + (optional(omit(Literal("="))), RangeOp), + GreaterEqualOp, + LessEqualOp, + GreaterThanOp, + LessThanOp, + ( + optional(omit(Literal("="))), + [ComplexValue, SimpleValueBooleanQuery, SimpleDateValue], + ), + ], + ) + + ######################## class InvenioKeywordQuery(BinaryRule): """Keyword queries with colon separator (i.e. Invenio style). - There needs to be a distinction between Invenio and SPIRES keyword queries, so as the parser is able to recognize - any terminal as keyword for the former ones. + There needs to be a distinction between Invenio and SPIRES keyword + queries, so as the parser is able to recognize any terminal as + keyword for the former ones. - Note: - E.g. author: ellis, title: boson, or unknown_keyword: foo. + Note: E.g. author: ellis, title: boson, or unknown_keyword: foo. """ - grammar = attr('left', [[InspireKeyword, InspireDateKeyword], re.compile(r"[^\s:]+")]), \ - omit(':'), \ - attr('right', Value) + + grammar = ( + attr('left', [[InspireKeyword, InspireDateKeyword], re.compile(r"[^\s:]+")]), + omit(':'), + attr('right', Value), + ) class SpiresKeywordQuery(BinaryRule): """Keyword queries with space separator (i.e. Spires style).""" + grammar = attr('left', InspireKeyword), attr('right', Value) class SpiresDateKeywordQuery(BinaryRule): """Keyword queries with pace separator (i.e. Spires style).""" + grammar = attr('left', InspireDateKeyword), attr('right', DateValue) @@ -783,13 +926,17 @@ class SimpleQuery(UnaryRule): These are comprised of metadata queries, keywords and value queries. """ - grammar = attr('op', [ - InvenioKeywordQuery, - SpiresDateKeywordQuery, - SpiresKeywordQuery, - Value, - DateValue, - ]) + + grammar = attr( + 'op', + [ + InvenioKeywordQuery, + SpiresDateKeywordQuery, + SpiresKeywordQuery, + Value, + DateValue, + ], + ) class Statement(UnaryRule): @@ -797,26 +944,30 @@ class Statement(UnaryRule): Supports queries chaining, see its grammar for more information. """ + pass class Expression(UnaryRule): """A generic query expression. - Serves as a more restrictive rule than Statement. - This is useful for eliminating left recursion in the grammar (requirement for PEGs) when used in binary queries as - left hand side production rule. + Serves as a more restrictive rule than Statement. This is useful for + eliminating left recursion in the grammar (requirement for PEGs) + when used in binary queries as left hand side production rule. """ + pass class NotQuery(UnaryRule): """Negation query.""" + grammar = omit(Not), attr('op', Expression) class ParenthesizedQuery(UnaryRule): """Parenthesized query for denoting precedence.""" + grammar = omit(Literal('(')), attr('op', Statement), omit(Literal(')')) @@ -825,37 +976,45 @@ class NestedKeywordQuery(BinaryRule): E.g. citedby:author:hui and refersto:author:witten """ + pass -Expression.grammar = attr('op', [ - NotQuery, - NestedKeywordQuery, - ParenthesizedQuery, - SimpleQuery, -]) +Expression.grammar = attr( + 'op', + [ + NotQuery, + NestedKeywordQuery, + ParenthesizedQuery, + SimpleQuery, + ], +) -NestedKeywordQuery.grammar = \ - attr('left', [ - # Most specific regex must be higher. - re.compile(r'citedbyexcludingselfcites', re.IGNORECASE), - re.compile(r'citedbyx', re.IGNORECASE), - re.compile(r'citedby', re.IGNORECASE), - re.compile(r'referstoexcludingselfcites', re.IGNORECASE), - re.compile(r'referstox', re.IGNORECASE), - re.compile(r'refersto', re.IGNORECASE), - ]), \ - optional(omit(":")), \ - attr('right', Expression) +NestedKeywordQuery.grammar = ( + attr( + 'left', + [ + # Most specific regex must be higher. + re.compile(r'citedbyexcludingselfcites', re.IGNORECASE), + re.compile(r'citedbyx', re.IGNORECASE), + re.compile(r'citedby', re.IGNORECASE), + re.compile(r'referstoexcludingselfcites', re.IGNORECASE), + re.compile(r'referstox', re.IGNORECASE), + re.compile(r'refersto', re.IGNORECASE), + ], + ), + optional(omit(":")), + attr('right', Expression), +) class BooleanQuery(BooleanRule): - """Represents boolean query as a binary rule. + """Represents boolean query as a binary rule.""" - """ grammar = Expression, [And, Or, None], Statement + # ######################## @@ -864,7 +1023,9 @@ class BooleanQuery(BooleanRule): class MalformedQueryWords(ListRule): - """Represents queries that weren't recognized by the main parsing branch of Statements.""" + """Represents queries that weren't recognized by the main parsing branch of + Statements.""" + grammar = some(re.compile(r"[^\s]+", re.UNICODE)) def __init__(self, children): @@ -884,13 +1045,15 @@ def __repr__(self): class Query(ListRule): """The entry-point for the grammar. - Find keyword is ignored as the current grammar is an augmentation of SPIRES and Invenio style syntaxes. - It only serves for backward compatibility with SPIRES syntax. + Find keyword is ignored as the current grammar is an augmentation of + SPIRES and Invenio style syntaxes. It only serves for backward + compatibility with SPIRES syntax. """ + grammar = [ ( omit(optional(re.compile(r"(find|fin|fi|f)\s", re.IGNORECASE))), - (Statement, maybe_some(MalformedQueryWords)) + (Statement, maybe_some(MalformedQueryWords)), ), MalformedQueryWords, EmptyQuery, diff --git a/inspire_query_parser/parsing_driver.py b/inspire_query_parser/parsing_driver.py index 4c10299..f122211 100644 --- a/inspire_query_parser/parsing_driver.py +++ b/inspire_query_parser/parsing_driver.py @@ -19,7 +19,6 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. - """This module provides the public API of INSPIRE query parser.""" from __future__ import absolute_import, print_function, unicode_literals @@ -31,32 +30,36 @@ from inspire_query_parser.parser import Query from inspire_query_parser.stateful_pypeg_parser import StatefulParser from inspire_query_parser.utils.format_parse_tree import emit_tree_format -from inspire_query_parser.visitors.elastic_search_visitor import \ - ElasticSearchVisitor -from inspire_query_parser.visitors.restructuring_visitor import \ - RestructuringVisitor +from inspire_query_parser.visitors.elastic_search_visitor import ElasticSearchVisitor +from inspire_query_parser.visitors.restructuring_visitor import RestructuringVisitor logger = logging.getLogger(__name__) def parse_query(query_str): - """ - Drives the whole logic, by parsing, restructuring and finally, generating an ElasticSearch query. + """Drives the whole logic, by parsing, restructuring and finally, + generating an ElasticSearch query. - Args: - query_str (six.text_types): the given query to be translated to an ElasticSearch query + Args: query_str (six.text_types): the given query to be + translated to an ElasticSearch query - Returns: - six.text_types: Return an ElasticSearch query. + Returns: six.text_types: Return an ElasticSearch query. - Notes: - In case there's an error, an ElasticSearch `multi_match` query is generated with its `query` value, being the - query_str argument. + Notes: In case there's an error, an ElasticSearch `multi_match` + query is generated with its `query` value, being the query_str + argument. """ + def _generate_match_all_fields_query(): # Strip colon character (special character for ES) stripped_query_str = ' '.join(query_str.replace(':', ' ').split()) - return {'multi_match': {'query': stripped_query_str, 'fields': ['_all'], 'zero_terms_query': 'all'}} + return { + 'multi_match': { + 'query': stripped_query_str, + 'fields': ['_all'], + 'zero_terms_query': 'all', + } + } if not isinstance(query_str, six.text_type): query_str = six.text_type(query_str.decode('utf-8')) @@ -71,8 +74,13 @@ def _generate_match_all_fields_query(): unrecognized_text, parse_tree = parser.parse(query_str, Query) if unrecognized_text: # Usually, should never happen. - msg = 'Parser returned unrecognized text: "' + unrecognized_text + \ - '" for query: "' + query_str + '".' + msg = ( + 'Parser returned unrecognized text: "' + + unrecognized_text + + '" for query: "' + + query_str + + '".' + ) if query_str == unrecognized_text and parse_tree is None: # Didn't recognize anything. @@ -83,18 +91,26 @@ def _generate_match_all_fields_query(): logger.warn(msg) except SyntaxError as e: - logger.warn('Parser syntax error (' + six.text_type(e) + ') with query: "' + query_str + - '". Continuing with a match_all with the given query.') + logger.warn( + 'Parser syntax error (' + + six.text_type(e) + + ') with query: "' + + query_str + + '". Continuing with a match_all with the given query.' + ) return _generate_match_all_fields_query() - # Try-Catch-all exceptions for visitors, so that search functionality never fails for the user. + # Try-Catch-all exceptions for visitors, so that search functionality + # never fails for the user. try: restructured_parse_tree = parse_tree.accept(rst_visitor) logger.debug('Parse tree: \n' + emit_tree_format(restructured_parse_tree)) except Exception as e: logger.exception( - RestructuringVisitor.__name__ + " crashed" + (": " + six.text_type(e) + ".") if six.text_type(e) else '.' + RestructuringVisitor.__name__ + " crashed" + (": " + six.text_type(e) + ".") + if six.text_type(e) + else '.' ) return _generate_match_all_fields_query() @@ -102,12 +118,15 @@ def _generate_match_all_fields_query(): es_query = restructured_parse_tree.accept(es_visitor) except Exception as e: logger.exception( - ElasticSearchVisitor.__name__ + " crashed" + (": " + six.text_type(e) + ".") if six.text_type(e) else '.' + ElasticSearchVisitor.__name__ + " crashed" + (": " + six.text_type(e) + ".") + if six.text_type(e) + else '.' ) return _generate_match_all_fields_query() if not es_query: - # Case where an empty query was generated (i.e. date query with malformed date, e.g. "d < 200"). + # Case where an empty query was generated (i.e. date query with malformed + # date, e.g. "d < 200"). return _generate_match_all_fields_query() return es_query diff --git a/inspire_query_parser/stateful_pypeg_parser.py b/inspire_query_parser/stateful_pypeg_parser.py index 02fbe9c..d89ac48 100644 --- a/inspire_query_parser/stateful_pypeg_parser.py +++ b/inspire_query_parser/stateful_pypeg_parser.py @@ -26,19 +26,22 @@ class StatefulParser(Parser): """Defines a stateful parser for encapsulating parsing flags functionality. - Attributes: - _parsing_parenthesized_terminal (bool): - Signifies whether the parser is trying to identify a parenthesized terminal. Used for disabling the - terminals parsing related check "stop on DSL keyword", for allowing to parse symbols such as "+", "-" which - are also DSL keywords ('and' and 'not' respectively). + Attributes: _parsing_parenthesized_terminal (bool): + Signifies whether the parser is trying to identify a parenthesized + terminal. Used for disabling the terminals parsing related + check "stop on DSL keyword", for allowing to parse symbols such as + "+", "-" which are also DSL keywords ('and' and 'not' + respectively). - _parsing_parenthesized_simple_values_expression (bool): - Signifies whether we are parsing a parenthesized simple values expression. Used for disabling the simple - values parsing related check "stop on INSPIRE keyword", for allowing parsing more expressions and not - restrict the input accepted by the parser. + _parsing_parenthesized_simple_values_expression (bool): + Signifies whether we are parsing a parenthesized simple values + expression. Used for disabling the simple values parsing related + check "stop on INSPIRE keyword", for allowing parsing more + expressions and not restrict the input accepted by the parser. - _parsing_texkey_expression (bool): - Signifies whether we are parsing a `texkey` expression which has special value in which we must accept ':'. + _parsing_texkey_expression (bool): Signifies whether we are + parsing a `texkey` expression which has special value in which we + must accept ':'. """ def __init__(self): diff --git a/inspire_query_parser/utils/format_parse_tree.py b/inspire_query_parser/utils/format_parse_tree.py index dde7db0..4784716 100644 --- a/inspire_query_parser/utils/format_parse_tree.py +++ b/inspire_query_parser/utils/format_parse_tree.py @@ -24,22 +24,20 @@ import six +from inspire_query_parser.ast import BinaryOp, Leaf, ListOp, UnaryOp from inspire_query_parser.parser import BooleanRule -from ..ast import BinaryOp, Leaf, ListOp, UnaryOp - INDENTATION = 4 def emit_tree_format(tree, verbose=False): """Returns a tree representation of a parse tree. - Arguments: - tree: the parse tree whose tree representation is to be generated - verbose (bool): if True prints the parse tree to be formatted + Arguments: tree: the parse tree whose tree + representation is to be generated verbose (bool): if True prints + the parse tree to be formatted - Returns: - str: tree-like representation of the parse tree + Returns: str: tree-like representation of the parse tree """ if verbose: print("Converting: " + repr(tree)) @@ -65,14 +63,19 @@ def __recursive_formatter(node, level=-INDENTATION): new_level = INDENTATION + level if isinstance(node, Leaf): - value = "" if not repr(node.value) else node.__class__.__name__ \ - + " {" + (node.value if node.value else "") + "}" + value = ( + "" + if not repr(node.value) + else node.__class__.__name__ + + " {" + + (node.value if node.value else "") + + "}" + ) ret_str = __emit_symbol_at_level_str(value, new_level) if value != "" else "" elif isinstance(node, six.text_type): - value = "" if not repr(node) or repr(node) == "None" \ - else "Text {" + node + "}" + value = "" if not repr(node) or repr(node) == "None" else "Text {" + node + "}" ret_str = __emit_symbol_at_level_str(value, new_level) if value != "" else "" @@ -88,7 +91,7 @@ def __recursive_formatter(node, level=-INDENTATION): if isinstance(node, BooleanRule): ret_str = __emit_symbol_at_level_str( node.__class__.__name__ + " {" + str(node.bool_op) + "}", - new_level + new_level, ) except AttributeError: pass diff --git a/inspire_query_parser/utils/visitor_utils.py b/inspire_query_parser/utils/visitor_utils.py index 97902a3..a648e39 100644 --- a/inspire_query_parser/utils/visitor_utils.py +++ b/inspire_query_parser/utils/visitor_utils.py @@ -22,35 +22,39 @@ from __future__ import absolute_import, unicode_literals -from datetime import date - +import contextlib import json -from dateutil.relativedelta import relativedelta -from dateutil.parser import parse import re -from unidecode import unidecode - -from inspire_utils.name import ParsedName +from datetime import date +from dateutil.parser import parse +from dateutil.relativedelta import relativedelta from inspire_utils.date import PartialDate +from inspire_utils.name import ParsedName +from unidecode import unidecode from inspire_query_parser.ast import GenericValue -from inspire_query_parser.config import (DATE_LAST_MONTH_REGEX_PATTERN, - DATE_SPECIFIERS_COLLECTION, - DATE_THIS_MONTH_REGEX_PATTERN, - DATE_TODAY_REGEX_PATTERN, - DATE_YESTERDAY_REGEX_PATTERN) - - -NAME_INITIAL_FOLLOWED_BY_FIRSTNAME_WITHOUT_SPACE = re.compile(r"(\.[a-z])", re.IGNORECASE) -QUERY_STRING_QUERY_SPECIAL_CHARACTERS = re.compile(r'\/|\+|\-|\=|\&\&|\|\||\>|\<|\!|\(|\)|\{|\}|\[|\]|\^|\"|\~|\?|\:|\\') +from inspire_query_parser.config import ( + DATE_LAST_MONTH_REGEX_PATTERN, + DATE_SPECIFIERS_COLLECTION, + DATE_THIS_MONTH_REGEX_PATTERN, + DATE_TODAY_REGEX_PATTERN, + DATE_YESTERDAY_REGEX_PATTERN, +) + +NAME_INITIAL_FOLLOWED_BY_FIRSTNAME_WITHOUT_SPACE = re.compile( + r"(\.[a-z])", re.IGNORECASE +) +QUERY_STRING_QUERY_SPECIAL_CHARACTERS = re.compile( + r'\/|\+|\-|\=|\&\&|\|\||\>|\<|\!|\(|\)|\{|\}|\[|\]|\^|\"|\~|\?|\:|\\' +) def retokenize_first_names(names): """Handle corner cases where the intial and firstname has no space. - Example: - For queries ``J.David`` we be split into ``J`` and ``David``. + Example: For queries ``J.David`` we be split into ``J`` and + ``David``. """ names_filtered = [] for name in names: @@ -70,24 +74,24 @@ def is_initial_of_a_name(name_part): def author_name_contains_fullnames(author_name): - """Recognizes whether the name contains full name parts and not initials or only lastname. + """Recognizes whether the name contains full name parts and not initials or + only lastname. - Returns: - bool: True if name has only full name parts, e.g. 'Ellis John', False otherwise. So for example, False is - returned for 'Ellis, J.' or 'Ellis'. + Returns: bool: True if name has only full name parts, e.g. + 'Ellis John', False otherwise. So for example, False is + returned for 'Ellis, J.' or 'Ellis'. """ parsed_name = ParsedName(author_name) - if len(parsed_name) == 1: - return False - elif any([is_initial_of_a_name(name_part) for name_part in parsed_name]): - return False - - return True + return not ( + len(parsed_name) == 1 + or any([is_initial_of_a_name(name_part) for name_part in parsed_name]) + ) def _name_variation_has_only_initials(name): """Detects whether the name variation consists only from initials.""" + def _is_initial(name_variation): return len(name_variation) == 1 or u'.' in name_variation @@ -99,42 +103,45 @@ def _is_initial(name_variation): def generate_minimal_name_variations(author_name): """Generate a small number of name variations. - Notes: - Unidecodes the name, so that we use its transliterated version, since this is how the field is being indexed. - - For names with more than one part, {lastname} x {non lastnames, non lastnames initial} variations. - Additionally, it generates the swapped version of those, for supporting queries like ``Mele Salvatore`` which - ``ParsedName`` parses as lastname: Salvatore and firstname: Mele. So in those cases, we need to generate both - ``Mele, Salvatore`` and ``Mele, S``. - - Wherever, the '-' is replaced by ' ', it's done because it's the way the name variations are being index, thus - we want our minimal name variations to be generated identically. This has to be done after the creation of - ParsedName, otherwise the name is parsed differently. E.g. 'Caro-Estevez' as is, it's a lastname, if we replace - the '-' with ' ', then it's a firstname and lastname. + Notes: Unidecodes the name, so that we use its transliterated + version, since this is how the field is being indexed. + + For names with more than one part, {lastname} x {non lastnames, non + lastnames initial} variations. Additionally, it generates the + swapped version of those, for supporting queries like ``Mele + Salvatore`` which ``ParsedName`` parses as lastname: Salvatore and + firstname: Mele. So in those cases, we need to generate both ``Mele, + Salvatore`` and ``Mele, S``. + + Wherever, the '-' is replaced by ' ', it's done because it's the way + the name variations are being index, thus we want our minimal name + variations to be generated identically. This has to be done after + the creation of ParsedName, otherwise the name is parsed + differently. E.g. 'Caro-Estevez' as is, it's a lastname, if we + replace the '-' with ' ', then it's a firstname and lastname. """ parsed_name = ParsedName.loads(unidecode(author_name)) if len(parsed_name) > 1: lastnames = parsed_name.last.replace('-', ' ') - non_lastnames = ' '.join( - parsed_name.first_list + parsed_name.suffix_list - ) + non_lastnames = ' '.join(parsed_name.first_list + parsed_name.suffix_list) # Strip extra whitespace added if any of middle_list and suffix_list are empty. non_lastnames = non_lastnames.strip().replace('-', ' ') # Adding into a set first, so as to drop identical name variations. - return list({ - name_variation.lower() - for name_variation - in [ - lastnames + ' ' + non_lastnames, - lastnames + ' ' + non_lastnames[0], - non_lastnames + ' ' + lastnames, - non_lastnames + ' ' + lastnames[0], - ] - if not _name_variation_has_only_initials(name_variation) - }) + return list( + { + name_variation.lower() + for name_variation in [ + lastnames + ' ' + non_lastnames, + lastnames + ' ' + non_lastnames[0], + non_lastnames + ' ' + lastnames, + non_lastnames + ' ' + lastnames[0], + ] + if not _name_variation_has_only_initials(name_variation) + } + ) else: return [parsed_name.dumps().replace('-', ' ').lower()] @@ -142,7 +149,8 @@ def generate_minimal_name_variations(author_name): # #### Date specifiers related utils #### ANY_PREFIX_AND_A_NUMBER = re.compile('(.+)(\d+)') -# ES query constants that provide rounding of dates on query time, according to the date "resolution" the user gave. +# ES query constants that provide rounding of dates on query time, according to the +# date "resolution" the user gave. # More here: https://www.elastic.co/guide/en/elasticsearch/reference/6.1/common-options.html#date-math ES_DATE_MATH_ROUNDING_YEAR = "||/y" ES_DATE_MATH_ROUNDING_MONTH = "||/M" @@ -152,7 +160,9 @@ def generate_minimal_name_variations(author_name): def _compile_date_regexes(date_specifier_patterns): date_specifier_regexes = {} for date_specifier in date_specifier_patterns: - date_specifier_regexes[date_specifier] = re.compile(date_specifier, re.IGNORECASE) + date_specifier_regexes[date_specifier] = re.compile( + date_specifier, re.IGNORECASE + ) return date_specifier_regexes @@ -163,35 +173,40 @@ def _compile_date_regexes(date_specifier_patterns): def register_date_conversion_handler(date_specifier_patterns): """Decorator for registering handlers that convert text dates to dates. - Args: - date_specifier_patterns (str): the date specifier (in regex pattern format) for which the handler is registered + Args: date_specifier_patterns (str): the date specifier (in + regex pattern format) for which the handler is registered """ def _decorator(func): global DATE_SPECIFIERS_CONVERSION_HANDLERS - DATE_SPECIFIERS_CONVERSION_HANDLERS[DATE_SPECIFIERS_REGEXES[date_specifier_patterns]] = func + DATE_SPECIFIERS_CONVERSION_HANDLERS[ + DATE_SPECIFIERS_REGEXES[date_specifier_patterns] + ] = func return func return _decorator DATE_SPECIFIERS_CONVERSION_HANDLERS = {} -"""Mapping that depending on the date-specifier (key), returns the handler that converts the textual date to date.""" +"""Mapping that depending on the date-specifier (key), returns the handler that +converts the textual date to date.""" def _extract_number_from_text(text): number = 0 # fallback in case extracting the number fails number_match = ANY_PREFIX_AND_A_NUMBER.match(text) if number_match: - try: + with contextlib.suppress(ValueError): number = int(number_match.group(2)) - except ValueError: - pass return number def _convert_date_to_string(start_date, relative_delta=None): - return str(start_date - relative_delta) if relative_delta is not None else str(start_date) + return ( + str(start_date - relative_delta) + if relative_delta is not None + else str(start_date) + ) @register_date_conversion_handler(DATE_TODAY_REGEX_PATTERN) @@ -199,7 +214,8 @@ def convert_today_date_specifier(relative_date_specifier_suffix): start_date = date.today() relative_delta = ( relativedelta(days=_extract_number_from_text(relative_date_specifier_suffix)) - if relative_date_specifier_suffix else None + if relative_date_specifier_suffix + else None ) return _convert_date_to_string(start_date, relative_delta) @@ -210,7 +226,8 @@ def convert_yesterday_date_specifier(relative_date_specifier_suffix): start_date = date.today() - relativedelta(days=1) relative_delta = ( relativedelta(days=_extract_number_from_text(relative_date_specifier_suffix)) - if relative_date_specifier_suffix else None + if relative_date_specifier_suffix + else None ) return _convert_date_to_string(start_date, relative_delta) @@ -221,7 +238,8 @@ def convert_this_month_date(relative_date_specifier_suffix): start_date = date.today() relative_delta = ( relativedelta(months=_extract_number_from_text(relative_date_specifier_suffix)) - if relative_date_specifier_suffix else None + if relative_date_specifier_suffix + else None ) return _convert_date_to_string(start_date, relative_delta) @@ -232,7 +250,8 @@ def convert_last_month_date(relative_date_specifier_suffix): start_date = date.today() - relativedelta(months=1) relative_delta = ( relativedelta(months=_extract_number_from_text(relative_date_specifier_suffix)) - if relative_date_specifier_suffix else None + if relative_date_specifier_suffix + else None ) return _convert_date_to_string(start_date, relative_delta) @@ -244,22 +263,22 @@ def convert_last_month_date(relative_date_specifier_suffix): """Contains all the dates that contain always only a year date.""" ES_RANGE_EQ_OPERATOR = 'eq' -"""Additional (internal to the parser) range operator, for handling date equality queries as ranges.""" +"""Additional (internal to the parser) range operator, for handling date +equality queries as ranges.""" def _truncate_wildcard_from_date(date_value): """Truncate wildcard from date parts. - Returns: - (str) The truncated date. + Returns: (str) The truncated date. - Raises: - ValueError, on either unsupported date separator (currently only ' ' and '-' are supported), or if there's a - wildcard in the year. + Raises: ValueError, on either unsupported date separator + (currently only ' ' and '-' are supported), or if there's a + wildcard in the year. - Notes: - Either whole date part is wildcard, in which we ignore it and do a range query on the - remaining parts, or some numbers are wildcards, where again, we ignore this part. + Notes: Either whole date part is wildcard, in which we ignore it + and do a range query on the remaining parts, or some numbers are + wildcards, where again, we ignore this part. """ if ' ' in date_value: date_parts = date_value.split(' ') @@ -278,17 +297,18 @@ def _truncate_wildcard_from_date(date_value): def _truncate_date_value_according_on_date_field(field, date_value): """Truncates date value (to year only) according to the given date field. - Args: - field (unicode): The field for which the date value will be used to query on. - date_value (str): The date value that is going to be truncated to its year. + Args: field (unicode): The field for which the date value will + be used to query on. date_value (str): The date value that is + going to be truncated to its year. - Returns: - PartialDate: The possibly truncated date, on success. None, otherwise. + Returns: PartialDate: The possibly truncated date, on success. + None, otherwise. - Notes: - In case the fieldname is in `ES_MAPPING_HEP_DATE_ONLY_YEAR`, then the date is normalized and then only its year - value is used. This is needed for ElasticSearch to be able to do comparisons on dates that have only year, which - fails if being queried with a date with more . + Notes: In case the fieldname is in + `ES_MAPPING_HEP_DATE_ONLY_YEAR`, then the date is normalized and + then only its year value is used. This is needed for + ElasticSearch to be able to do comparisons on dates that have only + year, which fails if being queried with a date with more . """ try: partial_date = PartialDate.parse(date_value) @@ -306,11 +326,10 @@ def _truncate_date_value_according_on_date_field(field, date_value): def _get_next_date_from_partial_date(partial_date): """Calculates the next date from the given partial date. - Args: - partial_date (inspire_utils.date.PartialDate): The partial date whose next date should be calculated. + Args: partial_date (inspire_utils.date.PartialDate): The partial + date whose next date should be calculated. - Returns: - PartialDate: The next date from the given partial date. + Returns: PartialDate: The next date from the given partial date. """ relativedelta_arg = 'years' @@ -323,27 +342,29 @@ def _get_next_date_from_partial_date(partial_date): return PartialDate.from_parts( next_date.year, next_date.month if partial_date.month else None, - next_date.day if partial_date.day else None + next_date.day if partial_date.day else None, ) def _get_proper_elastic_search_date_rounding_format(partial_date): - """Returns the proper ES date math unit according to the "resolution" of the partial_date. - - Args: - partial_date (PartialDate): The partial date for which the date math unit is. - - Returns: - (str): The ES date math unit format. - - Notes: - This is needed for supporting range queries on dates, i.e. rounding them up or down according to - the ES range operator. - For example, without this, a query like 'date > 2010-11', would return documents with date '2010-11-15', due to - the date value of the query being interpreted by ES as '2010-11-01 01:00:00'. By using the suffixes for rounding - up or down, the date value of the query is interpreted as '2010-11-30T23:59:59.999', thus not returning the - document with date '2010-11-15', as the user would expect. See: - https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-dsl-range-query.html#_date_math_and_rounding + """Returns the proper ES date math unit according to the "resolution" of + the partial_date. + + Args: partial_date (PartialDate): The partial date for which the + date math unit is. + + Returns: (str): The ES date math unit format. + + Notes: This is needed for supporting range queries on dates, + i.e. rounding them up or down according to the ES range + operator. For example, without this, a query like 'date > + 2010-11', would return documents with date '2010-11-15', due to + the date value of the query being interpreted by ES as '2010-11-01 + 01:00:00'. By using the suffixes for rounding up or down, the + date value of the query is interpreted as '2010-11-30T23:59:59.999', + thus not returning the document with date '2010-11-15', as the + user would expect. See: + https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-dsl-range-query.html#_date_math_and_rounding """ es_date_math_unit = ES_DATE_MATH_ROUNDING_YEAR @@ -355,17 +376,21 @@ def _get_proper_elastic_search_date_rounding_format(partial_date): return es_date_math_unit -def update_date_value_in_operator_value_pairs_for_fieldname(field, operator_value_pairs): - """Updates (operator, date value) pairs by normalizing the date value according to the given field. +def update_date_value_in_operator_value_pairs_for_fieldname( + field, operator_value_pairs +): + """Updates (operator, date value) pairs by normalizing the date value + according to the given field. - Args: - field (unicode): The fieldname for which the operator-value pairs are being generated. - operator_value_pairs (dict): ES range operator {'gt', 'gte', 'lt', 'lte'} along with a value. - Additionally, if the operator is ``ES_RANGE_EQ_OPERATOR``, then it is indicated that the method should - generate both a lower and an upper bound operator-value pairs, with the given date_value. + Args: field (unicode): The fieldname for which the operator- + value pairs are being generated. operator_value_pairs (dict): ES + range operator {'gt', 'gte', 'lt', 'lte'} along with a value. + Additionally, if the operator is ``ES_RANGE_EQ_OPERATOR``, then it + is indicated that the method should generate both a lower + and an upper bound operator-value pairs, with the given date_value. - Notes: - On a ``ValueError`` an empty operator_value_pairs is returned. + Notes: On a ``ValueError`` an empty operator_value_pairs is + returned. """ updated_operator_value_pairs = {} for operator, value in operator_value_pairs.items(): @@ -374,15 +399,24 @@ def update_date_value_in_operator_value_pairs_for_fieldname(field, operator_valu return {} if operator == ES_RANGE_EQ_OPERATOR: - updated_operator_value_pairs['gte'] = \ - modified_date.dumps() + _get_proper_elastic_search_date_rounding_format(modified_date) + updated_operator_value_pairs[ + 'gte' + ] = modified_date.dumps() + _get_proper_elastic_search_date_rounding_format( + modified_date + ) next_date = _get_next_date_from_partial_date(modified_date) - updated_operator_value_pairs['lt'] = \ - next_date.dumps() + _get_proper_elastic_search_date_rounding_format(next_date) + updated_operator_value_pairs[ + 'lt' + ] = next_date.dumps() + _get_proper_elastic_search_date_rounding_format( + next_date + ) else: - updated_operator_value_pairs[operator] = \ - modified_date.dumps() + _get_proper_elastic_search_date_rounding_format(modified_date) + updated_operator_value_pairs[ + operator + ] = modified_date.dumps() + _get_proper_elastic_search_date_rounding_format( + modified_date + ) return updated_operator_value_pairs @@ -391,22 +425,21 @@ def update_date_value_in_operator_value_pairs_for_fieldname(field, operator_valu def generate_match_query(field, value, with_operator_and): """Helper for generating a match query. - Args: - field (six.text_type): The ES field to be queried. - value (six.text_type/bool): The value of the query (bool for the case of type-code query ["core: true"]). - with_operator_and (bool): Flag that signifies whether to generate the explicit notation of the query, along - with '"operator": "and"', so that all tokens of the query value are required to match. + Args: field (six.text_type): The ES field to be queried. + value (six.text_type/bool): The value of the query (bool for the + case of type-code query ["core: true"]). with_operator_and + (bool): Flag that signifies whether to generate the explicit + notation of the query, along with '"operator": "and"', so + that all tokens of the query value are required to match. - Notes: - If value is of instance bool, then the shortened version of the match query is generated, at all times. + Notes: If value is of instance bool, then the shortened version + of the match query is generated, at all times. """ parsed_value = None - try: + # Catch all possible exceptions + # we are not interested if they will appear + with contextlib.suppress(ValueError, TypeError, AttributeError): parsed_value = json.loads(value.lower()) - except (ValueError, TypeError, AttributeError): - # Catch all possible exceptions - # we are not interested if they will appear - pass if isinstance(value, bool): return {'match': {field: value}} @@ -414,14 +447,7 @@ def generate_match_query(field, value, with_operator_and): return {'match': {field: value.lower()}} if with_operator_and: - return { - 'match': { - field: { - 'query': value, - 'operator': 'and' - } - } - } + return {'match': {field: {'query': value, 'operator': 'and'}}} return {'match': {field: value}} @@ -429,35 +455,32 @@ def generate_match_query(field, value, with_operator_and): def generate_nested_query(path, queries): """Generates nested query. - Returns: - (dict): The nested query if queries is not falsy, otherwise an empty dict. + Returns: (dict): The nested query if queries is not falsy, + otherwise an empty dict. """ if not queries: return {} - return { - 'nested': { - 'path': path, - 'query': queries - } - } + return {'nested': {'path': path, 'query': queries}} -def wrap_queries_in_bool_clauses_if_more_than_one(queries, - use_must_clause, - preserve_bool_semantics_if_one_clause=False): +def wrap_queries_in_bool_clauses_if_more_than_one( + queries, use_must_clause, preserve_bool_semantics_if_one_clause=False +): """Helper for wrapping a list of queries into a bool.{must, should} clause. - Args: - queries (list): List of queries to be wrapped in a bool.{must, should} clause. - use_must_clause (bool): Flag that signifies whether to use 'must' or 'should' clause. - preserve_bool_semantics_if_one_clause (bool): Flag that signifies whether to generate a bool query even if - there's only one clause. This happens to generate boolean query semantics. Usually not the case, but - useful for boolean queries support. - - Returns: - (dict): If len(queries) > 1, the bool clause, otherwise if len(queries) == 1, will return the query itself, - while finally, if len(queries) == 0, then an empty dictionary is returned. + Args: queries (list): List of queries to be wrapped in a + bool.{must, should} clause. use_must_clause (bool): Flag that + signifies whether to use 'must' or 'should' clause. + preserve_bool_semantics_if_one_clause (bool): Flag that signifies + whether to generate a bool query even if there's only one + clause. This happens to generate boolean query semantics. Usually + not the case, but useful for boolean queries support. + + Returns: (dict): If len(queries) > 1, the bool clause, otherwise + if len(queries) == 1, will return the query itself, + while finally, if len(queries) == 0, then an empty dictionary is + returned. """ if not queries: return {} @@ -467,22 +490,16 @@ def wrap_queries_in_bool_clauses_if_more_than_one(queries, if len(queries) == 1 and not preserve_bool_semantics_if_one_clause: return queries[0] - return { - 'bool': { - ('must' if use_must_clause else 'should'): queries - } - } + return {'bool': {('must' if use_must_clause else 'should'): queries}} def wrap_query_in_nested_if_field_is_nested(query, field, nested_fields): - """Helper for wrapping a query into a nested if the fields within the query are nested - - Args: - query : The query to be wrapped. - field : The field that is being queried. - nested_fields : List of fields which are nested. - Returns: - (dict): The nested query + """Helper for wrapping a query into a nested if the fields within the query + are nested. + + Args: query : The query to be wrapped. field : The field + that is being queried. nested_fields : List of fields which are + nested. Returns: (dict): The nested query """ if not field: return query @@ -500,12 +517,13 @@ def wrap_query_in_nested_if_field_is_nested(query, field, nested_fields): def escape_query_string_special_characters(value): - """ - Helper to escape reserved characters in query_string query. + """Helper to escape reserved characters in query_string query. + According do the documentation failing to escape these special characters correctly could lead to a syntax error which prevents your query from running. """ - value = re.sub(QUERY_STRING_QUERY_SPECIAL_CHARACTERS, - lambda char: "\\" + char.group(), value) + value = re.sub( + QUERY_STRING_QUERY_SPECIAL_CHARACTERS, lambda char: "\\" + char.group(), value + ) return value diff --git a/inspire_query_parser/visitors/elastic_search_visitor.py b/inspire_query_parser/visitors/elastic_search_visitor.py index 3f6cf42..cedc6b0 100644 --- a/inspire_query_parser/visitors/elastic_search_visitor.py +++ b/inspire_query_parser/visitors/elastic_search_visitor.py @@ -19,24 +19,22 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. - -""" -This module encapsulates the ElasticSearch visitor logic, that receives the output of the parser and restructuring -visitor and converts it to an ElasticSearch query. -""" +"""This module encapsulates the ElasticSearch visitor logic, that receives the +output of the parser and restructuring visitor and converts it to an +ElasticSearch query.""" from __future__ import absolute_import, unicode_literals import logging -from pypeg2 import whitespace import re -import six from unicodedata import normalize +import six from inspire_schemas.utils import convert_old_publication_info_to_new from inspire_utils.helpers import force_list -from inspire_utils.name import normalize_name, ParsedName +from inspire_utils.name import ParsedName, normalize_name from inspire_utils.query import wrap_queries_in_bool_clauses_if_more_than_one +from pypeg2 import whitespace from inspire_query_parser import ast from inspire_query_parser.config import ( @@ -47,11 +45,11 @@ ES_RANGE_EQ_OPERATOR, _truncate_date_value_according_on_date_field, _truncate_wildcard_from_date, + escape_query_string_special_characters, generate_match_query, generate_nested_query, update_date_value_in_operator_value_pairs_for_fieldname, wrap_query_in_nested_if_field_is_nested, - escape_query_string_special_characters ) from inspire_query_parser.visitors.visitor_impl import Visitor @@ -65,9 +63,11 @@ class FieldVariations(object): class ElasticSearchVisitor(Visitor): """Converts a parse tree to an ElasticSearch query. - Notes: - The ElasticSearch query follows the 2.4 version DSL specification. + + Notes: The ElasticSearch query follows the 2.4 version DSL + specification. """ + # ##### Configuration ##### # ## Journal queries ## JOURNAL_FIELDS_PREFIX = 'publication_info' @@ -78,7 +78,9 @@ class ElasticSearchVisitor(Visitor): JOURNAL_ART_ID = 'artid' JOURNAL_YEAR = 'year' JOURNAL_FIELDS_MAPPING = { - JOURNAL_TITLE: '.'.join((JOURNAL_FIELDS_PREFIX, JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO)), + JOURNAL_TITLE: '.'.join( + (JOURNAL_FIELDS_PREFIX, JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO) + ), JOURNAL_VOLUME: '.'.join((JOURNAL_FIELDS_PREFIX, JOURNAL_VOLUME)), JOURNAL_PAGE_START: '.'.join((JOURNAL_FIELDS_PREFIX, JOURNAL_PAGE_START)), JOURNAL_ART_ID: '.'.join((JOURNAL_FIELDS_PREFIX, JOURNAL_ART_ID)), @@ -115,9 +117,7 @@ class ElasticSearchVisitor(Visitor): 'eprint': 'arxiv_eprints.value.raw', 'exact-author': 'authors.full_name_unicode_normalized', 'irn': 'external_system_identifiers.value.raw', - 'journal': [ - *JOURNAL_FIELDS_MAPPING.values() - ], + 'journal': [*JOURNAL_FIELDS_MAPPING.values()], 'keyword': 'keywords.value', 'refersto': 'references.record.$ref', 'reportnumber': 'report_numbers.value.fuzzy', @@ -136,13 +136,14 @@ class ElasticSearchVisitor(Visitor): 'fulltext': 'documents.attachment.content', 'citedby': { 'path': 'references.record.$ref.raw', - 'search_path': 'self.$ref.raw' - } + 'search_path': 'self.$ref.raw', + }, } """Mapping from keywords to ElasticSearch fields. - Note: - If a keyword should query multiple fields, then it's value in the mapping should be a list. This will generate - a ``multi_match`` query. Otherwise a ``match`` query is generated. + + Note: If a keyword should query multiple fields, then it's value + in the mapping should be a list. This will generate a + ``multi_match`` query. Otherwise a ``match`` query is generated. """ TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING = { 'b': ('document_type', 'book'), @@ -164,8 +165,9 @@ class ElasticSearchVisitor(Visitor): 'proceedings': ('document_type', 'proceedings'), } """Mapping from type-code query values to field and value pairs. - Note: - These are going to be used for querying (instead of the given value). + + Note: These are going to be used for querying (instead of the + given value). """ AUTHORS_NAME_VARIATIONS_FIELD = 'authors.name_variations' @@ -187,68 +189,91 @@ class ElasticSearchVisitor(Visitor): # #### Helpers #### def _get_author_or_first_author_keyword_from_fieldnames(self, fieldnames=None): - """Returns author or first_author keywords if their fields are part of the fieldnames. Defaults to author""" - return 'first_author' if fieldnames and self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames else 'author' + """Returns author or first_author keywords if their fields are part of + the fieldnames. + + Defaults to author + """ + return ( + 'first_author' + if fieldnames and self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames + else 'author' + ) def _generate_nested_author_query(self, query, fieldnames=None): - """Generates nested query with path for authors or first_author""" - nested_path = self.FIRST_AUTHOR_NESTED_QUERY_PATH \ - if fieldnames and self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames \ + """Generates nested query with path for authors or first_author.""" + nested_path = ( + self.FIRST_AUTHOR_NESTED_QUERY_PATH + if fieldnames and self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames else self.AUTHORS_NESTED_QUERY_PATH + ) return generate_nested_query(nested_path, query) def _are_fieldnames_author_or_first_author(self, fieldnames): if isinstance(fieldnames, list): - return self.KEYWORD_TO_ES_FIELDNAME['author'] in fieldnames or self.KEYWORD_TO_ES_FIELDNAME[ - 'first_author'] in fieldnames - return self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames or self.KEYWORD_TO_ES_FIELDNAME[ - 'first_author'] == fieldnames + return ( + self.KEYWORD_TO_ES_FIELDNAME['author'] in fieldnames + or self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames + ) + return ( + self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames + or self.KEYWORD_TO_ES_FIELDNAME['first_author'] == fieldnames + ) - def _generate_fieldnames_if_bai_query(self, fieldnames, node_value, bai_field_variation, - query_bai_field_if_dots_in_name): + def _generate_fieldnames_if_bai_query( + self, + fieldnames, + node_value, + bai_field_variation, + query_bai_field_if_dots_in_name, + ): """Generates new fieldnames in case of BAI query. - Args: - fieldnames : names of the fields of the node. - node_value (six.text_type): The node's value (i.e. author name). - bai_field_variation (six.text_type): Which field variation to query ('search' or 'raw'). - query_bai_field_if_dots_in_name (bool): Whether to query BAI field (in addition to author's name field) - if dots exist in the name and name contains no whitespace. - Returns: - list: Fieldnames to query on, in case of BAI query or None, otherwise. - Raises: - ValueError, if ``field_variation`` is not one of ('search', 'raw'). + + Args: fieldnames : names of the fields of the node. + node_value (six.text_type): The node's value (i.e. author name). + bai_field_variation (six.text_type): Which field variation to + query ('search' or 'raw'). query_bai_field_if_dots_in_name + (bool): Whether to query BAI field (in addition to author's name + field) if dots exist in the name and name contains no + whitespace. Returns: list: Fieldnames to query on, in case + of BAI query or None, otherwise. Raises: ValueError, if + ``field_variation`` is not one of ('search', 'raw'). """ if bai_field_variation not in (FieldVariations.search, FieldVariations.raw): - raise ValueError('Non supported field variation "{}".'.format(bai_field_variation)) + raise ValueError( + 'Non supported field variation "{}".'.format(bai_field_variation) + ) keyword = self._get_author_or_first_author_keyword_from_fieldnames(fieldnames) normalized_author_name = normalize_name(node_value).strip('.') bai_fieldname = self.KEYWORD_TO_ES_FIELDNAME['{}_bai'.format(keyword)] - if self.KEYWORD_TO_ES_FIELDNAME[keyword] and \ - self.BAI_REGEX.match(node_value): + if self.KEYWORD_TO_ES_FIELDNAME[keyword] and self.BAI_REGEX.match(node_value): return [bai_fieldname + '.' + bai_field_variation] - elif not whitespace.search(normalized_author_name) and \ - query_bai_field_if_dots_in_name and \ - self.KEYWORD_TO_ES_FIELDNAME[keyword] and \ - '.' in normalized_author_name: + elif ( + not whitespace.search(normalized_author_name) + and query_bai_field_if_dots_in_name + and self.KEYWORD_TO_ES_FIELDNAME[keyword] + and '.' in normalized_author_name + ): # Case of partial BAI, e.g. ``J.Smith``. - return [bai_fieldname + '.' + bai_field_variation] + \ - force_list(self.KEYWORD_TO_ES_FIELDNAME[keyword]) + return [bai_fieldname + '.' + bai_field_variation] + force_list( + self.KEYWORD_TO_ES_FIELDNAME[keyword] + ) return None def _generate_author_query(self, fieldnames, author_name): """Generates a query handling specifically authors. - Notes: - There are three main cases: - 1) ``a Smith`` - This will just generate a ``match`` query on ``last_name`` - 2) ``a John Smith`` - This will just generate a ``match`` query on ``last_name`` and a ``prefix`` query on ``first_name`` - and a ``match`` query on the initial ``J``. This will return results from ``Smith, John`` and ``Smith, J`` - but not from ``Smith, Jane``. - 3) ``a J Smith`` - This will just generate a ``match`` query on ``last_name`` and a match query on ``first_name.initials``. - Please note, cases such as ``J.D.`` have been properly handled by the tokenizer. + + Notes: There are three main cases: 1) ``a Smith`` This + will just generate a ``match`` query on ``last_name`` 2) ``a + John Smith`` This will just generate a ``match`` query on + ``last_name`` and a ``prefix`` query on ``first_name`` and a + ``match`` query on the initial ``J``. This will return results + from ``Smith, John`` and ``Smith, J`` but not from ``Smith, + Jane``. 3) ``a J Smith`` This will just generate a + ``match`` query on ``last_name`` and a match query on + ``first_name.initials``. Please note, cases such as ``J.D.`` + have been properly handled by the tokenizer. """ parsed_name = ParsedName(author_name) keyword = self._get_author_or_first_author_keyword_from_fieldnames(fieldnames) @@ -258,35 +283,37 @@ def _generate_author_query(self, fieldnames, author_name): def _generate_exact_author_query(self, author_name_or_bai): """Generates a term query handling authors and BAIs. - Notes: - If given value is a BAI, search for the provided value in the raw field variation of - `self.AUTHORS_BAI_FIELD`. - Otherwise, the value will be procesed in the same way as the indexed value (i.e. lowercased and normalized - (inspire_utils.normalize_name and then NFKC normalization). - E.g. Searching for 'Smith, J.' is the same as searching for: 'Smith, J', 'smith, j.', 'smith j', 'j smith', - 'j. smith', 'J Smith', 'J. Smith'. + + Notes: If given value is a BAI, search for the provided + value in the raw field variation of `self.AUTHORS_BAI_FIELD`. + Otherwise, the value will be procesed in the same way as the + indexed value (i.e. lowercased and normalized + (inspire_utils.normalize_name and then NFKC normalization). + E.g. Searching for 'Smith, J.' is the same as searching for: + 'Smith, J', 'smith, j.', 'smith j', 'j smith', 'j. smith', 'J + Smith', 'J. Smith'. """ if self.BAI_REGEX.match(author_name_or_bai): bai = author_name_or_bai.lower() query = self._generate_term_query( - '.'.join((self.AUTHORS_BAI_FIELD, FieldVariations.search)), - bai + '.'.join((self.AUTHORS_BAI_FIELD, FieldVariations.search)), bai ) else: author_name = normalize('NFKC', normalize_name(author_name_or_bai)).lower() query = self._generate_term_query( - self.KEYWORD_TO_ES_FIELDNAME['exact-author'], - author_name + self.KEYWORD_TO_ES_FIELDNAME['exact-author'], author_name ) return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query) def _generate_date_with_wildcard_query(self, date_value): """Helper for generating a date keyword query containing a wildcard. - Returns: - (dict): The date query containing the wildcard or an empty dict in case the date value is malformed. - The policy followed here is quite conservative on what it accepts as valid input. Look into - :meth:`inspire_query_parser.utils.visitor_utils._truncate_wildcard_from_date` for more information. + + Returns: (dict): The date query containing the wildcard or + an empty dict in case the date value is malformed. The policy + followed here is quite conservative on what it accepts as valid + input. Look into :meth:`inspire_query_parser.utils.visitor_utils + ._truncate_wildcard_from_date` for more information. """ if date_value.endswith(ast.GenericValue.WILDCARD_TOKEN): try: @@ -295,38 +322,46 @@ def _generate_date_with_wildcard_query(self, date_value): # Drop date query. return {} - return self._generate_range_queries(self.KEYWORD_TO_ES_FIELDNAME['date'], - {ES_RANGE_EQ_OPERATOR: date_value}) + return self._generate_range_queries( + self.KEYWORD_TO_ES_FIELDNAME['date'], {ES_RANGE_EQ_OPERATOR: date_value} + ) else: # Drop date query with wildcard not as suffix, e.g. 2000-1*-31 return {} def _generate_queries_for_title_symbols(self, title_field, query_value): - """Generate queries for any symbols in the title against the whitespace tokenized field of titles. - Returns: - (dict): The query or queries for the whitespace tokenized field of titles. If none such tokens exist, then - returns an empty dict. - Notes: - Splits the value stream into tokens according to whitespace. - Heuristically identifies the ones that contain symbol-indicating-characters (examples of those tokens are - "g-2", "SU(2)"). + """Generate queries for any symbols in the title against the whitespace + tokenized field of titles. + + Returns: (dict): The query or queries for the whitespace + tokenized field of titles. If none such tokens exist, then + returns an empty dict. Notes: Splits the value stream into + tokens according to whitespace. Heuristically identifies the + ones that contain symbol-indicating-characters (examples of + those tokens are "g-2", "SU(2)"). """ values_tokenized_by_whitespace = query_value.split() symbol_queries = [] for value in values_tokenized_by_whitespace: - # Heuristic: If there's a symbol-indicating-character in the value, it signifies terms that should be + # Heuristic: If there's a symbol-indicating-character in the value, + # it signifies terms that should be # queried against the whitespace-tokenized title. - if any(character in value for character in self.TITLE_SYMBOL_INDICATING_CHARACTER): + if any( + character in value + for character in self.TITLE_SYMBOL_INDICATING_CHARACTER + ): symbol_queries.append( generate_match_query( '.'.join([title_field, FieldVariations.search]), value, - with_operator_and=False + with_operator_and=False, ) ) - return wrap_queries_in_bool_clauses_if_more_than_one(symbol_queries, use_must_clause=True) + return wrap_queries_in_bool_clauses_if_more_than_one( + symbol_queries, use_must_clause=True + ) def _generate_title_queries(self, value): title_field = self.KEYWORD_TO_ES_FIELDNAME['title'] @@ -335,18 +370,22 @@ def _generate_title_queries(self, value): symbol_queries = self._generate_queries_for_title_symbols(title_field, value) return wrap_queries_in_bool_clauses_if_more_than_one( [element for element in (q, symbol_queries) if element], - use_must_clause=True + use_must_clause=True, ) def _generate_type_code_query(self, value): """Generate type-code queries. - Notes: - If the value of the type-code query exists in `TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING, then we - query the specified field, along with the given value according to the mapping. - See: https://github.com/inspirehep/inspire-query-parser/issues/79 - Otherwise, we query both ``document_type`` and ``publication_info``. + + Notes: If the value of the type-code query exists in + `TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING, then we query + the specified field, along with the given value according to the + mapping. See: + https://github.com/inspirehep/inspire-query-parser/issues/79 + Otherwise, we query both ``document_type`` and ``publication_info``. """ - mapping_for_value = self.TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING.get(value.lower(), None) + mapping_for_value = self.TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING.get( + value.lower(), None + ) if mapping_for_value: return generate_match_query(*mapping_for_value, with_operator_and=True) @@ -355,9 +394,13 @@ def _generate_type_code_query(self, value): 'bool': { 'minimum_should_match': 1, 'should': [ - generate_match_query('document_type', value, with_operator_and=True), - generate_match_query('publication_type', value, with_operator_and=True), - ] + generate_match_query( + 'document_type', value, with_operator_and=True + ), + generate_match_query( + 'publication_type', value, with_operator_and=True + ), + ], } } @@ -367,7 +410,9 @@ def _generate_query_string_query(self, value, fieldnames, analyze_wildcard): field_specifier, field_specifier_value = 'default_field', '_all' else: field_specifier = 'fields' - field_specifier_value = fieldnames if isinstance(fieldnames, list) else [fieldnames] + field_specifier_value = ( + fieldnames if isinstance(fieldnames, list) else [fieldnames] + ) # Can only use prefix queries on keyword, text and wildcard # fields so in journal * searches with type date need to be removed if 'publication_info.year' in field_specifier_value: @@ -376,7 +421,7 @@ def _generate_query_string_query(self, value, fieldnames, analyze_wildcard): 'query_string': { 'query': escape_query_string_special_characters(value), field_specifier: field_specifier_value, - 'default_operator': "AND" + 'default_operator': "AND", } } if analyze_wildcard: @@ -387,20 +432,9 @@ def _generate_query_string_query(self, value, fieldnames, analyze_wildcard): # TODO Move it to visitor utils and write tests for it. def _generate_term_query(self, fieldname, value, boost=None): if not boost: - return { - 'term': { - fieldname: value - } - } + return {'term': {fieldname: value}} - return { - 'term': { - fieldname: { - 'value': value, - 'boost': boost - } - } - } + return {'term': {fieldname: {'value': value, 'boost': boost}}} def _generate_boolean_query(self, node): condition_a = node.left.accept(self) @@ -410,42 +444,48 @@ def _generate_boolean_query(self, node): return wrap_queries_in_bool_clauses_if_more_than_one( bool_body, use_must_clause=isinstance(node, ast.AndOp), - preserve_bool_semantics_if_one_clause=True + preserve_bool_semantics_if_one_clause=True, ) def _generate_range_queries(self, fieldnames, operator_value_pairs): """Generates ElasticSearch range queries. - Args: - fieldnames (list): The fieldnames on which the search is the range query is targeted on, - operator_value_pairs (dict): Contains (range_operator, value) pairs. - The range_operator should be one of those supported by ElasticSearch (e.g. 'gt', 'lt', 'ge', 'le'). - The value should be of type int or string. - Notes: - A bool should query with multiple range sub-queries is generated so that even if one of the multiple fields - is missing from a document, ElasticSearch will be able to match some records. - In the case of a 'date' keyword query, it updates date values after normalizing them by using - :meth:`inspire_query_parser.utils.visitor_utils.update_date_value_in_operator_value_pairs_for_fieldname`. - Additionally, in the aforementioned case, if a malformed date has been given, then the the method will - return an empty dictionary. + + Args: fieldnames (list): The fieldnames on which the search + is the range query is targeted on, operator_value_pairs + (dict): Contains (range_operator, value) pairs. The + range_operator should be one of those supported by ElasticSearch + (e.g. 'gt', 'lt', 'ge', 'le'). The value should be of + type int or string. Notes: A bool should query with multiple + range sub-queries is generated so that even if one of the + multiple fields is missing from a document, ElasticSearch + will be able to match some records. In the case of a 'date' + keyword query, it updates date values after normalizing them by + using :meth:`inspire_query_parser.utils.visitor_utils.update + _date_value_in_operator_value_pairs_for_fieldname`. + Additionally, in the aforementioned case, if a malformed date + has been given, then the the method will return an empty + dictionary. """ if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames or all( - field in [self.KEYWORD_TO_ES_FIELDNAME['date-added'], - self.KEYWORD_TO_ES_FIELDNAME['date-updated'], - self.KEYWORD_TO_ES_FIELDNAME['date-earliest']] for field in fieldnames + field + in [ + self.KEYWORD_TO_ES_FIELDNAME['date-added'], + self.KEYWORD_TO_ES_FIELDNAME['date-updated'], + self.KEYWORD_TO_ES_FIELDNAME['date-earliest'], + ] + for field in fieldnames ): range_queries = [] for fieldname in fieldnames: - updated_operator_value_pairs = update_date_value_in_operator_value_pairs_for_fieldname( - fieldname, operator_value_pairs + updated_operator_value_pairs = ( + update_date_value_in_operator_value_pairs_for_fieldname( + fieldname, operator_value_pairs + ) ) if not updated_operator_value_pairs: break # Malformed date else: - range_query = { - 'range': { - fieldname: updated_operator_value_pairs - } - } + range_query = {'range': {fieldname: updated_operator_value_pairs}} range_queries.append( generate_nested_query(self.DATE_NESTED_QUERY_PATH, range_query) @@ -453,22 +493,30 @@ def _generate_range_queries(self, fieldnames, operator_value_pairs): else range_query ) elif 'publication_info.year' in fieldnames: - range_queries = [generate_nested_query(self.DATE_NESTED_QUERY_PATH, - {'range': {fieldname: operator_value_pairs}}) - for fieldname in fieldnames] + range_queries = [ + generate_nested_query( + self.DATE_NESTED_QUERY_PATH, + {'range': {fieldname: operator_value_pairs}}, + ) + for fieldname in fieldnames + ] else: - range_queries = [{'range': {fieldname: operator_value_pairs}} - for fieldname in fieldnames] + range_queries = [ + {'range': {fieldname: operator_value_pairs}} for fieldname in fieldnames + ] - return wrap_queries_in_bool_clauses_if_more_than_one(range_queries, use_must_clause=False) + return wrap_queries_in_bool_clauses_if_more_than_one( + range_queries, use_must_clause=False + ) @staticmethod def _generate_malformed_query(data): """Generates a query on the ``_all`` field with all the query content. - Args: - data (six.text_type or list): The query in the format of ``six.text_type`` (when used from parsing driver) - or ``list`` when used from withing the ES visitor. + + Args: data (six.text_type or list): The query in the format + of ``six.text_type`` (when used from parsing driver) or + ``list`` when used from withing the ES visitor. """ if isinstance(data, six.text_type): # Remove colon character (special character for ES) @@ -476,73 +524,77 @@ def _generate_malformed_query(data): else: query_str = ' '.join([word.strip(':') for word in data.children]) - return { - 'simple_query_string': { - 'fields': ['_all'], - 'query': query_str - } - } + return {'simple_query_string': {'fields': ['_all'], 'query': query_str}} - def _preprocess_journal_query_value(self, third_journal_field, old_publication_info_values): - """Transforms the given journal query value (old publication info) to the new one. - Args: - third_journal_field (six.text_type): The final field to be used for populating the old publication info. - old_publication_info_values (six.text_type): The old publication info. It must be one of {only title, title - & volume, title & volume & artid/page_start}. - Returns: - (dict) The new publication info. + def _preprocess_journal_query_value( + self, third_journal_field, old_publication_info_values + ): + """Transforms the given journal query value (old publication info) to + the new one. + + Args: third_journal_field (six.text_type): The final field + to be used for populating the old publication info. + old_publication_info_values (six.text_type): The old publication + info. It must be one of {only title, title & volume, + title & volume & artid/page_start}. Returns: (dict) The new + publication info. """ - # Prepare old publication info for :meth:`inspire_schemas.utils.convert_old_publication_info_to_new`. + # Prepare old publication info for + # :meth:`inspire_schemas.utils.convert_old_publication_info_to_new`. publication_info_keys = [ self.JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO, self.JOURNAL_VOLUME, - third_journal_field + third_journal_field, ] values_list = [ - value.strip() - for value - in old_publication_info_values.split(',') - if value + value.strip() for value in old_publication_info_values.split(',') if value ] old_publication_info = [ { key: value - for key, value - in zip(publication_info_keys, values_list) + for key, value in zip(publication_info_keys, values_list) if value } ] - # We are always assuming that the returned list will not be empty. In the situation of a journal query with no + # We are always assuming that the returned list will not be empty. + # In the situation of a journal query with no # value, a malformed query will be generated instead. - new_publication_info = convert_old_publication_info_to_new(old_publication_info)[0] + new_publication_info = convert_old_publication_info_to_new( + old_publication_info + )[0] return new_publication_info def _generate_journal_queries(self, value): """Generates ElasticSearch nested query(s). - Args: - value (string): Contains the journal_title, journal_volume and artid or start_page separated by a comma. - This value should be of type string. - Notes: - The value contains at least one of the 3 mentioned items, in this order and at most 3. - The 3rd is either the artid or the page_start and it will query the corresponding ES field for this item. - The values are then split on comma and stripped of spaces before being saved in a values list in order to - be assigned to corresponding fields. + + Args: value (string): Contains the journal_title, + journal_volume and artid or start_page separated by a comma. + This value should be of type string. Notes: The value + contains at least one of the 3 mentioned items, in this order + and at most 3. The 3rd is either the artid or the page_start + and it will query the corresponding ES field for this item. The + values are then split on comma and stripped of spaces before + being saved in a values list in order to be assigned to + corresponding fields. """ # Abstract away which is the third field, we care only for its existence. third_journal_field = self.JOURNAL_PAGE_START - new_publication_info = self._preprocess_journal_query_value(third_journal_field, value) + new_publication_info = self._preprocess_journal_query_value( + third_journal_field, value + ) - # We always expect a journal title, otherwise query would be considered malformed, and thus this method would + # We always expect a journal title, otherwise query would + # be considered malformed, and thus this method would journal_title_query = generate_match_query( - self.JOURNAL_TITLE, - new_publication_info[self.JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO], - with_operator_and=False - ) + self.JOURNAL_TITLE, + new_publication_info[self.JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO], + with_operator_and=False, + ) queries_for_each_field = [] if self.JOURNAL_VOLUME in new_publication_info: @@ -550,7 +602,7 @@ def _generate_journal_queries(self, value): generate_match_query( self.JOURNAL_FIELDS_MAPPING[self.JOURNAL_VOLUME], new_publication_info[self.JOURNAL_VOLUME], - with_operator_and=False + with_operator_and=False, ) ) @@ -559,7 +611,7 @@ def _generate_journal_queries(self, value): generate_match_query( self.JOURNAL_FIELDS_MAPPING[self.JOURNAL_YEAR], new_publication_info[self.JOURNAL_YEAR], - with_operator_and=False + with_operator_and=False, ) ) @@ -569,33 +621,32 @@ def _generate_journal_queries(self, value): generate_match_query( self.JOURNAL_FIELDS_MAPPING[third_field], artid_or_page_start, - with_operator_and=False + with_operator_and=False, ) - for third_field - in (self.JOURNAL_PAGE_START, self.JOURNAL_ART_ID) + for third_field in (self.JOURNAL_PAGE_START, self.JOURNAL_ART_ID) ] queries_for_each_field.append( - wrap_queries_in_bool_clauses_if_more_than_one(match_queries, use_must_clause=False) + wrap_queries_in_bool_clauses_if_more_than_one( + match_queries, use_must_clause=False + ) ) nested_query = generate_nested_query( self.JOURNAL_FIELDS_PREFIX, - wrap_queries_in_bool_clauses_if_more_than_one(queries_for_each_field, use_must_clause=True) + wrap_queries_in_bool_clauses_if_more_than_one( + queries_for_each_field, use_must_clause=True + ), ) journal_queries = [journal_title_query, nested_query] - return wrap_queries_in_bool_clauses_if_more_than_one(journal_queries, use_must_clause=True) + return wrap_queries_in_bool_clauses_if_more_than_one( + journal_queries, use_must_clause=True + ) def _generate_terms_lookup(self, path, search_path, value): return { - "terms": { - search_path : { - "index" : "records-hep", - "id" : value, - "path" : path - } - } + "terms": {search_path: {"index": "records-hep", "id": value, "path": path}} } # ################ @@ -626,11 +677,7 @@ def visit_query_with_malformed_part(self, node): return query def visit_not_op(self, node): - return { - 'bool': { - 'must_not': [node.op.accept(self)] - } - } + return {'bool': {'must_not': [node.op.accept(self)]}} def visit_and_op(self, node): return self._generate_boolean_query(node) @@ -639,25 +686,37 @@ def visit_or_op(self, node): return self._generate_boolean_query(node) def visit_keyword_op(self, node): - # For this visitor, the decision on which type of ElasticSearch query to generate, relies mainly on the leaves. - # Thus, the fieldname is propagated to them, so that they generate query type, depending on their type. + # For this visitor, the decision on which type of ElasticSearch + # query to generate, relies mainly on the leaves. + # Thus, the fieldname is propagated to them, so that they + # generate query type, depending on their type. fieldname = node.left.accept(self) return node.right.accept(self, fieldname) def visit_range_op(self, node, fieldnames): - return self._generate_range_queries(force_list(fieldnames), {'gte': node.left.value, 'lte': node.right.value}) + return self._generate_range_queries( + force_list(fieldnames), {'gte': node.left.value, 'lte': node.right.value} + ) def visit_greater_than_op(self, node, fieldnames): - return self._generate_range_queries(force_list(fieldnames), {'gt': node.op.value}) + return self._generate_range_queries( + force_list(fieldnames), {'gt': node.op.value} + ) def visit_greater_equal_than_op(self, node, fieldnames): - return self._generate_range_queries(force_list(fieldnames), {'gte': node.op.value}) + return self._generate_range_queries( + force_list(fieldnames), {'gte': node.op.value} + ) def visit_less_than_op(self, node, fieldnames): - return self._generate_range_queries(force_list(fieldnames), {'lt': node.op.value}) + return self._generate_range_queries( + force_list(fieldnames), {'lt': node.op.value} + ) def visit_less_equal_than_op(self, node, fieldnames): - return self._generate_range_queries(force_list(fieldnames), {'lte': node.op.value}) + return self._generate_range_queries( + force_list(fieldnames), {'lte': node.op.value} + ) def visit_nested_keyword_op(self, node): # TODO Cannot be completed as of yet. # FIXME: quick and dirty implementation of refersto:recid: @@ -668,42 +727,43 @@ def visit_nested_keyword_op(self, node): # TODO Cannot be completed as of yet. return self._generate_terms_lookup( self.KEYWORD_TO_ES_FIELDNAME['citedby']['path'], self.KEYWORD_TO_ES_FIELDNAME['citedby']['search_path'], - record_id + record_id, ) - if node.left.value == 'refersto': - if right.left.value == 'control_number': - recid = right.right.value - citing_records_query = generate_match_query( - self.KEYWORD_TO_ES_FIELDNAME['refersto'], - recid, - with_operator_and=False - ) - records_with_collection_literature_query = generate_match_query( - '_collections', - 'Literature', - with_operator_and=False - ) - superseded_records_query = generate_match_query( - self.RECORD_RELATION_FIELD, - 'successor', - with_operator_and=False - ) - self_citation = generate_match_query( - "control_number", - recid, - with_operator_and=False - ) - return { - 'bool': { - 'must': [citing_records_query, records_with_collection_literature_query], - 'must_not': [superseded_records_query, self_citation] - } + if node.left.value == 'refersto' and right.left.value == 'control_number': + recid = right.right.value + citing_records_query = generate_match_query( + self.KEYWORD_TO_ES_FIELDNAME['refersto'], + recid, + with_operator_and=False, + ) + records_with_collection_literature_query = generate_match_query( + '_collections', 'Literature', with_operator_and=False + ) + superseded_records_query = generate_match_query( + self.RECORD_RELATION_FIELD, 'successor', with_operator_and=False + ) + self_citation = generate_match_query( + "control_number", recid, with_operator_and=False + ) + return { + 'bool': { + 'must': [ + citing_records_query, + records_with_collection_literature_query, + ], + 'must_not': [superseded_records_query, self_citation], } + } if right.left.value == 'author': - return generate_match_query("referenced_authors_bais", right.right.value, with_operator_and=False) + return generate_match_query( + "referenced_authors_bais", + right.right.value, + with_operator_and=False, + ) def visit_keyword(self, node): - # If no keyword is found, return the original node value (case of an unknown keyword). + # If no keyword is found, return the original node value + # (case of an unknown keyword). return self.KEYWORD_TO_ES_FIELDNAME.get(node.value, node.value) def handle_value_wildcard(self, node, fieldnames=None): @@ -714,35 +774,36 @@ def handle_value_wildcard(self, node, fieldnames=None): fieldnames, node.value, bai_field_variation=FieldVariations.search, - query_bai_field_if_dots_in_name=True + query_bai_field_if_dots_in_name=True, ) query = self._generate_query_string_query( node.value, fieldnames=bai_fieldnames or fieldnames, - analyze_wildcard=True + analyze_wildcard=True, ) return self._generate_nested_author_query(query, fieldnames) query = self._generate_query_string_query( - node.value, - fieldnames=fieldnames, - analyze_wildcard=True + node.value, fieldnames=fieldnames, analyze_wildcard=True + ) + return wrap_query_in_nested_if_field_is_nested( + query, fieldnames, self.NESTED_FIELDS ) - return wrap_query_in_nested_if_field_is_nested(query, fieldnames, self.NESTED_FIELDS) def handle_author_query(self, node, fieldnames=None): bai_fieldnames = self._generate_fieldnames_if_bai_query( fieldnames, node.value, bai_field_variation=FieldVariations.search, - query_bai_field_if_dots_in_name=True + query_bai_field_if_dots_in_name=True, ) if bai_fieldnames: if len(bai_fieldnames) == 1: query = {"match": {bai_fieldnames[0]: node.value}} return self._generate_nested_author_query(query, fieldnames) - # Not an exact BAI pattern match, but node's value looks like BAI (no spaces and dots), - # e.g. `S.Mele`. In this case generate a partial match query. + # Not an exact BAI pattern match, but node's value looks like + # BAI (no spaces and dots), e.g. `S.Mele`. In this case generate + # a partial match query. return self.visit_partial_match_value(node, bai_fieldnames) return self._generate_author_query(fieldnames, node.value) @@ -753,11 +814,18 @@ def visit_value(self, node, fieldnames=None): if node.contains_wildcard: return self.handle_value_wildcard(node, fieldnames=fieldnames) - if fieldnames in [self.KEYWORD_TO_ES_FIELDNAME['date'], self.KEYWORD_TO_ES_FIELDNAME['date-added'], - self.KEYWORD_TO_ES_FIELDNAME['date-updated'], self.KEYWORD_TO_ES_FIELDNAME['date-earliest']]: - # Date queries with simple values are transformed into range queries, among the given and the exact + if fieldnames in [ + self.KEYWORD_TO_ES_FIELDNAME['date'], + self.KEYWORD_TO_ES_FIELDNAME['date-added'], + self.KEYWORD_TO_ES_FIELDNAME['date-updated'], + self.KEYWORD_TO_ES_FIELDNAME['date-earliest'], + ]: + # Date queries with simple values are transformed into range queries, + # among the given and the exact # next date, according to the granularity of the given date. - return self._generate_range_queries(force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value}) + return self._generate_range_queries( + force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value} + ) if isinstance(fieldnames, list): if self.KEYWORD_TO_ES_FIELDNAME['journal'] == fieldnames: return self._generate_journal_queries(node.value) @@ -765,14 +833,17 @@ def visit_value(self, node, fieldnames=None): if self.KEYWORD_TO_ES_FIELDNAME['affiliation-id'] == fieldnames: match_queries = [ wrap_query_in_nested_if_field_is_nested( - generate_match_query(field, node.value, with_operator_and=False), + generate_match_query( + field, node.value, with_operator_and=False + ), field, self.NESTED_FIELDS, ) for field in fieldnames ] return wrap_queries_in_bool_clauses_if_more_than_one( - match_queries, use_must_clause=False) + match_queries, use_must_clause=False + ) return { 'multi_match': { @@ -800,34 +871,46 @@ def visit_value(self, node, fieldnames=None): query = generate_match_query( self.KEYWORD_TO_ES_FIELDNAME['affiliation'], node.value, - with_operator_and=True + with_operator_and=True, ) return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query) elif self.KEYWORD_TO_ES_FIELDNAME['eprint'] == fieldnames: - return generate_match_query(fieldnames, re.sub('ar[xX]iv:', "", node.value), with_operator_and=True) + return generate_match_query( + fieldnames, + re.sub('ar[xX]iv:', "", node.value), + with_operator_and=True, + ) elif self.KEYWORD_TO_ES_FIELDNAME['texkey'] == fieldnames: - return generate_match_query('texkeys.raw', node.value, with_operator_and=False) + return generate_match_query( + 'texkeys.raw', node.value, with_operator_and=False + ) elif fieldnames not in self.KEYWORD_TO_ES_FIELDNAME.values(): colon_value = ':'.join([fieldnames, node.value]) - given_field_query = generate_match_query(fieldnames, node.value, with_operator_and=True) + given_field_query = generate_match_query( + fieldnames, node.value, with_operator_and=True + ) if self.TEXKEY_REGEX.match(colon_value): - return generate_match_query('texkeys.raw', colon_value, with_operator_and=False) - _all_field_query = generate_match_query('_all', colon_value, with_operator_and=True) + return generate_match_query( + 'texkeys.raw', colon_value, with_operator_and=False + ) + _all_field_query = generate_match_query( + '_all', colon_value, with_operator_and=True + ) query = wrap_queries_in_bool_clauses_if_more_than_one( - [given_field_query, _all_field_query], use_must_clause=False) - return wrap_query_in_nested_if_field_is_nested(query, fieldnames, self.NESTED_FIELDS) + [given_field_query, _all_field_query], use_must_clause=False + ) + return wrap_query_in_nested_if_field_is_nested( + query, fieldnames, self.NESTED_FIELDS + ) return generate_match_query(fieldnames, node.value, with_operator_and=True) def visit_exact_match_value(self, node, fieldnames=None): """Generates a term query (exact search in ElasticSearch).""" - if not fieldnames: - fieldnames = ['_all'] - else: - fieldnames = force_list(fieldnames) + fieldnames = ['_all'] if not fieldnames else force_list(fieldnames) if self.KEYWORD_TO_ES_FIELDNAME['exact-author'] == fieldnames[0]: return self._generate_exact_author_query(node.value) @@ -842,14 +925,19 @@ def visit_exact_match_value(self, node, fieldnames=None): fieldnames, node.value, bai_field_variation=FieldVariations.raw, - query_bai_field_if_dots_in_name=False + query_bai_field_if_dots_in_name=False, ) if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames: exact_match_queries = [] for field in fieldnames: - term_query = \ - {'term': {field: _truncate_date_value_according_on_date_field(field, node.value).dumps()}} + term_query = { + 'term': { + field: _truncate_date_value_according_on_date_field( + field, node.value + ).dumps() + } + } exact_match_queries.append( generate_nested_query(self.DATE_NESTED_QUERY_PATH, term_query) @@ -858,25 +946,40 @@ def visit_exact_match_value(self, node, fieldnames=None): ) elif self._are_fieldnames_author_or_first_author(fieldnames): exact_match_queries = [ - self._generate_nested_author_query({'match_phrase': {field: node.value}}, fieldnames) + self._generate_nested_author_query( + {'match_phrase': {field: node.value}}, fieldnames + ) for field in (bai_fieldnames or fieldnames) ] else: - exact_match_queries = [{'match_phrase': {field: node.value}} for field in (bai_fieldnames or fieldnames)] - query = wrap_queries_in_bool_clauses_if_more_than_one(exact_match_queries, use_must_clause=False) - return wrap_query_in_nested_if_field_is_nested(query, fieldnames[0], self.NESTED_FIELDS) + exact_match_queries = [ + {'match_phrase': {field: node.value}} + for field in (bai_fieldnames or fieldnames) + ] + query = wrap_queries_in_bool_clauses_if_more_than_one( + exact_match_queries, use_must_clause=False + ) + return wrap_query_in_nested_if_field_is_nested( + query, fieldnames[0], self.NESTED_FIELDS + ) - return wrap_queries_in_bool_clauses_if_more_than_one(exact_match_queries, use_must_clause=False) + return wrap_queries_in_bool_clauses_if_more_than_one( + exact_match_queries, use_must_clause=False + ) def visit_partial_match_value(self, node, fieldnames=None): - """Generates a query which looks for a substring of the node's value in the given fieldname.""" + """Generates a query which looks for a substring of the node's value in + the given fieldname.""" if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames: - # Date queries with partial values are transformed into range queries, among the given and the exact + # Date queries with partial values are transformed into range queries, + # among the given and the exact # next date, according to the granularity of the given date. if node.contains_wildcard: return self._generate_date_with_wildcard_query(node.value) - return self._generate_range_queries(force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value}) + return self._generate_range_queries( + force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value} + ) if self.KEYWORD_TO_ES_FIELDNAME['exact-author'] == fieldnames: return self._generate_exact_author_query(node.value) @@ -888,32 +991,37 @@ def visit_partial_match_value(self, node, fieldnames=None): return self._generate_journal_queries(node.value) # Add wildcard token as prefix and suffix. - value = \ - ('' if node.value.startswith(ast.GenericValue.WILDCARD_TOKEN) else '*') + \ - node.value + \ - ('' if node.value.endswith(ast.GenericValue.WILDCARD_TOKEN) else '*') + value = ( + ('' if node.value.startswith(ast.GenericValue.WILDCARD_TOKEN) else '*') + + node.value + + ('' if node.value.endswith(ast.GenericValue.WILDCARD_TOKEN) else '*') + ) if self._are_fieldnames_author_or_first_author(fieldnames): bai_fieldnames = self._generate_fieldnames_if_bai_query( fieldnames, node.value, bai_field_variation=FieldVariations.search, - query_bai_field_if_dots_in_name=True + query_bai_field_if_dots_in_name=True, + ) + query = self._generate_query_string_query( + value, fieldnames=bai_fieldnames or fieldnames, analyze_wildcard=True ) - query = self._generate_query_string_query(value, fieldnames=bai_fieldnames or fieldnames, analyze_wildcard=True) return self._generate_nested_author_query(query, fieldnames) - query = self._generate_query_string_query(value, fieldnames, analyze_wildcard=True) - return wrap_query_in_nested_if_field_is_nested(query, fieldnames, self.NESTED_FIELDS) + query = self._generate_query_string_query( + value, fieldnames, analyze_wildcard=True + ) + return wrap_query_in_nested_if_field_is_nested( + query, fieldnames, self.NESTED_FIELDS + ) def visit_regex_value(self, node, fieldname="_all"): - query = { - 'regexp': { - fieldname: node.value - } - } + query = {'regexp': {fieldname: node.value}} if self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldname: return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query) - return wrap_query_in_nested_if_field_is_nested(query, fieldname, self.NESTED_FIELDS) + return wrap_query_in_nested_if_field_is_nested( + query, fieldname, self.NESTED_FIELDS + ) diff --git a/inspire_query_parser/visitors/restructuring_visitor.py b/inspire_query_parser/visitors/restructuring_visitor.py index bdb2225..d0effc2 100644 --- a/inspire_query_parser/visitors/restructuring_visitor.py +++ b/inspire_query_parser/visitors/restructuring_visitor.py @@ -19,10 +19,9 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. - -""" -This module encapsulates the restructuring visitor logic, that receives the output of the parser and converts it to a -more compact and restructured parse tree. +"""This module encapsulates the restructuring visitor logic, that receives the +output of the parser and converts it to a more compact and restructured parse +tree. Additionally, the date specifier conversion handlers logic is defined. """ @@ -32,46 +31,64 @@ import logging from inspire_query_parser import ast -from inspire_query_parser.ast import (AndOp, ExactMatchValue, Keyword, - KeywordOp, NotOp, OrOp, - PartialMatchValue, - QueryWithMalformedPart, RegexValue, ValueOp) -from inspire_query_parser.parser import (And, ComplexValue, - SimpleValueBooleanQuery) -from inspire_query_parser.utils.visitor_utils import \ - DATE_SPECIFIERS_CONVERSION_HANDLERS +from inspire_query_parser.ast import ( + AndOp, + ExactMatchValue, + Keyword, + KeywordOp, + NotOp, + OrOp, + PartialMatchValue, + QueryWithMalformedPart, + RegexValue, + ValueOp, +) +from inspire_query_parser.parser import And, ComplexValue, SimpleValueBooleanQuery +from inspire_query_parser.utils.visitor_utils import DATE_SPECIFIERS_CONVERSION_HANDLERS from inspire_query_parser.visitors.visitor_impl import Visitor logger = logging.getLogger(__name__) def _restructure_if_volume_follows_journal(left, right): - """Remove volume node if it follows a journal logically in the tree hierarchy. + """Remove volume node if it follows a journal logically in the tree + hierarchy. - Args: - left (ast.ASTElement): The journal KeywordOp node. - right (ast.ASTElement): The rest of the tree to be restructured. + Args: left (ast.ASTElement): The journal KeywordOp node. right + (ast.ASTElement): The rest of the tree to be restructured. - Return: - (ast.ASTElement): The restructured tree, with the volume node removed. + Return: (ast.ASTElement): The restructured tree, with the volume + node removed. - Notes: - This happens to support queries like "journal Phys.Rev. and vol d85". Appends the value of KeywordOp with - Keyword 'volume' and discards 'volume' KeywordOp node from the tree. + Notes: This happens to support queries like "journal Phys.Rev. + and vol d85". Appends the value of KeywordOp with Keyword + 'volume' and discards 'volume' KeywordOp node from the tree. """ + def _get_volume_keyword_op_and_remaining_subtree(right_subtree): - if isinstance(right_subtree, NotOp) and isinstance(right_subtree.op, KeywordOp) \ - and right_subtree.op.left == Keyword('volume'): + if ( + isinstance(right_subtree, NotOp) + and isinstance(right_subtree.op, KeywordOp) + and right_subtree.op.left == Keyword('volume') + ): return None, None - elif isinstance(right_subtree, AndOp) and isinstance(right_subtree.left, NotOp) \ - and isinstance(right_subtree.left.op, KeywordOp) and right_subtree.left.op.left == Keyword('volume'): + elif ( + isinstance(right_subtree, AndOp) + and isinstance(right_subtree.left, NotOp) + and isinstance(right_subtree.left.op, KeywordOp) + and right_subtree.left.op.left == Keyword('volume') + ): return None, right_subtree.right - elif isinstance(right_subtree, KeywordOp) and right_subtree.left == Keyword('volume'): + elif isinstance(right_subtree, KeywordOp) and right_subtree.left == Keyword( + 'volume' + ): return right_subtree, None - elif isinstance(right_subtree, AndOp) and right_subtree.left.left == Keyword('volume'): + elif isinstance(right_subtree, AndOp) and right_subtree.left.left == Keyword( + 'volume' + ): return right_subtree.left, right_subtree.right journal_value = left.right.value @@ -88,14 +105,21 @@ def _get_volume_keyword_op_and_remaining_subtree(right_subtree): def _convert_simple_value_boolean_query_to_and_boolean_queries(tree, keyword): - """Chain SimpleValueBooleanQuery values into chained AndOp queries with the given current Keyword.""" + """Chain SimpleValueBooleanQuery values into chained AndOp queries with the + given current Keyword.""" def _create_operator_node(value_node): """Creates a KeywordOp or a ValueOp node.""" base_node = value_node.op if isinstance(value_node, NotOp) else value_node - updated_base_node = KeywordOp(keyword, base_node) if keyword else ValueOp(base_node) + updated_base_node = ( + KeywordOp(keyword, base_node) if keyword else ValueOp(base_node) + ) - return NotOp(updated_base_node) if isinstance(value_node, NotOp) else updated_base_node + return ( + NotOp(updated_base_node) + if isinstance(value_node, NotOp) + else updated_base_node + ) def _get_bool_op_type(bool_op): return AndOp if isinstance(bool_op, And) else OrOp @@ -119,11 +143,12 @@ def _get_bool_op_type(bool_op): class RestructuringVisitor(Visitor): - """Converts the output of the parser to a more compact and restructured parse tree. + """Converts the output of the parser to a more compact and restructured + parse tree. - Notes: - Compaction, as in removing intermediate nodes, such as Statement, Expression, etc. and restructure, as in, - breaking down a :class:`SimpleValueBooleanQuery` to chained boolean queries. + Notes: Compaction, as in removing intermediate nodes, such as + Statement, Expression, etc. and restructure, as in, breaking + down a :class:`SimpleValueBooleanQuery` to chained boolean queries. """ def _create_not_op(self, node): @@ -134,9 +159,10 @@ def visit_query(self, node): if len(result) == 1: result = result[0] - if isinstance(result, (ast.Value, ast.ExactMatchValue)) \ - or isinstance(result, ast.PartialMatchValue) \ - or isinstance(result, ast.RegexValue): + if isinstance( + result, + (ast.Value, ast.ExactMatchValue, ast.PartialMatchValue, ast.RegexValue), + ): # The only Values that can be standalone queries are the above. return ast.ValueOp(result) else: @@ -162,26 +188,34 @@ def visit_boolean_query(self, node): left = node.left.accept(self) right = node.right.accept(self) - is_journal_keyword_op = isinstance(left, KeywordOp) and left.left == Keyword('journal') + is_journal_keyword_op = isinstance(left, KeywordOp) and left.left == Keyword( + 'journal' + ) if is_journal_keyword_op: - journal_and_volume_conjunction = _restructure_if_volume_follows_journal(left, right) + journal_and_volume_conjunction = _restructure_if_volume_follows_journal( + left, right + ) if journal_and_volume_conjunction: return journal_and_volume_conjunction - return AndOp(left, right) if isinstance(node.bool_op, And) else OrOp(left, right) + return ( + AndOp(left, right) if isinstance(node.bool_op, And) else OrOp(left, right) + ) def visit_simple_value_boolean_query(self, node): - """ - Visits only the children of :class:`SimpleValueBooleanQuery` without substituting the actual node type. - - Notes: - Defer conversion from :class:`SimpleValueBooleanQuery` to AndOp or OrOp. - This transformation needs to occur higher in the tree, so that we don't lose the information that this is a - boolean query among terminals and thus the associative rule needs to be applied if we reached here from a - keyword query, or a conversion from :class:`SimpleValueBooleanQuery` to :class:`AndOp` or :class:`OrOp`, - otherwise. + """Visits only the children of :class:`SimpleValueBooleanQuery` without + substituting the actual node type. + + Notes: Defer conversion from + :class:`SimpleValueBooleanQuery` to AndOp or OrOp. This + transformation needs to occur higher in the tree, so that we + don't lose the information that this is a boolean query + among terminals and thus the associative rule needs to be + applied if we reached here from a keyword query, or a + conversion from :class:`SimpleValueBooleanQuery` to + :class:`AndOp` or :class:`OrOp`, otherwise. """ node.left, node.right = node.left.accept(self), node.right.accept(self) return node @@ -192,10 +226,14 @@ def visit_simple_value_negation(self, node): def visit_simple_query(self, node): node = node.op.accept(self) if isinstance(node, SimpleValueBooleanQuery): - # Case in which the node is a simple value boolean query not paired with a keyword query. e.g. 'foo and bar' - return _convert_simple_value_boolean_query_to_and_boolean_queries(node, None) + # Case in which the node is a simple value boolean query not + # paired with a keyword query. e.g. 'foo and bar' + return _convert_simple_value_boolean_query_to_and_boolean_queries( + node, None + ) elif isinstance(node, ast.Value): - # Case in which the node is a SimpleQuery(Value(...)) e.g. for a value query "Ellis" + # Case in which the node is a SimpleQuery(Value(...)) e.g. for + # a value query "Ellis" return ast.ValueOp(node) return node @@ -206,45 +244,55 @@ def visit_not_query(self, node): def visit_spires_keyword_query(self, node): """Transform a :class:`SpiresKeywordQuery` into a :class:`KeywordOp`. - Notes: - In case the value being a :class:`SimpleValueBooleanQuery`, the subtree is transformed to chained - :class:`AndOp` queries containing :class:`KeywordOp`, whose keyword is the keyword of the current node and - values, all the :class:`SimpleValueBooleanQuery` values (either :class:`SimpleValues` or - :class:`SimpleValueNegation`.) + Notes: In case the value being a + :class:`SimpleValueBooleanQuery`, the subtree is transformed to + chained :class:`AndOp` queries containing + :class:`KeywordOp`, whose keyword is the keyword of the current + node and values, all the :class:`SimpleValueBooleanQuery` + values (either :class:`SimpleValues` or + :class:`SimpleValueNegation`.) """ keyword = node.left.accept(self) value = node.right.accept(self) if isinstance(value, SimpleValueBooleanQuery): - return _convert_simple_value_boolean_query_to_and_boolean_queries(value, keyword) + return _convert_simple_value_boolean_query_to_and_boolean_queries( + value, keyword + ) return KeywordOp(keyword, value) def visit_spires_date_keyword_query(self, node): """Transform a :class:`SpiresKeywordQuery` into a :class:`KeywordOp`. - Notes: - In case the value being a :class:`SimpleValueBooleanQuery`, the subtree is transformed to chained - :class:`AndOp` queries containing :class:`KeywordOp`, whose keyword is the keyword of the current node and - values, all the :class:`SimpleValueBooleanQuery` values (either :class:`SimpleValues` or - :class:`SimpleValueNegation`.) + Notes: In case the value being a + :class:`SimpleValueBooleanQuery`, the subtree is transformed to + chained :class:`AndOp` queries containing + :class:`KeywordOp`, whose keyword is the keyword of the current + node and values, all the :class:`SimpleValueBooleanQuery` + values (either :class:`SimpleValues` or + :class:`SimpleValueNegation`.) """ keyword = node.left.accept(self) value = node.right.accept(self) if isinstance(value, SimpleValueBooleanQuery): - return _convert_simple_value_boolean_query_to_and_boolean_queries(value, keyword) + return _convert_simple_value_boolean_query_to_and_boolean_queries( + value, keyword + ) return KeywordOp(keyword, value) def visit_invenio_keyword_query(self, node): """Transform an :class:`InvenioKeywordQuery` into a :class:`KeywordOp`. - Notes: - In case the value being a :class:`SimpleValueBooleanQuery`, the subtree is transformed to chained - :class:`AndOp` queries containing :class:`KeywordOp`, whose keyword is the keyword of the current node and - values, all the :class:`SimpleValueBooleanQuery` values (either :class:`SimpleValues` or - :class:`SimpleValueNegation`.) + Notes: In case the value being a + :class:`SimpleValueBooleanQuery`, the subtree is transformed to + chained :class:`AndOp` queries containing + :class:`KeywordOp`, whose keyword is the keyword of the current + node and values, all the :class:`SimpleValueBooleanQuery` + values (either :class:`SimpleValues` or + :class:`SimpleValueNegation`.) """ try: keyword = node.left.accept(self) @@ -255,7 +303,9 @@ def visit_invenio_keyword_query(self, node): value = node.right.accept(self) if isinstance(value, SimpleValueBooleanQuery): - return _convert_simple_value_boolean_query_to_and_boolean_queries(value, keyword) + return _convert_simple_value_boolean_query_to_and_boolean_queries( + value, keyword + ) return KeywordOp(keyword, value) @@ -274,7 +324,8 @@ def visit_greater_than_op(self, node): def visit_greater_equal_op(self, node): try: value = node.op.accept(self) - except AttributeError: # Case of "100+" format, where 100 is text (and not a SimpleValue). + # Case of "100+" format, where 100 is text (and not a SimpleValue). + except AttributeError: value = ast.Value(node.op) return ast.GreaterEqualThanOp(value) @@ -284,7 +335,8 @@ def visit_less_than_op(self, node): def visit_less_equal_op(self, node): try: value = node.op.accept(self) - except AttributeError: # Case of "100-" format where 100 is text (and not a SimpleValue). + # Case of "100-" format where 100 is text (and not a SimpleValue). + except AttributeError: value = ast.Value(node.op) return ast.LessEqualThanOp(value) @@ -299,19 +351,21 @@ def visit_empty_query(self, node): return ast.EmptyQuery(None) def visit_complex_value(self, node): - """Convert :class:`ComplexValue` to one of ExactMatch, PartialMatch and Regex Value nodes.""" + """Convert :class:`ComplexValue` to one of ExactMatch, PartialMatch and + Regex Value nodes.""" if node.value.startswith(ComplexValue.EXACT_VALUE_TOKEN): value = node.value.strip(ComplexValue.EXACT_VALUE_TOKEN) return ExactMatchValue(value) elif node.value.startswith(ComplexValue.PARTIAL_VALUE_TOKEN): value = node.value.strip(ComplexValue.PARTIAL_VALUE_TOKEN) - return PartialMatchValue(value, True if ast.GenericValue.WILDCARD_TOKEN in value else False) + return PartialMatchValue(value, ast.GenericValue.WILDCARD_TOKEN in value) elif node.value.startswith(ComplexValue.REGEX_VALUE_TOKEN): return RegexValue(node.value.strip(ComplexValue.REGEX_VALUE_TOKEN)) else: - # Covering the case where ComplexValue supports more than ExactMatch, PartialMatch and Regex values. + # Covering the case where ComplexValue supports more than ExactMatch, + # PartialMatch and Regex values. msg = self.__class__.__name__ + ': Unrecognized complex value' try: msg += ' lookahead token: "' + node.value[0] + '"' @@ -323,15 +377,22 @@ def visit_complex_value(self, node): def visit_simple_value(self, node): # In case of date specifiers convert relative or text date to normal date. - for regexp, date_conversion_handler in DATE_SPECIFIERS_CONVERSION_HANDLERS.items(): + for ( + regexp, + date_conversion_handler, + ) in DATE_SPECIFIERS_CONVERSION_HANDLERS.items(): date_value = node.value regexp_match = regexp.match(node.value) if regexp_match: - relative_date_specifier_suffix = date_value.split(regexp_match.group())[1] - return ast.Value(str(date_conversion_handler(relative_date_specifier_suffix))) + relative_date_specifier_suffix = date_value.split(regexp_match.group())[ + 1 + ] + return ast.Value( + str(date_conversion_handler(relative_date_specifier_suffix)) + ) # Normal text value - return ast.Value(node.value, True if ast.GenericValue.WILDCARD_TOKEN in node.value else False) + return ast.Value(node.value, ast.GenericValue.WILDCARD_TOKEN in node.value) def visit_simple_range_value(self, node): return ast.Value(node.value) @@ -340,12 +401,19 @@ def visit_date_value(self, node): return node.op.accept(self) def visit_simple_date_value(self, node): - for regexp, date_conversion_handler in DATE_SPECIFIERS_CONVERSION_HANDLERS.items(): + for ( + regexp, + date_conversion_handler, + ) in DATE_SPECIFIERS_CONVERSION_HANDLERS.items(): date_value = node.value regexp_match = regexp.match(node.value) if regexp_match: - relative_date_specifier_suffix = date_value.split(regexp_match.group())[1] - return ast.Value(str(date_conversion_handler(relative_date_specifier_suffix))) + relative_date_specifier_suffix = date_value.split(regexp_match.group())[ + 1 + ] + return ast.Value( + str(date_conversion_handler(relative_date_specifier_suffix)) + ) # Normal text value - return ast.Value(node.value, True if ast.GenericValue.WILDCARD_TOKEN in node.value else False) + return ast.Value(node.value, ast.GenericValue.WILDCARD_TOKEN in node.value) diff --git a/inspire_query_parser/visitors/visitor_impl.py b/inspire_query_parser/visitors/visitor_impl.py index b504045..8b52bc9 100644 --- a/inspire_query_parser/visitors/visitor_impl.py +++ b/inspire_query_parser/visitors/visitor_impl.py @@ -19,10 +19,7 @@ # In applying this license, CERN does not waive the privileges and immunities # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. - -""" -Encapsulates visitor pattern logic. -""" +"""Encapsulates visitor pattern logic.""" from __future__ import absolute_import, unicode_literals diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..07adf74 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,28 @@ +target-version = "py311" +[lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", + # flake8-tidy-imports + "TID", + # flake8-pytest-style + "PT", +] +ignore = ["B904", "B905"] + +[lint.pycodestyle] +ignore-overlong-task-comments = true + +[lint.pydocstyle] +convention = "google" diff --git a/run-tests.sh b/run-tests.sh index 93f195e..0258522 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -22,5 +22,4 @@ set -e -flake8 inspire_query_parser tests py.test tests diff --git a/setup.py b/setup.py index 9f5d753..f46b12b 100644 --- a/setup.py +++ b/setup.py @@ -22,27 +22,28 @@ """A PEG-based query parser for INSPIRE.""" -import os from setuptools import find_packages, setup - URL = 'https://github.com/inspirehep/inspire-query-parser' -readme = open('README.rst').read() +with open("README.rst") as f: + readme = f.read() setup_requires = [ 'autosemver==0.5.5', ] -install_requires = [ - 'inspire-schemas~=61.0', - 'inspire-utils~=3.0,>=3.0.0', - 'pypeg2~=2.0,>=2.15.2', - 'python-dateutil~=2.0,>=2.6.1', - 'six~=1.0,>=1.11.0', - 'datefinder~=0.7.1' -], +install_requires = ( + [ + 'inspire-schemas~=61.0', + 'inspire-utils~=3.0,>=3.0.0', + 'pypeg2~=2.0,>=2.15.2', + 'python-dateutil~=2.0,>=2.6.1', + 'six~=1.0,>=1.11.0', + 'datefinder~=0.7.1', + ], +) docs_require = [] @@ -53,13 +54,18 @@ 'pytest~=3.0,>=3.2.2', ] +dev_require = [ + "pre-commit==3.5.0", +] + extras_require = { 'docs': docs_require, 'tests': tests_require, + 'dev': dev_require, } extras_require['all'] = [] -for name, reqs in extras_require.items(): +for _name, reqs in extras_require.items(): extras_require['all'].extend(reqs) packages = find_packages(exclude=['docs']) diff --git a/tests/conftest.py b/tests/conftest.py index 3ef134e..31faee6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,15 +38,20 @@ def pytest_assertrepr_compare(op, left, right): if ( - isinstance(left, Query) and isinstance(right, Query) or - isinstance(left, KeywordOp) and isinstance(right, KeywordOp) or - isinstance(left, AndOp) and isinstance(right, AndOp) or - isinstance(left, OrOp) and isinstance(right, OrOp) + isinstance(left, Query) + and isinstance(right, Query) + or isinstance(left, KeywordOp) + and isinstance(right, KeywordOp) + or isinstance(left, AndOp) + and isinstance(right, AndOp) + or isinstance(left, OrOp) + and isinstance(right, OrOp) ) and op == "==": left_parse_tree = emit_tree_format(left).splitlines() right_parse_tree = emit_tree_format(right).splitlines() - return \ - ['that given parse trees are equal:'] \ - + left_parse_tree \ - + ['', "──────── == ────────", ''] \ + return ( + ['that given parse trees are equal:'] + + left_parse_tree + + ['', "──────── == ────────", ''] + right_parse_tree + ) diff --git a/tests/helpers/test_utils.py b/tests/helpers/test_utils.py index ee220e3..44f4859 100644 --- a/tests/helpers/test_utils.py +++ b/tests/helpers/test_utils.py @@ -47,18 +47,25 @@ def parametrize(test_configurations): if not isinstance(test_configurations, dict): __tracebackhide__ = True - pytest.fail('In parametrize test configurations parameter must be a dictionary.') + pytest.fail( + 'In parametrize test configurations parameter must be a dictionary.' + ) ordered_tests_config = OrderedDict(sorted(viewitems(test_configurations))) for test_name, test_configuration in iteritems(ordered_tests_config): - ordered_tests_config[test_name] = OrderedDict(sorted(viewitems(test_configuration))) + ordered_tests_config[test_name] = OrderedDict( + sorted(viewitems(test_configuration)) + ) # Extract arg_names from a test configuration arg_names = list(iterkeys(next(itervalues(ordered_tests_config)))) # Generate list of arg_values - arg_values = [ordered_tests_config[test_config].values() for test_config in ordered_tests_config] + arg_values = [ + ordered_tests_config[test_config].values() + for test_config in ordered_tests_config + ] # Generate ids list ids = list(iterkeys(ordered_tests_config)) diff --git a/tests/test_elastic_search_visitor.py b/tests/test_elastic_search_visitor.py index 73d7138..a848f17 100644 --- a/tests/test_elastic_search_visitor.py +++ b/tests/test_elastic_search_visitor.py @@ -29,10 +29,8 @@ from inspire_query_parser import parse_query, parser from inspire_query_parser.config import ES_MUST_QUERY, ES_SHOULD_QUERY from inspire_query_parser.stateful_pypeg_parser import StatefulParser -from inspire_query_parser.visitors.elastic_search_visitor import \ - ElasticSearchVisitor -from inspire_query_parser.visitors.restructuring_visitor import \ - RestructuringVisitor +from inspire_query_parser.visitors.elastic_search_visitor import ElasticSearchVisitor +from inspire_query_parser.visitors.restructuring_visitor import RestructuringVisitor def _parse_query(query_str): @@ -302,7 +300,7 @@ def test_elastic_search_visitor_find_journal_title_and_old_style_vol_simple_valu assert generated_es_query == expected_es_query -def test_elastic_search_visitor_find_journal_title_and_vol_and_artid_or_start_page_simple_value(): +def test_elastic_search_visitor_find_journal_title_and_vol_and_artid_or_start_page_simple_value(): # noqa E501 query_str = "j Phys.Lett.B,351,123" expected_es_query = { "bool": { @@ -324,7 +322,9 @@ def test_elastic_search_visitor_find_journal_title_and_vol_and_artid_or_start_pa "should": [ { "match": { - "publication_info.page_start": "123" + "publication_info.page_start": ( + "123" + ) } }, { @@ -774,11 +774,16 @@ def test_elastic_search_visitor_wildcard_journal_search(): 'query': { 'query_string': { 'query': 'Phys.Rev.*', - 'fields': ['publication_info.journal_title','publication_info.journal_volume', 'publication_info.page_start', 'publication_info.artid'], + 'fields': [ + 'publication_info.journal_title', + 'publication_info.journal_volume', + 'publication_info.page_start', + 'publication_info.artid', + ], 'default_operator': 'AND', 'analyze_wildcard': True, } - } + }, } } generated_es_query = _parse_query(query_str) @@ -862,7 +867,7 @@ def test_elastic_search_visitor_with_malformed_query(): "inspire_query_parser.visitors.elastic_search_visitor.DEFAULT_ES_OPERATOR_FOR_MALFORMED_QUERIES", ES_MUST_QUERY, ) -def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malformed_query_op_as_must(): +def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malformed_query_op_as_must(): # noqa E501 query_str = "subject astrophysics and: author:" expected_es_query = { "bool": { @@ -888,7 +893,7 @@ def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malfo "inspire_query_parser.visitors.elastic_search_visitor.DEFAULT_ES_OPERATOR_FOR_MALFORMED_QUERIES", ES_SHOULD_QUERY, ) -def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malformed_query_op_as_should(): +def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malformed_query_op_as_should(): # noqa E501 query_str = "subject astrophysics and author:" expected_es_query = { "bool": { @@ -912,7 +917,7 @@ def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malfo assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_only_year_fields(): +def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_only_year_fields(): # noqa E501 query_str = "date 2000-10" expected_es_query = { "bool": { @@ -958,7 +963,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_o assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_rollover_year(): +def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_rollover_year(): # noqa E501 query_str = "date 2017-12" expected_es_query = { "bool": { @@ -1004,7 +1009,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_r assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_rollover_month(): +def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_rollover_month(): # noqa E501 query_str = "date 2017-10-31" expected_es_query = { "bool": { @@ -1062,7 +1067,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_r assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_day(): +def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_day(): # noqa E501 query_str = "date 2000-10-*" expected_es_query = { "bool": { @@ -1108,7 +1113,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_ assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_month(): +def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_month(): # noqa E501 query_str = "date 2015-*" expected_es_query = { "bool": { @@ -1138,7 +1143,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_ assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_as_month_part(): +def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_as_month_part(): # noqa E501 query_str = "date 2015-1*" expected_es_query = { "bool": { @@ -1168,7 +1173,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_ assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_one_query_date_multi_field_and_wildcard_infix_generates_to_all_field(): +def test_elastic_search_visitor_with_one_query_date_multi_field_and_wildcard_infix_generates_to_all_field(): # noqa E501 query_str = "date: 2017-*-12" expected_es_query = { "multi_match": { @@ -1182,7 +1187,7 @@ def test_elastic_search_visitor_with_one_query_date_multi_field_and_wildcard_inf assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_two_queries_date_multi_field_and_wildcard_infix_drops_date(): +def test_elastic_search_visitor_with_two_queries_date_multi_field_and_wildcard_infix_drops_date(): # noqa E501 query_str = "date: 2017-*-12 and title collider" expected_es_query = { "bool": { @@ -1203,7 +1208,7 @@ def test_elastic_search_visitor_with_two_queries_date_multi_field_and_wildcard_i assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_year_drops_date_query(): +def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_year_drops_date_query(): # noqa E501 query_str = "date 201* and title collider" expected_es_query = { "bool": { @@ -1224,7 +1229,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_ assert generated_es_query == expected_es_query -def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_month_drops_date_query(): +def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_month_drops_date_query(): # noqa E501 query_str = "date 2000-*-01 and title collider" expected_es_query = { "bool": { @@ -1407,8 +1412,9 @@ def test_elastic_search_visitor_with_date_multi_field_and_range_op(): def test_elastic_search_visitor_with_date_multi_field_range_within_same_year(): - # This kind of query works fine (regarding the ``publication_info.year``), since the range operator is including - # its bounds, otherwise we would get no records. + # This kind of query works fine (regarding the ``publication_info.year``), + # since the range operator is including its bounds, + # otherwise we would get no records. query_str = "date 2000-01->2000-04" expected_es_query = { "bool": { @@ -1726,7 +1732,7 @@ def test_elastic_search_visitor_handles_first_author_bai_exact_value(): assert generated_es_query == expected_es_query -def test_elastic_search_visitor_handles_partial_match_value_with_bai_value_and_partial_bai_value(): +def test_elastic_search_visitor_handles_partial_match_value_with_bai_value_and_partial_bai_value(): # noqa E501 query_str = "a 'A.Einstein.1' and a 'S.Mele'" expected_es_query = { "bool": { @@ -1813,7 +1819,7 @@ def test_elastic_search_visitor_handles_wildcard_simple_and_partial_bai_like_que assert generated_es_query == expected_es_query -def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_author_name_contains_dot_and_no_spaces(): +def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_author_name_contains_dot_and_no_spaces(): # noqa E501 query_str = "a S.Mele" expected_es_query = { "nested": { @@ -1833,7 +1839,7 @@ def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_author_n assert generated_es_query == expected_es_query -def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_first_author_name_contains_dot_and_no_spaces(): +def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_first_author_name_contains_dot_and_no_spaces(): # noqa E501 query_str = "fa S.Mele" expected_es_query = { "nested": { @@ -1856,14 +1862,14 @@ def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_first_au assert generated_es_query == expected_es_query -def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_comma_and_dot(): +def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_comma_and_dot(): # noqa E501 query_str = "a gava,e." generated_es_query = _parse_query(query_str) assert ElasticSearchVisitor.AUTHORS_BAI_FIELD not in str(generated_es_query) -def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_comma_and_dot(): +def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_comma_and_dot(): # noqa E501 query_str = "fa gava,e." generated_es_query = _parse_query(query_str) @@ -1872,14 +1878,14 @@ def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_cont ) -def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_trailing_dot(): +def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_trailing_dot(): # noqa E501 query_str = "a mele." generated_es_query = _parse_query(query_str) assert ElasticSearchVisitor.AUTHORS_BAI_FIELD not in str(generated_es_query) -def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_trailing_dot(): +def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_trailing_dot(): # noqa E501 query_str = "fa mele." generated_es_query = _parse_query(query_str) @@ -1888,14 +1894,14 @@ def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_cont ) -def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_prefix_dot(): +def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_prefix_dot(): # noqa E501 query_str = "a .mele" generated_es_query = _parse_query(query_str) assert ElasticSearchVisitor.AUTHORS_BAI_FIELD not in str(generated_es_query) -def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_prefix_dot(): +def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_prefix_dot(): # noqa E501 query_str = "fa .mele" generated_es_query = _parse_query(query_str) @@ -1904,7 +1910,7 @@ def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_cont ) -def test_elastic_search_visitor_does_not_query_bai_field_if_name_contains_dot_and_spaces(): +def test_elastic_search_visitor_does_not_query_bai_field_if_name_contains_dot_and_spaces(): # noqa E501 query_str = "a S. Mele" bai_field = "authors.ids.value.search" @@ -1912,7 +1918,7 @@ def test_elastic_search_visitor_does_not_query_bai_field_if_name_contains_dot_an assert bai_field not in str(generated_es_query) -def test_elastic_search_visitor_does_not_query_bai_field_if_fa_name_contains_dot_and_spaces(): +def test_elastic_search_visitor_does_not_query_bai_field_if_fa_name_contains_dot_and_spaces(): # noqa E501 query_str = "fa S. Mele" bai_field = "first_author.ids.value.search" @@ -2006,7 +2012,7 @@ def test_elastic_search_visitor_with_word_and_symbol_containing_unicode_characte assert generated_es_query == expected_es_query -def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_document_type(): +def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_document_type(): # noqa E501 query_str = "tc c" expected_es_query = { "match": {"document_type": {"query": "conference paper", "operator": "and"}} @@ -2016,7 +2022,7 @@ def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_doc assert generated_es_query == expected_es_query -def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_publication_type(): +def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_publication_type(): # noqa E501 query_str = "tc i" expected_es_query = { "match": {"publication_type": {"query": "introductory", "operator": "and"}} @@ -2042,7 +2048,7 @@ def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_ref assert generated_es_query == expected_es_query -def test_elastic_search_visitor_type_code_with_unknown_value_searches_both_document_and_publication_type_fields(): +def test_elastic_search_visitor_type_code_with_unknown_value_searches_both_document_and_publication_type_fields(): # noqa E501 query_str = "tc note" expected_es_query = { "bool": { @@ -2058,7 +2064,7 @@ def test_elastic_search_visitor_type_code_with_unknown_value_searches_both_docum assert generated_es_query == expected_es_query -def test_elastic_search_visitor_type_code_with_known_exact_value_mapping_and_query_refereed(): +def test_elastic_search_visitor_type_code_with_known_exact_value_mapping_and_query_refereed(): # noqa E501 query_str = 'tc "p"' expected_es_query = {"match": {"refereed": True}} @@ -2066,7 +2072,7 @@ def test_elastic_search_visitor_type_code_with_known_exact_value_mapping_and_que assert generated_es_query == expected_es_query -def test_elastic_search_visitor_type_code_with_known_partial_value_mapping_and_query_refereed(): +def test_elastic_search_visitor_type_code_with_known_partial_value_mapping_and_query_refereed(): # noqa E501 query_str = "tc 'p'" expected_es_query = {"match": {"refereed": True}} @@ -2406,7 +2412,9 @@ def test_elastic_search_visitor_find_journal_with_year(): "should": [ { "match": { - "publication_info.page_start": "112" + "publication_info.page_start": ( + "112" + ) } }, { @@ -2461,7 +2469,9 @@ def test_regression_query_with_multiple_dots(): { "match": { "_all": { - "query": "references.reference.dois:10.7483/OPENDATA.CMS.ATLAS", + "query": ( + "references.reference.dois:10.7483/OPENDATA.CMS.ATLAS" + ), "operator": "and", } } @@ -2764,7 +2774,9 @@ def test_first_author_query_with_full_name(): { "match_phrase_prefix": { "first_author.first_name": { - "analyzer": "names_analyzer", + "analyzer": ( + "names_analyzer" + ), "query": "John", } } @@ -2772,7 +2784,7 @@ def test_first_author_query_with_full_name(): { "match": { "first_author.first_name": { - "analyzer": "names_initials_analyzer", + "analyzer": "names_initials_analyzer", # noqa E501 "operator": "AND", "query": "John", } @@ -3097,7 +3109,9 @@ def test_journal_title_variants_regression(): "should": [ { "match": { - "publication_info.page_start": "015" + "publication_info.page_start": ( + "015" + ) } }, { @@ -3141,12 +3155,9 @@ def test_journal_title_variants_regression_complex_journal_title(): def test_elastic_search_visitor_fulltext(): query_str = "fulltext FCC" - expected_es_query = {'match':{ - 'documents.attachment.content': { - 'query': 'FCC', - 'operator': 'and' - } - }} + expected_es_query = { + 'match': {'documents.attachment.content': {'query': 'FCC', 'operator': 'and'}} + } generated_es_query = _parse_query(query_str) assert expected_es_query == generated_es_query @@ -3160,18 +3171,11 @@ def test_elastic_search_visitor_fulltext_and_other_field(): 'match': { 'documents.attachment.content': { 'query': 'something', - 'operator': 'and' + 'operator': 'and', } } }, - { - 'match': { - 'titles.full_title': { - 'query': 'boson', - 'operator': 'and' - } - } - } + {'match': {'titles.full_title': {'query': 'boson', 'operator': 'and'}}}, ] } } @@ -3186,7 +3190,7 @@ def test_elastic_search_visitor_partial_match_fulltext(): 'query': '*this is a test*', 'fields': ['documents.attachment.content'], 'default_operator': 'AND', - 'analyze_wildcard': True + 'analyze_wildcard': True, } } generated_es_query = _parse_query(query_str) @@ -3200,7 +3204,7 @@ def test_elastic_search_visitor_citedby(): "self.$ref.raw": { "index": "records-hep", "id": "123456", - "path": "references.record.$ref.raw" + "path": "references.record.$ref.raw", } } } @@ -3218,18 +3222,11 @@ def test_elastic_search_visitor_complex_query(): "self.$ref.raw": { "index": "records-hep", "id": "123456", - "path": "references.record.$ref.raw" + "path": "references.record.$ref.raw", } } }, - { - "match": { - "titles.full_title": { - "query": "Test", - "operator": "and" - } - } - } + {"match": {"titles.full_title": {"query": "Test", "operator": "and"}}}, ] } } @@ -3239,11 +3236,7 @@ def test_elastic_search_visitor_complex_query(): def test_elastic_search_visitor_texkeys_regression(): query_str = "texkey Chen:2014cwa" - expected_es_query = { - "match": { - "texkeys.raw": "Chen:2014cwa" - } - } + expected_es_query = {"match": {"texkeys.raw": "Chen:2014cwa"}} generated_es_query = _parse_query(query_str) assert generated_es_query == expected_es_query @@ -3253,11 +3246,7 @@ def test_elastic_search_visitor_texkeys_regression_bool_query(): expected_es_query = { "bool": { "must": [ - { - "match": { - "texkeys.raw": "Chen:2014cwa" - } - }, + {"match": {"texkeys.raw": "Chen:2014cwa"}}, { "nested": { "path": "authors", @@ -3268,15 +3257,15 @@ def test_elastic_search_visitor_texkeys_regression_bool_query(): "match": { "authors.last_name": { "query": "Moskovic", - "operator": "AND" + "operator": "AND", } } } ] } - } + }, } - } + }, ] } } diff --git a/tests/test_format_parse_tree.py b/tests/test_format_parse_tree.py index 7d81915..e093f6d 100644 --- a/tests/test_format_parse_tree.py +++ b/tests/test_format_parse_tree.py @@ -22,18 +22,37 @@ from __future__ import absolute_import, unicode_literals -from inspire_query_parser.parser import (Expression, InvenioKeywordQuery, - Query, SimpleQuery, SimpleValue, - Statement, Value) +from inspire_query_parser.parser import ( + Expression, + InvenioKeywordQuery, + Query, + SimpleQuery, + SimpleValue, + Statement, + Value, +) from inspire_query_parser.utils.format_parse_tree import emit_tree_format def test_format_parse_tree_handles_unicode_values(): - parse_tree = Query([Statement(Expression(SimpleQuery(Value(SimpleValue('γ-radiation')))))]) + parse_tree = Query( + [Statement(Expression(SimpleQuery(Value(SimpleValue('γ-radiation')))))] + ) assert emit_tree_format(parse_tree, verbose=True) def test_format_parse_tree_handles_unicode_nodes(): - parse_tree = Query([Statement(Expression(SimpleQuery(InvenioKeywordQuery('unicode-keyword-φοο', - Value(SimpleValue('γ-radiation'))))))]) + parse_tree = Query( + [ + Statement( + Expression( + SimpleQuery( + InvenioKeywordQuery( + 'unicode-keyword-φοο', Value(SimpleValue('γ-radiation')) + ) + ) + ) + ) + ] + ) assert emit_tree_format(parse_tree, verbose=True) diff --git a/tests/test_parser.py b/tests/test_parser.py index 879f0b6..9c0905a 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -22,19 +22,22 @@ from __future__ import print_function, unicode_literals +from test_utils import parametrize + from inspire_query_parser.parser import SimpleValue, SimpleValueUnit from inspire_query_parser.stateful_pypeg_parser import StatefulParser -from test_utils import parametrize # Test parse terminal token -def test_that_parse_terminal_token_does_accept_keywords_if_parsing_parenthesized_terminal_flag_is_on(): +def test_that_parse_terminal_token_does_accept_keywords_if_parsing_parenthesized_terminal_flag_is_on(): # noqa E501 query_str = 'and' parser = StatefulParser() parser._parsing_parenthesized_terminal = True - returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token(parser, query_str) + returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token( + parser, query_str + ) assert returned_unrecognised_text == '' assert returned_result == query_str @@ -44,7 +47,9 @@ def test_that_parse_terminal_token_does_not_accept_token_followed_by_colon(): parser = StatefulParser() - returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token(parser, query_str) + returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token( + parser, query_str + ) assert isinstance(returned_result, SyntaxError) assert returned_unrecognised_text == query_str @@ -54,7 +59,9 @@ def test_that_parse_terminal_token_accepts_non_shortened_inspire_keywords(): parser = StatefulParser() - returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token(parser, query_str) + returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token( + parser, query_str + ) assert returned_result == query_str assert returned_unrecognised_text == "" @@ -66,58 +73,60 @@ def test_that_parse_terminal_token_accepts_non_shortened_inspire_keywords(): 'Date specifiers arithmetic: today': { 'query_str': 'today - 2', 'unrecognized_text': '', - 'result': SimpleValueUnit('today - 2') + 'result': SimpleValueUnit('today - 2'), }, 'Date specifiers arithmetic: yesterday': { 'query_str': 'yesterday - 365', 'unrecognized_text': '', - 'result': SimpleValueUnit('yesterday - 365') + 'result': SimpleValueUnit('yesterday - 365'), }, 'Date specifiers arithmetic: this month': { 'query_str': 'this month - 1', 'unrecognized_text': '', - 'result': SimpleValueUnit('this month - 1') + 'result': SimpleValueUnit('this month - 1'), }, 'Date specifiers arithmetic: last month': { 'query_str': 'last month-1', 'unrecognized_text': '', - 'result': SimpleValueUnit('last month-1') + 'result': SimpleValueUnit('last month-1'), }, 'Date specifier w/o arithmetic (followed by a query)': { 'query_str': 'today - a', 'unrecognized_text': ' - a', - 'result': SimpleValueUnit('today') + 'result': SimpleValueUnit('today'), }, - # Basic tokens 'Simple token': { 'query_str': 'foo', 'unrecognized_text': '', - 'result': SimpleValueUnit('foo') + 'result': SimpleValueUnit('foo'), }, 'Unicode token': { 'query_str': 'γ-radiation', 'unrecognized_text': '', - 'result': SimpleValueUnit('γ-radiation') + 'result': SimpleValueUnit('γ-radiation'), }, # Tokens separated by whitespace, don't get recognized by SimpleValueUnit. 'Many tokens (whitespace separated)': { 'query_str': 'foo bar', 'unrecognized_text': ' bar', - 'result': SimpleValueUnit('foo') + 'result': SimpleValueUnit('foo'), }, } ) def test_simple_value_unit_accepted_tokens(query_str, unrecognized_text, result): parser = StatefulParser() - returned_unrecognised_text, returned_result = SimpleValueUnit.parse(parser, query_str, None) - if type(result) != SyntaxError: + returned_unrecognised_text, returned_result = SimpleValueUnit.parse( + parser, query_str, None + ) + if not isinstance(result, SyntaxError): assert returned_unrecognised_text == unrecognized_text assert returned_result == result else: assert returned_unrecognised_text == unrecognized_text - assert isinstance(returned_result, SyntaxError) and result.msg == result.msg + assert isinstance(returned_result, SyntaxError) + assert result.msg == result.msg @parametrize( @@ -125,32 +134,35 @@ def test_simple_value_unit_accepted_tokens(query_str, unrecognized_text, result) 'Multiple whitespace-separated tokens': { 'query_str': 'foo bar', 'unrecognized_text': '', - 'result': SimpleValue('foo bar') + 'result': SimpleValue('foo bar'), }, 'Plaintext with parentheses': { 'query_str': 'foo(a)', 'unrecognized_text': '', - 'result': SimpleValue('foo(a)') + 'result': SimpleValue('foo(a)'), }, 'Plaintext with keywords (or keyword symbols +/-/|) in parentheses': { 'query_str': '(and)', 'unrecognized_text': '', - 'result': SimpleValue('(and)') + 'result': SimpleValue('(and)'), }, 'Plaintext with colons in the first word': { 'query_str': 'foo:bar baz:quux', 'unrecognized_text': 'baz:quux', - 'result': SimpleValue('foo:bar') + 'result': SimpleValue('foo:bar'), }, } ) def test_simple_value_accepted_tokens(query_str, unrecognized_text, result): parser = StatefulParser() - returned_unrecognised_text, returned_result = SimpleValue.parse(parser, query_str, None) - if type(result) != SyntaxError: + returned_unrecognised_text, returned_result = SimpleValue.parse( + parser, query_str, None + ) + if not isinstance(result, SyntaxError): assert returned_unrecognised_text == unrecognized_text assert returned_result == result else: assert returned_unrecognised_text == unrecognized_text - assert isinstance(returned_result, SyntaxError) and result.msg == result.msg + assert isinstance(returned_result, SyntaxError) + assert result.msg == result.msg diff --git a/tests/test_parser_functionality.py b/tests/test_parser_functionality.py index d0f9dec..a27c79f 100644 --- a/tests/test_parser_functionality.py +++ b/tests/test_parser_functionality.py @@ -24,26 +24,44 @@ import pytest -from inspire_query_parser.parser import (And, BooleanQuery, ComplexValue, - DateValue, EmptyQuery, Expression, - GreaterEqualOp, GreaterThanOp, - InspireDateKeyword, InspireKeyword, - InvenioKeywordQuery, LessEqualOp, - LessThanOp, MalformedQueryWords, - NestedKeywordQuery, NotQuery, Or, - ParenthesizedQuery, Query, RangeOp, - SimpleDateValue, SimpleQuery, - SimpleRangeValue, SimpleValue, - SimpleValueBooleanQuery, - SpiresDateKeywordQuery, - SpiresKeywordQuery, Statement, Value) +from inspire_query_parser.parser import ( + And, + BooleanQuery, + ComplexValue, + DateValue, + EmptyQuery, + Expression, + GreaterEqualOp, + GreaterThanOp, + InspireDateKeyword, + InspireKeyword, + InvenioKeywordQuery, + LessEqualOp, + LessThanOp, + MalformedQueryWords, + NestedKeywordQuery, + NotQuery, + Or, + ParenthesizedQuery, + Query, + RangeOp, + SimpleDateValue, + SimpleQuery, + SimpleRangeValue, + SimpleValue, + SimpleValueBooleanQuery, + SpiresDateKeywordQuery, + SpiresKeywordQuery, + Statement, + Value, +) from inspire_query_parser.stateful_pypeg_parser import StatefulParser # TODO Reformat parentheses around parametrize entries @pytest.mark.parametrize( - ["query_str", "expected_parse_tree"], + ("query_str", "expected_parse_tree"), { ( "date nov 2020 12", @@ -807,7 +825,10 @@ ), ), ( - "author ellis, j. and not (title boson or not (author /^xi$/ and title foo))", + ( + "author ellis, j. and not (title boson or not (author /^xi$/ and title" + " foo))" + ), Query( [ Statement( @@ -1061,7 +1082,8 @@ ] ), ), - # Parenthesized keyword query values (working also with SPIRES operators - doesn't on legacy) + # Parenthesized keyword query values (working also with + # SPIRES operators - doesn't on legacy) ( "author:(title ellis)", Query( @@ -1173,7 +1195,10 @@ ), ), ( - "find title Alternative the Phase-II upgrade of the ATLAS Inner Detector or na61/shine", + ( + "find title Alternative the Phase-II upgrade of the ATLAS Inner" + " Detector or na61/shine" + ), Query( [ Statement( @@ -1184,7 +1209,8 @@ Value( SimpleValueBooleanQuery( SimpleValue( - "Alternative the Phase-II upgrade of the ATLAS Inner Detector" + "Alternative the Phase-II upgrade of" + " the ATLAS Inner Detector" ), Or(), SimpleValue("na61/shine"), diff --git a/tests/test_parsing_driver.py b/tests/test_parsing_driver.py index 6a8de30..ec9782f 100644 --- a/tests/test_parsing_driver.py +++ b/tests/test_parsing_driver.py @@ -31,10 +31,7 @@ def test_driver_with_simple_query(): query_str = 'subject astrophysics' expected_es_query = { "match": { - "facet_inspire_categories": { - "query": "astrophysics", - "operator": "and" - } + "facet_inspire_categories": {"query": "astrophysics", "operator": "and"} } } @@ -50,7 +47,7 @@ def test_driver_with_nothing_recognized(mocked_parser): 'multi_match': { 'query': 'unrecognized query', 'fields': ['_all'], - 'zero_terms_query': 'all' + 'zero_terms_query': 'all', } } @@ -68,7 +65,7 @@ def test_driver_with_syntax_error(mocked_parser): 'multi_match': { 'query': 'query with syntax error', 'fields': ['_all'], - 'zero_terms_query': 'all' + 'zero_terms_query': 'all', } } @@ -83,13 +80,11 @@ def test_driver_with_syntax_error(mocked_parser): def test_driver_with_rst_visitor_error(mocked_rst_visitor): query_str = 'foo' expected_es_query = { - 'multi_match': { - 'query': 'foo', - 'fields': ['_all'], - 'zero_terms_query': 'all' - } + 'multi_match': {'query': 'foo', 'fields': ['_all'], 'zero_terms_query': 'all'} } - mocked_rst_visitor.return_value.visit.side_effect = Exception('Something went wrong with visit_value') + mocked_rst_visitor.return_value.visit.side_effect = Exception( + 'Something went wrong with visit_value' + ) mocked_rst_visitor.__name__ = 'MockedRestructuringVisitor' es_query = parse_query(query_str) @@ -101,13 +96,11 @@ def test_driver_with_rst_visitor_error(mocked_rst_visitor): def test_driver_with_es_visitor_error(mocked_es_visitor): query_str = 'foo' expected_es_query = { - 'multi_match': { - 'query': 'foo', - 'fields': ['_all'], - 'zero_terms_query': 'all' - } + 'multi_match': {'query': 'foo', 'fields': ['_all'], 'zero_terms_query': 'all'} } - mocked_es_visitor.return_value.visit.side_effect = Exception('Something went wrong with visit_value') + mocked_es_visitor.return_value.visit.side_effect = Exception( + 'Something went wrong with visit_value' + ) mocked_es_visitor.__name__ = 'MockedElasticSearchVisitor' es_query = parse_query(query_str) @@ -121,7 +114,7 @@ def test_driver_with_es_visitor_empty_query_generates_a_query_against_all(): 'multi_match': { 'query': 'd < 200', 'fields': ['_all'], - 'zero_terms_query': 'all' + 'zero_terms_query': 'all', } } diff --git a/tests/test_restructuring_visitor.py b/tests/test_restructuring_visitor.py index 928c571..5ce337a 100644 --- a/tests/test_restructuring_visitor.py +++ b/tests/test_restructuring_visitor.py @@ -28,121 +28,141 @@ from dateutil.relativedelta import relativedelta from inspire_query_parser import parser -from inspire_query_parser.ast import (AndOp, EmptyQuery, ExactMatchValue, - GreaterEqualThanOp, GreaterThanOp, - Keyword, KeywordOp, LessEqualThanOp, - LessThanOp, MalformedQuery, - NestedKeywordOp, NotOp, OrOp, - PartialMatchValue, - QueryWithMalformedPart, RangeOp, - RegexValue, Value, ValueOp) +from inspire_query_parser.ast import ( + AndOp, + EmptyQuery, + ExactMatchValue, + GreaterEqualThanOp, + GreaterThanOp, + Keyword, + KeywordOp, + LessEqualThanOp, + LessThanOp, + MalformedQuery, + NestedKeywordOp, + NotOp, + OrOp, + PartialMatchValue, + QueryWithMalformedPart, + RangeOp, + RegexValue, + Value, + ValueOp, +) from inspire_query_parser.stateful_pypeg_parser import StatefulParser -from inspire_query_parser.visitors.restructuring_visitor import \ - RestructuringVisitor +from inspire_query_parser.visitors.restructuring_visitor import RestructuringVisitor @pytest.mark.parametrize( - ['query_str', 'expected_parse_tree'], + ('query_str', 'expected_parse_tree'), [ # Find keyword combined with other production rules - ('FIN author:\'ellis\'', KeywordOp(Keyword('author'), PartialMatchValue('ellis'))), + ( + 'FIN author:\'ellis\'', + KeywordOp(Keyword('author'), PartialMatchValue('ellis')), + ), ('Find author "ellis"', KeywordOp(Keyword('author'), ExactMatchValue('ellis'))), ('f author ellis', KeywordOp(Keyword('author'), Value('ellis'))), - # Invenio like search ( 'author:ellis and title:boson', AndOp( KeywordOp(Keyword('author'), Value('ellis')), - KeywordOp(Keyword('title'), Value('boson')) - ) - ), - ('unknown_keyword:\'bar\'', KeywordOp(Keyword('unknown_keyword'), PartialMatchValue('bar'))), - ('dotted.keyword:\'bar\'', KeywordOp(Keyword('dotted.keyword'), PartialMatchValue('bar'))), - + KeywordOp(Keyword('title'), Value('boson')), + ), + ), + ( + 'unknown_keyword:\'bar\'', + KeywordOp(Keyword('unknown_keyword'), PartialMatchValue('bar')), + ), + ( + 'dotted.keyword:\'bar\'', + KeywordOp(Keyword('dotted.keyword'), PartialMatchValue('bar')), + ), # Boolean operator testing (And/Or) ( 'author ellis and title \'boson\'', AndOp( KeywordOp(Keyword('author'), Value('ellis')), - KeywordOp(Keyword('title'), PartialMatchValue('boson')) - ) - ), + KeywordOp(Keyword('title'), PartialMatchValue('boson')), + ), + ), ( 'f a appelquist and date 1983', AndOp( KeywordOp(Keyword('author'), Value('appelquist')), - KeywordOp(Keyword('date'), Value('1983')) - ) - ), + KeywordOp(Keyword('date'), Value('1983')), + ), + ), ( 'fin a henneaux and citedby a nicolai', AndOp( KeywordOp(Keyword('author'), Value('henneaux')), - NestedKeywordOp(Keyword('citedby'), KeywordOp(Keyword('author'), Value('nicolai')))) - ), + NestedKeywordOp( + Keyword('citedby'), KeywordOp(Keyword('author'), Value('nicolai')) + ), + ), + ), ( 'au ellis | title \'boson\'', OrOp( KeywordOp(Keyword('author'), Value('ellis')), - KeywordOp(Keyword('title'), PartialMatchValue('boson')) - ) - ), + KeywordOp(Keyword('title'), PartialMatchValue('boson')), + ), + ), ( '-author ellis OR title \'boson\'', OrOp( NotOp(KeywordOp(Keyword('author'), Value('ellis'))), - KeywordOp(Keyword('title'), PartialMatchValue('boson')) - ) - ), + KeywordOp(Keyword('title'), PartialMatchValue('boson')), + ), + ), ( 'author ellis & title \'boson\'', AndOp( KeywordOp(Keyword('author'), Value('ellis')), - KeywordOp(Keyword('title'), PartialMatchValue('boson')) - ) - ), - + KeywordOp(Keyword('title'), PartialMatchValue('boson')), + ), + ), # Implicit And ( 'author ellis elastic.keyword:\'boson\'', AndOp( KeywordOp(Keyword('author'), Value('ellis')), - KeywordOp(Keyword('elastic.keyword'), PartialMatchValue('boson')) - ) - ), + KeywordOp(Keyword('elastic.keyword'), PartialMatchValue('boson')), + ), + ), ( 'find cn atlas not tc c', AndOp( KeywordOp(Keyword('collaboration'), Value('atlas')), - NotOp(KeywordOp(Keyword('type-code'), Value('c'))) - ) - ), + NotOp(KeywordOp(Keyword('type-code'), Value('c'))), + ), + ), ( 'author:ellis j title:\'boson\' reference:M.N.1', AndOp( KeywordOp(Keyword('author'), Value('ellis j')), AndOp( KeywordOp(Keyword('title'), PartialMatchValue('boson')), - KeywordOp(Keyword('cite'), Value('M.N.1')) - ) - ) - ), + KeywordOp(Keyword('cite'), Value('M.N.1')), + ), + ), + ), ( 'author ellis - title \'boson\'', AndOp( KeywordOp(Keyword('author'), Value('ellis')), - NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson'))) - ) - ), + NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson'))), + ), + ), ( - 'topcite 2+ and skands', - AndOp( - KeywordOp(Keyword('topcite'), GreaterEqualThanOp(Value('2'))), - ValueOp(Value('skands')) - ) + 'topcite 2+ and skands', + AndOp( + KeywordOp(Keyword('topcite'), GreaterEqualThanOp(Value('2'))), + ValueOp(Value('skands')), + ), ), - # ##### Boolean operators at terminals level #### ( 'author ellis title:boson not higgs', @@ -150,21 +170,22 @@ KeywordOp(Keyword('author'), Value('ellis')), AndOp( KeywordOp(Keyword('title'), Value('boson')), - NotOp(KeywordOp(Keyword('title'), Value('higgs'))) - ) - ) - ), - + NotOp(KeywordOp(Keyword('title'), Value('higgs'))), + ), + ), + ), # Negation ( 'ellis and not title \'boson\'', AndOp( ValueOp(Value('ellis')), - NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson'))) - ) - ), - ('-title \'boson\'', NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson')))), - + NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson'))), + ), + ), + ( + '-title \'boson\'', + NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson'))), + ), # Nested expressions ( 'author ellis, j. and (title boson or (author /^xi$/ and title foo))', @@ -174,13 +195,16 @@ KeywordOp(Keyword('title'), Value('boson')), AndOp( KeywordOp(Keyword('author'), RegexValue('^xi$')), - KeywordOp(Keyword('title'), Value('foo')) - ) - ) - ) - ), + KeywordOp(Keyword('title'), Value('foo')), + ), + ), + ), + ), ( - 'author ellis, j. and not (title boson or not (author /^xi$/ and title foo))', + ( + 'author ellis, j. and not (title boson or not (author /^xi$/ and title' + ' foo))' + ), AndOp( KeywordOp(Keyword('author'), Value('ellis, j.')), NotOp( @@ -189,14 +213,13 @@ NotOp( AndOp( KeywordOp(Keyword('author'), RegexValue('^xi$')), - KeywordOp(Keyword('title'), Value('foo')) + KeywordOp(Keyword('title'), Value('foo')), ) - ) + ), ) - ) - ) - ), - + ), + ), + ), # Metadata search ( 'refersto:1347300 and (reference:Ellis or reference "Ellis")', @@ -204,41 +227,37 @@ NestedKeywordOp(Keyword('refersto'), ValueOp(Value('1347300'))), OrOp( KeywordOp(Keyword('cite'), Value('Ellis')), - KeywordOp(Keyword('cite'), ExactMatchValue('Ellis')) - ) - ) + KeywordOp(Keyword('cite'), ExactMatchValue('Ellis')), + ), + ), ), ( 'exactauthor:M.Vanderhaeghen.1 and ac: 42', AndOp( KeywordOp(Keyword('exact-author'), Value('M.Vanderhaeghen.1')), - KeywordOp(Keyword('author-count'), Value('42')) - ) + KeywordOp(Keyword('author-count'), Value('42')), + ), ), - # Simple phrases ('ellis', ValueOp(Value('ellis'))), ('\'ellis\'', ValueOp(PartialMatchValue('ellis'))), ('(ellis and smith)', AndOp(ValueOp(Value('ellis')), ValueOp(Value('smith')))), - - # Parenthesized keyword query values (working also with SPIRES operators - doesn't on legacy) - ( - 'author:(title ellis)', KeywordOp(Keyword('author'), Value('title ellis')) - ), + # Parenthesized keyword query values (working also with SPIRES operators - + # doesn't on legacy) + ('author:(title ellis)', KeywordOp(Keyword('author'), Value('title ellis'))), ( 'author (pardo, f AND slavich) OR (author:bernreuther and not date:2017)', OrOp( AndOp( KeywordOp(Keyword('author'), Value('pardo, f')), - KeywordOp(Keyword('author'), Value('slavich')) + KeywordOp(Keyword('author'), Value('slavich')), ), AndOp( KeywordOp(Keyword('author'), Value('bernreuther')), - NotOp(KeywordOp(Keyword('date'), Value('2017'))) - ) - ) - ), - + NotOp(KeywordOp(Keyword('date'), Value('2017'))), + ), + ), + ), # Non trivial terminals ( 'author smith and not j., ellis or foo', @@ -246,301 +265,363 @@ KeywordOp(Keyword('author'), Value('smith')), OrOp( NotOp(KeywordOp(Keyword('author'), Value('j., ellis'))), - KeywordOp(Keyword('author'), Value('foo')) - ) - ) - ), + KeywordOp(Keyword('author'), Value('foo')), + ), + ), + ), ( - 'find title Alternative the Phase-II upgrade of the ATLAS Inner Detector or na61/shine', + ( + 'find title Alternative the Phase-II upgrade of the ATLAS Inner' + ' Detector or na61/shine' + ), OrOp( - KeywordOp(Keyword('title'), Value('Alternative the Phase-II upgrade of the ATLAS Inner Detector')), - KeywordOp(Keyword('title'), Value('na61/shine')) - ) - ), + KeywordOp( + Keyword('title'), + Value( + 'Alternative the Phase-II upgrade of the ATLAS Inner Detector' + ), + ), + KeywordOp(Keyword('title'), Value('na61/shine')), + ), + ), ( 'find (j phys.rev. and vol d85) or (j phys.rev.lett.,62,1825)', OrOp( KeywordOp(Keyword('journal'), Value('phys.rev.,d85')), - KeywordOp(Keyword('journal'), Value('phys.rev.lett.,62,1825')) - ) - ), + KeywordOp(Keyword('journal'), Value('phys.rev.lett.,62,1825')), + ), + ), ( "title e-10 and -author d'hoker", AndOp( KeywordOp(Keyword('title'), Value('e-10')), - NotOp(KeywordOp(Keyword('author'), Value('d\'hoker'))) - ) - ), + NotOp(KeywordOp(Keyword('author'), Value('d\'hoker'))), + ), + ), ( 'a pang,yi and t SU(2)', AndOp( KeywordOp(Keyword('author'), Value('pang,yi')), - KeywordOp(Keyword('title'), Value('SU(2)')) - ) - ), + KeywordOp(Keyword('title'), Value('SU(2)')), + ), + ), ( 't e(+)e(-) or e+e- Colliders', OrOp( KeywordOp(Keyword('title'), Value('e(+)e(-)')), - KeywordOp(Keyword('title'), Value('e+e- Colliders')) - ) + KeywordOp(Keyword('title'), Value('e+e- Colliders')), + ), + ), + ( + 'title: Si-28(p(pol.),n(pol.))', + KeywordOp(Keyword('title'), Value('Si-28(p(pol.),n(pol.))')), + ), + ( + 't Si28(p→,p→′)Si28(6−,T=1)', + KeywordOp(Keyword('title'), Value('Si28(p→,p→′)Si28(6−,T=1)')), + ), + ( + 't C-12(vec-p,vec-n)N-12 (g.s.,1+)', + KeywordOp(Keyword('title'), Value('C-12(vec-p,vec-n)N-12 (g.s.,1+)')), ), - ('title: Si-28(p(pol.),n(pol.))', KeywordOp(Keyword('title'), Value('Si-28(p(pol.),n(pol.))'))), - ('t Si28(p→,p→′)Si28(6−,T=1)', KeywordOp(Keyword('title'), Value('Si28(p→,p→′)Si28(6−,T=1)'))), - ('t C-12(vec-p,vec-n)N-12 (g.s.,1+)', KeywordOp(Keyword('title'), Value('C-12(vec-p,vec-n)N-12 (g.s.,1+)'))), - # Regex - ('author:/^Ellis, (J|John)$/', KeywordOp(Keyword('author'), RegexValue('^Ellis, (J|John)$'))), - ('title:/dense ([^ $]* )?matter/', KeywordOp(Keyword('title'), RegexValue('dense ([^ $]* )?matter'))), - + ( + 'author:/^Ellis, (J|John)$/', + KeywordOp(Keyword('author'), RegexValue('^Ellis, (J|John)$')), + ), + ( + 'title:/dense ([^ $]* )?matter/', + KeywordOp(Keyword('title'), RegexValue('dense ([^ $]* )?matter')), + ), # Nestable keywords ( 'referstox:author:s.p.martin.1', - NestedKeywordOp(Keyword('referstox'), KeywordOp(Keyword('author'), Value('s.p.martin.1'))) - ), + NestedKeywordOp( + Keyword('referstox'), + KeywordOp(Keyword('author'), Value('s.p.martin.1')), + ), + ), ( 'find a parke, s j and refersto author witten', AndOp( KeywordOp(Keyword('author'), Value('parke, s j')), - NestedKeywordOp(Keyword('refersto'), KeywordOp(Keyword('author'), Value('witten'))) - ) - ), + NestedKeywordOp( + Keyword('refersto'), KeywordOp(Keyword('author'), Value('witten')) + ), + ), + ), ( 'citedbyx:author:s.p.martin.1', - NestedKeywordOp(Keyword('citedbyx'), KeywordOp(Keyword('author'), Value('s.p.martin.1'))) - ), + NestedKeywordOp( + Keyword('citedbyx'), KeywordOp(Keyword('author'), Value('s.p.martin.1')) + ), + ), ( 'citedby:author:s.p.martin.1', - NestedKeywordOp(Keyword('citedby'), KeywordOp(Keyword('author'), Value('s.p.martin.1'))) - ), + NestedKeywordOp( + Keyword('citedby'), KeywordOp(Keyword('author'), Value('s.p.martin.1')) + ), + ), ( '-refersto:recid:1374998 and citedby:(A.A.Aguilar.Arevalo.1)', AndOp( - NotOp(NestedKeywordOp(Keyword('refersto'), KeywordOp(Keyword('control_number'), Value('1374998')))), - NestedKeywordOp(Keyword('citedby'), ValueOp(Value('A.A.Aguilar.Arevalo.1'))) - ) - ), + NotOp( + NestedKeywordOp( + Keyword('refersto'), + KeywordOp(Keyword('control_number'), Value('1374998')), + ) + ), + NestedKeywordOp( + Keyword('citedby'), ValueOp(Value('A.A.Aguilar.Arevalo.1')) + ), + ), + ), ( 'citedby:(author A.A.Aguilar.Arevalo.1 and not a ellis)', NestedKeywordOp( Keyword('citedby'), AndOp( KeywordOp(Keyword('author'), Value('A.A.Aguilar.Arevalo.1')), - NotOp(KeywordOp(Keyword('author'), Value('ellis'))) - ) - ) + NotOp(KeywordOp(Keyword('author'), Value('ellis'))), + ), + ), ), ( 'citedby:refersto:recid:1432705', NestedKeywordOp( Keyword('citedby'), - NestedKeywordOp(Keyword('refersto'), KeywordOp(Keyword('control_number'), Value('1432705'))) - ) - ), - + NestedKeywordOp( + Keyword('refersto'), + KeywordOp(Keyword('control_number'), Value('1432705')), + ), + ), + ), # Ranges ( - 'd 2015->2017 and cited:1->9', - AndOp( - KeywordOp(Keyword("date"), RangeOp(Value('2015'), Value('2017'))), - KeywordOp(Keyword('topcite'), RangeOp(Value('1'), Value('9'))) - ) - ), - + 'd 2015->2017 and cited:1->9', + AndOp( + KeywordOp(Keyword("date"), RangeOp(Value('2015'), Value('2017'))), + KeywordOp(Keyword('topcite'), RangeOp(Value('1'), Value('9'))), + ), + ), # Empty query ('', EmptyQuery()), (' ', EmptyQuery()), - # G, GE, LT, LE, E queries ( - 'date > 2000-10 and date < 2000-12', - AndOp( - KeywordOp(Keyword('date'), GreaterThanOp(Value('2000-10'))), - KeywordOp(Keyword('date'), LessThanOp(Value('2000-12'))) - ) - ), + 'date > 2000-10 and date < 2000-12', + AndOp( + KeywordOp(Keyword('date'), GreaterThanOp(Value('2000-10'))), + KeywordOp(Keyword('date'), LessThanOp(Value('2000-12'))), + ), + ), ( - 'date after 10/2000 and date before 2000-12', - AndOp( - KeywordOp(Keyword('date'), GreaterThanOp(Value('10/2000'))), - KeywordOp(Keyword('date'), LessThanOp(Value('2000-12'))) - ) - ), + 'date after 10/2000 and date before 2000-12', + AndOp( + KeywordOp(Keyword('date'), GreaterThanOp(Value('10/2000'))), + KeywordOp(Keyword('date'), LessThanOp(Value('2000-12'))), + ), + ), ( 'date >= nov 2000 and d<=2005', AndOp( KeywordOp(Keyword('date'), GreaterEqualThanOp(Value('nov 2000'))), - KeywordOp(Keyword('date'), LessEqualThanOp(Value('2005'))) - ) + KeywordOp(Keyword('date'), LessEqualThanOp(Value('2005'))), + ), ), ( 'date 1978+ + -ac 100+', AndOp( - KeywordOp(Keyword('date'), GreaterEqualThanOp(Value('1978'))), - NotOp(KeywordOp(Keyword('author-count'), GreaterEqualThanOp(Value('100')))) - ) - ), + KeywordOp(Keyword('date'), GreaterEqualThanOp(Value('1978'))), + NotOp( + KeywordOp(Keyword('author-count'), GreaterEqualThanOp(Value('100'))) + ), + ), + ), ( 'f a wimpenny and date = 1987', AndOp( KeywordOp(Keyword('author'), Value('wimpenny')), - KeywordOp(Keyword('date'), Value('1987'))) - ), - + KeywordOp(Keyword('date'), Value('1987')), + ), + ), # Date specifiers ( 'date today - 2 and title foo', AndOp( - KeywordOp(Keyword('date'), Value(str(date.today() - timedelta(days=2)))), - KeywordOp(Keyword('title'), Value('foo')) - ) - ), + KeywordOp( + Keyword('date'), Value(str(date.today() - timedelta(days=2))) + ), + KeywordOp(Keyword('title'), Value('foo')), + ), + ), ( 'date today - 0 and title foo', AndOp( KeywordOp(Keyword('date'), Value(str(date.today()))), - KeywordOp(Keyword('title'), Value('foo')) - ) - ), + KeywordOp(Keyword('title'), Value('foo')), + ), + ), ( 'date today - title foo', AndOp( KeywordOp(Keyword('date'), Value(str(date.today()))), - NotOp(KeywordOp(Keyword('title'), Value('foo'))) - ) - ), + NotOp(KeywordOp(Keyword('title'), Value('foo'))), + ), + ), ( 'date this month and author ellis', AndOp( KeywordOp(Keyword('date'), Value(str(date.today()))), - KeywordOp(Keyword('author'), Value('ellis')) - ) - ), + KeywordOp(Keyword('author'), Value('ellis')), + ), + ), ( 'date this month - 3 and author ellis', AndOp( - KeywordOp(Keyword('date'), Value(str(date.today() - relativedelta(months=3)))), - KeywordOp(Keyword('author'), Value('ellis')) - ) - ), + KeywordOp( + Keyword('date'), Value(str(date.today() - relativedelta(months=3))) + ), + KeywordOp(Keyword('author'), Value('ellis')), + ), + ), ( 'date yesterday - 2 - ac 100', AndOp( - KeywordOp(Keyword('date'), - Value(str(date.today() - relativedelta(days=3)))), - NotOp(KeywordOp(Keyword('author-count'), Value('100'))) - ) - ), + KeywordOp( + Keyword('date'), Value(str(date.today() - relativedelta(days=3))) + ), + NotOp(KeywordOp(Keyword('author-count'), Value('100'))), + ), + ), ( pytest.param( 'date last month - 2 + ac < 50', AndOp( - KeywordOp(Keyword('date'), Value(str((date.today() - relativedelta(months=3))))), - KeywordOp(Keyword('author-count'), LessThanOp(Value('50'))) + KeywordOp( + Keyword('date'), + Value(str((date.today() - relativedelta(months=3)))), + ), + KeywordOp(Keyword('author-count'), LessThanOp(Value('50'))), + ), + marks=pytest.mark.xfail( + reason="doesn't work on 31st of the month, see INSPIR-2882" ), - marks=pytest.mark.xfail(reason="doesn't work on 31st of the month, see INSPIR-2882") ) - ), + ), ( 'du > yesterday - 2', KeywordOp( Keyword('date-updated'), - GreaterThanOp(Value(str((date.today() - relativedelta(days=3))))) - ) - ), - + GreaterThanOp(Value(str((date.today() - relativedelta(days=3))))), + ), + ), # Wildcard queries ( 'find a \'o*aigh\' and t "alge*" and date >2013', AndOp( - KeywordOp(Keyword('author'), PartialMatchValue('o*aigh', contains_wildcard=True)), + KeywordOp( + Keyword('author'), + PartialMatchValue('o*aigh', contains_wildcard=True), + ), AndOp( - KeywordOp(Keyword('title'), ExactMatchValue('alge*' - - )), - KeywordOp(Keyword('date'), GreaterThanOp(Value('2013'))) - ) - ) - ), + KeywordOp(Keyword('title'), ExactMatchValue('alge*')), + KeywordOp(Keyword('date'), GreaterThanOp(Value('2013'))), + ), + ), + ), ( 'a *alge | a alge* | a o*aigh', OrOp( KeywordOp(Keyword('author'), Value('*alge', contains_wildcard=True)), OrOp( - KeywordOp(Keyword('author'), Value('alge*', contains_wildcard=True)), - KeywordOp(Keyword('author'), Value('o*aigh', contains_wildcard=True)) - ) - ) - ), + KeywordOp( + Keyword('author'), Value('alge*', contains_wildcard=True) + ), + KeywordOp( + Keyword('author'), Value('o*aigh', contains_wildcard=True) + ), + ), + ), + ), ( 'find texkey Hirata:1992*', - KeywordOp(Keyword('texkeys.raw'), Value('Hirata:1992*', contains_wildcard=True)) + KeywordOp( + Keyword('texkeys.raw'), Value('Hirata:1992*', contains_wildcard=True) + ), ), - # Queries for implicit "and" removal ('title and foo', AndOp(ValueOp(Value('title')), ValueOp(Value('foo')))), ('author takumi doi', KeywordOp(Keyword('author'), Value('takumi doi'))), ( 'title cms and title experiment and date 2008', AndOp( - KeywordOp(Keyword('title'), Value('cms')), - AndOp( - KeywordOp(Keyword('title'), Value('experiment')), - KeywordOp(Keyword('date'), Value('2008')) - ) - ) + KeywordOp(Keyword('title'), Value('cms')), + AndOp( + KeywordOp(Keyword('title'), Value('experiment')), + KeywordOp(Keyword('date'), Value('2008')), + ), + ), ), ( 'author:witten title:foo', AndOp( KeywordOp(Keyword('author'), Value('witten')), - KeywordOp(Keyword('title'), Value('foo')) - ) + KeywordOp(Keyword('title'), Value('foo')), + ), ), - # Unrecognized queries ( 'title γ-radiation and and', QueryWithMalformedPart( KeywordOp(Keyword('title'), Value('γ-radiation')), - MalformedQuery(['and', 'and']) - ) - ), - ('find j Nucl.Phys.,A531,11', KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A531,11'))), + MalformedQuery(['and', 'and']), + ), + ), + ( + 'find j Nucl.Phys.,A531,11', + KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A531,11')), + ), ( 'find j Nucl.Phys. and j Nucl.Phys.', AndOp( KeywordOp(Keyword('journal'), Value('Nucl.Phys.')), - KeywordOp(Keyword('journal'), Value('Nucl.Phys.')) - ) + KeywordOp(Keyword('journal'), Value('Nucl.Phys.')), + ), ), ( 'find j Nucl.Phys. and vol A351 and author ellis', AndOp( KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A351')), - KeywordOp(Keyword('author'), Value('ellis')) - ) + KeywordOp(Keyword('author'), Value('ellis')), + ), ), ( - 'find j Nucl.Phys. and vol A351 and author ellis and author smith and ea john', + ( + 'find j Nucl.Phys. and vol A351 and author ellis and author smith and' + ' ea john' + ), AndOp( KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A351')), AndOp( KeywordOp(Keyword('author'), Value('ellis')), AndOp( KeywordOp(Keyword('author'), Value('smith')), - KeywordOp(Keyword('exact-author'), Value('john')) - ) - ) - ) + KeywordOp(Keyword('exact-author'), Value('john')), + ), + ), + ), + ), + ( + 'find j Nucl.Phys. and vol A531', + KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A531')), ), - ('find j Nucl.Phys. and vol A531', KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A531'))), ( 'find j Nucl.Phys. and author ellis', AndOp( KeywordOp(Keyword('journal'), Value('Nucl.Phys.')), - KeywordOp(Keyword('author'), Value('ellis')) - ) + KeywordOp(Keyword('author'), Value('ellis')), + ), ), ( 'find author ellis and j Nucl.Phys. and vol B351 and title Collider', @@ -548,21 +629,14 @@ KeywordOp(Keyword('author'), Value('ellis')), AndOp( KeywordOp(Keyword('journal'), Value('Nucl.Phys.,B351')), - KeywordOp(Keyword('title'), Value('Collider')) - ) - ) + KeywordOp(Keyword('title'), Value('Collider')), + ), + ), ), ( - 'find author ellis and j Nucl.Phys. and vol B351 and title Collider', - AndOp( - KeywordOp(Keyword('author'), Value('ellis')), - AndOp( - KeywordOp(Keyword('journal'), Value('Nucl.Phys.,B351')), - KeywordOp(Keyword('title'), Value('Collider')) - ) - ) + 'find j Nucl.Phys. and not vol A531', + KeywordOp(Keyword('journal'), Value('Nucl.Phys.')), ), - ('find j Nucl.Phys. and not vol A531', KeywordOp(Keyword('journal'), Value('Nucl.Phys.'))), # regression with date keyword followed by string not containing date ( "find da Silva and j Nucl.Phys.", @@ -577,11 +651,11 @@ KeywordOp(Keyword('journal'), Value('Nucl.Phys.')), AndOp( KeywordOp(Keyword('author'), Value('ellis')), - KeywordOp(Keyword('author'), Value('john')) - ) - ) - ) - ] + KeywordOp(Keyword('author'), Value('john')), + ), + ), + ), + ], ) def test_restructuring_visitor_functionality(query_str, expected_parse_tree): print("Parsing: " + query_str) @@ -600,28 +674,27 @@ def test_foo_bar(): restructuring_visitor = RestructuringVisitor() _, parse_tree = stateful_parser.parse(query_str, parser.Query) parse_tree = parse_tree.accept(restructuring_visitor) - expected_parse_tree = AndOp(KeywordOp(Keyword('journal'), Value('Nucl.Phys.')), - KeywordOp(Keyword('author'), Value('ellis'))) + expected_parse_tree = AndOp( + KeywordOp(Keyword('journal'), Value('Nucl.Phys.')), + KeywordOp(Keyword('author'), Value('ellis')), + ) assert parse_tree == expected_parse_tree @pytest.mark.parametrize( - ['query_str', 'expected_parse_tree'], + ('query_str', 'expected_parse_tree'), [ ( 'sungtae cho or 1301.7261', - OrOp( - ValueOp(Value('sungtae cho')), - ValueOp(Value('1301.7261')) - ) + OrOp(ValueOp(Value('sungtae cho')), ValueOp(Value('1301.7261'))), ), ( 'raffaele d\'agnolo and not cn cms', AndOp( ValueOp(Value('raffaele d\'agnolo')), - NotOp(KeywordOp(Keyword('collaboration'), Value('cms'))) - ) + NotOp(KeywordOp(Keyword('collaboration'), Value('cms'))), + ), ), ('a kondrashuk', KeywordOp(Keyword('author'), Value('kondrashuk'))), ('a r.j.hill.1', KeywordOp(Keyword('author'), Value('r.j.hill.1'))), @@ -630,29 +703,31 @@ def test_foo_bar(): OrOp( KeywordOp(Keyword('author'), Value('fileviez perez,p')), KeywordOp(Keyword('author'), Value('p. f. perez')), - ) + ), ), ( 'a espinosa,jose r and not a rodriguez espinosa', AndOp( KeywordOp(Keyword('author'), Value('espinosa,jose r')), NotOp(KeywordOp(Keyword('author'), Value('rodriguez espinosa'))), - ) + ), ), ( 'a nilles,h and not tc I', AndOp( KeywordOp(Keyword('author'), Value('nilles,h')), NotOp(KeywordOp(Keyword('type-code'), Value('I'))), - ) + ), ), ( - 'a rojo,j. or rojo-chacon,j. and not collaboration pierre auger ' - 'and not collaboration auger and not t auger and tc p', + ( + 'a rojo,j. or rojo-chacon,j. and not collaboration pierre auger ' + 'and not collaboration auger and not t auger and tc p' + ), AndOp( OrOp( KeywordOp(Keyword('author'), Value('rojo,j.')), - KeywordOp(Keyword('author'), Value('rojo-chacon,j.')) + KeywordOp(Keyword('author'), Value('rojo-chacon,j.')), ), AndOp( NotOp(KeywordOp(Keyword('collaboration'), Value('pierre auger'))), @@ -660,45 +735,69 @@ def test_foo_bar(): NotOp(KeywordOp(Keyword('collaboration'), Value('auger'))), AndOp( NotOp(KeywordOp(Keyword('title'), Value('auger'))), - KeywordOp(Keyword('type-code'), Value('p')) - ) - ) - ) - ) + KeywordOp(Keyword('type-code'), Value('p')), + ), + ), + ), + ), ), - ('ea wu, xing gang', KeywordOp(Keyword('exact-author'), Value('wu, xing gang'))), - ('abstract: part*', KeywordOp(Keyword('abstract'), Value('part*', contains_wildcard=True))), ( - "(author:'Hiroshi Okada' OR (author:'H Okada' hep-ph) OR " - "title: 'Dark matter in supersymmetric U(1(B-L) model' OR " - "title: 'Non-Abelian discrete symmetry for flavors')", + 'ea wu, xing gang', + KeywordOp(Keyword('exact-author'), Value('wu, xing gang')), + ), + ( + 'abstract: part*', + KeywordOp(Keyword('abstract'), Value('part*', contains_wildcard=True)), + ), + ( + ( + "(author:'Hiroshi Okada' OR (author:'H Okada' hep-ph) OR " + "title: 'Dark matter in supersymmetric U(1(B-L) model' OR " + "title: 'Non-Abelian discrete symmetry for flavors')" + ), OrOp( KeywordOp(Keyword('author'), PartialMatchValue('Hiroshi Okada')), OrOp( AndOp( KeywordOp(Keyword('author'), PartialMatchValue('H Okada')), - ValueOp(Value('hep-ph')) + ValueOp(Value('hep-ph')), ), OrOp( - KeywordOp(Keyword('title'), PartialMatchValue('Dark matter in supersymmetric U(1(B-L) model')), - KeywordOp(Keyword('title'), PartialMatchValue('Non-Abelian discrete symmetry for flavors')), - ) - ) - ) + KeywordOp( + Keyword('title'), + PartialMatchValue( + 'Dark matter in supersymmetric U(1(B-L) model' + ), + ), + KeywordOp( + Keyword('title'), + PartialMatchValue( + 'Non-Abelian discrete symmetry for flavors' + ), + ), + ), + ), + ), ), ( 'author:"Takayanagi, Tadashi" or hep-th/0010101', OrOp( KeywordOp(Keyword('author'), ExactMatchValue('Takayanagi, Tadashi')), - ValueOp(Value('hep-th/0010101')) - ) + ValueOp(Value('hep-th/0010101')), + ), ), ('ea:matt visser', KeywordOp(Keyword('exact-author'), Value('matt visser'))), ( 'citedby:recid:902780', - NestedKeywordOp(Keyword('citedby'), KeywordOp(Keyword('control_number'), Value('902780'))) + NestedKeywordOp( + Keyword('citedby'), + KeywordOp(Keyword('control_number'), Value('902780')), + ), + ), + ( + 'eprint:arxiv:1706.04080', + KeywordOp(Keyword('eprint'), Value('arxiv:1706.04080')), ), - ('eprint:arxiv:1706.04080', KeywordOp(Keyword('eprint'), Value('arxiv:1706.04080'))), ('eprint:1706.04080', KeywordOp(Keyword('eprint'), Value('1706.04080'))), ( 'f a ostapchenko not olinto not haungs', @@ -706,18 +805,29 @@ def test_foo_bar(): KeywordOp(Keyword('author'), Value('ostapchenko')), AndOp( NotOp(KeywordOp(Keyword('author'), Value('olinto'))), - NotOp(KeywordOp(Keyword('author'), Value('haungs'))) - ) - ) + NotOp(KeywordOp(Keyword('author'), Value('haungs'))), + ), + ), ), ('find cc italy', KeywordOp(Keyword('country'), Value('italy'))), - ('fin date > today', KeywordOp(Keyword('date'), GreaterThanOp(Value(str(date.today()))))), - ('find r atlas-conf-*', KeywordOp(Keyword('reportnumber'), Value('atlas-conf-*', contains_wildcard=True))), + ( + 'fin date > today', + KeywordOp(Keyword('date'), GreaterThanOp(Value(str(date.today())))), + ), + ( + 'find r atlas-conf-*', + KeywordOp( + Keyword('reportnumber'), Value('atlas-conf-*', contains_wildcard=True) + ), + ), ( 'find caption "Diagram for the fermion flow violating process"', - KeywordOp(Keyword('caption'), ExactMatchValue('Diagram for the fermion flow violating process')) - ) - ] + KeywordOp( + Keyword('caption'), + ExactMatchValue('Diagram for the fermion flow violating process'), + ), + ), + ], ) def test_parsing_output_with_inspire_next_tests(query_str, expected_parse_tree): print("Parsing: " + query_str) @@ -730,32 +840,30 @@ def test_parsing_output_with_inspire_next_tests(query_str, expected_parse_tree): def test_convert_simple_value_boolean_query_to_and_boolean_queries(): - parse_tree = \ - parser.SimpleQuery( - parser.SpiresKeywordQuery( - parser.InspireKeyword('author'), - parser.Value( + parse_tree = parser.SimpleQuery( + parser.SpiresKeywordQuery( + parser.InspireKeyword('author'), + parser.Value( + parser.SimpleValueBooleanQuery( + parser.SimpleValue('foo'), + parser.And(), parser.SimpleValueBooleanQuery( - parser.SimpleValue('foo'), - parser.And(), - parser.SimpleValueBooleanQuery( - parser.SimpleValue('bar'), - parser.Or(), - parser.SimpleValueNegation(parser.SimpleValue('foobar')) - ) - ) + parser.SimpleValue('bar'), + parser.Or(), + parser.SimpleValueNegation(parser.SimpleValue('foobar')), + ), ) - ) + ), ) + ) - expected_parse_tree = \ - AndOp( - KeywordOp(Keyword('author'), Value('foo')), - OrOp( - KeywordOp(Keyword('author'), Value('bar')), - NotOp(KeywordOp(Keyword('author'), Value('foobar'))) - ) - ) + expected_parse_tree = AndOp( + KeywordOp(Keyword('author'), Value('foo')), + OrOp( + KeywordOp(Keyword('author'), Value('bar')), + NotOp(KeywordOp(Keyword('author'), Value('foobar'))), + ), + ) restructuring_visitor = RestructuringVisitor() parse_tree = parse_tree.accept(restructuring_visitor) diff --git a/tests/test_visitor_utils.py b/tests/test_visitor_utils.py index 3b72602..b0ce7a1 100644 --- a/tests/test_visitor_utils.py +++ b/tests/test_visitor_utils.py @@ -22,7 +22,8 @@ from __future__ import absolute_import, print_function, unicode_literals -from pytest import raises +import pytest +from test_utils import parametrize from inspire_query_parser.utils.visitor_utils import ( _truncate_wildcard_from_date, @@ -34,23 +35,18 @@ wrap_query_in_nested_if_field_is_nested, ) -from test_utils import parametrize - -@parametrize({ - 'Name with full name parts': { - 'name': 'mele salvatore', 'expected_answer': True - }, - 'Lastname only': { - 'name': 'mele', 'expected_answer': False - }, - 'Lastname, initial(Firstname)': { - 'name': 'mele s', 'expected_answer': False - }, - 'Lastname, initial(Firstname).': { - 'name': 'mele s.', 'expected_answer': False - }, -}) +@parametrize( + { + 'Name with full name parts': { + 'name': 'mele salvatore', + 'expected_answer': True, + }, + 'Lastname only': {'name': 'mele', 'expected_answer': False}, + 'Lastname, initial(Firstname)': {'name': 'mele s', 'expected_answer': False}, + 'Lastname, initial(Firstname).': {'name': 'mele s.', 'expected_answer': False}, + } +) def test_author_name_contains_fullnames(name, expected_answer): assert expected_answer == author_name_contains_fullnames(name) @@ -94,7 +90,7 @@ def test_generate_minimal_name_variations_with_dotted_initial(): assert expected_variations == set(generate_minimal_name_variations(name)) -def test_generate_minimal_name_variations_without_dotted_initial_doesnt_generate_same_variation(): +def test_generate_minimal_name_variations_without_dotted_initial_doesnt_generate_same_variation(): # noqa E501 name = 'Oz, Y' expected_variations = { 'oz y', @@ -108,7 +104,7 @@ def test_generate_minimal_name_variations_without_dotted_initial_doesnt_generate assert expected_variations == set(result) -def test_generate_minimal_name_variations_with_initial_strips_multiple_consecutive_whitespace(): +def test_generate_minimal_name_variations_with_initial_strips_multiple_consecutive_whitespace(): # noqa E501 name = 'oz,y' expected_variations = { 'oz y', @@ -132,63 +128,61 @@ def test_generate_minimal_name_variations_with_dashed_lastname(): assert expected_variations == generate_minimal_name_variations(name) -@parametrize({ - 'Wildcard as whole day': { - 'date': '2018-01-*', 'expected_date': '2018-01' - }, - 'Wildcard as part of the day': { - 'date': '2018-01-1*', 'expected_date': '2018-01' - }, - 'Wildcard as whole day (space separated)': { - 'date': '2018 01 *', 'expected_date': '2018-01' - }, - 'Wildcard as part of the day (space separated)': { - 'date': '2018 01 1*', 'expected_date': '2018-01' - }, - - 'Wildcard as whole month': { - 'date': '2018-*', 'expected_date': '2018' - }, - 'Wildcard as part of the month': { - 'date': '2018-*', 'expected_date': '2018' - }, - 'Wildcard as whole month (space separated)': { - 'date': '2018 *', 'expected_date': '2018' - }, - 'Wildcard as part of the month (space separated)': { - 'date': '2018 1*', 'expected_date': '2018' - }, -}) +@parametrize( + { + 'Wildcard as whole day': {'date': '2018-01-*', 'expected_date': '2018-01'}, + 'Wildcard as part of the day': { + 'date': '2018-01-1*', + 'expected_date': '2018-01', + }, + 'Wildcard as whole day (space separated)': { + 'date': '2018 01 *', + 'expected_date': '2018-01', + }, + 'Wildcard as part of the day (space separated)': { + 'date': '2018 01 1*', + 'expected_date': '2018-01', + }, + 'Wildcard as whole month': {'date': '2018-*', 'expected_date': '2018'}, + 'Wildcard as part of the month': {'date': '2018-*', 'expected_date': '2018'}, + 'Wildcard as whole month (space separated)': { + 'date': '2018 *', + 'expected_date': '2018', + }, + 'Wildcard as part of the month (space separated)': { + 'date': '2018 1*', + 'expected_date': '2018', + }, + } +) def test_truncate_wildcard_from_date_with_wildcard(date, expected_date): assert _truncate_wildcard_from_date(date) == expected_date def test_truncate_wildcard_from_date_throws_on_wildcard_in_year(): date = '201*' - with raises(ValueError): + with pytest.raises(ValueError, match='Erroneous date value:'): _truncate_wildcard_from_date(date) def test_truncate_wildcard_from_date_throws_with_unsupported_separator(): date = '2018_1*' - with raises(ValueError): + with pytest.raises(ValueError, match='Erroneous date value:'): _truncate_wildcard_from_date(date) def test_generate_match_query_with_bool_value(): generated_match_query = generate_match_query('core', True, with_operator_and=True) - expected_match_query = { - 'match': { - 'core': True - } - } + expected_match_query = {'match': {'core': True}} assert generated_match_query == expected_match_query def test_generate_match_query_with_operator_and(): - generated_match_query = generate_match_query('author', 'Ellis, John', with_operator_and=True) + generated_match_query = generate_match_query( + 'author', 'Ellis, John', with_operator_and=True + ) expected_match_query = { 'match': { @@ -203,13 +197,11 @@ def test_generate_match_query_with_operator_and(): def test_generate_match_query_with_operator_and_false(): - generated_match_query = generate_match_query('document_type', 'book', with_operator_and=False) + generated_match_query = generate_match_query( + 'document_type', 'book', with_operator_and=False + ) - expected_match_query = { - 'match': { - 'document_type': 'book' - } - } + expected_match_query = {'match': {'document_type': 'book'}} assert generated_match_query == expected_match_query @@ -220,8 +212,9 @@ def test_wrap_queries_in_bool_clauses_if_more_than_one_with_two_queries(): {'match': {'subject': 'hep'}}, ] - generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries, - use_must_clause=True) + generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( + queries, use_must_clause=True + ) expected_bool_clause = { 'bool': { @@ -235,58 +228,54 @@ def test_wrap_queries_in_bool_clauses_if_more_than_one_with_two_queries(): assert generated_bool_clause == expected_bool_clause -def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_drops_bool_clause_with_flag_disabled(): +def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_drops_bool_clause_with_flag_disabled(): # noqa E501 queries = [ {'match': {'title': 'collider'}}, ] - generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries, - use_must_clause=True) + generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( + queries, use_must_clause=True + ) expected_bool_clause = {'match': {'title': 'collider'}} assert generated_bool_clause == expected_bool_clause -def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_preserves_bool_clause_with_flag_enabled(): +def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_preserves_bool_clause_with_flag_enabled(): # noqa E501 queries = [ {'match': {'title': 'collider'}}, ] - generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries, - use_must_clause=True, - preserve_bool_semantics_if_one_clause=True) + generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( + queries, use_must_clause=True, preserve_bool_semantics_if_one_clause=True + ) - expected_bool_clause = { - 'bool': { - 'must': [ - {'match': {'title': 'collider'}} - ] - } - } + expected_bool_clause = {'bool': {'must': [{'match': {'title': 'collider'}}]}} assert generated_bool_clause == expected_bool_clause -def test_wrap_queries_in_bool_clauses_if_more_than_one_with_no_query_returns_empty_dict(): +def test_wrap_queries_in_bool_clauses_if_more_than_one_with_no_query_returns_empty_dict(): # noqa E501 queries = [] - generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries, - use_must_clause=True) + generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( + queries, use_must_clause=True + ) expected_bool_clause = {} assert generated_bool_clause == expected_bool_clause -def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_generates_should_clause(): +def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_generates_should_clause(): # noqa E501 queries = [ {'match': {'title': 'collider'}}, ] - generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries, - use_must_clause=False, - preserve_bool_semantics_if_one_clause=True) + generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one( + queries, use_must_clause=False, preserve_bool_semantics_if_one_clause=True + ) expected_bool_clause = { 'bool': { @@ -322,7 +311,7 @@ def test_generate_nested_query(): {'match': {'journal.volume': 'D42'}}, ] } - } + }, } } @@ -343,17 +332,18 @@ def test_generate_nested_query_returns_empty_dict_on_falsy_query(): def test_wrap_query_in_nested_if_field_is_nested(): query = {'match': {'title.name': 'collider'}} - generated_query = wrap_query_in_nested_if_field_is_nested(query, 'title.name', ['title']) + generated_query = wrap_query_in_nested_if_field_is_nested( + query, 'title.name', ['title'] + ) expected_query = { - 'nested': { - 'path': 'title', - 'query': {'match': {'title.name': 'collider'}} - } + 'nested': {'path': 'title', 'query': {'match': {'title.name': 'collider'}}} } assert generated_query == expected_query - generated_query_2 = wrap_query_in_nested_if_field_is_nested(query, 'title.name', ['authors']) + generated_query_2 = wrap_query_in_nested_if_field_is_nested( + query, 'title.name', ['authors'] + ) assert generated_query_2 == query