-
Notifications
You must be signed in to change notification settings - Fork 560
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH#2256 Introduce a query optimizer concept #2257
base: 8.x
Are you sure you want to change the base?
Changes from 1 commit
66a266a
bf71d4a
94ede34
78e9d90
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,46 @@ | ||||||||
from __future__ import annotations | ||||||||
|
||||||||
""" | ||||||||
This contains standard optimizers for sparql | ||||||||
|
||||||||
""" | ||||||||
import re | ||||||||
from rdflib import Literal | ||||||||
from rdflib.plugins.sparql.operators import Builtin_CONTAINS, Builtin_REGEX | ||||||||
from rdflib.plugins.sparql.sparql import Query | ||||||||
from rdflib.plugins.sparql.algebra import CompValue, Join, Values, Expr | ||||||||
from typing import Any | ||||||||
|
||||||||
""" | ||||||||
An interface for having optimizers that transform a query algebra hopefully | ||||||||
in an faster to evaluate version. | ||||||||
""" | ||||||||
|
||||||||
|
||||||||
class SPARQLOptimizer: | ||||||||
def optimize(self, query: Query) -> Query: | ||||||||
return query | ||||||||
|
||||||||
|
||||||||
class ValuesToTheLeftOfTheJoin(SPARQLOptimizer): | ||||||||
|
||||||||
def optimize(self, query: Query) -> Query: | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
As these methods don't use the class state and are side effect free it is best to make them class methods, that way it is clearer to users that they don't have to be concerned with concurrency issues. |
||||||||
main = query.algebra | ||||||||
query.algebra = self._optimize_node(main) | ||||||||
return query | ||||||||
|
||||||||
def _optimize_node(self, cv: Any) -> Any: | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
if cv.name == "Join": | ||||||||
if cv.p1.name != "ToMultiSet" and "ToMultiSet" == cv.p2.name: | ||||||||
cv.update(p1=cv.p2, p2=cv.p1) | ||||||||
else: | ||||||||
cv.update(self._optimize_node(cv.p1), self._optimize_node(cv.p2)) | ||||||||
return cv | ||||||||
elif cv.p is not None: | ||||||||
cv.p.update(self._optimize_node(cv.p)) | ||||||||
elif cv.p1 is not None and cv.p2 is not None: | ||||||||
cv.p1.update(self._optimize_node(cv.p1)) | ||||||||
cv.p2.update(self._optimize_node(cv.p2)) | ||||||||
elif cv.p1 is not None: | ||||||||
cv.p1.update(self._optimize_node(cv.p1)) | ||||||||
return cv |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -10,9 +10,10 @@ | |||||
from rdflib.plugins.sparql.evaluate import evalQuery | ||||||
from rdflib.plugins.sparql.parser import parseQuery, parseUpdate | ||||||
from rdflib.plugins.sparql.sparql import Query | ||||||
from rdflib.plugins.sparql.optimizer import SPARQLOptimizer | ||||||
from rdflib.plugins.sparql.update import evalUpdate | ||||||
from rdflib.query import Processor, Result, UpdateProcessor | ||||||
|
||||||
from typing import List | ||||||
|
||||||
def prepareQuery(queryString, initNs={}, base=None) -> Query: | ||||||
""" | ||||||
|
@@ -63,8 +64,9 @@ def update(self, strOrQuery, initBindings={}, initNs={}): | |||||
|
||||||
|
||||||
class SPARQLProcessor(Processor): | ||||||
def __init__(self, graph): | ||||||
def __init__(self, graph, optimizers: List[SPARQLOptimizer] = None): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be best to just accept any callable that maps a query to a query: So somewhere before this: _QueryTranslatorType = Callable[[Query],Query] Then:
Suggested change
That way, users can pass methods or free functions, and even have multiple different translator methods on one class. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your tests should still work fine with that. |
||||||
self.graph = graph | ||||||
self.optimizers = optimizers | ||||||
|
||||||
def query(self, strOrQuery, initBindings={}, initNs={}, base=None, DEBUG=False): | ||||||
""" | ||||||
|
@@ -78,4 +80,8 @@ def query(self, strOrQuery, initBindings={}, initNs={}, base=None, DEBUG=False): | |||||
query = translateQuery(parsetree, base, initNs) | ||||||
else: | ||||||
query = strOrQuery | ||||||
|
||||||
for optimizer in self.optimizers: | ||||||
query = optimizer.optimize(query) | ||||||
|
||||||
return evalQuery(self.graph, query, initBindings, base) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from rdflib import Graph | ||
from rdflib.plugins.sparql.parser import * | ||
# from rdflib.plugins.sparql.processor import prepareQuery | ||
from rdflib.plugins.sparql.processor import translateQuery | ||
from rdflib.plugins.sparql.processor import parseQuery | ||
from rdflib.plugins.sparql.optimizer import ValuesToTheLeftOfTheJoin, RegexAsStringFunctionsOptimizer | ||
|
||
query_slow = """ | ||
PREFIX ex:<https://example.org/> | ||
|
||
SELECT ?x { | ||
?x ?y ?z . | ||
VALUES (?x) { | ||
(ex:1) | ||
(ex:2) | ||
(ex:3) | ||
} | ||
} | ||
""" | ||
|
||
query_fast = """ | ||
PREFIX ex:<https://example.org/> | ||
|
||
SELECT ?x { | ||
VALUES (?x) { | ||
(ex:1) | ||
(ex:2) | ||
(ex:3) | ||
} | ||
?x ?y ?z . | ||
} | ||
""" | ||
|
||
query_regex = """ | ||
PREFIX ex:<https://example.org/> | ||
|
||
SELECT ?x { | ||
?x ?y ?z . | ||
FILTER(regex("?z", "hi")) | ||
} | ||
""" | ||
|
||
query_contains = """ | ||
PREFIX ex:<https://example.org/> | ||
|
||
SELECT ?x { | ||
?x ?y ?z . | ||
FILTER(contains("?z", "hi")) | ||
} | ||
""" | ||
|
||
|
||
def test_values_to_left(): | ||
qs = _prepare_query(query_slow) | ||
qf = _prepare_query(query_fast) | ||
assert qs != qf | ||
qso = ValuesToTheLeftOfTheJoin().optimize(qs) | ||
|
||
assert qso.algebra == qf.algebra | ||
|
||
|
||
def _prepare_query(str_or_query): | ||
parse_tree = parseQuery(str_or_query) | ||
query = translateQuery(parse_tree, None, {}) | ||
return query | ||
|
||
|
||
if __name__ == "__main__": | ||
test_values_to_left() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While this is valuable, I think it may be better to keep it in
rdflib._contrib
, as we don't necessarily want to offer the same level of compatibility guarantees as we do for other code.