Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH#2256 Introduce a query optimizer concept #2257

Draft
wants to merge 4 commits into
base: 8.x
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions rdflib/plugins/sparql/optimizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

"""
This contains standard optimizers for sparql

"""
import re
from rdflib import Literal
from rdflib.plugins.sparql.operators import Builtin_CONTAINS, Builtin_REGEX
from rdflib.plugins.sparql.sparql import Query
from rdflib.plugins.sparql.algebra import CompValue, Join, Values, Expr
from typing import Any

"""
An interface for having optimizers that transform a query algebra hopefully
in an faster to evaluate version.
"""


class SPARQLOptimizer:
def optimize(self, query: Query) -> Query:
return query


class ValuesToTheLeftOfTheJoin(SPARQLOptimizer):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While this is valuable, I think it may be better to keep it in rdflib._contrib, as we don't necessarily want to offer the same level of compatibility guarantees as we do for other code.


def optimize(self, query: Query) -> Query:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def optimize(self, query: Query) -> Query:
@classmethod
def optimize(cls, query: Query) -> Query:

As these methods don't use the class state and are side effect free it is best to make them class methods, that way it is clearer to users that they don't have to be concerned with concurrency issues.

main = query.algebra
query.algebra = self._optimize_node(main)
return query

def _optimize_node(self, cv: Any) -> Any:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _optimize_node(self, cv: Any) -> Any:
@classmethod
def _optimize_node(cls, cv: Any) -> Any:

if cv.name == "Join":
if cv.p1.name != "ToMultiSet" and "ToMultiSet" == cv.p2.name:
cv.update(p1=cv.p2, p2=cv.p1)
else:
cv.update(self._optimize_node(cv.p1), self._optimize_node(cv.p2))
return cv
elif cv.p is not None:
cv.p.update(self._optimize_node(cv.p))
elif cv.p1 is not None and cv.p2 is not None:
cv.p1.update(self._optimize_node(cv.p1))
cv.p2.update(self._optimize_node(cv.p2))
elif cv.p1 is not None:
cv.p1.update(self._optimize_node(cv.p1))
return cv
10 changes: 8 additions & 2 deletions rdflib/plugins/sparql/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from rdflib.plugins.sparql.evaluate import evalQuery
from rdflib.plugins.sparql.parser import parseQuery, parseUpdate
from rdflib.plugins.sparql.sparql import Query
from rdflib.plugins.sparql.optimizer import SPARQLOptimizer
from rdflib.plugins.sparql.update import evalUpdate
from rdflib.query import Processor, Result, UpdateProcessor

from typing import List

def prepareQuery(queryString, initNs={}, base=None) -> Query:
"""
Expand Down Expand Up @@ -63,8 +64,9 @@ def update(self, strOrQuery, initBindings={}, initNs={}):


class SPARQLProcessor(Processor):
def __init__(self, graph):
def __init__(self, graph, optimizers: List[SPARQLOptimizer] = None):
Copy link
Member

@aucampia aucampia Mar 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be best to just accept any callable that maps a query to a query:

So somewhere before this:

_QueryTranslatorType = Callable[[Query],Query]

Then:

Suggested change
def __init__(self, graph, optimizers: List[SPARQLOptimizer] = None):
def __init__(self, graph, query_translators: Optional[List[_QueryTranslatorType]] = None):

That way, users can pass methods or free functions, and even have multiple different translator methods on one class.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your tests should still work fine with that.

self.graph = graph
self.optimizers = optimizers

def query(self, strOrQuery, initBindings={}, initNs={}, base=None, DEBUG=False):
"""
Expand All @@ -78,4 +80,8 @@ def query(self, strOrQuery, initBindings={}, initNs={}, base=None, DEBUG=False):
query = translateQuery(parsetree, base, initNs)
else:
query = strOrQuery

for optimizer in self.optimizers:
query = optimizer.optimize(query)

return evalQuery(self.graph, query, initBindings, base)
70 changes: 70 additions & 0 deletions test/test_sparql/test_optimizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from rdflib import Graph
from rdflib.plugins.sparql.parser import *
# from rdflib.plugins.sparql.processor import prepareQuery
from rdflib.plugins.sparql.processor import translateQuery
from rdflib.plugins.sparql.processor import parseQuery
from rdflib.plugins.sparql.optimizer import ValuesToTheLeftOfTheJoin, RegexAsStringFunctionsOptimizer

query_slow = """
PREFIX ex:<https://example.org/>

SELECT ?x {
?x ?y ?z .
VALUES (?x) {
(ex:1)
(ex:2)
(ex:3)
}
}
"""

query_fast = """
PREFIX ex:<https://example.org/>

SELECT ?x {
VALUES (?x) {
(ex:1)
(ex:2)
(ex:3)
}
?x ?y ?z .
}
"""

query_regex = """
PREFIX ex:<https://example.org/>

SELECT ?x {
?x ?y ?z .
FILTER(regex("?z", "hi"))
}
"""

query_contains = """
PREFIX ex:<https://example.org/>

SELECT ?x {
?x ?y ?z .
FILTER(contains("?z", "hi"))
}
"""


def test_values_to_left():
qs = _prepare_query(query_slow)
qf = _prepare_query(query_fast)
assert qs != qf
qso = ValuesToTheLeftOfTheJoin().optimize(qs)

assert qso.algebra == qf.algebra


def _prepare_query(str_or_query):
parse_tree = parseQuery(str_or_query)
query = translateQuery(parse_tree, None, {})
return query


if __name__ == "__main__":
test_values_to_left()