From 6ab6a40aa86b81e0b0e02bf2435cbd995b2ad44c Mon Sep 17 00:00:00 2001 From: Kevin Pfeifer Date: Mon, 24 Jul 2023 21:29:32 +0200 Subject: [PATCH 1/2] add docs on how to limit associations --- en/orm.rst | 1 + en/orm/limit-associations.rst | 327 ++++++++++++++++++++++++++++++++++ en/orm/query-builder.rst | 10 +- 3 files changed, 334 insertions(+), 4 deletions(-) create mode 100644 en/orm/limit-associations.rst diff --git a/en/orm.rst b/en/orm.rst index 409aa23bd1..7708db3e63 100644 --- a/en/orm.rst +++ b/en/orm.rst @@ -118,6 +118,7 @@ More Information orm/saving-data orm/deleting-data orm/associations + orm/limit-associations orm/behaviors orm/schema-system console-commands/schema-cache diff --git a/en/orm/limit-associations.rst b/en/orm/limit-associations.rst new file mode 100644 index 0000000000..8accfec653 --- /dev/null +++ b/en/orm/limit-associations.rst @@ -0,0 +1,327 @@ +Limit contained associations per record/group +############################################# + +By default containing an association will always load all associated entities. +But in certain scenarios you only want to show the first few items sorted by e.g. ``created``. + +In general this problem is called the ``greatest-n-per-group`` and here are a few ways how you can +achieve this behavior. + +In the following scenarios lets image a database with a **Articles hasMany Abstracts** schema. + +.. _a-library-solution: + +A library solution +================== + +With `icings/partitionable `__ you have a +rather nice way to define ``partitionable`` associations like this :: + + // inside your table class initialize() method + $this + ->partitionableHasMany('TopAbstracts') + ->setClassName('Abstracts') + ->setLimit(1) + ->setSort([ + 'Abstracts.points' => 'DESC', + 'Abstracts.id' => 'ASC', + ]); + +``TopAbstracts`` can then be part of any ``contain`` query just like all other associations. + +.. tip:: + + Please make sure your DBMS is supported to do these kind of ``window`` functions in SQL. + Check the plugins requirements for further information. + + +.. _custom-solutions-on-association-level: + +Custom solutions on association level +===================================== + +Select strategy - Using a join on a grouping, max-value subquery +---------------------------------------------------------------- + +This will select the top comments via a join query that is based on the max points:: + + $this->hasOne('TopAbstracts', [ + 'className' => 'Abstracts', + 'strategy' => 'select', + 'conditions' => function (\Cake\Database\Expression\QueryExpression $exp, \Cake\ORM\Query $query) { + $query->innerJoin( + [ + 'AbstractsFilter' => $query + ->getConnection() + ->newQuery() + ->select(['article_id', 'points' => $query->func()->max('points')]) + ->from('abstracts') + ->group('article_id') + ], + [ + 'TopAbstracts.article_id = AbstractsFilter.article_id', + 'TopAbstracts.points = AbstractsFilter.points' + ] + ); + return []; + } + ]); + +this will look something like this:: + + SELECT + TopAbstracts.id AS `TopAbstracts__id`, ... + FROM + abstracts TopAbstracts + INNER JOIN ( + SELECT + article_id, (MAX(points)) AS `points` + FROM + abstracts + GROUP BY + article_id + ) + AbstractsFilter ON ( + TopAbstracts.article_id = AbstractsFilter.article_id + AND + TopAbstracts.points = AbstractsFilter.points + ) + WHERE + TopAbstracts.article_id in (1,2,3,4,5,6,7,8, ...) + + +Select strategy - Using left self-join filtering +------------------------------------------------ + +Another alternative is self-joining like so:: + + $this->hasOne('TopAbstracts', [ + 'className' => 'Abstracts', + 'strategy' => 'select', + 'conditions' => function (\Cake\Database\Expression\QueryExpression $exp, \Cake\ORM\Query $query) { + $query->leftJoin( + ['AbstractsFilter' => 'abstracts'], + [ + 'TopAbstracts.article_id = AbstractsFilter.article_id', + 'TopAbstracts.points < AbstractsFilter.points' + ]); + return $exp->add(['AbstractsFilter.id IS NULL']); + } + ]); + +This will use a self-join that filters based on the rows that don't have ``a.points < b.points``, it will look something like:: + + SELECT + TopAbstracts.id AS `TopAbstracts__id`, ... + FROM + abstracts TopAbstracts + LEFT JOIN + abstracts AbstractsFilter ON ( + TopAbstracts.article_id = AbstractsFilter.article_id + AND + TopAbstracts.points < AbstractsFilter.points + ) + WHERE + (AbstractsFilter.id IS NULL AND TopAbstracts.article_id in (1,2,3,4,5,6,7,8, ...)) + + +Join strategy - Using a subquery for the join condition +------------------------------------------------------- + +Another alternative is a subquery like so:: + + $this->hasOne('TopAbstracts', [ + 'className' => 'Abstracts', + 'foreignKey' => false, + 'conditions' => function (\Cake\Database\Expression\QueryExpression $exp, \Cake\ORM\Query $query) { + $subquery = $query + ->getConnection() + ->newQuery() + ->select(['SubTopAbstracts.id']) + ->from(['SubTopAbstracts' => 'abstracts']) + ->where(['Articles.id = SubTopAbstracts.article_id']) + ->order(['SubTopAbstracts.points' => 'DESC']) + ->limit(1); + + return $exp->add(['TopAbstracts.id' => $subquery]); + } + ]); + +This will use a correlated subquery that uses a rather specific select with simple ordering and +limiting to pick the top comment. + +Note that the foreignKey option is set to false in order to avoid an additional +`Articles.id = TopAbstracts.article_id` condition to be compiled into the join conditions. + +The query will look like this:: + + SELECT + Articles.id AS `Articles__id`, ... , + TopAbstracts.id AS `TopAbstracts__id`, ... + FROM + articles Articles + LEFT JOIN + abstracts TopAbstracts ON ( + TopAbstracts.id = ( + SELECT + SubTopAbstracts.id + FROM + abstracts SubTopAbstracts + WHERE + Articles.id = SubTopAbstracts.article_id + ORDER BY + SubTopAbstracts.points DESC + LIMIT + 1 + ) + ) + +All these 3 options will query and inject the records without any hackery, it's just not very "straightforward". + + +.. _a-manual-approach: + +A manual approach +================= + +If you want to go even deeper you can of course manually load the associated records yourself as well. + +.. _window-functions: + +Using window functions +---------------------- + +.. note:: + + Please make sure your DBMS supports `window functions `__ + +You could query all associated records in a single additional query. For example:: + + $query = $this->Articles + ->find() + ->formatResults(function(\Cake\Collection\CollectionInterface $results) { + // extract the article IDs from the results + $articleIds = array_unique($results->extract('id')->toArray()); + + // rank abstracts by points, partitioned by article + $rankedAbstracts = $this->Articles->Abstracts + ->find() + ->select(function (\Cake\ORM\Query $query) { + return [ + 'id' => 'id', + // as of CakePHP 4.1 + 'row_num' => $query + ->func() + ->rowNumber() + ->over() + ->partition(['Abstracts.article_id']) + ->order(['Abstracts.points' => 'DESC']), + // in earlier CakePHP versions instead + /* + 'row_num' => $query->newExpr(' + ROW_NUMBER() OVER ( + PARTITION BY Abstracts.article_id + ORDER BY Abstracts.points DESC + ) + '), + */ + ]; + }) + ->where([ + 'Abstracts.article_id IN' => $articleIds, + ]); + + // fetch top abstracts by ranking + $topAbstracts = $this->Articles->Abstracts + ->find() + ->innerJoin( + ['RankedAbstracts' => $rankedAbstracts], + function ( + \Cake\Database\Expression\QueryExpression $exp, + \Cake\ORM\Query $query + ) { + return [ + 'RankedAbstracts.id' => $query->identifier( + 'Abstracts.id' + ), + 'RankedAbstracts.row_num' => 1, + ]; + } + ) + ->all(); + + // inject the associated records into the results + return $results->map(function ($row) use ($topAbstracts) { + $row['top_abstract'] = $topAbstracts + ->filter(function ($value, $key) use ($row) { + return $value['article_id'] === $row['id']; + }) + ->first(); + + return $row; + }); + }) + +Note that if one wanted to limit to more than 1 result, this could easily be achieved +by testing for ``'RankedAbstracts.row_num <=' => $limit`` instead, +and injecting all matching records by using ``->toList()`` instead of ``->first()``. + +The query will look something like:: + + SELECT + Abstracts.id ... + FROM + abstracts Abstracts + INNER JOIN + ( + SELECT + id AS id, + ROW_NUMBER() OVER ( + PARTITION BY Abstracts.article_id + ORDER BY Abstracts.points DESC + ) AS row_num + FROM + abstracts Abstracts + WHERE + Abstracts.article_id IN (...) + ) + RankedAbstracts ON + RankedAbstracts.id = Abstracts.id AND + RankedAbstracts.row_num = 1 + +See also :ref:`query-window-functions` + + +Fetch for each parent record individually +========================================= + +If your DBMS is outdated and doesn't support window functions, then another +manual approach would be to issue an additional query for each article to +fetch the associated top abstract (eg. 100 articles would mean 100 additional queries!). + +.. note:: + + Usually one would try to avoid doing this, as it can very quickly perform + rather badly depending on the size of the result set. + +For example:: + + $query = $this->Articles + ->find() + ->formatResults(function(\Cake\Collection\CollectionInterface $results) { + return $results->map(function ($row) { + // query the top abstract for the current article + // and inject it into the result + $row['top_abstract'] = $this->Articles->Abstracts + ->find() + ->where(['Abstracts.article_id' => $row['id']]) + ->sort(['Abstracts.points' => 'DESC']) + ->limit(1) + ->all() + ->first(); + + return $row; + }); + }); + +See also :ref:`format-results` diff --git a/en/orm/query-builder.rst b/en/orm/query-builder.rst index da2d2a1f56..6e39d078a1 100644 --- a/en/orm/query-builder.rst +++ b/en/orm/query-builder.rst @@ -1094,8 +1094,8 @@ You can use ``identifier()`` in comparisons to aggregations too:: Collation --------------------------------- -In situations that you need to deal with accented characters, multilingual data -or case-sensitive comparisons, you can use the ``$collation`` parameter of ``IdentifierExpression`` +In situations that you need to deal with accented characters, multilingual data +or case-sensitive comparisons, you can use the ``$collation`` parameter of ``IdentifierExpression`` or ``StringExpression`` to apply a character expression to a certain collation:: use Cake\Database\Expression\IdentifierExpression; @@ -1182,7 +1182,7 @@ expression objects to add snippets of SQL to your queries:: Expression Conjuction ----------------------- -It is possible to change the conjunction used to join conditions in a query +It is possible to change the conjunction used to join conditions in a query expression using the method ``setConjunction``:: $query = $articles->find(); @@ -1651,7 +1651,7 @@ results based on the results of other queries:: $query = $articles->find(); $query->from(['matches' => $matchingComment]) ->innerJoin( - ['Articles' => 'articles'], + ['Articles' => 'articles'], ['Articles.id' => $query->identifier('matches.id') ] ); @@ -1687,6 +1687,8 @@ operations. You can use the ``epilog()`` method for this:: The ``epilog()`` method allows you to append raw SQL to the end of queries. You should never put raw user data into ``epilog()``. +.. _query-window-functions: + Window Functions ---------------- From dfcd306d8cca7c69998c457a79e03afc6b6935d2 Mon Sep 17 00:00:00 2001 From: Mark Story Date: Mon, 24 Jul 2023 21:59:13 -0700 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: ADmad --- en/orm/limit-associations.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/en/orm/limit-associations.rst b/en/orm/limit-associations.rst index 8accfec653..097232c6a9 100644 --- a/en/orm/limit-associations.rst +++ b/en/orm/limit-associations.rst @@ -7,7 +7,7 @@ But in certain scenarios you only want to show the first few items sorted by e.g In general this problem is called the ``greatest-n-per-group`` and here are a few ways how you can achieve this behavior. -In the following scenarios lets image a database with a **Articles hasMany Abstracts** schema. +In the following scenarios lets imagine a database with a **Articles hasMany Abstracts** schema. .. _a-library-solution: @@ -32,7 +32,7 @@ rather nice way to define ``partitionable`` associations like this :: .. tip:: Please make sure your DBMS is supported to do these kind of ``window`` functions in SQL. - Check the plugins requirements for further information. + Check the plugin's requirements for further information. .. _custom-solutions-on-association-level: @@ -67,7 +67,7 @@ This will select the top comments via a join query that is based on the max poin } ]); -this will look something like this:: +The generated query will look something like this:: SELECT TopAbstracts.id AS `TopAbstracts__id`, ...