RFC Add chunk method to DataList to iterate over large dataset (#8940)

This commit is contained in:
Maxime Rainville 2021-04-14 07:49:44 +12:00 committed by GitHub
parent dcdc25500b
commit 6fc25e4e96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 242 additions and 3 deletions

View File

@ -10,7 +10,7 @@ Whenever using the ORM to fetch records or navigate relationships you will recei
either [DataList](api:SilverStripe\ORM\DataList) or [RelationList](api:SilverStripe\ORM\RelationList). This object gives you the ability to iterate over each of the results or either [DataList](api:SilverStripe\ORM\DataList) or [RelationList](api:SilverStripe\ORM\RelationList). This object gives you the ability to iterate over each of the results or
modify. modify.
## Iterating over the list. ## Iterating over the list
[SS_List](api:SilverStripe\ORM\SS_List) implements `IteratorAggregate`, allowing you to loop over the instance. [SS_List](api:SilverStripe\ORM\SS_List) implements `IteratorAggregate`, allowing you to loop over the instance.
@ -32,7 +32,7 @@ Or in the template engine:
<% end_loop %> <% end_loop %>
``` ```
## Finding an item by value. ## Finding an item by value
```php ```php
// $list->find($key, $value); // $list->find($key, $value);
@ -79,6 +79,42 @@ echo $members->column('Email');
// ]; // ];
``` ```
## Iterating over a large list {#chunkedFetch}
When iterating over a DataList, all DataObjects in the list will be loaded in memory. This can consume a lot of memory when working with a large data set.
To limit the number of DataObjects loaded in memory, you can use the `chunkedFetch()` method on your DataList. In most cases, you can iterate over the results of `chunkedFetch()` the same way you would iterate over your DataList. Internally, `chunkedFetch()` will split your DataList query into smaller queries and keep running through them until it runs out of results.
```php
$members = Member::get();
foreach ($members as $member) {
echo $member->Email;
}
// This call will produce the same output, but it will use less memory and run more queries against the database
$members = Member::get()->chunkedFetch();
foreach ($members as $member) {
echo $member->Email;
}
```
`chunkedFetch()` will respect any filter or sort condition applied to the DataList. By default, chunk will limit each query to 1000 results. You can explicitly set this limit by passing an integer to `chunkedFetch()`.
```php
$members = Member::get()
->filter('Email:PartialMatch', 'silverstripe.com')
->sort('Email')
->chunkedFetch(10);
foreach ($members as $member) {
echo $member->Email;
}
```
They are some limitations:
* `chunkedFetch()` will ignore any limit or offset you have applied to your DataList
* you can not "count" a chunked list or do any other call against it aside from iterating it
* while iterating over a chunked list, you can not perform any operation that would alter the order of the items.
## ArrayList ## ArrayList
[ArrayList](api:SilverStripe\ORM\ArrayList) exists to wrap a standard PHP array in the same API as a database backed list. [ArrayList](api:SilverStripe\ORM\ArrayList) exists to wrap a standard PHP array in the same API as a database backed list.

View File

@ -6,6 +6,8 @@
## New features ## New features
* [Added a `chunkedFetch()` method to `DataList`](/Developer_Guides/Model/Lists#chunkedFetch) to avoid loading large result sets in memory all at once.
### Support for silverstripe/graphql v4 {#graphql-v4} ### Support for silverstripe/graphql v4 {#graphql-v4}
The [silverstripe/graphql](http://github.com/silverstripe/silverstripe-graphql/issues) module The [silverstripe/graphql](http://github.com/silverstripe/silverstripe-graphql/issues) module

View File

@ -1286,4 +1286,44 @@ class DataList extends ViewableData implements SS_List, Filterable, Sortable, Li
{ {
throw new \BadMethodCallException("Can't alter items in a DataList using array-access"); throw new \BadMethodCallException("Can't alter items in a DataList using array-access");
} }
/**
* Iterate over this DataList in "chunks". This will break the query in smaller subsets and avoid loading the entire
* result set in memory at once. Beware not to perform any operations on the results that might alter the return
* order. Otherwise, you might break subsequent chunks.
*
* You also can not define a custom limit or offset when using the chunk method.
*
* @param int $chunkSize
* @throws InvalidArgumentException If `$chunkSize` has an invalid size.
* @return Generator|DataObject[]
*/
public function chunkedFetch(int $chunkSize = 1000): iterable
{
if ($chunkSize < 1) {
throw new InvalidArgumentException(sprintf(
'%s::%s: chunkSize must be greater than or equal to 1',
__CLASS__,
__METHOD__
));
}
$currentChunk = 0;
// Keep looping until we run out of chunks
while ($chunk = $this->limit($chunkSize, $chunkSize * $currentChunk)->getIterator()) {
// Loop over all the item in our chunk
foreach ($chunk as $item) {
yield $item;
}
if ($chunk->count() < $chunkSize) {
// If our last chunk had less item than our chunkSize, we've reach the end.
break;
}
$currentChunk++;
}
}
} }

View File

@ -11,10 +11,11 @@ use SilverStripe\ORM\DataQuery;
use SilverStripe\ORM\DB; use SilverStripe\ORM\DB;
use SilverStripe\ORM\Filterable; use SilverStripe\ORM\Filterable;
use SilverStripe\ORM\Filters\ExactMatchFilter; use SilverStripe\ORM\Filters\ExactMatchFilter;
use SilverStripe\ORM\Tests\DataObjectTest\DataListQueryCounter;
use SilverStripe\ORM\Tests\DataObjectTest\Fixture;
use SilverStripe\ORM\Tests\DataObjectTest\Bracket; use SilverStripe\ORM\Tests\DataObjectTest\Bracket;
use SilverStripe\ORM\Tests\DataObjectTest\EquipmentCompany; use SilverStripe\ORM\Tests\DataObjectTest\EquipmentCompany;
use SilverStripe\ORM\Tests\DataObjectTest\Fan; use SilverStripe\ORM\Tests\DataObjectTest\Fan;
use SilverStripe\ORM\Tests\DataObjectTest\Fixture;
use SilverStripe\ORM\Tests\DataObjectTest\Player; use SilverStripe\ORM\Tests\DataObjectTest\Player;
use SilverStripe\ORM\Tests\DataObjectTest\Sortable; use SilverStripe\ORM\Tests\DataObjectTest\Sortable;
use SilverStripe\ORM\Tests\DataObjectTest\Staff; use SilverStripe\ORM\Tests\DataObjectTest\Staff;
@ -1875,4 +1876,122 @@ class DataListTest extends SapphireTest
'Product B', 'Product B',
], $productTitles); ], $productTitles);
} }
public function testChunkedFetch()
{
$expectedIDs = Team::get()->map('ID', 'ID')->toArray();
$expectedSize = sizeof($expectedIDs);
$dataQuery = new DataListQueryCounter(Team::class);
$this->chunkTester(
$expectedIDs,
Team::get()->setDataQuery($dataQuery)->chunkedFetch(),
$dataQuery,
1
);
$dataQuery = new DataListQueryCounter(Team::class);
$this->chunkTester(
$expectedIDs,
Team::get()->setDataQuery($dataQuery)->chunkedFetch(1),
$dataQuery,
$expectedSize+1
);
$dataQuery = new DataListQueryCounter(Team::class);
$this->chunkTester(
$expectedIDs,
Team::get()->setDataQuery($dataQuery)->chunkedFetch($expectedSize),
$dataQuery,
2
);
$dataQuery = new DataListQueryCounter(Team::class);
$this->chunkTester(
$expectedIDs,
Team::get()->setDataQuery($dataQuery)->chunkedFetch($expectedSize-1),
$dataQuery,
2
);
$dataQuery = new DataListQueryCounter(Team::class);
$this->chunkTester(
$expectedIDs,
Team::get()->setDataQuery($dataQuery)->chunkedFetch($expectedSize+1),
$dataQuery,
1
);
}
public function testFilteredChunk()
{
$dataQuery = new DataListQueryCounter(Team::class);
$this->chunkTester(
Team::get()->filter('ClassName', Team::class)->map('ID', 'ID')->toArray(),
Team::get()->setDataQuery($dataQuery)->filter('ClassName', Team::class)->chunkedFetch(),
$dataQuery,
1
);
}
public function testSortedChunk()
{
$dataQuery = new DataListQueryCounter(Team::class);
$this->chunkTester(
Team::get()->sort('ID', 'Desc')->map('ID', 'ID')->toArray(),
Team::get()->setDataQuery($dataQuery)->sort('ID', 'Desc')->chunkedFetch(),
$dataQuery,
1
);
}
public function testEmptyChunk()
{
$dataQuery = new DataListQueryCounter(Team::class);
$this->chunkTester(
[],
Team::get()->setDataQuery($dataQuery)->filter('ClassName', 'non-sense')->chunkedFetch(),
$dataQuery,
1
);
}
public function testInvalidChunkSize()
{
$this->expectException(InvalidArgumentException::class);
foreach (Team::get()->chunkedFetch(0) as $item) {
// You don't get the error until you iterate over the list
};
}
/**
* Loop over a chunck list and make sure it matches our expected results
* @param int[] $expectedIDs
* @param iterable $chunkList
*/
private function chunkTester(
array $expectedIDs,
iterable $chunkList,
DataListQueryCounter $dataQuery,
int $expectedQueryCount
) {
foreach ($chunkList as $chunkedTeam) {
$this->assertInstanceOf(
Team::class,
$chunkedTeam,
'Chunk return the correct type of data object'
);
$expectedID = array_shift($expectedIDs);
$this->assertEquals(
$expectedID,
$chunkedTeam->ID,
'chunk returns the same results in the same order as the regular iterator'
);
}
$this->assertEmpty($expectedIDs, 'chunk returns all the results that the regular iterator does');
$this->assertEquals($expectedQueryCount, $dataQuery->getCount());
}
} }

View File

@ -0,0 +1,42 @@
<?php
namespace SilverStripe\ORM\Tests\DataObjectTest;
use SilverStripe\ORM\DataQuery;
/**
* This is designed around the chunk method so we can count the number of queries run.
*/
class DataListQueryCounter extends DataQuery
{
private $queryCount = 0;
/**
* When the DataList gets clone our reference to parent will be attached to our cloned DataListQueryCounter. So all
* DataListQueryCounter::parent will point back to the original one that go created by with the constructor.
* @var DataListQueryCounter
*/
private $parent;
public function __construct($dataClass)
{
parent::__construct($dataClass);
$this->parent = $this;
}
public function getFinalisedQuery($queriedColumns = null)
{
$this->increment();
return parent::getFinalisedQuery($queriedColumns);
}
private function increment()
{
$this->parent->queryCount++;
}
public function getCount()
{
return $this->parent->queryCount;
}
}