Merge pull request #11100 from creative-commoners/pulls/5/split-summary-by-punctuation

ENH Split sentences by configurable punctuation for summary
This commit is contained in:
Guy Sartorelli 2024-01-09 09:01:44 +13:00 committed by GitHub
commit b1a1d4b951
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 56 additions and 8 deletions

View File

@ -37,6 +37,11 @@ class DBText extends DBString
'Summary' => 'Text',
];
/**
* Punctuation that marks an end of a sentence for the Summary() method
*/
private static array $summary_sentence_separators = ['.', '?', '!'];
/**
* (non-PHPdoc)
* @see DBField::requireField()
@ -130,10 +135,18 @@ class DBText extends DBString
$add = $this->defaultEllipsis();
}
// Split on sentences (don't remove period)
$sentences = array_filter(array_map(function ($str) {
return trim($str ?? '');
}, preg_split('@(?<=\.)@', $value ?? '') ?: []));
// Split on sentences (don't remove punctuation)
$summarySentenceSeparators = preg_quote(implode(static::config()->get('summary_sentence_separators')), '@');
$possibleSentences = preg_split('@(?<=[' . $summarySentenceSeparators . '])@', $value ?? '') ?: [];
$sentences = [];
foreach ($possibleSentences as $sentence) {
$sentence = trim($sentence);
if ($sentence) {
$sentences[] = $sentence;
}
}
$wordCount = count(preg_split('#\s+#u', $sentences[0] ?? '') ?: []);
// if the first sentence is too long, show only the first $maxWords words

View File

@ -282,32 +282,56 @@ class DBTextTest extends SapphireTest
public function providerSummary()
{
return [
[
'simple test' => [
'This is some text. It is a test',
3,
false,
'This is some…',
],
[
'custom ellipses' => [
// check custom ellipsis
'This is a test text in a longer sentence and a custom ellipsis.',
8,
'...', // regular dots instead of the ellipsis character
'This is a test text in a longer...',
],
[
'umlauts' => [
'both schön and können have umlauts',
5,
false,
'both schön and können have…',
],
[
'invalid UTF' => [
// check invalid UTF8 handling — input is an invalid UTF sequence, output should be empty string
"\xf0\x28\x8c\xbc",
50,
false,
'',
],
'treats period as sentence boundary' => [
'This is some text. It is a test. There are three sentences.',
10,
false,
'This is some text. It is a test.',
],
'treats exclamation mark as sentence boundary' => [
'This is some text! It is a test! There are three sentences.',
10,
false,
'This is some text! It is a test!',
],
'treats question mark as sentence boundary' => [
'This is some text? It is a test? There are three sentences.',
10,
false,
'This is some text? It is a test?',
],
'does not treat colon as sentence boundary' => [
'This is some text: It is a test: There are three sentences.',
10,
false,
'This is some text: It is a test: There are…',
],
];
}
@ -401,4 +425,15 @@ class DBTextTest extends SapphireTest
$result = $text->obj('Summary', [$words, $add])->forTemplate();
$this->assertEquals($expectedValue, $result);
}
public function testSummaryConfiguration()
{
$text = DBField::create_field(DBText::class, 'This is some text: It is a test: There are three sentences.');
// Doesn't treat colon as a boundary by default
$this->assertSame('This is some text: It is a test: There are…', $text->Summary(10));
DBText::config()->merge('summary_sentence_separators', [':']);
// Does treat colon as a boundary if configured to do so
$this->assertSame('This is some text: It is a test:', $text->Summary(10));
}
}