ENH Split sentences by configurable punctuation for summary

Co-authored-by: Lukas Erni <le@kraftausdruck.ch>
This commit is contained in:
Guy Sartorelli 2021-12-04 18:43:36 +01:00
parent 6c69d32367
commit bf629dfabd
No known key found for this signature in database
GPG Key ID: F313E3B9504D496A
2 changed files with 56 additions and 8 deletions

View File

@ -37,6 +37,11 @@ class DBText extends DBString
'Summary' => 'Text', 'Summary' => 'Text',
]; ];
/**
* Punctuation that marks an end of a sentence for the Summary() method
*/
private static array $summary_sentence_separators = ['.', '?', '!'];
/** /**
* (non-PHPdoc) * (non-PHPdoc)
* @see DBField::requireField() * @see DBField::requireField()
@ -130,10 +135,18 @@ class DBText extends DBString
$add = $this->defaultEllipsis(); $add = $this->defaultEllipsis();
} }
// Split on sentences (don't remove period) // Split on sentences (don't remove punctuation)
$sentences = array_filter(array_map(function ($str) { $summarySentenceSeparators = preg_quote(implode(static::config()->get('summary_sentence_separators')), '@');
return trim($str ?? ''); $possibleSentences = preg_split('@(?<=[' . $summarySentenceSeparators . '])@', $value ?? '') ?: [];
}, preg_split('@(?<=\.)@', $value ?? '') ?: [])); $sentences = [];
foreach ($possibleSentences as $sentence) {
$sentence = trim($sentence);
if ($sentence) {
$sentences[] = $sentence;
}
}
$wordCount = count(preg_split('#\s+#u', $sentences[0] ?? '') ?: []); $wordCount = count(preg_split('#\s+#u', $sentences[0] ?? '') ?: []);
// if the first sentence is too long, show only the first $maxWords words // if the first sentence is too long, show only the first $maxWords words

View File

@ -282,32 +282,56 @@ class DBTextTest extends SapphireTest
public function providerSummary() public function providerSummary()
{ {
return [ return [
[ 'simple test' => [
'This is some text. It is a test', 'This is some text. It is a test',
3, 3,
false, false,
'This is some…', 'This is some…',
], ],
[ 'custom ellipses' => [
// check custom ellipsis // check custom ellipsis
'This is a test text in a longer sentence and a custom ellipsis.', 'This is a test text in a longer sentence and a custom ellipsis.',
8, 8,
'...', // regular dots instead of the ellipsis character '...', // regular dots instead of the ellipsis character
'This is a test text in a longer...', 'This is a test text in a longer...',
], ],
[ 'umlauts' => [
'both schön and können have umlauts', 'both schön and können have umlauts',
5, 5,
false, false,
'both schön and können have…', 'both schön and können have…',
], ],
[ 'invalid UTF' => [
// check invalid UTF8 handling — input is an invalid UTF sequence, output should be empty string // check invalid UTF8 handling — input is an invalid UTF sequence, output should be empty string
"\xf0\x28\x8c\xbc", "\xf0\x28\x8c\xbc",
50, 50,
false, false,
'', '',
], ],
'treats period as sentence boundary' => [
'This is some text. It is a test. There are three sentences.',
10,
false,
'This is some text. It is a test.',
],
'treats exclamation mark as sentence boundary' => [
'This is some text! It is a test! There are three sentences.',
10,
false,
'This is some text! It is a test!',
],
'treats question mark as sentence boundary' => [
'This is some text? It is a test? There are three sentences.',
10,
false,
'This is some text? It is a test?',
],
'does not treat colon as sentence boundary' => [
'This is some text: It is a test: There are three sentences.',
10,
false,
'This is some text: It is a test: There are…',
],
]; ];
} }
@ -401,4 +425,15 @@ class DBTextTest extends SapphireTest
$result = $text->obj('Summary', [$words, $add])->forTemplate(); $result = $text->obj('Summary', [$words, $add])->forTemplate();
$this->assertEquals($expectedValue, $result); $this->assertEquals($expectedValue, $result);
} }
public function testSummaryConfiguration()
{
$text = DBField::create_field(DBText::class, 'This is some text: It is a test: There are three sentences.');
// Doesn't treat colon as a boundary by default
$this->assertSame('This is some text: It is a test: There are…', $text->Summary(10));
DBText::config()->merge('summary_sentence_separators', [':']);
// Does treat colon as a boundary if configured to do so
$this->assertSame('This is some text: It is a test:', $text->Summary(10));
}
} }