Fix OS X issue with Convert::html2raw, HTMLText::FirstSentence, HTMLText::Summary and Text::FirstSentence.

Use unicode modifier for regular expressions that deal with whitespace.
Added unit-tests to ensure no invalud utf-8 gets generated by these methods.
This commit is contained in:
Roman Schmid 2017-07-06 16:16:39 +02:00
parent 3a7261ac6e
commit a6db16b229
6 changed files with 143 additions and 12 deletions

View File

@ -331,10 +331,10 @@ class Convert {
// Expand hyperlinks
if(!$preserveLinks && !$config['PreserveLinks']) {
$data = preg_replace_callback('/<a[^>]*href\s*=\s*"([^"]*)">(.*?)<\/a>/i', function($matches) {
$data = preg_replace_callback('/<a[^>]*href\s*=\s*"([^"]*)">(.*?)<\/a>/ui', function($matches) {
return Convert::html2raw($matches[2]) . "[$matches[1]]";
}, $data);
$data = preg_replace_callback('/<a[^>]*href\s*=\s*([^ ]*)>(.*?)<\/a>/i', function($matches) {
$data = preg_replace_callback('/<a[^>]*href\s*=\s*([^ ]*)>(.*?)<\/a>/ui', function($matches) {
return Convert::html2raw($matches[2]) . "[$matches[1]]";
}, $data);
}
@ -347,13 +347,13 @@ class Convert {
// Compress whitespace
if($config['CompressWhitespace']) {
$data = preg_replace("/\s+/", " ", $data);
$data = preg_replace("/\s+/u", " ", $data);
}
// Parse newline tags
$data = preg_replace("/\s*<[Hh][1-6]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data);
$data = preg_replace("/\s*<[Pp]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data);
$data = preg_replace("/\s*<[Dd][Ii][Vv]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data);
$data = preg_replace("/\s*<[Hh][1-6]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data);
$data = preg_replace("/\s*<[Pp]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data);
$data = preg_replace("/\s*<[Dd][Ii][Vv]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data);
$data = preg_replace("/\n\n\n+/", "\n\n", $data);
$data = preg_replace("/<[Bb][Rr]([^A-Za-z0-9>][^>]*)?> */", "\n", $data);

View File

@ -140,8 +140,8 @@ class HTMLText extends Text {
/* See if we can pull a paragraph out*/
// Strip out any images in case there's one at the beginning. Not doing this will return a blank paragraph
$str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}', '', $this->value);
if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}', $str, $matches)) $str = $matches[2];
$str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}u', '', $this->value);
if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}u', $str, $matches)) $str = $matches[2];
/* If _that_ failed, just use the whole text */
if (!$str) $str = $this->value;
@ -155,7 +155,7 @@ class HTMLText extends Text {
/* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for
* whitespace normalization) */
$words = preg_split('/\s+/', $str);
$words = preg_split('/\s+/u', $str);
if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words);
/* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't
@ -183,7 +183,7 @@ class HTMLText extends Text {
$paragraph = $this->Summary(-1);
/* Then look for the first sentence ending. We could probably use a nice regex, but for now this will do */
$words = preg_split('/\s+/', $paragraph);
$words = preg_split('/\s+/u', $paragraph);
foreach ($words as $i => $word) {
if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) {
return implode(' ', array_slice($words, 0, $i+1));
@ -270,7 +270,7 @@ class HTMLText extends Text {
// If it's just one or two tags on its own (and not the above) it's empty.
// This might be <p></p> or <h1></h1> or whatever.
if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/', $value)) {
if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/u', $value)) {
return false;
}

View File

@ -97,7 +97,7 @@ class Text extends StringField {
$paragraph = Convert::xml2raw( $this->RAW() );
if( !$paragraph ) return "";
$words = preg_split('/\s+/', $paragraph);
$words = preg_split('/\s+/u', $paragraph);
foreach ($words as $i => $word) {
if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) {
return implode(' ', array_slice($words, 0, $i+1));

View File

@ -10,6 +10,24 @@ class ConvertTest extends SapphireTest {
protected $usesDatabase = false;
private $previousLocaleSetting = null;
public function setUp()
{
parent::setUp();
// clear the previous locale setting
$this->previousLocaleSetting = null;
}
public function tearDown()
{
parent::tearDown();
// If a test sets the locale, reset it on teardown
if ($this->previousLocaleSetting) {
setlocale(LC_CTYPE, $this->previousLocaleSetting);
}
}
/**
* Tests {@link Convert::raw2att()}
*/
@ -396,4 +414,28 @@ XML
Convert::base64url_decode(Convert::base64url_encode($data))
);
}
public function testValidUtf8()
{
// Install a UTF-8 locale
$this->previousLocaleSetting = setlocale(LC_CTYPE, 0);
$locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8');
$localeInstalled = false;
foreach ($locales as $locale) {
if ($localeInstalled = setlocale(LC_CTYPE, $locale)) {
break;
}
}
// If the system doesn't have any of the UTF-8 locales, exit early
if ($localeInstalled === false) {
$this->markTestIncomplete('Unable to run this test because of missing locale!');
return;
}
$problematicText = html_entity_decode('<p>This is a&nbsp;Test with non-breaking&nbsp;space!</p>', ENT_COMPAT, 'UTF-8');
$this->assertTrue(mb_check_encoding(Convert::html2raw($problematicText), 'UTF-8'));
}
}

View File

@ -5,6 +5,24 @@
*/
class HTMLTextTest extends SapphireTest {
private $previousLocaleSetting = null;
public function setUp()
{
parent::setUp();
// clear the previous locale setting
$this->previousLocaleSetting = null;
}
public function tearDown()
{
parent::tearDown();
// If a test sets the locale, reset it on teardown
if ($this->previousLocaleSetting) {
setlocale(LC_CTYPE, $this->previousLocaleSetting);
}
}
/**
* Test {@link HTMLText->LimitCharacters()}
*/
@ -314,4 +332,31 @@ class HTMLTextTest extends SapphireTest {
ShortcodeParser::set_active('default');
}
public function testValidUtf8()
{
// Install a UTF-8 locale
$this->previousLocaleSetting = setlocale(LC_CTYPE, 0);
$locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8');
$localeInstalled = false;
foreach ($locales as $locale) {
if ($localeInstalled = setlocale(LC_CTYPE, $locale)) {
break;
}
}
// If the system doesn't have any of the UTF-8 locales, exit early
if ($localeInstalled === false) {
$this->markTestIncomplete('Unable to run this test because of missing locale!');
return;
}
$problematicText = html_entity_decode('<p>This is a&nbsp;Test with non-breaking&nbsp;space!</p>', ENT_COMPAT, 'UTF-8');
$textObj = new HTMLText('Test');
$textObj->setValue($problematicText);
$this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8'));
$this->assertTrue(mb_check_encoding($textObj->Summary(), 'UTF-8'));
}
}

View File

@ -5,6 +5,24 @@
*/
class TextTest extends SapphireTest {
private $previousLocaleSetting = null;
public function setUp()
{
parent::setUp();
// clear the previous locale setting
$this->previousLocaleSetting = null;
}
public function tearDown()
{
parent::tearDown();
// If a test sets the locale, reset it on teardown
if ($this->previousLocaleSetting) {
setlocale(LC_CTYPE, $this->previousLocaleSetting);
}
}
/**
* Test {@link Text->LimitCharacters()}
*/
@ -240,4 +258,30 @@ class TextTest extends SapphireTest {
$data = DBField::create_field('Text', '"this is a test"');
$this->assertEquals($data->ATT(), '&quot;this is a test&quot;');
}
public function testValidUtf8()
{
// Install a UTF-8 locale
$this->previousLocaleSetting = setlocale(LC_CTYPE, 0);
$locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8');
$localeInstalled = false;
foreach ($locales as $locale) {
if ($localeInstalled = setlocale(LC_CTYPE, $locale)) {
break;
}
}
// If the system doesn't have any of the UTF-8 locales, exit early
if ($localeInstalled === false) {
$this->markTestIncomplete('Unable to run this test because of missing locale!');
return;
}
$problematicText = html_entity_decode('This is a&nbsp;Test with non-breaking&nbsp;space!', ENT_COMPAT, 'UTF-8');
$textObj = new Text('Test');
$textObj->setValue($problematicText);
$this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8'));
}
}