From a6db16b2298738e1ef1329329cbef7c6b33f993e Mon Sep 17 00:00:00 2001 From: Roman Schmid Date: Thu, 6 Jul 2017 16:16:39 +0200 Subject: [PATCH] Fix OS X issue with `Convert::html2raw`, `HTMLText::FirstSentence`, `HTMLText::Summary` and `Text::FirstSentence`. Use unicode modifier for regular expressions that deal with whitespace. Added unit-tests to ensure no invalud utf-8 gets generated by these methods. --- core/Convert.php | 12 +++++----- model/fieldtypes/HTMLText.php | 10 ++++---- model/fieldtypes/Text.php | 2 +- tests/core/ConvertTest.php | 42 ++++++++++++++++++++++++++++++++ tests/model/HTMLTextTest.php | 45 +++++++++++++++++++++++++++++++++++ tests/model/TextTest.php | 44 ++++++++++++++++++++++++++++++++++ 6 files changed, 143 insertions(+), 12 deletions(-) diff --git a/core/Convert.php b/core/Convert.php index be4dde8d8..66708f2dc 100644 --- a/core/Convert.php +++ b/core/Convert.php @@ -331,10 +331,10 @@ class Convert { // Expand hyperlinks if(!$preserveLinks && !$config['PreserveLinks']) { - $data = preg_replace_callback('/]*href\s*=\s*"([^"]*)">(.*?)<\/a>/i', function($matches) { + $data = preg_replace_callback('/]*href\s*=\s*"([^"]*)">(.*?)<\/a>/ui', function($matches) { return Convert::html2raw($matches[2]) . "[$matches[1]]"; }, $data); - $data = preg_replace_callback('/]*href\s*=\s*([^ ]*)>(.*?)<\/a>/i', function($matches) { + $data = preg_replace_callback('/]*href\s*=\s*([^ ]*)>(.*?)<\/a>/ui', function($matches) { return Convert::html2raw($matches[2]) . "[$matches[1]]"; }, $data); } @@ -347,13 +347,13 @@ class Convert { // Compress whitespace if($config['CompressWhitespace']) { - $data = preg_replace("/\s+/", " ", $data); + $data = preg_replace("/\s+/u", " ", $data); } // Parse newline tags - $data = preg_replace("/\s*<[Hh][1-6]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data); - $data = preg_replace("/\s*<[Pp]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data); - $data = preg_replace("/\s*<[Dd][Ii][Vv]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data); + $data = preg_replace("/\s*<[Hh][1-6]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data); + $data = preg_replace("/\s*<[Pp]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data); + $data = preg_replace("/\s*<[Dd][Ii][Vv]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data); $data = preg_replace("/\n\n\n+/", "\n\n", $data); $data = preg_replace("/<[Bb][Rr]([^A-Za-z0-9>][^>]*)?> */", "\n", $data); diff --git a/model/fieldtypes/HTMLText.php b/model/fieldtypes/HTMLText.php index a93be690f..db1bad2f8 100644 --- a/model/fieldtypes/HTMLText.php +++ b/model/fieldtypes/HTMLText.php @@ -140,8 +140,8 @@ class HTMLText extends Text { /* See if we can pull a paragraph out*/ // Strip out any images in case there's one at the beginning. Not doing this will return a blank paragraph - $str = preg_replace('{^\s*(<.+?>)*]*>}', '', $this->value); - if (preg_match('{]*)?>(.*[A-Za-z]+.*)

}', $str, $matches)) $str = $matches[2]; + $str = preg_replace('{^\s*(<.+?>)*]*>}u', '', $this->value); + if (preg_match('{]*)?>(.*[A-Za-z]+.*)

}u', $str, $matches)) $str = $matches[2]; /* If _that_ failed, just use the whole text */ if (!$str) $str = $this->value; @@ -155,7 +155,7 @@ class HTMLText extends Text { /* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for * whitespace normalization) */ - $words = preg_split('/\s+/', $str); + $words = preg_split('/\s+/u', $str); if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words); /* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't @@ -183,7 +183,7 @@ class HTMLText extends Text { $paragraph = $this->Summary(-1); /* Then look for the first sentence ending. We could probably use a nice regex, but for now this will do */ - $words = preg_split('/\s+/', $paragraph); + $words = preg_split('/\s+/u', $paragraph); foreach ($words as $i => $word) { if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) { return implode(' ', array_slice($words, 0, $i+1)); @@ -270,7 +270,7 @@ class HTMLText extends Text { // If it's just one or two tags on its own (and not the above) it's empty. // This might be

or

or whatever. - if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/', $value)) { + if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/u', $value)) { return false; } diff --git a/model/fieldtypes/Text.php b/model/fieldtypes/Text.php index 4a1eb4577..7dbc2ed61 100644 --- a/model/fieldtypes/Text.php +++ b/model/fieldtypes/Text.php @@ -97,7 +97,7 @@ class Text extends StringField { $paragraph = Convert::xml2raw( $this->RAW() ); if( !$paragraph ) return ""; - $words = preg_split('/\s+/', $paragraph); + $words = preg_split('/\s+/u', $paragraph); foreach ($words as $i => $word) { if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) { return implode(' ', array_slice($words, 0, $i+1)); diff --git a/tests/core/ConvertTest.php b/tests/core/ConvertTest.php index d30e4efdc..a67047cb3 100644 --- a/tests/core/ConvertTest.php +++ b/tests/core/ConvertTest.php @@ -10,6 +10,24 @@ class ConvertTest extends SapphireTest { protected $usesDatabase = false; + private $previousLocaleSetting = null; + + public function setUp() + { + parent::setUp(); + // clear the previous locale setting + $this->previousLocaleSetting = null; + } + + public function tearDown() + { + parent::tearDown(); + // If a test sets the locale, reset it on teardown + if ($this->previousLocaleSetting) { + setlocale(LC_CTYPE, $this->previousLocaleSetting); + } + } + /** * Tests {@link Convert::raw2att()} */ @@ -396,4 +414,28 @@ XML Convert::base64url_decode(Convert::base64url_encode($data)) ); } + + public function testValidUtf8() + { + // Install a UTF-8 locale + $this->previousLocaleSetting = setlocale(LC_CTYPE, 0); + + $locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8'); + $localeInstalled = false; + foreach ($locales as $locale) { + if ($localeInstalled = setlocale(LC_CTYPE, $locale)) { + break; + } + } + + // If the system doesn't have any of the UTF-8 locales, exit early + if ($localeInstalled === false) { + $this->markTestIncomplete('Unable to run this test because of missing locale!'); + return; + } + + $problematicText = html_entity_decode('

This is a Test with non-breaking space!

', ENT_COMPAT, 'UTF-8'); + + $this->assertTrue(mb_check_encoding(Convert::html2raw($problematicText), 'UTF-8')); + } } diff --git a/tests/model/HTMLTextTest.php b/tests/model/HTMLTextTest.php index 8a5be4116..06734b6d9 100644 --- a/tests/model/HTMLTextTest.php +++ b/tests/model/HTMLTextTest.php @@ -5,6 +5,24 @@ */ class HTMLTextTest extends SapphireTest { + private $previousLocaleSetting = null; + + public function setUp() + { + parent::setUp(); + // clear the previous locale setting + $this->previousLocaleSetting = null; + } + + public function tearDown() + { + parent::tearDown(); + // If a test sets the locale, reset it on teardown + if ($this->previousLocaleSetting) { + setlocale(LC_CTYPE, $this->previousLocaleSetting); + } + } + /** * Test {@link HTMLText->LimitCharacters()} */ @@ -314,4 +332,31 @@ class HTMLTextTest extends SapphireTest { ShortcodeParser::set_active('default'); } + + public function testValidUtf8() + { + // Install a UTF-8 locale + $this->previousLocaleSetting = setlocale(LC_CTYPE, 0); + $locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8'); + $localeInstalled = false; + foreach ($locales as $locale) { + if ($localeInstalled = setlocale(LC_CTYPE, $locale)) { + break; + } + } + + // If the system doesn't have any of the UTF-8 locales, exit early + if ($localeInstalled === false) { + $this->markTestIncomplete('Unable to run this test because of missing locale!'); + return; + } + + $problematicText = html_entity_decode('

This is a Test with non-breaking space!

', ENT_COMPAT, 'UTF-8'); + + $textObj = new HTMLText('Test'); + $textObj->setValue($problematicText); + + $this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8')); + $this->assertTrue(mb_check_encoding($textObj->Summary(), 'UTF-8')); + } } diff --git a/tests/model/TextTest.php b/tests/model/TextTest.php index aad2de28a..34831af82 100644 --- a/tests/model/TextTest.php +++ b/tests/model/TextTest.php @@ -5,6 +5,24 @@ */ class TextTest extends SapphireTest { + private $previousLocaleSetting = null; + + public function setUp() + { + parent::setUp(); + // clear the previous locale setting + $this->previousLocaleSetting = null; + } + + public function tearDown() + { + parent::tearDown(); + // If a test sets the locale, reset it on teardown + if ($this->previousLocaleSetting) { + setlocale(LC_CTYPE, $this->previousLocaleSetting); + } + } + /** * Test {@link Text->LimitCharacters()} */ @@ -240,4 +258,30 @@ class TextTest extends SapphireTest { $data = DBField::create_field('Text', '"this is a test"'); $this->assertEquals($data->ATT(), '"this is a test"'); } + + public function testValidUtf8() + { + // Install a UTF-8 locale + $this->previousLocaleSetting = setlocale(LC_CTYPE, 0); + $locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8'); + $localeInstalled = false; + foreach ($locales as $locale) { + if ($localeInstalled = setlocale(LC_CTYPE, $locale)) { + break; + } + } + + // If the system doesn't have any of the UTF-8 locales, exit early + if ($localeInstalled === false) { + $this->markTestIncomplete('Unable to run this test because of missing locale!'); + return; + } + + $problematicText = html_entity_decode('This is a Test with non-breaking space!', ENT_COMPAT, 'UTF-8'); + + $textObj = new Text('Test'); + $textObj->setValue($problematicText); + + $this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8')); + } }