Fix OS X issue with Convert::html2raw, HTMLText::FirstSentence, HTMLText::Summary and Text::FirstSentence.

Use unicode modifier for regular expressions that deal with whitespace. Added unit-tests to ensure no invalud utf-8 gets generated by these methods.
2024-10-22 12:05:37 +00:00 · 2017-07-06 16:16:39 +02:00 · 2017-07-06 16:16:39 +02:00 · a6db16b229
commit a6db16b229
parent 3a7261ac6e
6 changed files with 143 additions and 12 deletions
--- a/core/Convert.php
+++ b/core/Convert.php
@ -331,10 +331,10 @@ class Convert {

 		// Expand hyperlinks
 		if(!$preserveLinks && !$config['PreserveLinks']) {
-			$data = preg_replace_callback('/<a[^>]*href\s*=\s*"([^"]*)">(.*?)<\/a>/i', function($matches) {
+			$data = preg_replace_callback('/<a[^>]*href\s*=\s*"([^"]*)">(.*?)<\/a>/ui', function($matches) {
 				return Convert::html2raw($matches[2]) . "[$matches[1]]";
 			}, $data);
-			$data = preg_replace_callback('/<a[^>]*href\s*=\s*([^ ]*)>(.*?)<\/a>/i', function($matches) {
+			$data = preg_replace_callback('/<a[^>]*href\s*=\s*([^ ]*)>(.*?)<\/a>/ui', function($matches) {
 				return Convert::html2raw($matches[2]) . "[$matches[1]]";
 			}, $data);
 		}
@ -347,13 +347,13 @@ class Convert {

 		// Compress whitespace
 		if($config['CompressWhitespace']) {
-			$data = preg_replace("/\s+/", " ", $data);
+			$data = preg_replace("/\s+/u", " ", $data);
 		}

 		// Parse newline tags
-		$data = preg_replace("/\s*<[Hh][1-6]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data);
-		$data = preg_replace("/\s*<[Pp]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data);
-		$data = preg_replace("/\s*<[Dd][Ii][Vv]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data);
+		$data = preg_replace("/\s*<[Hh][1-6]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data);
+		$data = preg_replace("/\s*<[Pp]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data);
+		$data = preg_replace("/\s*<[Dd][Ii][Vv]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data);
 		$data = preg_replace("/\n\n\n+/", "\n\n", $data);

 		$data = preg_replace("/<[Bb][Rr]([^A-Za-z0-9>][^>]*)?> */", "\n", $data);
--- a/model/fieldtypes/HTMLText.php
+++ b/model/fieldtypes/HTMLText.php
@ -140,8 +140,8 @@ class HTMLText extends Text {
 			/* See if we can pull a paragraph out*/

 			// Strip out any images in case there's one at the beginning. Not doing this will return a blank paragraph
-			$str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}', '', $this->value);
-			if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}', $str, $matches)) $str = $matches[2];
+			$str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}u', '', $this->value);
+			if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}u', $str, $matches)) $str = $matches[2];

 			/* If _that_ failed, just use the whole text */
 			if (!$str) $str = $this->value;
@ -155,7 +155,7 @@ class HTMLText extends Text {

 		/* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for
 		 * whitespace normalization) */
-		$words = preg_split('/\s+/', $str);
+		$words = preg_split('/\s+/u', $str);
 		if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words);

 		/* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't
@ -183,7 +183,7 @@ class HTMLText extends Text {
 		$paragraph = $this->Summary(-1);

 		/* Then look for the first sentence ending. We could probably use a nice regex, but for now this will do */
-		$words = preg_split('/\s+/', $paragraph);
+		$words = preg_split('/\s+/u', $paragraph);
 		foreach ($words as $i => $word) {
 			if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) {
 				return implode(' ', array_slice($words, 0, $i+1));
@ -270,7 +270,7 @@ class HTMLText extends Text {

 		// If it's just one or two tags on its own (and not the above) it's empty.
 		// This might be <p></p> or <h1></h1> or whatever.
-		if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/', $value)) {
+		if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/u', $value)) {
 			return false;
 		}

--- a/model/fieldtypes/Text.php
+++ b/model/fieldtypes/Text.php
@ -97,7 +97,7 @@ class Text extends StringField {
 		$paragraph = Convert::xml2raw( $this->RAW() );
 		if( !$paragraph ) return "";

-		$words = preg_split('/\s+/', $paragraph);
+		$words = preg_split('/\s+/u', $paragraph);
 		foreach ($words as $i => $word) {
 			if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) {
 				return implode(' ', array_slice($words, 0, $i+1));
--- a/tests/core/ConvertTest.php
+++ b/tests/core/ConvertTest.php
@ -10,6 +10,24 @@ class ConvertTest extends SapphireTest {

 	protected $usesDatabase = false;

+	private $previousLocaleSetting = null;
+
+	public function setUp()
+	{
+		parent::setUp();
+		// clear the previous locale setting
+		$this->previousLocaleSetting = null;
+	}
+
+	public function tearDown()
+	{
+		parent::tearDown();
+		// If a test sets the locale, reset it on teardown
+		if ($this->previousLocaleSetting) {
+			setlocale(LC_CTYPE, $this->previousLocaleSetting);
+		}
+	}
+
 	/**
 	 * Tests {@link Convert::raw2att()}
 	 */
@ -396,4 +414,28 @@ XML
 			Convert::base64url_decode(Convert::base64url_encode($data))
 		);
 	}
+
+	public function testValidUtf8()
+	{
+		// Install a UTF-8 locale
+		$this->previousLocaleSetting = setlocale(LC_CTYPE, 0);
+
+		$locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8');
+		$localeInstalled = false;
+		foreach ($locales as $locale) {
+			if ($localeInstalled = setlocale(LC_CTYPE, $locale)) {
+				break;
+			}
+		}
+
+		// If the system doesn't have any of the UTF-8 locales, exit early
+		if ($localeInstalled === false) {
+			$this->markTestIncomplete('Unable to run this test because of missing locale!');
+			return;
+		}
+
+		$problematicText = html_entity_decode('<p>This is a&nbsp;Test with non-breaking&nbsp;space!</p>', ENT_COMPAT, 'UTF-8');
+
+		$this->assertTrue(mb_check_encoding(Convert::html2raw($problematicText), 'UTF-8'));
+	}
 }
--- a/tests/model/HTMLTextTest.php
+++ b/tests/model/HTMLTextTest.php
@ -5,6 +5,24 @@
 */
 class HTMLTextTest extends SapphireTest {

+	private $previousLocaleSetting = null;
+
+	public function setUp()
+	{
+		parent::setUp();
+		// clear the previous locale setting
+		$this->previousLocaleSetting = null;
+	}
+
+	public function tearDown()
+	{
+		parent::tearDown();
+		// If a test sets the locale, reset it on teardown
+		if ($this->previousLocaleSetting) {
+			setlocale(LC_CTYPE, $this->previousLocaleSetting);
+		}
+	}
+
 	/**
 	 * Test {@link HTMLText->LimitCharacters()}
 	 */
@ -314,4 +332,31 @@ class HTMLTextTest extends SapphireTest {

 		ShortcodeParser::set_active('default');
 	}
+
+	public function testValidUtf8()
+	{
+		// Install a UTF-8 locale
+		$this->previousLocaleSetting = setlocale(LC_CTYPE, 0);
+		$locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8');
+		$localeInstalled = false;
+		foreach ($locales as $locale) {
+			if ($localeInstalled = setlocale(LC_CTYPE, $locale)) {
+				break;
+			}
+		}
+
+		// If the system doesn't have any of the UTF-8 locales, exit early
+		if ($localeInstalled === false) {
+			$this->markTestIncomplete('Unable to run this test because of missing locale!');
+			return;
+		}
+
+		$problematicText = html_entity_decode('<p>This is a&nbsp;Test with non-breaking&nbsp;space!</p>', ENT_COMPAT, 'UTF-8');
+
+		$textObj = new HTMLText('Test');
+		$textObj->setValue($problematicText);
+
+		$this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8'));
+		$this->assertTrue(mb_check_encoding($textObj->Summary(), 'UTF-8'));
+	}
 }
--- a/tests/model/TextTest.php
+++ b/tests/model/TextTest.php
@ -5,6 +5,24 @@
 */
 class TextTest extends SapphireTest {

+	private $previousLocaleSetting = null;
+
+	public function setUp()
+	{
+		parent::setUp();
+		// clear the previous locale setting
+		$this->previousLocaleSetting = null;
+	}
+
+	public function tearDown()
+	{
+		parent::tearDown();
+		// If a test sets the locale, reset it on teardown
+		if ($this->previousLocaleSetting) {
+			setlocale(LC_CTYPE, $this->previousLocaleSetting);
+		}
+	}
+
 	/**
 	 * Test {@link Text->LimitCharacters()}
 	 */
@ -240,4 +258,30 @@ class TextTest extends SapphireTest {
 		$data = DBField::create_field('Text', '"this is a test"');
 		$this->assertEquals($data->ATT(), '&quot;this is a test&quot;');
 	}
+
+	public function testValidUtf8()
+	{
+		// Install a UTF-8 locale
+		$this->previousLocaleSetting = setlocale(LC_CTYPE, 0);
+		$locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8');
+		$localeInstalled = false;
+		foreach ($locales as $locale) {
+			if ($localeInstalled = setlocale(LC_CTYPE, $locale)) {
+				break;
+			}
+		}
+
+		// If the system doesn't have any of the UTF-8 locales, exit early
+		if ($localeInstalled === false) {
+			$this->markTestIncomplete('Unable to run this test because of missing locale!');
+			return;
+		}
+
+		$problematicText = html_entity_decode('This is a&nbsp;Test with non-breaking&nbsp;space!', ENT_COMPAT, 'UTF-8');
+
+		$textObj = new Text('Test');
+		$textObj->setValue($problematicText);
+
+		$this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8'));
+	}
 }