From a6db16b2298738e1ef1329329cbef7c6b33f993e Mon Sep 17 00:00:00 2001
From: Roman Schmid <bummzack@gmail.com>
Date: Thu, 6 Jul 2017 16:16:39 +0200
Subject: [PATCH] Fix OS X issue with `Convert::html2raw`,
 `HTMLText::FirstSentence`, `HTMLText::Summary` and `Text::FirstSentence`. Use
 unicode modifier for regular expressions that deal with whitespace. Added
 unit-tests to ensure no invalud utf-8 gets generated by these methods.

---
 core/Convert.php              | 12 +++++-----
 model/fieldtypes/HTMLText.php | 10 ++++----
 model/fieldtypes/Text.php     |  2 +-
 tests/core/ConvertTest.php    | 42 ++++++++++++++++++++++++++++++++
 tests/model/HTMLTextTest.php  | 45 +++++++++++++++++++++++++++++++++++
 tests/model/TextTest.php      | 44 ++++++++++++++++++++++++++++++++++
 6 files changed, 143 insertions(+), 12 deletions(-)
diff --git a/core/Convert.php b/core/Convert.php
index be4dde8d8..66708f2dc 100644
--- a/core/Convert.php
+++ b/core/Convert.php
@@ -331,10 +331,10 @@ class Convert {
 
 		// Expand hyperlinks
 		if(!$preserveLinks && !$config['PreserveLinks']) {
-			$data = preg_replace_callback('/<a[^>]*href\s*=\s*"([^"]*)">(.*?)<\/a>/i', function($matches) {
+			$data = preg_replace_callback('/<a[^>]*href\s*=\s*"([^"]*)">(.*?)<\/a>/ui', function($matches) {
 				return Convert::html2raw($matches[2]) . "[$matches[1]]";
 			}, $data);
-			$data = preg_replace_callback('/<a[^>]*href\s*=\s*([^ ]*)>(.*?)<\/a>/i', function($matches) {
+			$data = preg_replace_callback('/<a[^>]*href\s*=\s*([^ ]*)>(.*?)<\/a>/ui', function($matches) {
 				return Convert::html2raw($matches[2]) . "[$matches[1]]";
 			}, $data);
 		}
@@ -347,13 +347,13 @@ class Convert {
 
 		// Compress whitespace
 		if($config['CompressWhitespace']) {
-			$data = preg_replace("/\s+/", " ", $data);
+			$data = preg_replace("/\s+/u", " ", $data);
 		}
 
 		// Parse newline tags
-		$data = preg_replace("/\s*<[Hh][1-6]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data);
-		$data = preg_replace("/\s*<[Pp]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data);
-		$data = preg_replace("/\s*<[Dd][Ii][Vv]([^A-Za-z0-9>][^>]*)?> */", "\n\n", $data);
+		$data = preg_replace("/\s*<[Hh][1-6]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data);
+		$data = preg_replace("/\s*<[Pp]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data);
+		$data = preg_replace("/\s*<[Dd][Ii][Vv]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data);
 		$data = preg_replace("/\n\n\n+/", "\n\n", $data);
 
 		$data = preg_replace("/<[Bb][Rr]([^A-Za-z0-9>][^>]*)?> */", "\n", $data);
diff --git a/model/fieldtypes/HTMLText.php b/model/fieldtypes/HTMLText.php
index a93be690f..db1bad2f8 100644
--- a/model/fieldtypes/HTMLText.php
+++ b/model/fieldtypes/HTMLText.php
@@ -140,8 +140,8 @@ class HTMLText extends Text {
 			/* See if we can pull a paragraph out*/
 
 			// Strip out any images in case there's one at the beginning. Not doing this will return a blank paragraph
-			$str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}', '', $this->value);
-			if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}', $str, $matches)) $str = $matches[2];
+			$str = preg_replace('{^\s*(<.+?>)*<img[^>]*>}u', '', $this->value);
+			if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}u', $str, $matches)) $str = $matches[2];
 
 			/* If _that_ failed, just use the whole text */
 			if (!$str) $str = $this->value;
@@ -155,7 +155,7 @@ class HTMLText extends Text {
 
 		/* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for
 		 * whitespace normalization) */
-		$words = preg_split('/\s+/', $str);
+		$words = preg_split('/\s+/u', $str);
 		if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words);
 
 		/* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't
@@ -183,7 +183,7 @@ class HTMLText extends Text {
 		$paragraph = $this->Summary(-1);
 
 		/* Then look for the first sentence ending. We could probably use a nice regex, but for now this will do */
-		$words = preg_split('/\s+/', $paragraph);
+		$words = preg_split('/\s+/u', $paragraph);
 		foreach ($words as $i => $word) {
 			if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) {
 				return implode(' ', array_slice($words, 0, $i+1));
@@ -270,7 +270,7 @@ class HTMLText extends Text {
 
 		// If it's just one or two tags on its own (and not the above) it's empty.
 		// This might be <p></p> or <h1></h1> or whatever.
-		if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/', $value)) {
+		if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/u', $value)) {
 			return false;
 		}
 
diff --git a/model/fieldtypes/Text.php b/model/fieldtypes/Text.php
index 4a1eb4577..7dbc2ed61 100644
--- a/model/fieldtypes/Text.php
+++ b/model/fieldtypes/Text.php
@@ -97,7 +97,7 @@ class Text extends StringField {
 		$paragraph = Convert::xml2raw( $this->RAW() );
 		if( !$paragraph ) return "";
 
-		$words = preg_split('/\s+/', $paragraph);
+		$words = preg_split('/\s+/u', $paragraph);
 		foreach ($words as $i => $word) {
 			if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) {
 				return implode(' ', array_slice($words, 0, $i+1));
diff --git a/tests/core/ConvertTest.php b/tests/core/ConvertTest.php
index d30e4efdc..a67047cb3 100644
--- a/tests/core/ConvertTest.php
+++ b/tests/core/ConvertTest.php
@@ -10,6 +10,24 @@ class ConvertTest extends SapphireTest {
 
 	protected $usesDatabase = false;
 
+	private $previousLocaleSetting = null;
+
+	public function setUp()
+	{
+		parent::setUp();
+		// clear the previous locale setting
+		$this->previousLocaleSetting = null;
+	}
+
+	public function tearDown()
+	{
+		parent::tearDown();
+		// If a test sets the locale, reset it on teardown
+		if ($this->previousLocaleSetting) {
+			setlocale(LC_CTYPE, $this->previousLocaleSetting);
+		}
+	}
+
 	/**
 	 * Tests {@link Convert::raw2att()}
 	 */
@@ -396,4 +414,28 @@ XML
 			Convert::base64url_decode(Convert::base64url_encode($data))
 		);
 	}
+
+	public function testValidUtf8()
+	{
+		// Install a UTF-8 locale
+		$this->previousLocaleSetting = setlocale(LC_CTYPE, 0);
+
+		$locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8');
+		$localeInstalled = false;
+		foreach ($locales as $locale) {
+			if ($localeInstalled = setlocale(LC_CTYPE, $locale)) {
+				break;
+			}
+		}
+
+		// If the system doesn't have any of the UTF-8 locales, exit early
+		if ($localeInstalled === false) {
+			$this->markTestIncomplete('Unable to run this test because of missing locale!');
+			return;
+		}
+
+		$problematicText = html_entity_decode('<p>This is a&nbsp;Test with non-breaking&nbsp;space!</p>', ENT_COMPAT, 'UTF-8');
+
+		$this->assertTrue(mb_check_encoding(Convert::html2raw($problematicText), 'UTF-8'));
+	}
 }
diff --git a/tests/model/HTMLTextTest.php b/tests/model/HTMLTextTest.php
index 8a5be4116..06734b6d9 100644
--- a/tests/model/HTMLTextTest.php
+++ b/tests/model/HTMLTextTest.php
@@ -5,6 +5,24 @@
  */
 class HTMLTextTest extends SapphireTest {
 
+	private $previousLocaleSetting = null;
+
+	public function setUp()
+	{
+		parent::setUp();
+		// clear the previous locale setting
+		$this->previousLocaleSetting = null;
+	}
+
+	public function tearDown()
+	{
+		parent::tearDown();
+		// If a test sets the locale, reset it on teardown
+		if ($this->previousLocaleSetting) {
+			setlocale(LC_CTYPE, $this->previousLocaleSetting);
+		}
+	}
+
 	/**
 	 * Test {@link HTMLText->LimitCharacters()}
 	 */
@@ -314,4 +332,31 @@ class HTMLTextTest extends SapphireTest {
 
 		ShortcodeParser::set_active('default');
 	}
+
+	public function testValidUtf8()
+	{
+		// Install a UTF-8 locale
+		$this->previousLocaleSetting = setlocale(LC_CTYPE, 0);
+		$locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8');
+		$localeInstalled = false;
+		foreach ($locales as $locale) {
+			if ($localeInstalled = setlocale(LC_CTYPE, $locale)) {
+				break;
+			}
+		}
+
+		// If the system doesn't have any of the UTF-8 locales, exit early
+		if ($localeInstalled === false) {
+			$this->markTestIncomplete('Unable to run this test because of missing locale!');
+			return;
+		}
+
+		$problematicText = html_entity_decode('<p>This is a&nbsp;Test with non-breaking&nbsp;space!</p>', ENT_COMPAT, 'UTF-8');
+
+		$textObj = new HTMLText('Test');
+		$textObj->setValue($problematicText);
+
+		$this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8'));
+		$this->assertTrue(mb_check_encoding($textObj->Summary(), 'UTF-8'));
+	}
 }
diff --git a/tests/model/TextTest.php b/tests/model/TextTest.php
index aad2de28a..34831af82 100644
--- a/tests/model/TextTest.php
+++ b/tests/model/TextTest.php
@@ -5,6 +5,24 @@
  */
 class TextTest extends SapphireTest {
 
+	private $previousLocaleSetting = null;
+
+	public function setUp()
+	{
+		parent::setUp();
+		// clear the previous locale setting
+		$this->previousLocaleSetting = null;
+	}
+
+	public function tearDown()
+	{
+		parent::tearDown();
+		// If a test sets the locale, reset it on teardown
+		if ($this->previousLocaleSetting) {
+			setlocale(LC_CTYPE, $this->previousLocaleSetting);
+		}
+	}
+
 	/**
 	 * Test {@link Text->LimitCharacters()}
 	 */
@@ -240,4 +258,30 @@ class TextTest extends SapphireTest {
 		$data = DBField::create_field('Text', '"this is a test"');
 		$this->assertEquals($data->ATT(), '&quot;this is a test&quot;');
 	}
+
+	public function testValidUtf8()
+	{
+		// Install a UTF-8 locale
+		$this->previousLocaleSetting = setlocale(LC_CTYPE, 0);
+		$locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8');
+		$localeInstalled = false;
+		foreach ($locales as $locale) {
+			if ($localeInstalled = setlocale(LC_CTYPE, $locale)) {
+				break;
+			}
+		}
+
+		// If the system doesn't have any of the UTF-8 locales, exit early
+		if ($localeInstalled === false) {
+			$this->markTestIncomplete('Unable to run this test because of missing locale!');
+			return;
+		}
+
+		$problematicText = html_entity_decode('This is a&nbsp;Test with non-breaking&nbsp;space!', ENT_COMPAT, 'UTF-8');
+
+		$textObj = new Text('Test');
+		$textObj->setValue($problematicText);
+
+		$this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8'));
+	}
 }