]*)?>(.*[A-Za-z]+.*)
}', $str, $matches)) $str = $matches[2]; + $str = preg_replace('{^\s*(<.+?>)*]*>}u', '', $this->value); + if (preg_match('{]*)?>(.*[A-Za-z]+.*)
}u', $str, $matches)) $str = $matches[2]; /* If _that_ failed, just use the whole text */ if (!$str) $str = $this->value; @@ -155,7 +155,7 @@ class HTMLText extends Text { /* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for * whitespace normalization) */ - $words = preg_split('/\s+/', $str); + $words = preg_split('/\s+/u', $str); if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words); /* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't @@ -183,7 +183,7 @@ class HTMLText extends Text { $paragraph = $this->Summary(-1); /* Then look for the first sentence ending. We could probably use a nice regex, but for now this will do */ - $words = preg_split('/\s+/', $paragraph); + $words = preg_split('/\s+/u', $paragraph); foreach ($words as $i => $word) { if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) { return implode(' ', array_slice($words, 0, $i+1)); @@ -270,7 +270,7 @@ class HTMLText extends Text { // If it's just one or two tags on its own (and not the above) it's empty. // This might be or or whatever. - if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/', $value)) { + if(preg_match('/^[\\s]*(<[^>]+>[\\s]*){1,2}$/u', $value)) { return false; } diff --git a/model/fieldtypes/Text.php b/model/fieldtypes/Text.php index 4a1eb4577..7dbc2ed61 100644 --- a/model/fieldtypes/Text.php +++ b/model/fieldtypes/Text.php @@ -97,7 +97,7 @@ class Text extends StringField { $paragraph = Convert::xml2raw( $this->RAW() ); if( !$paragraph ) return ""; - $words = preg_split('/\s+/', $paragraph); + $words = preg_split('/\s+/u', $paragraph); foreach ($words as $i => $word) { if (preg_match('/(!|\?|\.)$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) { return implode(' ', array_slice($words, 0, $i+1)); diff --git a/tests/core/ConvertTest.php b/tests/core/ConvertTest.php index d30e4efdc..a67047cb3 100644 --- a/tests/core/ConvertTest.php +++ b/tests/core/ConvertTest.php @@ -10,6 +10,24 @@ class ConvertTest extends SapphireTest { protected $usesDatabase = false; + private $previousLocaleSetting = null; + + public function setUp() + { + parent::setUp(); + // clear the previous locale setting + $this->previousLocaleSetting = null; + } + + public function tearDown() + { + parent::tearDown(); + // If a test sets the locale, reset it on teardown + if ($this->previousLocaleSetting) { + setlocale(LC_CTYPE, $this->previousLocaleSetting); + } + } + /** * Tests {@link Convert::raw2att()} */ @@ -396,4 +414,28 @@ XML Convert::base64url_decode(Convert::base64url_encode($data)) ); } + + public function testValidUtf8() + { + // Install a UTF-8 locale + $this->previousLocaleSetting = setlocale(LC_CTYPE, 0); + + $locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8'); + $localeInstalled = false; + foreach ($locales as $locale) { + if ($localeInstalled = setlocale(LC_CTYPE, $locale)) { + break; + } + } + + // If the system doesn't have any of the UTF-8 locales, exit early + if ($localeInstalled === false) { + $this->markTestIncomplete('Unable to run this test because of missing locale!'); + return; + } + + $problematicText = html_entity_decode('This is a Test with non-breaking space!
', ENT_COMPAT, 'UTF-8'); + + $this->assertTrue(mb_check_encoding(Convert::html2raw($problematicText), 'UTF-8')); + } } diff --git a/tests/model/HTMLTextTest.php b/tests/model/HTMLTextTest.php index 8a5be4116..06734b6d9 100644 --- a/tests/model/HTMLTextTest.php +++ b/tests/model/HTMLTextTest.php @@ -5,6 +5,24 @@ */ class HTMLTextTest extends SapphireTest { + private $previousLocaleSetting = null; + + public function setUp() + { + parent::setUp(); + // clear the previous locale setting + $this->previousLocaleSetting = null; + } + + public function tearDown() + { + parent::tearDown(); + // If a test sets the locale, reset it on teardown + if ($this->previousLocaleSetting) { + setlocale(LC_CTYPE, $this->previousLocaleSetting); + } + } + /** * Test {@link HTMLText->LimitCharacters()} */ @@ -314,4 +332,31 @@ class HTMLTextTest extends SapphireTest { ShortcodeParser::set_active('default'); } + + public function testValidUtf8() + { + // Install a UTF-8 locale + $this->previousLocaleSetting = setlocale(LC_CTYPE, 0); + $locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8'); + $localeInstalled = false; + foreach ($locales as $locale) { + if ($localeInstalled = setlocale(LC_CTYPE, $locale)) { + break; + } + } + + // If the system doesn't have any of the UTF-8 locales, exit early + if ($localeInstalled === false) { + $this->markTestIncomplete('Unable to run this test because of missing locale!'); + return; + } + + $problematicText = html_entity_decode('This is a Test with non-breaking space!
', ENT_COMPAT, 'UTF-8'); + + $textObj = new HTMLText('Test'); + $textObj->setValue($problematicText); + + $this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8')); + $this->assertTrue(mb_check_encoding($textObj->Summary(), 'UTF-8')); + } } diff --git a/tests/model/TextTest.php b/tests/model/TextTest.php index aad2de28a..34831af82 100644 --- a/tests/model/TextTest.php +++ b/tests/model/TextTest.php @@ -5,6 +5,24 @@ */ class TextTest extends SapphireTest { + private $previousLocaleSetting = null; + + public function setUp() + { + parent::setUp(); + // clear the previous locale setting + $this->previousLocaleSetting = null; + } + + public function tearDown() + { + parent::tearDown(); + // If a test sets the locale, reset it on teardown + if ($this->previousLocaleSetting) { + setlocale(LC_CTYPE, $this->previousLocaleSetting); + } + } + /** * Test {@link Text->LimitCharacters()} */ @@ -240,4 +258,30 @@ class TextTest extends SapphireTest { $data = DBField::create_field('Text', '"this is a test"'); $this->assertEquals($data->ATT(), '"this is a test"'); } + + public function testValidUtf8() + { + // Install a UTF-8 locale + $this->previousLocaleSetting = setlocale(LC_CTYPE, 0); + $locales = array('en_US.UTF-8', 'en_NZ.UTF-8', 'de_DE.UTF-8'); + $localeInstalled = false; + foreach ($locales as $locale) { + if ($localeInstalled = setlocale(LC_CTYPE, $locale)) { + break; + } + } + + // If the system doesn't have any of the UTF-8 locales, exit early + if ($localeInstalled === false) { + $this->markTestIncomplete('Unable to run this test because of missing locale!'); + return; + } + + $problematicText = html_entity_decode('This is a Test with non-breaking space!', ENT_COMPAT, 'UTF-8'); + + $textObj = new Text('Test'); + $textObj->setValue($problematicText); + + $this->assertTrue(mb_check_encoding($textObj->FirstSentence(), 'UTF-8')); + } }