From 9b27a4c1be23fdfcdcac84597c337b80718443ba Mon Sep 17 00:00:00 2001 From: Ingo Schommer Date: Mon, 14 Nov 2011 12:26:51 +0100 Subject: [PATCH] ENHANCEMENT More flexible URL filtering through new URLSegmentFilter API. Support for multibyte URL segments through URLPathFilter::$default_allow_multibyte. Abstraction from Convert::raw2url() (and SiteTree->generateURLSegment()) --- core/Convert.php | 14 +-- docs/en/topics/i18n.md | 14 +++ filesystem/FileNameFilter.php | 2 + model/URLSegmentFilter.php | 137 ++++++++++++++++++++++++ tests/core/ConvertTest.php | 2 +- tests/filesystem/FileNameFilterTest.php | 2 + tests/model/URLSegmentFilterTest.php | 35 ++++++ 7 files changed, 194 insertions(+), 12 deletions(-) create mode 100644 model/URLSegmentFilter.php create mode 100644 tests/model/URLSegmentFilterTest.php diff --git a/core/Convert.php b/core/Convert.php index e1df2d254..0be48ac7d 100644 --- a/core/Convert.php +++ b/core/Convert.php @@ -349,21 +349,13 @@ class Convert { /** * Convert a string (normally a title) to a string suitable for using in - * urls and other html attributes + * urls and other html attributes. Uses {@link URLSegmentFilter}. * * @param string - * * @return string */ public static function raw2url($title) { - $t = (function_exists('mb_strtolower')) ? mb_strtolower($title) : strtolower($title); - $t = Object::create('Transliterator')->toASCII($t); - $t = str_replace('&','-and-',$t); - $t = str_replace('&','-and-',$t); - $t = ereg_replace('[^A-Za-z0-9]+','-',$t); - $t = ereg_replace('-+','-',$t); - $t = trim($t, '-'); - - return $t; + $f = Object::create('URLSegmentFilter'); + return $f->filter($title); } } \ No newline at end of file diff --git a/docs/en/topics/i18n.md b/docs/en/topics/i18n.md index feea71608..86505b47b 100644 --- a/docs/en/topics/i18n.md +++ b/docs/en/topics/i18n.md @@ -75,6 +75,20 @@ This means all formats are defined in [http://framework.zend.com/manual/en/zend.date.constants.html#zend.date.constants.selfdefinedformats](ISO date format), not PHP's built-in [date()](http://nz.php.net/manual/en/function.date.php). +### i18n in URLs + +By default, URLs for pages in SilverStripe (the `SiteTree->URLSegment` property) +are automatically reduced to the allowed allowed subset of ASCII characters. +If characters outside this subsetare added, they are either removed or (if possible) "transliterated". +This describes the process of converting from one character set to another +while keeping characters recognizeable. For example, vowels with french accents +are replaced with their base characters, `pâté` becomes `pate`. + +In order to allow for so called "multibyte" characters outside of the ASCII subset, +limit the character filtering in the underlying class: `URLSegmentFilter::$default_use_transliterator = false` + +Please refer to [W3C: Introduction to IDN and IRI](http://www.w3.org/International/articles/idn-and-iri/) for more details. + ### i18n in Form Fields Date- and time related form fields support i18n ([api:DateField], [api:TimeField], [api:DatetimeField]). diff --git a/filesystem/FileNameFilter.php b/filesystem/FileNameFilter.php index b4662d234..1669ff371 100644 --- a/filesystem/FileNameFilter.php +++ b/filesystem/FileNameFilter.php @@ -23,6 +23,8 @@ * FileNameFilter::$default_use_transliterator = false; * FileNameFilter::$default_replacements = array(); * + * + * See {@link URLSegmentFilter} for a more generic implementation. */ class FileNameFilter { diff --git a/model/URLSegmentFilter.php b/model/URLSegmentFilter.php new file mode 100644 index 000000000..b3cd7f428 --- /dev/null +++ b/model/URLSegmentFilter.php @@ -0,0 +1,137 @@ + '-and-', + '/&/u' => '-and-', + '/\s/u' => '-', // remove whitespace + '/_/u' => '-', // underscores to dashes + '/[^A-Za-z0-9+.-]+/u' => '', // remove non-ASCII chars, only allow alphanumeric plus dash and dot + '/[\-]{2,}/u' => '-', // remove duplicate dashes + '/^[\.\-_]/u' => '', // Remove all leading dots, dashes or underscores + ); + + /** + * Doesn't try to replace or transliterate non-ASCII filters. + * Useful for character sets that have little overlap with ASCII (e.g. far eastern), + * as well as better search engine optimization for URLs. + * @see http://www.ietf.org/rfc/rfc3987 + * + * @var boolean + */ + static $default_allow_multibyte = false; + + /** + * @var Array See {@link setReplacements()} + */ + public $replacements = array(); + + /** + * Note: Depending on the applied replacement rules, this method might result in an empty string. + * + * @param String URL path (without domain or query parameters), in utf8 encoding + * @return String A filtered path compatible with RFC 3986 + */ + function filter($name) { + if(!$this->getAllowMultibyte()) { + // Only transliterate when no multibyte support is requested + $transliterator = $this->getTransliterator(); + if($transliterator) $name = $transliterator->toASCII($name); + } + + $name = (function_exists('mb_strtolower')) ? mb_strtolower($name) : strtolower($name); + $replacements = $this->getReplacements(); + if($this->getAllowMultibyte()) { + // unset automated removal of non-ASCII characters, and don't try to transliterate + if(isset($replacements['/[^A-Za-z0-9+.-]+/u'])) unset($replacements['/[^A-Za-z0-9+.-]+/u']); + } + foreach($replacements as $regex => $replace) { + $name = preg_replace($regex, $replace, $name); + } + + return $name; + } + + /** + * @param Array Map of find/replace used for preg_replace(). + */ + function setReplacements($r) { + $this->replacements = $r; + } + + /** + * @return Array + */ + function getReplacements() { + return ($this->replacements) ? $this->replacements : self::$default_replacements; + } + + /** + * @var Transliterator + */ + protected $transliterator; + + /** + * @return Transliterator|NULL + */ + function getTransliterator() { + if($this->transliterator === null && self::$default_use_transliterator) { + $this->transliterator = Object::create('Transliterator'); + } + return $this->transliterator; + } + + /** + * @param Transliterator|FALSE + */ + function setTransliterator($t) { + $this->transliterator = $t; + } + + /** + * @var boolean + */ + protected $allowMultibyte; + + /** + * @param boolean + */ + function setAllowMultibyte($bool) { + $this->allowMultibyte = $bool; + } + + /** + * @return boolean + */ + function getAllowMultibyte() { + return ($this->allowMultibyte !== null) ? $this->allowMultibyte : self::$default_allow_multibyte; + } +} \ No newline at end of file diff --git a/tests/core/ConvertTest.php b/tests/core/ConvertTest.php index 6a271fe56..bdaa60daa 100644 --- a/tests/core/ConvertTest.php +++ b/tests/core/ConvertTest.php @@ -104,7 +104,7 @@ class ConvertTest extends SapphireTest { $this->assertEquals('foo', Convert::raw2url('foo')); $this->assertEquals('foo-and-bar', Convert::raw2url('foo & bar')); $this->assertEquals('foo-and-bar', Convert::raw2url('foo & bar!')); - $this->assertEquals('foo-s-bar-2', Convert::raw2url('foo\'s [bar] (2)')); + $this->assertEquals('foos-bar-2', Convert::raw2url('foo\'s [bar] (2)')); } } \ No newline at end of file diff --git a/tests/filesystem/FileNameFilterTest.php b/tests/filesystem/FileNameFilterTest.php index 5a4039a69..daf5717c4 100644 --- a/tests/filesystem/FileNameFilterTest.php +++ b/tests/filesystem/FileNameFilterTest.php @@ -8,6 +8,7 @@ class FileNameFilterTest extends SapphireTest { function testFilter() { $name = 'Brötchen für allë-mit_Unterstrich!.jpg'; $filter = new FileNameFilter(); + $filter->setTransliterator(false); $this->assertEquals( 'Brtchen-fr-all-mit-Unterstrich.jpg', $filter->filter($name) @@ -27,6 +28,7 @@ class FileNameFilterTest extends SapphireTest { function testFilterWithCustomRules() { $name = 'Brötchen für allë-mit_Unterstrich!.jpg'; $filter = new FileNameFilter(); + $filter->setTransliterator(false); $filter->setReplacements(array('/[\s-]/' => '_')); $this->assertEquals( 'Brötchen__für_allë_mit_Unterstrich!.jpg', diff --git a/tests/model/URLSegmentFilterTest.php b/tests/model/URLSegmentFilterTest.php new file mode 100644 index 000000000..0d6c673a1 --- /dev/null +++ b/tests/model/URLSegmentFilterTest.php @@ -0,0 +1,35 @@ +setAllowMultibyte(false); + $this->assertEquals( + 'john-and-spencer', + $f->filter('John & Spencer') + ); + } + + function testTransliteratesNonAsciiUrls() { + $f = new URLSegmentFilter(); + $f->setAllowMultibyte(false); + $this->assertEquals( + 'broetchen', + $f->filter('Brötchen') + ); + } + + function testRetainsNonAsciiUrlsWithAllowMultiByteOption() { + $f = new URLSegmentFilter(); + $f->setAllowMultibyte(true); + $this->assertEquals( + 'brötchen', + $f->filter('Brötchen') + ); + } + +} \ No newline at end of file