ENHANCEMENT More flexible URL filtering through new URLSegmentFilter API. Support for multibyte URL segments through URLPathFilter::$default_allow_multibyte. Abstraction from Convert::raw2url() (and SiteTree->generateURLSegment())

This commit is contained in:
Ingo Schommer 2011-11-14 12:26:51 +01:00
parent 4a2fe9877d
commit 9b27a4c1be
7 changed files with 194 additions and 12 deletions

View File

@ -349,21 +349,13 @@ class Convert {
/**
* Convert a string (normally a title) to a string suitable for using in
* urls and other html attributes
* urls and other html attributes. Uses {@link URLSegmentFilter}.
*
* @param string
*
* @return string
*/
public static function raw2url($title) {
$t = (function_exists('mb_strtolower')) ? mb_strtolower($title) : strtolower($title);
$t = Object::create('Transliterator')->toASCII($t);
$t = str_replace('&','-and-',$t);
$t = str_replace('&','-and-',$t);
$t = ereg_replace('[^A-Za-z0-9]+','-',$t);
$t = ereg_replace('-+','-',$t);
$t = trim($t, '-');
return $t;
$f = Object::create('URLSegmentFilter');
return $f->filter($title);
}
}

View File

@ -75,6 +75,20 @@ This means all formats are defined in
[http://framework.zend.com/manual/en/zend.date.constants.html#zend.date.constants.selfdefinedformats](ISO date format),
not PHP's built-in [date()](http://nz.php.net/manual/en/function.date.php).
### i18n in URLs
By default, URLs for pages in SilverStripe (the `SiteTree->URLSegment` property)
are automatically reduced to the allowed allowed subset of ASCII characters.
If characters outside this subsetare added, they are either removed or (if possible) "transliterated".
This describes the process of converting from one character set to another
while keeping characters recognizeable. For example, vowels with french accents
are replaced with their base characters, `pâté` becomes `pate`.
In order to allow for so called "multibyte" characters outside of the ASCII subset,
limit the character filtering in the underlying class: `URLSegmentFilter::$default_use_transliterator = false`
Please refer to [W3C: Introduction to IDN and IRI](http://www.w3.org/International/articles/idn-and-iri/) for more details.
### i18n in Form Fields
Date- and time related form fields support i18n ([api:DateField], [api:TimeField], [api:DatetimeField]).

View File

@ -23,6 +23,8 @@
* FileNameFilter::$default_use_transliterator = false;
* FileNameFilter::$default_replacements = array();
* </code>
*
* See {@link URLSegmentFilter} for a more generic implementation.
*/
class FileNameFilter {

137
model/URLSegmentFilter.php Normal file
View File

@ -0,0 +1,137 @@
<?php
/**
* @package sapphire
* @subpackage model
*/
/**
* Filter certain characters from "URL segments" (also called "slugs"), for nicer (more SEO-friendly) URLs.
* Uses {@link Transliterator} to convert non-ASCII characters to meaningful ASCII representations.
* Use {@link $default_allow_multibyte} to allow a broader range of characters without transliteration.
*
* Caution: Should not be used on full URIs with domains or query parameters.
* In order to retain forward slashes in a path, each individual segment needs to be filtered individually.
*
* See {@link FileNameFilter} for similar implementation for filesystem-based URLs.
*/
class URLSegmentFilter {
/**
* Necessary to support {@link Object::create()}
*/
function __construct() {}
/**
* @var Boolean
*/
static $default_use_transliterator = true;
/**
* @var Array See {@link setReplacements()}.
*/
static $default_replacements = array(
'/&amp;/u' => '-and-',
'/&/u' => '-and-',
'/\s/u' => '-', // remove whitespace
'/_/u' => '-', // underscores to dashes
'/[^A-Za-z0-9+.-]+/u' => '', // remove non-ASCII chars, only allow alphanumeric plus dash and dot
'/[\-]{2,}/u' => '-', // remove duplicate dashes
'/^[\.\-_]/u' => '', // Remove all leading dots, dashes or underscores
);
/**
* Doesn't try to replace or transliterate non-ASCII filters.
* Useful for character sets that have little overlap with ASCII (e.g. far eastern),
* as well as better search engine optimization for URLs.
* @see http://www.ietf.org/rfc/rfc3987
*
* @var boolean
*/
static $default_allow_multibyte = false;
/**
* @var Array See {@link setReplacements()}
*/
public $replacements = array();
/**
* Note: Depending on the applied replacement rules, this method might result in an empty string.
*
* @param String URL path (without domain or query parameters), in utf8 encoding
* @return String A filtered path compatible with RFC 3986
*/
function filter($name) {
if(!$this->getAllowMultibyte()) {
// Only transliterate when no multibyte support is requested
$transliterator = $this->getTransliterator();
if($transliterator) $name = $transliterator->toASCII($name);
}
$name = (function_exists('mb_strtolower')) ? mb_strtolower($name) : strtolower($name);
$replacements = $this->getReplacements();
if($this->getAllowMultibyte()) {
// unset automated removal of non-ASCII characters, and don't try to transliterate
if(isset($replacements['/[^A-Za-z0-9+.-]+/u'])) unset($replacements['/[^A-Za-z0-9+.-]+/u']);
}
foreach($replacements as $regex => $replace) {
$name = preg_replace($regex, $replace, $name);
}
return $name;
}
/**
* @param Array Map of find/replace used for preg_replace().
*/
function setReplacements($r) {
$this->replacements = $r;
}
/**
* @return Array
*/
function getReplacements() {
return ($this->replacements) ? $this->replacements : self::$default_replacements;
}
/**
* @var Transliterator
*/
protected $transliterator;
/**
* @return Transliterator|NULL
*/
function getTransliterator() {
if($this->transliterator === null && self::$default_use_transliterator) {
$this->transliterator = Object::create('Transliterator');
}
return $this->transliterator;
}
/**
* @param Transliterator|FALSE
*/
function setTransliterator($t) {
$this->transliterator = $t;
}
/**
* @var boolean
*/
protected $allowMultibyte;
/**
* @param boolean
*/
function setAllowMultibyte($bool) {
$this->allowMultibyte = $bool;
}
/**
* @return boolean
*/
function getAllowMultibyte() {
return ($this->allowMultibyte !== null) ? $this->allowMultibyte : self::$default_allow_multibyte;
}
}

View File

@ -104,7 +104,7 @@ class ConvertTest extends SapphireTest {
$this->assertEquals('foo', Convert::raw2url('foo'));
$this->assertEquals('foo-and-bar', Convert::raw2url('foo & bar'));
$this->assertEquals('foo-and-bar', Convert::raw2url('foo &amp; bar!'));
$this->assertEquals('foo-s-bar-2', Convert::raw2url('foo\'s [bar] (2)'));
$this->assertEquals('foos-bar-2', Convert::raw2url('foo\'s [bar] (2)'));
}
}

View File

@ -8,6 +8,7 @@ class FileNameFilterTest extends SapphireTest {
function testFilter() {
$name = 'Brötchen für allë-mit_Unterstrich!.jpg';
$filter = new FileNameFilter();
$filter->setTransliterator(false);
$this->assertEquals(
'Brtchen-fr-all-mit-Unterstrich.jpg',
$filter->filter($name)
@ -27,6 +28,7 @@ class FileNameFilterTest extends SapphireTest {
function testFilterWithCustomRules() {
$name = 'Brötchen für allë-mit_Unterstrich!.jpg';
$filter = new FileNameFilter();
$filter->setTransliterator(false);
$filter->setReplacements(array('/[\s-]/' => '_'));
$this->assertEquals(
'Brötchen__für_allë_mit_Unterstrich!.jpg',

View File

@ -0,0 +1,35 @@
<?php
/**
* @package sapphire
* @subpackage tests
*/
class URLSegmentFilterTest extends SapphireTest {
function testReplacesCommonEnglishSymbols() {
$f = new URLSegmentFilter();
$f->setAllowMultibyte(false);
$this->assertEquals(
'john-and-spencer',
$f->filter('John & Spencer')
);
}
function testTransliteratesNonAsciiUrls() {
$f = new URLSegmentFilter();
$f->setAllowMultibyte(false);
$this->assertEquals(
'broetchen',
$f->filter('Brötchen')
);
}
function testRetainsNonAsciiUrlsWithAllowMultiByteOption() {
$f = new URLSegmentFilter();
$f->setAllowMultibyte(true);
$this->assertEquals(
'brötchen',
$f->filter('Brötchen')
);
}
}