From 9b27a4c1be23fdfcdcac84597c337b80718443ba Mon Sep 17 00:00:00 2001
From: Ingo Schommer <ingo@silverstripe.com>
Date: Mon, 14 Nov 2011 12:26:51 +0100
Subject: [PATCH] ENHANCEMENT More flexible URL filtering through new
 URLSegmentFilter API.  Support for multibyte URL segments through
 URLPathFilter::$default_allow_multibyte. Abstraction from Convert::raw2url()
 (and SiteTree->generateURLSegment())

---
 core/Convert.php                        |  14 +--
 docs/en/topics/i18n.md                  |  14 +++
 filesystem/FileNameFilter.php           |   2 +
 model/URLSegmentFilter.php              | 137 ++++++++++++++++++++++++
 tests/core/ConvertTest.php              |   2 +-
 tests/filesystem/FileNameFilterTest.php |   2 +
 tests/model/URLSegmentFilterTest.php    |  35 ++++++
 7 files changed, 194 insertions(+), 12 deletions(-)
 create mode 100644 model/URLSegmentFilter.php
 create mode 100644 tests/model/URLSegmentFilterTest.php
diff --git a/core/Convert.php b/core/Convert.php
index e1df2d254..0be48ac7d 100644
--- a/core/Convert.php
+++ b/core/Convert.php
@@ -349,21 +349,13 @@ class Convert {
 	
 	/**
 	 * Convert a string (normally a title) to a string suitable for using in
-	 * urls and other html attributes 
+	 * urls and other html attributes. Uses {@link URLSegmentFilter}.
 	 *
 	 * @param string 
-	 *
 	 * @return string
 	 */
 	public static function raw2url($title) {
-		$t = (function_exists('mb_strtolower')) ? mb_strtolower($title) : strtolower($title);
-		$t = Object::create('Transliterator')->toASCII($t);
-		$t = str_replace('&amp;','-and-',$t);
-		$t = str_replace('&','-and-',$t);
-		$t = ereg_replace('[^A-Za-z0-9]+','-',$t);
-		$t = ereg_replace('-+','-',$t);
-		$t = trim($t, '-');
-		
-		return $t;
+		$f = Object::create('URLSegmentFilter');
+		return $f->filter($title);
 	}
 }
\ No newline at end of file
diff --git a/docs/en/topics/i18n.md b/docs/en/topics/i18n.md
index feea71608..86505b47b 100644
--- a/docs/en/topics/i18n.md
+++ b/docs/en/topics/i18n.md
@@ -75,6 +75,20 @@ This means all formats are defined in
 [http://framework.zend.com/manual/en/zend.date.constants.html#zend.date.constants.selfdefinedformats](ISO date format),
 not PHP's built-in [date()](http://nz.php.net/manual/en/function.date.php).
 
+### i18n in URLs
+
+By default, URLs for pages in SilverStripe (the `SiteTree->URLSegment` property) 
+are automatically reduced to the allowed allowed subset of ASCII characters. 
+If characters outside this subsetare added, they are either removed or (if possible) "transliterated".
+This describes the process of converting from one character set to another
+while keeping characters recognizeable. For example, vowels with french accents 
+are replaced with their base characters, `pâté` becomes `pate`.
+
+In order to allow for so called "multibyte" characters outside of the ASCII subset,
+limit the character filtering in the underlying class: `URLSegmentFilter::$default_use_transliterator = false`
+
+Please refer to [W3C: Introduction to IDN and IRI](http://www.w3.org/International/articles/idn-and-iri/) for more details.
+
 ### i18n in Form Fields
 
 Date- and time related form fields support i18n ([api:DateField], [api:TimeField], [api:DatetimeField]).
diff --git a/filesystem/FileNameFilter.php b/filesystem/FileNameFilter.php
index b4662d234..1669ff371 100644
--- a/filesystem/FileNameFilter.php
+++ b/filesystem/FileNameFilter.php
@@ -23,6 +23,8 @@
  * FileNameFilter::$default_use_transliterator = false;
  * FileNameFilter::$default_replacements = array();
  * </code>
+ * 
+ * See {@link URLSegmentFilter} for a more generic implementation.
  */
 class FileNameFilter {
 	
diff --git a/model/URLSegmentFilter.php b/model/URLSegmentFilter.php
new file mode 100644
index 000000000..b3cd7f428
--- /dev/null
+++ b/model/URLSegmentFilter.php
@@ -0,0 +1,137 @@
+<?php
+/**
+ * @package sapphire
+ * @subpackage model
+ */
+
+/**
+ * Filter certain characters from "URL segments" (also called "slugs"), for nicer (more SEO-friendly) URLs.
+ * Uses {@link Transliterator} to convert non-ASCII characters to meaningful ASCII representations.
+ * Use {@link $default_allow_multibyte} to allow a broader range of characters without transliteration.
+ * 
+ * Caution: Should not be used on full URIs with domains or query parameters.
+ * In order to retain forward slashes in a path, each individual segment needs to be filtered individually.
+ * 
+ * See {@link FileNameFilter} for similar implementation for filesystem-based URLs.
+ */
+class URLSegmentFilter {
+	
+	/**
+	 * Necessary to support {@link Object::create()}
+	 */
+	function __construct() {}
+	
+	/**
+	 * @var Boolean
+	 */
+	static $default_use_transliterator = true;
+	
+	/**
+	 * @var Array See {@link setReplacements()}.
+	 */
+	static $default_replacements = array(
+		'/&amp;/u' => '-and-',
+		'/&/u' => '-and-',
+		'/\s/u' => '-', // remove whitespace
+		'/_/u' => '-', // underscores to dashes
+		'/[^A-Za-z0-9+.-]+/u' => '', // remove non-ASCII chars, only allow alphanumeric plus dash and dot
+		'/[\-]{2,}/u' => '-', // remove duplicate dashes
+		'/^[\.\-_]/u' => '', // Remove all leading dots, dashes or underscores
+	);
+	
+	/**
+	 * Doesn't try to replace or transliterate non-ASCII filters.
+	 * Useful for character sets that have little overlap with ASCII (e.g. far eastern),
+	 * as well as better search engine optimization for URLs.
+	 * @see http://www.ietf.org/rfc/rfc3987
+	 * 
+	 * @var boolean
+	 */
+	static $default_allow_multibyte = false;
+	
+	/**
+	 * @var Array See {@link setReplacements()}
+	 */
+	public $replacements = array();
+	
+	/**
+	 * Note: Depending on the applied replacement rules, this method might result in an empty string. 
+	 * 
+	 * @param String URL path (without domain or query parameters), in utf8 encoding
+	 * @return String A filtered path compatible with RFC 3986
+	 */
+	function filter($name) {
+		if(!$this->getAllowMultibyte()) {
+			// Only transliterate when no multibyte support is requested
+			$transliterator = $this->getTransliterator();
+			if($transliterator) $name = $transliterator->toASCII($name);
+		}
+		
+		$name = (function_exists('mb_strtolower')) ? mb_strtolower($name) : strtolower($name);
+		$replacements = $this->getReplacements();
+		if($this->getAllowMultibyte()) {
+			// unset automated removal of non-ASCII characters, and don't try to transliterate
+			if(isset($replacements['/[^A-Za-z0-9+.-]+/u'])) unset($replacements['/[^A-Za-z0-9+.-]+/u']);
+		}
+		foreach($replacements as $regex => $replace) {
+			$name = preg_replace($regex, $replace, $name);
+		}
+		
+		return $name;
+	}
+	
+	/**
+	 * @param Array Map of find/replace used for preg_replace().
+	 */
+	function setReplacements($r) {
+		$this->replacements = $r;
+	}
+	
+	/**
+	 * @return Array
+	 */
+	function getReplacements() {
+		return ($this->replacements) ? $this->replacements : self::$default_replacements;
+	}
+		
+	/**
+	 * @var Transliterator
+	 */
+	protected $transliterator;
+	
+	/**
+	 * @return Transliterator|NULL
+	 */
+	function getTransliterator() {
+		if($this->transliterator === null && self::$default_use_transliterator) {
+			$this->transliterator = Object::create('Transliterator');
+		} 
+		return $this->transliterator;
+	}
+	
+	/**
+	 * @param Transliterator|FALSE
+	 */
+	function setTransliterator($t) {
+		$this->transliterator = $t;
+	}
+	
+	/**
+	 * @var boolean
+	 */
+	protected $allowMultibyte;
+	
+	/**
+	 * @param boolean
+	 */
+	function setAllowMultibyte($bool) {
+		$this->allowMultibyte = $bool;
+	}
+	
+	/**
+	 * @return boolean
+	 */
+	function getAllowMultibyte() {
+		return ($this->allowMultibyte !== null) ? $this->allowMultibyte : self::$default_allow_multibyte;
+	}
+}
\ No newline at end of file
diff --git a/tests/core/ConvertTest.php b/tests/core/ConvertTest.php
index 6a271fe56..bdaa60daa 100644
--- a/tests/core/ConvertTest.php
+++ b/tests/core/ConvertTest.php
@@ -104,7 +104,7 @@ class ConvertTest extends SapphireTest {
 		$this->assertEquals('foo', Convert::raw2url('foo'));
 		$this->assertEquals('foo-and-bar', Convert::raw2url('foo & bar'));
 		$this->assertEquals('foo-and-bar', Convert::raw2url('foo &amp; bar!'));
-		$this->assertEquals('foo-s-bar-2', Convert::raw2url('foo\'s [bar] (2)'));
+		$this->assertEquals('foos-bar-2', Convert::raw2url('foo\'s [bar] (2)'));
 	}
 
 }
\ No newline at end of file
diff --git a/tests/filesystem/FileNameFilterTest.php b/tests/filesystem/FileNameFilterTest.php
index 5a4039a69..daf5717c4 100644
--- a/tests/filesystem/FileNameFilterTest.php
+++ b/tests/filesystem/FileNameFilterTest.php
@@ -8,6 +8,7 @@ class FileNameFilterTest extends SapphireTest {
 	function testFilter() {
 		$name = 'Brötchen  für allë-mit_Unterstrich!.jpg';
 		$filter = new FileNameFilter();
+		$filter->setTransliterator(false);
 		$this->assertEquals(
 			'Brtchen-fr-all-mit-Unterstrich.jpg', 
 			$filter->filter($name)
@@ -27,6 +28,7 @@ class FileNameFilterTest extends SapphireTest {
 	function testFilterWithCustomRules() {
 		$name = 'Brötchen  für allë-mit_Unterstrich!.jpg';
 		$filter = new FileNameFilter();
+		$filter->setTransliterator(false);
 		$filter->setReplacements(array('/[\s-]/' => '_'));
 		$this->assertEquals(
 			'Brötchen__für_allë_mit_Unterstrich!.jpg', 
diff --git a/tests/model/URLSegmentFilterTest.php b/tests/model/URLSegmentFilterTest.php
new file mode 100644
index 000000000..0d6c673a1
--- /dev/null
+++ b/tests/model/URLSegmentFilterTest.php
@@ -0,0 +1,35 @@
+<?php
+/**
+ * @package sapphire
+ * @subpackage tests
+ */
+class URLSegmentFilterTest extends SapphireTest {
+	
+	function testReplacesCommonEnglishSymbols() {
+		$f = new URLSegmentFilter();
+		$f->setAllowMultibyte(false);
+		$this->assertEquals(
+			'john-and-spencer', 
+			$f->filter('John & Spencer')
+		);
+	}
+	
+	function testTransliteratesNonAsciiUrls() {
+		$f = new URLSegmentFilter();
+		$f->setAllowMultibyte(false);
+		$this->assertEquals(
+			'broetchen', 
+			$f->filter('Brötchen')
+		);
+	}
+	
+	function testRetainsNonAsciiUrlsWithAllowMultiByteOption() {
+		$f = new URLSegmentFilter();
+		$f->setAllowMultibyte(true);
+		$this->assertEquals(
+			'brötchen', 
+			$f->filter('Brötchen')
+		);
+	}
+	
+}
\ No newline at end of file