2011-11-14 12:26:51 +01:00
|
|
|
<?php
|
|
|
|
/**
|
2012-04-12 08:02:46 +02:00
|
|
|
* @package framework
|
2011-11-14 12:26:51 +01:00
|
|
|
* @subpackage model
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Filter certain characters from "URL segments" (also called "slugs"), for nicer (more SEO-friendly) URLs.
|
2012-06-15 05:54:47 +02:00
|
|
|
* Uses {@link SS_Transliterator} to convert non-ASCII characters to meaningful ASCII representations.
|
2011-11-14 12:26:51 +01:00
|
|
|
* Use {@link $default_allow_multibyte} to allow a broader range of characters without transliteration.
|
|
|
|
*
|
|
|
|
* Caution: Should not be used on full URIs with domains or query parameters.
|
|
|
|
* In order to retain forward slashes in a path, each individual segment needs to be filtered individually.
|
|
|
|
*
|
|
|
|
* See {@link FileNameFilter} for similar implementation for filesystem-based URLs.
|
|
|
|
*/
|
2012-04-04 16:59:30 +02:00
|
|
|
class URLSegmentFilter extends Object {
|
2011-11-14 12:26:51 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var Boolean
|
|
|
|
*/
|
|
|
|
static $default_use_transliterator = true;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @var Array See {@link setReplacements()}.
|
|
|
|
*/
|
|
|
|
static $default_replacements = array(
|
|
|
|
'/&/u' => '-and-',
|
|
|
|
'/&/u' => '-and-',
|
|
|
|
'/\s/u' => '-', // remove whitespace
|
|
|
|
'/_/u' => '-', // underscores to dashes
|
|
|
|
'/[^A-Za-z0-9+.-]+/u' => '', // remove non-ASCII chars, only allow alphanumeric plus dash and dot
|
|
|
|
'/[\-]{2,}/u' => '-', // remove duplicate dashes
|
|
|
|
'/^[\.\-_]/u' => '', // Remove all leading dots, dashes or underscores
|
|
|
|
);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Doesn't try to replace or transliterate non-ASCII filters.
|
|
|
|
* Useful for character sets that have little overlap with ASCII (e.g. far eastern),
|
|
|
|
* as well as better search engine optimization for URLs.
|
|
|
|
* @see http://www.ietf.org/rfc/rfc3987
|
|
|
|
*
|
|
|
|
* @var boolean
|
|
|
|
*/
|
|
|
|
static $default_allow_multibyte = false;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @var Array See {@link setReplacements()}
|
|
|
|
*/
|
|
|
|
public $replacements = array();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Note: Depending on the applied replacement rules, this method might result in an empty string.
|
|
|
|
*
|
|
|
|
* @param String URL path (without domain or query parameters), in utf8 encoding
|
|
|
|
* @return String A filtered path compatible with RFC 3986
|
|
|
|
*/
|
2012-09-19 12:07:39 +02:00
|
|
|
public function filter($name) {
|
2011-11-14 12:26:51 +01:00
|
|
|
if(!$this->getAllowMultibyte()) {
|
|
|
|
// Only transliterate when no multibyte support is requested
|
|
|
|
$transliterator = $this->getTransliterator();
|
|
|
|
if($transliterator) $name = $transliterator->toASCII($name);
|
|
|
|
}
|
|
|
|
|
2012-02-06 11:54:48 +01:00
|
|
|
$name = mb_strtolower($name);
|
2011-11-14 12:26:51 +01:00
|
|
|
$replacements = $this->getReplacements();
|
2012-02-06 11:56:26 +01:00
|
|
|
|
|
|
|
// Unset automated removal of non-ASCII characters, and don't try to transliterate
|
|
|
|
if($this->getAllowMultibyte() && isset($replacements['/[^A-Za-z0-9+.-]+/u'])) unset($replacements['/[^A-Za-z0-9+.-]+/u']);
|
|
|
|
|
2011-11-14 12:26:51 +01:00
|
|
|
foreach($replacements as $regex => $replace) {
|
|
|
|
$name = preg_replace($regex, $replace, $name);
|
|
|
|
}
|
2012-02-06 11:56:26 +01:00
|
|
|
|
|
|
|
// Multibyte URLs require percent encoding to comply to RFC 3986.
|
|
|
|
// Without this setting, the "remove non-ASCII chars" regex takes care of that.
|
|
|
|
if($this->getAllowMultibyte()) $name = rawurlencode($name);
|
2011-11-14 12:26:51 +01:00
|
|
|
|
|
|
|
return $name;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param Array Map of find/replace used for preg_replace().
|
|
|
|
*/
|
2012-09-19 12:07:39 +02:00
|
|
|
public function setReplacements($r) {
|
2011-11-14 12:26:51 +01:00
|
|
|
$this->replacements = $r;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return Array
|
|
|
|
*/
|
2012-09-19 12:07:39 +02:00
|
|
|
public function getReplacements() {
|
2011-11-14 12:26:51 +01:00
|
|
|
return ($this->replacements) ? $this->replacements : self::$default_replacements;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2012-06-15 05:54:47 +02:00
|
|
|
* @var SS_Transliterator
|
2011-11-14 12:26:51 +01:00
|
|
|
*/
|
|
|
|
protected $transliterator;
|
|
|
|
|
|
|
|
/**
|
2012-06-15 05:54:47 +02:00
|
|
|
* @return SS_Transliterator|NULL
|
2011-11-14 12:26:51 +01:00
|
|
|
*/
|
2012-09-19 12:07:39 +02:00
|
|
|
public function getTransliterator() {
|
2011-11-14 12:26:51 +01:00
|
|
|
if($this->transliterator === null && self::$default_use_transliterator) {
|
2012-06-15 05:54:47 +02:00
|
|
|
$this->transliterator = SS_Transliterator::create();
|
2011-11-14 12:26:51 +01:00
|
|
|
}
|
|
|
|
return $this->transliterator;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2012-06-15 05:54:47 +02:00
|
|
|
* @param SS_Transliterator|FALSE
|
2011-11-14 12:26:51 +01:00
|
|
|
*/
|
2012-09-19 12:07:39 +02:00
|
|
|
public function setTransliterator($t) {
|
2011-11-14 12:26:51 +01:00
|
|
|
$this->transliterator = $t;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @var boolean
|
|
|
|
*/
|
|
|
|
protected $allowMultibyte;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param boolean
|
|
|
|
*/
|
2012-09-19 12:07:39 +02:00
|
|
|
public function setAllowMultibyte($bool) {
|
2011-11-14 12:26:51 +01:00
|
|
|
$this->allowMultibyte = $bool;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2012-09-19 12:07:39 +02:00
|
|
|
public function getAllowMultibyte() {
|
2011-11-14 12:26:51 +01:00
|
|
|
return ($this->allowMultibyte !== null) ? $this->allowMultibyte : self::$default_allow_multibyte;
|
|
|
|
}
|
2012-03-24 04:04:52 +01:00
|
|
|
}
|