2011-11-14 12:26:51 +01:00
< ? php
/**
2012-04-12 18:02:46 +12:00
* @ package framework
2011-11-14 12:26:51 +01:00
* @ subpackage model
*/
/**
* Filter certain characters from " URL segments " ( also called " slugs " ), for nicer ( more SEO - friendly ) URLs .
2012-06-15 15:54:47 +12:00
* Uses { @ link SS_Transliterator } to convert non - ASCII characters to meaningful ASCII representations .
2011-11-14 12:26:51 +01:00
* Use { @ link $default_allow_multibyte } to allow a broader range of characters without transliteration .
2014-08-15 18:53:05 +12:00
*
2011-11-14 12:26:51 +01:00
* Caution : Should not be used on full URIs with domains or query parameters .
* In order to retain forward slashes in a path , each individual segment needs to be filtered individually .
2014-08-15 18:53:05 +12:00
*
2011-11-14 12:26:51 +01:00
* See { @ link FileNameFilter } for similar implementation for filesystem - based URLs .
*/
2017-11-30 17:49:46 +00:00
class URLSegmentFilter extends SS_Object {
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
2013-03-21 19:48:54 +01:00
* @ config
2011-11-14 12:26:51 +01:00
* @ var Boolean
*/
2013-03-21 19:48:54 +01:00
private static $default_use_transliterator = true ;
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
2013-03-21 19:48:54 +01:00
* @ config
2011-11-14 12:26:51 +01:00
* @ var Array See { @ link setReplacements ()} .
*/
2013-03-21 19:48:54 +01:00
private static $default_replacements = array (
2011-11-14 12:26:51 +01:00
'/&/u' => '-and-' ,
'/&/u' => '-and-' ,
2013-05-10 16:10:26 +12:00
'/\s|\+/u' => '-' , // remove whitespace/plus
2013-03-30 19:47:17 +11:00
'/[_.]+/u' => '-' , // underscores and dots to dashes
2013-05-10 16:10:26 +12:00
'/[^A-Za-z0-9\-]+/u' => '' , // remove non-ASCII chars, only allow alphanumeric and dashes
2016-01-21 01:10:59 +02:00
'/[\/\?=#]+/u' => '-' , // remove forward slashes, question marks, equal signs and hashes in case multibyte is allowed (and non-ASCII chars aren't removed)
2011-11-14 12:26:51 +01:00
'/[\-]{2,}/u' => '-' , // remove duplicate dashes
2014-03-03 22:22:03 +00:00
'/^[\-]+/u' => '' , // Remove all leading dashes
'/[\-]+$/u' => '' // Remove all trailing dashes
2011-11-14 12:26:51 +01:00
);
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
* Doesn ' t try to replace or transliterate non - ASCII filters .
* Useful for character sets that have little overlap with ASCII ( e . g . far eastern ),
* as well as better search engine optimization for URLs .
* @ see http :// www . ietf . org / rfc / rfc3987
2013-03-21 19:48:54 +01:00
*
* @ config
2011-11-14 12:26:51 +01:00
* @ var boolean
*/
2013-03-21 19:48:54 +01:00
private static $default_allow_multibyte = false ;
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
* @ var Array See { @ link setReplacements ()}
*/
public $replacements = array ();
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
2014-08-15 18:53:05 +12:00
* Note : Depending on the applied replacement rules , this method might result in an empty string .
*
2011-11-14 12:26:51 +01:00
* @ param String URL path ( without domain or query parameters ), in utf8 encoding
* @ return String A filtered path compatible with RFC 3986
*/
2012-09-19 12:07:39 +02:00
public function filter ( $name ) {
2011-11-14 12:26:51 +01:00
if ( ! $this -> getAllowMultibyte ()) {
// Only transliterate when no multibyte support is requested
$transliterator = $this -> getTransliterator ();
if ( $transliterator ) $name = $transliterator -> toASCII ( $name );
}
2014-08-15 18:53:05 +12:00
2012-02-06 11:54:48 +01:00
$name = mb_strtolower ( $name );
2011-11-14 12:26:51 +01:00
$replacements = $this -> getReplacements ();
2014-08-15 18:53:05 +12:00
2012-02-06 11:56:26 +01:00
// Unset automated removal of non-ASCII characters, and don't try to transliterate
2013-05-10 16:10:26 +12:00
if ( $this -> getAllowMultibyte () && isset ( $replacements [ '/[^A-Za-z0-9\-]+/u' ])) {
unset ( $replacements [ '/[^A-Za-z0-9\-]+/u' ]);
2012-09-27 09:34:00 +12:00
}
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
foreach ( $replacements as $regex => $replace ) {
$name = preg_replace ( $regex , $replace , $name );
}
2012-02-06 11:56:26 +01:00
// Multibyte URLs require percent encoding to comply to RFC 3986.
// Without this setting, the "remove non-ASCII chars" regex takes care of that.
if ( $this -> getAllowMultibyte ()) $name = rawurlencode ( $name );
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
return $name ;
}
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
* @ param Array Map of find / replace used for preg_replace () .
*/
2012-09-19 12:07:39 +02:00
public function setReplacements ( $r ) {
2011-11-14 12:26:51 +01:00
$this -> replacements = $r ;
}
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
* @ return Array
*/
2012-09-19 12:07:39 +02:00
public function getReplacements () {
2013-03-21 19:48:54 +01:00
return ( $this -> replacements ) ? $this -> replacements : ( array ) $this -> config () -> default_replacements ;
2011-11-14 12:26:51 +01:00
}
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
2012-06-15 15:54:47 +12:00
* @ var SS_Transliterator
2011-11-14 12:26:51 +01:00
*/
protected $transliterator ;
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
2012-06-15 15:54:47 +12:00
* @ return SS_Transliterator | NULL
2011-11-14 12:26:51 +01:00
*/
2012-09-19 12:07:39 +02:00
public function getTransliterator () {
2013-03-21 19:48:54 +01:00
if ( $this -> transliterator === null && $this -> config () -> default_use_transliterator ) {
2012-06-15 15:54:47 +12:00
$this -> transliterator = SS_Transliterator :: create ();
2014-08-15 18:53:05 +12:00
}
2011-11-14 12:26:51 +01:00
return $this -> transliterator ;
}
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
2012-06-15 15:54:47 +12:00
* @ param SS_Transliterator | FALSE
2011-11-14 12:26:51 +01:00
*/
2012-09-19 12:07:39 +02:00
public function setTransliterator ( $t ) {
2011-11-14 12:26:51 +01:00
$this -> transliterator = $t ;
}
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
* @ var boolean
*/
protected $allowMultibyte ;
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
* @ param boolean
*/
2012-09-19 12:07:39 +02:00
public function setAllowMultibyte ( $bool ) {
2011-11-14 12:26:51 +01:00
$this -> allowMultibyte = $bool ;
}
2014-08-15 18:53:05 +12:00
2011-11-14 12:26:51 +01:00
/**
* @ return boolean
*/
2012-09-19 12:07:39 +02:00
public function getAllowMultibyte () {
2013-03-21 19:48:54 +01:00
return ( $this -> allowMultibyte !== null ) ? $this -> allowMultibyte : $this -> config () -> default_allow_multibyte ;
2011-11-14 12:26:51 +01:00
}
2012-03-24 16:04:52 +13:00
}