<?php /** * @package framework * @subpackage model */ /** * Filter certain characters from "URL segments" (also called "slugs"), for nicer (more SEO-friendly) URLs. * Uses {@link SS_Transliterator} to convert non-ASCII characters to meaningful ASCII representations. * Use {@link $default_allow_multibyte} to allow a broader range of characters without transliteration. * * Caution: Should not be used on full URIs with domains or query parameters. * In order to retain forward slashes in a path, each individual segment needs to be filtered individually. * * See {@link FileNameFilter} for similar implementation for filesystem-based URLs. */ class URLSegmentFilter extends Object { /** * @config * @var Boolean */ private static $default_use_transliterator = true; /** * @config * @var Array See {@link setReplacements()}. */ private static $default_replacements = array( '/&/u' => '-and-', '/&/u' => '-and-', '/\s|\+/u' => '-', // remove whitespace/plus '/[_.]+/u' => '-', // underscores and dots to dashes '/[^A-Za-z0-9\-]+/u' => '', // remove non-ASCII chars, only allow alphanumeric and dashes '/[\/\?=#]+/u' => '-', // remove forward slashes, question marks, equal signs and hashes in case multibyte is allowed (and non-ASCII chars aren't removed) '/[\-]{2,}/u' => '-', // remove duplicate dashes '/^[\-]+/u' => '', // Remove all leading dashes '/[\-]+$/u' => '' // Remove all trailing dashes ); /** * Doesn't try to replace or transliterate non-ASCII filters. * Useful for character sets that have little overlap with ASCII (e.g. far eastern), * as well as better search engine optimization for URLs. * @see http://www.ietf.org/rfc/rfc3987 * * @config * @var boolean */ private static $default_allow_multibyte = false; /** * @var Array See {@link setReplacements()} */ public $replacements = array(); /** * Note: Depending on the applied replacement rules, this method might result in an empty string. * * @param String URL path (without domain or query parameters), in utf8 encoding * @return String A filtered path compatible with RFC 3986 */ public function filter($name) { if(!$this->getAllowMultibyte()) { // Only transliterate when no multibyte support is requested $transliterator = $this->getTransliterator(); if($transliterator) $name = $transliterator->toASCII($name); } $name = mb_strtolower($name); $replacements = $this->getReplacements(); // Unset automated removal of non-ASCII characters, and don't try to transliterate if($this->getAllowMultibyte() && isset($replacements['/[^A-Za-z0-9\-]+/u'])) { unset($replacements['/[^A-Za-z0-9\-]+/u']); } foreach($replacements as $regex => $replace) { $name = preg_replace($regex, $replace, $name); } // Multibyte URLs require percent encoding to comply to RFC 3986. // Without this setting, the "remove non-ASCII chars" regex takes care of that. if($this->getAllowMultibyte()) $name = rawurlencode($name); return $name; } /** * @param Array Map of find/replace used for preg_replace(). */ public function setReplacements($r) { $this->replacements = $r; } /** * @return Array */ public function getReplacements() { return ($this->replacements) ? $this->replacements : (array)$this->config()->default_replacements; } /** * @var SS_Transliterator */ protected $transliterator; /** * @return SS_Transliterator|NULL */ public function getTransliterator() { if($this->transliterator === null && $this->config()->default_use_transliterator) { $this->transliterator = SS_Transliterator::create(); } return $this->transliterator; } /** * @param SS_Transliterator|FALSE */ public function setTransliterator($t) { $this->transliterator = $t; } /** * @var boolean */ protected $allowMultibyte; /** * @param boolean */ public function setAllowMultibyte($bool) { $this->allowMultibyte = $bool; } /** * @return boolean */ public function getAllowMultibyte() { return ($this->allowMultibyte !== null) ? $this->allowMultibyte : $this->config()->default_allow_multibyte; } }