silverstripe-framework/src/View/Parsers/URLSegmentFilter.php

164 lines
4.9 KiB
PHP
Raw Normal View History

<?php
namespace SilverStripe\View\Parsers;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Injector\Injectable;
/**
* Filter certain characters from "URL segments" (also called "slugs"), for nicer (more SEO-friendly) URLs.
2016-09-09 08:43:05 +02:00
* Uses {@link Transliterator} to convert non-ASCII characters to meaningful ASCII representations.
* Use {@link $default_allow_multibyte} to allow a broader range of characters without transliteration.
2014-08-15 08:53:05 +02:00
*
* Caution: Should not be used on full URIs with domains or query parameters.
* In order to retain forward slashes in a path, each individual segment needs to be filtered individually.
2014-08-15 08:53:05 +02:00
*
* See {@link FileNameFilter} for similar implementation for filesystem-based URLs.
*/
class URLSegmentFilter implements FilterInterface
2016-11-29 00:31:16 +01:00
{
use Configurable;
use Injectable;
2016-11-29 00:31:16 +01:00
/**
* @config
* @var Boolean
*/
private static $default_use_transliterator = true;
/**
* @config
* @var array See {@link setReplacements()}.
*/
private static $default_replacements = [
2016-11-29 00:31:16 +01:00
'/&amp;/u' => '-and-',
'/&/u' => '-and-',
'/\s|\+/u' => '-', // remove whitespace/plus
'/[_.]+/u' => '-', // underscores and dots to dashes
'/[^A-Za-z0-9\-]+/u' => '', // remove non-ASCII chars, only allow alphanumeric and dashes
'/[\/\?=#:]+/u' => '-', // remove forward slashes, question marks, equal signs, hashes and colons in case multibyte is allowed (and non-ASCII chars aren't removed)
2016-11-29 00:31:16 +01:00
'/[\-]{2,}/u' => '-', // remove duplicate dashes
'/^[\-]+/u' => '', // Remove all leading dashes
'/[\-]+$/u' => '' // Remove all trailing dashes
];
2016-11-29 00:31:16 +01:00
/**
* Doesn't try to replace or transliterate non-ASCII filters.
* Useful for character sets that have little overlap with ASCII (e.g. far eastern),
* as well as better search engine optimization for URLs.
* @see http://www.ietf.org/rfc/rfc3987
*
* @config
* @var boolean
*/
private static $default_allow_multibyte = false;
/**
* @var array See {@link setReplacements()}
*/
public $replacements = [];
2016-11-29 00:31:16 +01:00
/**
* @var Transliterator
*/
protected $transliterator;
/**
* @var boolean
*/
protected $allowMultibyte;
2016-11-29 00:31:16 +01:00
/**
* Note: Depending on the applied replacement rules, this method might result in an empty string.
*
* @param string $name URL path (without domain or query parameters), in utf8 encoding
* @return string A filtered path compatible with RFC 3986
*/
public function filter($name)
{
if (!$this->getAllowMultibyte()) {
// Only transliterate when no multibyte support is requested
$transliterator = $this->getTransliterator();
if ($transliterator) {
$name = $transliterator->toASCII($name);
}
}
2022-04-14 03:12:59 +02:00
$name = mb_strtolower($name ?? '');
2016-11-29 00:31:16 +01:00
$replacements = $this->getReplacements();
// Unset automated removal of non-ASCII characters, and don't try to transliterate
if ($this->getAllowMultibyte() && isset($replacements['/[^A-Za-z0-9\-]+/u'])) {
unset($replacements['/[^A-Za-z0-9\-]+/u']);
}
foreach ($replacements as $regex => $replace) {
2022-04-14 03:12:59 +02:00
$name = preg_replace($regex ?? '', $replace ?? '', $name ?? '');
2016-11-29 00:31:16 +01:00
}
// Multibyte URLs require percent encoding to comply to RFC 3986.
// Without this setting, the "remove non-ASCII chars" regex takes care of that.
if ($this->getAllowMultibyte()) {
2022-04-14 03:12:59 +02:00
$name = rawurlencode($name ?? '');
2016-11-29 00:31:16 +01:00
}
return $name;
}
/**
* @param string[] $replacements Map of find/replace used for preg_replace().
* @return $this
2016-11-29 00:31:16 +01:00
*/
public function setReplacements($replacements)
2016-11-29 00:31:16 +01:00
{
$this->replacements = $replacements;
return $this;
2016-11-29 00:31:16 +01:00
}
/**
* @return string[]
2016-11-29 00:31:16 +01:00
*/
public function getReplacements()
{
return $this->replacements ?: (array)$this->config()->get('default_replacements');
2016-11-29 00:31:16 +01:00
}
/**
* @return Transliterator|null
2016-11-29 00:31:16 +01:00
*/
public function getTransliterator()
{
if ($this->transliterator === null && $this->config()->get('default_use_transliterator')) {
2016-11-29 00:31:16 +01:00
$this->transliterator = Transliterator::create();
}
return $this->transliterator;
}
/**
* @param Transliterator $transliterator
* @return $this
2016-11-29 00:31:16 +01:00
*/
public function setTransliterator($transliterator)
2016-11-29 00:31:16 +01:00
{
$this->transliterator = $transliterator;
return $this;
2016-11-29 00:31:16 +01:00
}
/**
2020-12-21 22:23:23 +01:00
* @param bool $bool
2016-11-29 00:31:16 +01:00
*/
public function setAllowMultibyte($bool)
{
$this->allowMultibyte = $bool;
}
/**
* @return boolean
*/
public function getAllowMultibyte()
{
return ($this->allowMultibyte !== null) ? $this->allowMultibyte : $this->config()->default_allow_multibyte;
}
}