silverstripe-framework/src/Core/Convert.php

<?php

namespace SilverStripe\Core;

use InvalidArgumentException;
use SimpleXMLElement;
use SilverStripe\Dev\Deprecation;
use SilverStripe\ORM\DB;
use SilverStripe\View\Parsers\URLSegmentFilter;

/**
 * Library of conversion functions, implemented as static methods.
 *
 * The methods are all of the form (format)2(format), where the format is one of
 *
 *  raw: A UTF8 string
 *  attr: A UTF8 string suitable for inclusion in an HTML attribute
 *  js: A UTF8 string suitable for inclusion in a double-quoted javascript string.
 *
 *  array: A PHP associative array
 *  json: JavaScript object notation
 *
 *  html: HTML source suitable for use in a page or email
 *  text: Plain-text content, suitable for display to a user as-is, or insertion in a plaintext email.
 *
 * Objects of type {@link ViewableData} can have an "escaping type",
 * which determines if they are automatically escaped before output by {@link SSViewer}.
 */
class Convert
{
    /**
     * Convert a value to be suitable for an XML attribute.
     *
     * Warning: Does not escape array keys
     *
     * @param array|string $val String to escape, or array of strings
     * @return array|string
     */
    public static function raw2att($val)
    {
        return self::raw2xml($val);
    }

    /**
     * Convert a value to be suitable for an HTML attribute.
     *
     * Warning: Does not escape array keys
     *
     * @param string|array $val String to escape, or array of strings
     * @return array|string
     */
    public static function raw2htmlatt($val)
    {
        return self::raw2att($val);
    }

    /**
     * Convert a value to be suitable for an HTML ID attribute. Replaces non
     * supported characters with a space.
     *
     * Warning: Does not escape array keys
     *
     * @see http://www.w3.org/TR/REC-html40/types.html#type-cdata
     *
     * @param array|string $val String to escape, or array of strings
     *
     * @return array|string
     */
    public static function raw2htmlname($val)
    {
        if (is_array($val)) {
            foreach ($val as $k => $v) {
                $val[$k] = self::raw2htmlname($v);
            }

            return $val;
        }

        return self::raw2att($val);
    }

    /**
     * Convert a value to be suitable for an HTML ID attribute. Replaces non
     * supported characters with an underscore.
     *
     * Warning: Does not escape array keys
     *
     * @see http://www.w3.org/TR/REC-html40/types.html#type-cdata
     *
     * @param array|string $val String to escape, or array of strings
     *
     * @return array|string
     */
    public static function raw2htmlid($val)
    {
        if (is_array($val)) {
            foreach ($val as $k => $v) {
                $val[$k] = self::raw2htmlid($v);
            }

            return $val;
        }

        return trim(
            preg_replace(
                '/_+/',
                '_',
                preg_replace('/[^a-zA-Z0-9\-_:.]+/', '_', $val ?? '') ?? ''
            ) ?? '',
            '_'
        );
    }

    /**
     * Ensure that text is properly escaped for XML.
     *
     * Warning: Does not escape array keys
     *
     * @see http://www.w3.org/TR/REC-xml/#dt-escape
     * @param array|string $val String to escape, or array of strings
     * @return array|string
     */
    public static function raw2xml($val)
    {
        if (is_array($val)) {
            foreach ($val as $k => $v) {
                $val[$k] = self::raw2xml($v);
            }
            return $val;
        }

        return htmlspecialchars($val ?? '', ENT_QUOTES, 'UTF-8');
    }

    /**
     * Ensure that text is properly escaped for Javascript.
     *
     * Warning: Does not escape array keys
     *
     * @param array|string $val String to escape, or array of strings
     * @return array|string
     */
    public static function raw2js($val)
    {
        if (is_array($val)) {
            foreach ($val as $k => $v) {
                $val[$k] = self::raw2js($v);
            }
            return $val;
        }

        return str_replace(
            // Intercepts some characters such as <, >, and & which can interfere
            ["\\", '"', "\n", "\r", "'", '<', '>', '&'],
            ["\\\\", '\"', '\n', '\r', "\\'", "\\x3c", "\\x3e", "\\x26"],
            $val ?? ''
        );
    }

    /**
     * Encode a value as a JSON encoded string. You can optionally pass a bitmask of
     * JSON constants as options through to the encode function.
     *
     * @deprecated 4.4.0 Use json_encode() instead
     * @param  mixed $val     Value to be encoded
     * @param  int   $options Optional bitmask of JSON constants
     * @return string           JSON encoded string
     */
    public static function raw2json($val, $options = 0)
    {
        Deprecation::notice('4.4.0', 'Use json_encode() instead');

        return json_encode($val, $options ?? 0);
    }

    /**
     * Encode an array as a JSON encoded string.
     *
     * @deprecated 4.4.0 Use json_encode() instead
     * @param  array  $val     Array to convert
     * @param  int    $options Optional bitmask of JSON constants
     * @return string          JSON encoded string
     */
    public static function array2json($val, $options = 0)
    {
        Deprecation::notice('4.4.0', 'Use json_encode() instead');

        return json_encode($val, $options ?? 0);
    }

    /**
     * Safely encodes a value (or list of values) using the current database's
     * safe string encoding method
     *
     * Warning: Does not encode array keys
     *
     * @param mixed|array $val Input value, or list of values as an array
     * @param boolean $quoted Flag indicating whether the value should be safely
     * quoted, instead of only being escaped. By default this function will
     * only escape the string (false).
     * @return string|array Safely encoded value in the same format as the input
     */
    public static function raw2sql($val, $quoted = false)
    {
        if (is_array($val)) {
            foreach ($val as $k => $v) {
                $val[$k] = self::raw2sql($v, $quoted);
            }
            return $val;
        }

        if ($quoted) {
            return DB::get_conn()->quoteString($val);
        }

        return DB::get_conn()->escapeString($val);
    }

    /**
     * Safely encodes a SQL symbolic identifier (or list of identifiers), such as a database,
     * table, or column name. Supports encoding of multi identfiers separated by
     * a delimiter (e.g. ".")
     *
     * @param string|array $identifier The identifier to escape. E.g. 'SiteTree.Title' or list of identifiers
     * to be joined via the separator.
     * @param string $separator The string that delimits subsequent identifiers
     * @return string The escaped identifier. E.g. '"SiteTree"."Title"'
     */
    public static function symbol2sql($identifier, $separator = '.')
    {
        return DB::get_conn()->escapeIdentifier($identifier, $separator);
    }

    /**
     * Convert XML to raw text.
     *
     * Warning: Does not decode array keys
     *
     * @uses html2raw()
     * @todo Currently &#xxx; entries are stripped; they should be converted
     * @param mixed $val
     * @return array|string
     */
    public static function xml2raw($val)
    {
        if (is_array($val)) {
            foreach ($val as $k => $v) {
                $val[$k] = self::xml2raw($v);
            }
            return $val;
        }

        // More complex text needs to use html2raw instead
        if (strpos($val ?? '', '<') !== false) {
            return self::html2raw($val);
        }

        return html_entity_decode($val ?? '', ENT_QUOTES, 'UTF-8');
    }

    /**
     * Convert a JSON encoded string into an object.
     *
     * @deprecated 4.4.0 Use json_decode() instead
     * @param string $val
     * @return object|boolean
     */
    public static function json2obj($val)
    {
        Deprecation::notice('4.4.0', 'Use json_decode() instead');

        return json_decode($val ?? '');
    }

    /**
     * Convert a JSON string into an array.
     *
     * @deprecated 4.4.0 Use json_decode($val, true) instead
     * @param string $val JSON string to convert
     * @return array|boolean
     */
    public static function json2array($val)
    {
        Deprecation::notice('4.4.0', 'Use json_decode() instead');

        return json_decode($val ?? '', true);
    }

    /**
     * Converts an XML string to a PHP array
     * See http://phpsecurity.readthedocs.org/en/latest/Injection-Attacks.html#xml-external-entity-injection
     *
     * @uses recursiveXMLToArray()
     * @param string $val
     * @param boolean $disableDoctypes Disables the use of DOCTYPE, and will trigger an error if encountered.
     * false by default.
     * @param boolean $disableExternals Does nothing because xml entities are removed
     * @deprecated 4.11.0 Use a dedicated XML library instead
     * @return array
     * @throws Exception
     */
    public static function xml2array($val, $disableDoctypes = false, $disableExternals = false)
    {
        Deprecation::notice('4.11.0', 'Use a dedicated XML library instead');

        // Check doctype
        if ($disableDoctypes && strpos($val ?? '', '<!DOCTYPE') !== false) {
            throw new InvalidArgumentException('XML Doctype parsing disabled');
        }

        // CVE-2021-41559 Ensure entities are removed due to their inherent security risk via
        // XXE attacks and quadratic blowup attacks, and also lack of consistent support
        $val = preg_replace('/(?s)<!ENTITY.*?>/', '', $val ?? '');

        // If there's still an <!ENTITY> present, then it would be the result of a maliciously
        // crafted XML document e.g. <!ENTITY><!<!ENTITY>ENTITY ext SYSTEM "http://evil.com">
        if (strpos($val ?? '', '<!ENTITY') !== false) {
            throw new InvalidArgumentException('Malicious XML entity detected');
        }

        // This will throw an exception if the XML contains references to any internal entities
        // that were defined in an <!ENTITY /> before it was removed
        $xml = new SimpleXMLElement($val ?? '');
        return self::recursiveXMLToArray($xml);
    }

    /**
     * Convert a XML string to a PHP array recursively. Do not
     * call this function directly, Please use {@link Convert::xml2array()}
     *
     * @param SimpleXMLElement $xml
     *
     * @return mixed
     */
    protected static function recursiveXMLToArray($xml)
    {
        $x = null;
        if ($xml instanceof SimpleXMLElement) {
            $attributes = $xml->attributes();
            foreach ($attributes as $k => $v) {
                if ($v) {
                    $a[$k] = (string) $v;
                }
            }
            $x = $xml;
            $xml = get_object_vars($xml);
        }
        if (is_array($xml)) {
            if (count($xml ?? []) === 0) {
                return (string)$x;
            } // for CDATA
            $r = [];
            foreach ($xml as $key => $value) {
                $r[$key] = self::recursiveXMLToArray($value);
            }
            // Attributes
            if (isset($a)) {
                $r['@'] = $a;
            }
            return $r;
        }

        return (string) $xml;
    }

    /**
     * Create a link if the string is a valid URL
     *
     * @param string $string The string to linkify
     * @return string A link to the URL if string is a URL
     */
    public static function linkIfMatch($string)
    {
        if (preg_match('/^[a-z+]+\:\/\/[a-zA-Z0-9$-_.+?&=!*\'()%]+$/', $string ?? '')) {
            return "<a style=\"white-space: nowrap\" href=\"$string\">$string</a>";
        }

        return $string;
    }

    /**
     * Simple conversion of HTML to plaintext.
     *
     * @param string $data Input data
     * @param bool $preserveLinks
     * @param int $wordWrap
     * @param array $config
     * @return string
     */
    public static function html2raw($data, $preserveLinks = false, $wordWrap = 0, $config = null)
    {
        $defaultConfig = [
            'PreserveLinks' => false,
            'ReplaceBoldAsterisk' => true,
            'CompressWhitespace' => true,
            'ReplaceImagesWithAlt' => true,
        ];
        if (isset($config)) {
            $config = array_merge($defaultConfig, $config);
        } else {
            $config = $defaultConfig;
        }

        $data = preg_replace("/<style([^A-Za-z0-9>][^>]*)?>.*?<\/style[^>]*>/is", '', $data ?? '');
        $data = preg_replace("/<script([^A-Za-z0-9>][^>]*)?>.*?<\/script[^>]*>/is", '', $data ?? '');

        if ($config['ReplaceBoldAsterisk']) {
            $data = preg_replace('%<(strong|b)( [^>]*)?>|</(strong|b)>%i', '*', $data ?? '');
        }

        // Expand hyperlinks
        if (!$preserveLinks && !$config['PreserveLinks']) {
            $data = preg_replace_callback('/<a[^>]*href\s*=\s*"([^"]*)">(.*?)<\/a>/ui', function ($matches) {
                return Convert::html2raw($matches[2]) . "[$matches[1]]";
            }, $data ?? '');
            $data = preg_replace_callback('/<a[^>]*href\s*=\s*([^ ]*)>(.*?)<\/a>/ui', function ($matches) {
                return Convert::html2raw($matches[2]) . "[$matches[1]]";
            }, $data ?? '');
        }

        // Replace images with their alt tags
        if ($config['ReplaceImagesWithAlt']) {
            $data = preg_replace('/<img[^>]*alt *= *"([^"]*)"[^>]*>/i', ' \\1 ', $data ?? '');
            $data = preg_replace('/<img[^>]*alt *= *([^ ]*)[^>]*>/i', ' \\1 ', $data ?? '');
        }

        // Compress whitespace
        if ($config['CompressWhitespace']) {
            $data = preg_replace("/\s+/u", ' ', $data ?? '');
        }

        // Parse newline tags
        $data = preg_replace("/\s*<[Hh][1-6]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data ?? '');
        $data = preg_replace("/\s*<[Pp]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data ?? '');
        $data = preg_replace("/\s*<[Dd][Ii][Vv]([^A-Za-z0-9>][^>]*)?> */u", "\n\n", $data ?? '');
        $data = preg_replace("/\n\n\n+/", "\n\n", $data ?? '');

        $data = preg_replace('/<[Bb][Rr]([^A-Za-z0-9>][^>]*)?> */', "\n", $data ?? '');
        $data = preg_replace('/<[Tt][Rr]([^A-Za-z0-9>][^>]*)?> */', "\n", $data ?? '');
        $data = preg_replace("/<\/[Tt][Dd]([^A-Za-z0-9>][^>]*)?> */", '    ', $data ?? '');
        $data = preg_replace('/<\/p>/i', "\n\n", $data ?? '');

        // Replace HTML entities
        $data = html_entity_decode($data ?? '', ENT_QUOTES, 'UTF-8');
        // Remove all tags (but optionally keep links)

        // strip_tags seemed to be restricting the length of the output
        // arbitrarily. This essentially does the same thing.
        if (!$preserveLinks && !$config['PreserveLinks']) {
            $data = preg_replace('/<\/?[^>]*>/', '', $data ?? '');
        } else {
            $data = strip_tags($data ?? '', '<a>');
        }

        // Wrap
        if ($wordWrap) {
            $data = wordwrap(trim($data ?? ''), $wordWrap ?? 0);
        }
        return trim($data ?? '');
    }

    /**
     * There are no real specifications on correctly encoding mailto-links,
     * but this seems to be compatible with most of the user-agents.
     * Does nearly the same as rawurlencode().
     * Please only encode the values, not the whole url, e.g.
     * "mailto:test@test.com?subject=" . Convert::raw2mailto($subject)
     *
     * @param $data string
     * @return string
     * @see http://www.ietf.org/rfc/rfc1738.txt
     */
    public static function raw2mailto($data)
    {
        return str_ireplace(
            ["\n",'?','=',' ','(',')','&','@','"','\'',';'],
            ['%0A','%3F','%3D','%20','%28','%29','%26','%40','%22','%27','%3B'],
            $data ?? ''
        );
    }

    /**
     * Convert a string (normally a title) to a string suitable for using in
     * urls and other html attributes. Uses {@link URLSegmentFilter}.
     *
     * @param string $title
     * @return string
     */
    public static function raw2url($title)
    {
        $f = URLSegmentFilter::create();
        return $f->filter($title);
    }

    /**
     * Normalises newline sequences to conform to (an) OS specific format.
     *
     * @param string $data Text containing potentially mixed formats of newline
     * sequences including \r, \r\n, \n, or unicode newline characters
     * @param string $nl The newline sequence to normalise to. Defaults to that
     * specified by the current OS
     * @return string
     */
    public static function nl2os($data, $nl = PHP_EOL)
    {
        return preg_replace('~\R~u', $nl ?? '', $data ?? '');
    }

    /**
     * Encode a value into a string that can be used as part of a filename.
     * All string data must be UTF-8 encoded.
     *
     * @param mixed $val Value to be encoded
     * @return string
     */
    public static function base64url_encode($val)
    {
        return rtrim(strtr(base64_encode(json_encode($val) ?? ''), '+/', '~_'), '=');
    }

    /**
     * Decode a value that was encoded with Convert::base64url_encode.
     *
     * @param string $val Value to be decoded
     * @return mixed Original value
     */
    public static function base64url_decode($val)
    {
        return json_decode(
            base64_decode(str_pad(strtr($val ?? '', '~_', '+/'), strlen($val ?? '') % 4, '=', STR_PAD_RIGHT)) ?? '',
            true
        );
    }

    /**
     * Converts upper camel case names to lower camel case,
     * with leading upper case characters replaced with lower case.
     * Tries to retain word case.
     *
     * Examples:
     * - ID => id
     * - IDField => idField
     * - iDField => iDField
     *
     * @param $str
     * @return string
     */
    public static function upperCamelToLowerCamel($str)
    {
        $return = null;
        $matches = null;
        if (preg_match('/(^[A-Z]{1,})([A-Z]{1})([a-z]+.*)/', $str ?? '', $matches)) {
            // If string has trailing lowercase after more than one leading uppercase characters,
            // match everything but the last leading uppercase character.
            $return = implode('', [
                strtolower($matches[1] ?? ''),
                $matches[2],
                $matches[3]
            ]);
        } elseif (preg_match('/(^[A-Z]{1})([a-z]+.*)/', $str ?? '', $matches)) {
            // If string has trailing lowercase after exactly one leading uppercase characters,
            // match everything but the last leading uppercase character.
            $return = implode('', [
                strtolower($matches[1] ?? ''),
                $matches[2]
            ]);
        } elseif (preg_match('/^[A-Z]+$/', $str ?? '')) {
            // If string has leading uppercase without trailing lowercase,
            // just lowerase the whole thing.
            $return = strtolower($str ?? '');
        } else {
            // If string has no leading uppercase, just return.
            $return = $str;
        }

        return $return;
    }

    /**
     * Turn a memory string, such as 512M into an actual number of bytes.
     * Preserves integer values like "1024" or "-1"
     *
     * @param string $memString A memory limit string, such as "64M"
     * @return int
     */
    public static function memstring2bytes($memString)
    {
        // Remove  non-unit characters from the size
        $unit = preg_replace('/[^bkmgtpezy]/i', '', $memString ?? '');
        // Remove non-numeric characters from the size
        $size = preg_replace('/[^0-9\.\-]/', '', $memString ?? '');

        if ($unit) {
            // Find the position of the unit in the ordered string which is the power
            // of magnitude to multiply a kilobyte by
            return (int)round($size * pow(1024, stripos('bkmgtpezy', $unit[0] ?? '')));
        }

        return (int)round($size ?? 0.0);
    }

    /**
     * @param float $bytes
     * @param int $decimal decimal precision
     * @return string
     */
    public static function bytes2memstring($bytes, $decimal = 0)
    {
        $scales = ['B','K','M','G','T','P','E','Z','Y'];
        // Get scale
        $scale = (int)floor(log($bytes ?? 0.0, 1024));
        if (!isset($scales[$scale])) {
            $scale = 2;
        }

        // Size
        $num = round($bytes / pow(1024, $scale), $decimal ?? 0);
        return $num . $scales[$scale];
    }

    /**
     * Convert slashes in relative or absolute filesystem path. Defaults to DIRECTORY_SEPARATOR
     *
     * @param string $path
     * @param string $separator
     * @param bool $multiple Collapses multiple slashes or not
     * @return string
     */
    public static function slashes($path, $separator = DIRECTORY_SEPARATOR, $multiple = true)
    {
        if ($multiple) {
            return preg_replace('#[/\\\\]+#', $separator ?? '', $path ?? '');
        }
        return str_replace(['/', '\\'], $separator ?? '', $path ?? '');
    }
}