351 lines
8.5 KiB
PHP
351 lines
8.5 KiB
PHP
<?php
|
|
/**
|
|
* Author: Nil Portugués Calderó <contact@nilportugues.com>
|
|
* Date: 6/26/14
|
|
* Time: 12:10 AM.
|
|
*
|
|
* For the full copyright and license information, please view the LICENSE
|
|
* file that was distributed with this source code.
|
|
*/
|
|
|
|
namespace NilPortugues\Sql\QueryFormatter\Tokenizer;
|
|
|
|
use NilPortugues\Sql\QueryFormatter\Helper\Token;
|
|
use NilPortugues\Sql\QueryFormatter\Tokenizer\Parser\Boundary;
|
|
use NilPortugues\Sql\QueryFormatter\Tokenizer\Parser\Comment;
|
|
use NilPortugues\Sql\QueryFormatter\Tokenizer\Parser\Numeral;
|
|
use NilPortugues\Sql\QueryFormatter\Tokenizer\Parser\Quoted;
|
|
use NilPortugues\Sql\QueryFormatter\Tokenizer\Parser\Reserved;
|
|
use NilPortugues\Sql\QueryFormatter\Tokenizer\Parser\LiteralString;
|
|
use NilPortugues\Sql\QueryFormatter\Tokenizer\Parser\UserDefined;
|
|
use NilPortugues\Sql\QueryFormatter\Tokenizer\Parser\WhiteSpace;
|
|
|
|
/**
|
|
* Class Tokenizer.
|
|
*/
|
|
class Tokenizer
|
|
{
|
|
const TOKEN_TYPE_WHITESPACE = 0;
|
|
const TOKEN_TYPE_WORD = 1;
|
|
const TOKEN_TYPE_QUOTE = 2;
|
|
const TOKEN_TYPE_BACK_TICK_QUOTE = 3;
|
|
const TOKEN_TYPE_RESERVED = 4;
|
|
const TOKEN_TYPE_RESERVED_TOP_LEVEL = 5;
|
|
const TOKEN_TYPE_RESERVED_NEWLINE = 6;
|
|
const TOKEN_TYPE_BOUNDARY = 7;
|
|
const TOKEN_TYPE_COMMENT = 8;
|
|
const TOKEN_TYPE_BLOCK_COMMENT = 9;
|
|
const TOKEN_TYPE_NUMBER = 10;
|
|
const TOKEN_TYPE_ERROR = 11;
|
|
const TOKEN_TYPE_VARIABLE = 12;
|
|
const TOKEN_TYPE = 0;
|
|
const TOKEN_VALUE = 1;
|
|
|
|
/**
|
|
* @var string
|
|
*/
|
|
protected $regexBoundaries;
|
|
|
|
/**
|
|
* @var string
|
|
*/
|
|
protected $regexReserved;
|
|
|
|
/**
|
|
* @var string
|
|
*/
|
|
protected $regexReservedNewLine;
|
|
|
|
/**
|
|
* @var string
|
|
*/
|
|
protected $regexReservedTopLevel;
|
|
|
|
/**
|
|
* @var string
|
|
*/
|
|
protected $regexFunction;
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
protected $maxCacheKeySize = 15;
|
|
|
|
/**
|
|
* @var array
|
|
*/
|
|
protected $tokenCache = [];
|
|
|
|
/**
|
|
* @var array
|
|
*/
|
|
protected $nextToken = [];
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
protected $currentStringLength = 0;
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
protected $oldStringLength = 0;
|
|
|
|
/**
|
|
* @var string
|
|
*/
|
|
protected $previousToken = '';
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
protected $tokenLength = 0;
|
|
|
|
/**
|
|
* @var array
|
|
*/
|
|
protected $tokens = [];
|
|
|
|
/**
|
|
* Builds all the regular expressions needed to Tokenize the input.
|
|
*/
|
|
public function __construct()
|
|
{
|
|
$reservedMap = \array_combine(Token::$reserved, \array_map('strlen', Token::$reserved));
|
|
\arsort($reservedMap);
|
|
Token::$reserved = \array_keys($reservedMap);
|
|
|
|
$this->regexFunction = $this->initRegex(Token::$functions);
|
|
$this->regexBoundaries = $this->initRegex(Token::$boundaries);
|
|
$this->regexReserved = $this->initRegex(Token::$reserved);
|
|
$this->regexReservedTopLevel = \str_replace(' ', '\\s+', $this->initRegex(Token::$reservedTopLevel));
|
|
$this->regexReservedNewLine = \str_replace(' ', '\\s+', $this->initRegex(Token::$reservedNewLine));
|
|
}
|
|
|
|
/**
|
|
* @param $variable
|
|
*
|
|
* @return string
|
|
*/
|
|
protected function initRegex($variable)
|
|
{
|
|
return '('.implode('|', \array_map(array($this, 'quoteRegex'), $variable)).')';
|
|
}
|
|
|
|
/**
|
|
* Takes a SQL string and breaks it into tokens.
|
|
* Each token is an associative array with type and value.
|
|
*
|
|
* @param string $string
|
|
*
|
|
* @return array
|
|
*/
|
|
public function tokenize($string)
|
|
{
|
|
return (\strlen($string) > 0) ? $this->processTokens($string) : [];
|
|
}
|
|
|
|
/**
|
|
* @param string $string
|
|
*
|
|
* @return array
|
|
*/
|
|
protected function processTokens($string)
|
|
{
|
|
$this->tokens = [];
|
|
$this->previousToken = '';
|
|
$this->currentStringLength = \strlen($string);
|
|
$this->oldStringLength = \strlen($string) + 1;
|
|
|
|
while ($this->currentStringLength >= 0) {
|
|
if ($this->oldStringLength <= $this->currentStringLength) {
|
|
break;
|
|
}
|
|
$string = $this->processOneToken($string);
|
|
}
|
|
|
|
return $this->tokens;
|
|
}
|
|
|
|
/**
|
|
* @param string $string
|
|
*
|
|
* @return string
|
|
*/
|
|
protected function processOneToken($string)
|
|
{
|
|
$token = $this->getToken($string, $this->currentStringLength, $this->previousToken);
|
|
$this->tokens[] = $token;
|
|
$this->tokenLength = \strlen($token[self::TOKEN_VALUE]);
|
|
$this->previousToken = $token;
|
|
|
|
$this->oldStringLength = $this->currentStringLength;
|
|
$this->currentStringLength -= $this->tokenLength;
|
|
|
|
return \substr($string, $this->tokenLength);
|
|
}
|
|
|
|
/**
|
|
* @param string $string
|
|
* @param int $currentStringLength
|
|
* @param string string
|
|
*
|
|
* @return array|mixed
|
|
*/
|
|
protected function getToken($string, $currentStringLength, $previousToken)
|
|
{
|
|
$cacheKey = $this->useTokenCache($string, $currentStringLength);
|
|
if (!empty($cacheKey) && isset($this->tokenCache[$cacheKey])) {
|
|
return $this->getNextTokenFromCache($cacheKey);
|
|
}
|
|
|
|
return $this->getNextTokenFromString($string, $previousToken, $cacheKey);
|
|
}
|
|
|
|
/**
|
|
* @param string $string
|
|
* @param int $currentStringLength
|
|
*
|
|
* @return string
|
|
*/
|
|
protected function useTokenCache($string, $currentStringLength)
|
|
{
|
|
$cacheKey = '';
|
|
|
|
if ($currentStringLength >= $this->maxCacheKeySize) {
|
|
$cacheKey = \substr($string, 0, $this->maxCacheKeySize);
|
|
}
|
|
|
|
return $cacheKey;
|
|
}
|
|
|
|
/**
|
|
* @param string $cacheKey
|
|
*
|
|
* @return mixed
|
|
*/
|
|
protected function getNextTokenFromCache($cacheKey)
|
|
{
|
|
return $this->tokenCache[$cacheKey];
|
|
}
|
|
|
|
/**
|
|
* Get the next token and the token type and store it in cache.
|
|
*
|
|
* @param string $string
|
|
* @param string $token
|
|
* @param string $cacheKey
|
|
*
|
|
* @return array
|
|
*/
|
|
protected function getNextTokenFromString($string, $token, $cacheKey)
|
|
{
|
|
$token = $this->parseNextToken($string, $token);
|
|
|
|
if ($cacheKey && \strlen($token[self::TOKEN_VALUE]) < $this->maxCacheKeySize) {
|
|
$this->tokenCache[$cacheKey] = $token;
|
|
}
|
|
|
|
return $token;
|
|
}
|
|
|
|
/**
|
|
* Return the next token and token type in a SQL string.
|
|
* Quoted strings, comments, reserved words, whitespace, and punctuation are all their own tokens.
|
|
*
|
|
* @param string $string The SQL string
|
|
* @param array $previous The result of the previous parseNextToken() call
|
|
*
|
|
* @return array An associative array containing the type and value of the token.
|
|
*/
|
|
protected function parseNextToken($string, $previous = null)
|
|
{
|
|
$matches = [];
|
|
$this->nextToken = [];
|
|
|
|
WhiteSpace::isWhiteSpace($this, $string, $matches);
|
|
Comment::isComment($this, $string);
|
|
Quoted::isQuoted($this, $string);
|
|
UserDefined::isUserDefinedVariable($this, $string);
|
|
Numeral::isNumeral($this, $string, $matches);
|
|
Boundary::isBoundary($this, $string, $matches);
|
|
Reserved::isReserved($this, $string, $previous);
|
|
LiteralString::isFunction($this, $string, $matches);
|
|
LiteralString::getNonReservedString($this, $string, $matches);
|
|
|
|
return $this->nextToken;
|
|
}
|
|
|
|
/**
|
|
* @return array
|
|
*/
|
|
public function getNextToken()
|
|
{
|
|
return $this->nextToken;
|
|
}
|
|
|
|
/**
|
|
* @param array $nextToken
|
|
*
|
|
* @return $this
|
|
*/
|
|
public function setNextToken($nextToken)
|
|
{
|
|
$this->nextToken = $nextToken;
|
|
|
|
return $this;
|
|
}
|
|
|
|
/**
|
|
* @return string
|
|
*/
|
|
public function getRegexBoundaries()
|
|
{
|
|
return $this->regexBoundaries;
|
|
}
|
|
|
|
/**
|
|
* @return string
|
|
*/
|
|
public function getRegexFunction()
|
|
{
|
|
return $this->regexFunction;
|
|
}
|
|
|
|
/**
|
|
* @return string
|
|
*/
|
|
public function getRegexReserved()
|
|
{
|
|
return $this->regexReserved;
|
|
}
|
|
|
|
/**
|
|
* @return string
|
|
*/
|
|
public function getRegexReservedNewLine()
|
|
{
|
|
return $this->regexReservedNewLine;
|
|
}
|
|
|
|
/**
|
|
* @return string
|
|
*/
|
|
public function getRegexReservedTopLevel()
|
|
{
|
|
return $this->regexReservedTopLevel;
|
|
}
|
|
|
|
/**
|
|
* Helper function for building regular expressions for reserved words and boundary characters.
|
|
*
|
|
* @param string $string
|
|
*
|
|
* @return string
|
|
*/
|
|
protected function quoteRegex($string)
|
|
{
|
|
return \preg_quote($string, '/');
|
|
}
|
|
}
|