Skip to content

Commit

Permalink
Revert "Match using non-capturing groups only"
Browse files Browse the repository at this point in the history
  • Loading branch information
mvorisek committed Sep 5, 2024
1 parent 82e0827 commit 9ed8bdb
Showing 1 changed file with 12 additions and 58 deletions.
70 changes: 12 additions & 58 deletions src/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,15 @@

namespace Doctrine\SqlFormatter;

use function array_flip;
use function array_key_last;
use function array_keys;
use function array_map;
use function array_pop;
use function assert;
use function count;
use function implode;
use function is_int;
use function is_string;
use function preg_match;
use function preg_quote;
use function preg_replace;
use function preg_replace_callback;
use function reset;
use function str_replace;
use function str_starts_with;
Expand Down Expand Up @@ -726,12 +722,8 @@ final class Tokenizer
'YEARWEEK',
];

/**
* Regular expression for tokenizing.
*
* @var array{string, array<int, string>}
*/
private readonly array $tokenizeRegex;
/** Regular expression for tokenizing. */
private readonly string $tokenizeRegex;

/**
* Punctuation that can be used as a boundary between other tokens
Expand Down Expand Up @@ -767,8 +759,7 @@ final class Tokenizer
public function __construct()
{
$regexes = $this->makeTokenizeRegexes();
$regex = $this->makeTokenizeRegex($regexes);
$this->tokenizeRegex = $this->removeNamedGroupsFromRegex($regex);
$this->tokenizeRegex = $this->makeTokenizeRegex($regexes);
}

/**
Expand Down Expand Up @@ -897,47 +888,7 @@ private function makeTokenizeRegex(array $regexes): string
$parts[] = '(?<t_' . $type . '>' . $regex . ')';
}

return '~\G(?>' . implode('|', $parts) . ')~';
}

/**
* Workaround slow PCRE named groups - https://github.com/php/php-src/issues/14423.
*
* Revert/remove once PHP 8.3 support is dropped.
*
* @return array{string, array<int, string>}
*/
private function removeNamedGroupsFromRegex(string $regex): array
{
// make sure the original regex cannot be satisfied and append one unnamed capturing group
// to discover all named capturing groups
$regexForAnalysis = preg_replace('~^(.)(.*)(\1\w*$)~s', '$1(?:(?=1)0$2)|()$3', $regex);
assert($regexForAnalysis !== null);
preg_match($regexForAnalysis, '', $matches);
assert(count($matches) > 1);

$regexIndexToName = [];
$prevK = null;
foreach (array_keys($matches) as $k) {
if (is_string($prevK)) {
$regexIndexToName[$k] = $prevK;
}

$prevK = $k;
}

// remap named groups to unnamed
$namedGroupsRegex = '(?:' . implode('|', array_map(static fn ($v) => preg_quote($v, '~'), $regexIndexToName)) . ')';
$regexWithoutNamedGroups = preg_replace('~(?<=\()\?<' . $namedGroupsRegex . '>~', '', $regex);
assert($regexWithoutNamedGroups !== null);
$regexWithoutNamedGroups = preg_replace_callback(
'~(?<=\(\?)&' . $namedGroupsRegex . '(?=\))~',
static fn ($matches) => array_flip($regexIndexToName)[substr($matches[0], 1)],
$regexWithoutNamedGroups,
);
assert($regexWithoutNamedGroups !== null);

return [$regexWithoutNamedGroups, $regexIndexToName];
return '~\G(?:' . implode('|', $parts) . ')~';
}

/**
Expand All @@ -956,14 +907,17 @@ public function tokenize(string $string): Cursor

while ($offset < strlen($string)) {
// Get the next token and the token type
preg_match($tokenizeRegex[0], $upper, $matches, 0, $offset);
preg_match($tokenizeRegex, $upper, $matches, 0, $offset);
assert(($matches[0] ?? '') !== '');

$lastMatchesNamedGroup = $tokenizeRegex[1][array_key_last($matches)];
assert(str_starts_with($lastMatchesNamedGroup, 't_'));
while (is_int($lastMatchesKey = array_key_last($matches))) {
array_pop($matches);
}

assert(str_starts_with($lastMatchesKey, 't_'));

/** @var Token::TOKEN_TYPE_* $tokenType */
$tokenType = (int) substr($lastMatchesNamedGroup, 2);
$tokenType = (int) substr($lastMatchesKey, 2);

$token = new Token($tokenType, substr($string, $offset, strlen($matches[0])));

Expand Down

0 comments on commit 9ed8bdb

Please sign in to comment.