Skip to content

Commit

Permalink
Merge pull request #561 from MoonE/utfstring-character-array
Browse files Browse the repository at this point in the history
Use character array for utfstring
  • Loading branch information
MauricioFauth authored Aug 26, 2024
2 parents f2bafbb + 51225c5 commit 6bbea5b
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 97 deletions.
11 changes: 6 additions & 5 deletions psalm-baseline.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1107,11 +1107,6 @@
<code><![CDATA[isset(self::$translator)]]></code>
</RedundantPropertyInitializationCheck>
</file>
<file src="src/UtfString.php">
<PossiblyUnusedProperty>
<code><![CDATA[$byteLen]]></code>
</PossiblyUnusedProperty>
</file>
<file src="src/Utils/BufferedQuery.php">
<PossiblyNullOperand>
<code><![CDATA[$this->status]]></code>
Expand Down Expand Up @@ -1587,6 +1582,12 @@
* }]]></code>
</InvalidReturnType>
</file>
<file src="tests/UtfStringSerializer.php">
<PossiblyUnusedMethod>
<code><![CDATA[serialize]]></code>
<code><![CDATA[unserialize]]></code>
</PossiblyUnusedMethod>
</file>
<file src="tests/Utils/BufferedQueryTest.php">
<PossiblyUnusedMethod>
<code><![CDATA[extractProvider]]></code>
Expand Down
6 changes: 5 additions & 1 deletion src/Tools/TestGenerator.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
use PhpMyAdmin\SqlParser\Exceptions\ParserException;
use PhpMyAdmin\SqlParser\Lexer;
use PhpMyAdmin\SqlParser\Parser;
use PhpMyAdmin\SqlParser\Tests\UtfStringSerializer;
use PhpMyAdmin\SqlParser\Token;
use PhpMyAdmin\SqlParser\UtfString;

use function dirname;
use function file_exists;
Expand Down Expand Up @@ -168,7 +170,9 @@ public static function build(

// unset mode, reset to default every time, to be sure
Context::setMode();
$serializer = new CustomJsonSerializer();
$serializer = new CustomJsonSerializer(null, [
UtfString::class => new UtfStringSerializer(),
]);
// Writing test's data.
$encoded = $serializer->serialize($test);

Expand Down
83 changes: 13 additions & 70 deletions src/UtfString.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@
use Exception;
use Stringable;

use function count;
use function implode;
use function mb_check_encoding;
use function mb_strlen;
use function mb_substr;
use function ord;
use function strlen;
use function substr;
use function mb_str_split;

/**
* Implementation for UTF-8 strings.
Expand All @@ -32,44 +30,19 @@
class UtfString implements ArrayAccess, Stringable
{
/**
* The raw, multi-byte string.
*/
public string $str = '';

/**
* The index of current byte.
*
* For ASCII strings, the byte index is equal to the character index.
*/
public int $byteIdx = 0;

/**
* The index of current character.
* The multi-byte characters.
*
* For non-ASCII strings, some characters occupy more than one byte and
* the character index will have a lower value than the byte index.
*/
public int $charIdx = 0;

/**
* The length of the string (in bytes).
* @var list<string>
*/
public int $byteLen = 0;

/**
* The length of the string (in characters).
*/
public int $charLen = 0;
public array $characters;

/** @param string $str the string */
public function __construct(string $str)
{
$this->str = $str;
$this->byteLen = mb_strlen($str, '8bit');
if (! mb_check_encoding($str, 'UTF-8')) {
$this->charLen = 0;
if (mb_check_encoding($str, 'UTF-8')) {
$this->characters = mb_str_split($str, 1, 'UTF-8');
} else {
$this->charLen = mb_strlen($str, 'UTF-8');
$this->characters = [];
}
}

Expand All @@ -80,7 +53,7 @@ public function __construct(string $str)
*/
public function offsetExists(mixed $offset): bool
{
return ($offset >= 0) && ($offset < $this->charLen);
return $offset >= 0 && $offset < count($this->characters);
}

/**
Expand All @@ -90,37 +63,7 @@ public function offsetExists(mixed $offset): bool
*/
public function offsetGet(mixed $offset): string|null
{
// This function moves the internal byte and character pointer to the requested offset.
// This function is part of hot code so the aim is to do the following
// operations as efficiently as possible.
// UTF-8 character encoding is a variable length encoding that encodes Unicode
// characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
// to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
if (($offset < 0) || ($offset >= $this->charLen)) {
return null;
}

$delta = $offset - $this->charIdx;

if ($delta > 0) {
// Fast forwarding.
$this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta));
$this->charIdx += $delta;
} elseif ($delta < 0) {
// Rewinding.
while ($delta++ < 0) {
// We rewind byte by byte and only count characters that are not continuation bytes,
// i.e. ASCII characters and first octets of multibyte characters
do {
$byte = ord($this->str[--$this->byteIdx]);
} while (($byte >= 128) && ($byte < 192));

--$this->charIdx;
}
}

// Fetch the first Unicode character within the next 4 bytes in the string.
return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1);
return $this->characters[$offset] ?? null;
}

/**
Expand Down Expand Up @@ -153,14 +96,14 @@ public function offsetUnset(mixed $offset): void
*/
public function length(): int
{
return $this->charLen;
return count($this->characters);
}

/**
* Returns the contained string.
*/
public function __toString(): string
{
return $this->str;
return implode('', $this->characters);
}
}
5 changes: 4 additions & 1 deletion tests/TestCase.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
use PhpMyAdmin\SqlParser\Token;
use PhpMyAdmin\SqlParser\TokensList;
use PhpMyAdmin\SqlParser\Tools\CustomJsonSerializer;
use PhpMyAdmin\SqlParser\UtfString;
use PHPUnit\Framework\TestCase as BaseTestCase;

use function file_get_contents;
Expand Down Expand Up @@ -95,7 +96,9 @@ public function getData(string $name): array
$serializedData = file_get_contents('tests/data/' . $name . '.out');
$this->assertIsString($serializedData);

$serializer = new CustomJsonSerializer();
$serializer = new CustomJsonSerializer(null, [
UtfString::class => new UtfStringSerializer(),
]);
$data = $serializer->unserialize($serializedData);

$this->assertIsArray($data);
Expand Down
28 changes: 28 additions & 0 deletions tests/UtfStringSerializer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?php

declare(strict_types=1);

namespace PhpMyAdmin\SqlParser\Tests;

use PhpMyAdmin\SqlParser\UtfString;

class UtfStringSerializer
{
/**
* @return array<string,string>
* @psalm-return array{str: string}
*/
public function serialize(UtfString $str): array
{
return ['str' => (string) $str];
}

/**
* @param array<string,string> $data
* @psalm-param array{str: string} $data
*/
public function unserialize(array $data): UtfString
{
return new UtfString($data['str']);
}
}
6 changes: 1 addition & 5 deletions tests/data/lexer/lexUtf8.out
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
"errors": [],
"str": {
"@type": "PhpMyAdmin\\SqlParser\\UtfString",
"str": "select * from école",
"byteIdx": 19,
"charIdx": 18,
"byteLen": 20,
"charLen": 19
"str": "select * from école"
},
"len": 19,
"last": 19,
Expand Down
6 changes: 1 addition & 5 deletions tests/data/parser/parseCreateProcedure3.out
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
"errors": [],
"str": {
"@type": "PhpMyAdmin\\SqlParser\\UtfString",
"str": "DELIMITER $$\nCREATE DEFINER=`user`@`localhost` PROCEDURE `multiDBqueryRun_V1`(IN `query` TEXT, IN `table_name_var` VARCHAR(255), IN `columns_used_var` TEXT, IN `where_text_var` TEXT, IN `separator_value_var` VARCHAR(255)) COMMENT 'Query: SingleDB → MultiDB (All DBs) + run it' NOT DETERMINISTIC MODIFIES SQL DATA SQL SECURITY INVOKER BEGIN\nSET @TABLE_NAME = table_name_var;\nSET @WHERE_TEXT = where_text_var;\nSET @COLUMNS_USED = columns_used_var;\nSET @MULTIDB_QUERY = CONCAT('SELECT \"$MULTIDB\" FROM `$MULTIDB`.', @TABLE_NAME, @WHERE_TEXT);\n\n-- EXECUTION --\nCREATE TEMPORARY TABLE `MULTIDB_TEMP_DB_TBL_COLS` AS\nSELECT * FROM (\n SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME\n FROM INFORMATION_SCHEMA.COLUMNS\n WHERE\n TABLE_SCHEMA NOT IN('mysql', 'test', 'tmp', 'information_schema', 'sys', 'performance_schema') AND\n TABLE_NAME = @TABLE_NAME AND\n FIND_IN_SET(COLUMN_NAME, @COLUMNS_USED)\n) tbl\nGROUP BY\n TABLE_SCHEMA,\n TABLE_NAME;\n\nSELECT GROUP_CONCAT(REPLACE(@MULTIDB_QUERY, '$MULTIDB', CONCAT('', TABLE_SCHEMA, '')) SEPARATOR \"\\nUNION ALL\\n\")\nINTO @stmt_sql\nFROM `MULTIDB_TEMP_DB_TBL_COLS`;\n\nPREPARE stmt FROM @stmt_sql;\nEXECUTE stmt;\nDEALLOCATE PREPARE stmt;\nEND",
"byteIdx": 1174,
"charIdx": 1172,
"byteLen": 1175,
"charLen": 1173
"str": "DELIMITER $$\nCREATE DEFINER=`user`@`localhost` PROCEDURE `multiDBqueryRun_V1`(IN `query` TEXT, IN `table_name_var` VARCHAR(255), IN `columns_used_var` TEXT, IN `where_text_var` TEXT, IN `separator_value_var` VARCHAR(255)) COMMENT 'Query: SingleDB → MultiDB (All DBs) + run it' NOT DETERMINISTIC MODIFIES SQL DATA SQL SECURITY INVOKER BEGIN\nSET @TABLE_NAME = table_name_var;\nSET @WHERE_TEXT = where_text_var;\nSET @COLUMNS_USED = columns_used_var;\nSET @MULTIDB_QUERY = CONCAT('SELECT \"$MULTIDB\" FROM `$MULTIDB`.', @TABLE_NAME, @WHERE_TEXT);\n\n-- EXECUTION --\nCREATE TEMPORARY TABLE `MULTIDB_TEMP_DB_TBL_COLS` AS\nSELECT * FROM (\n SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME\n FROM INFORMATION_SCHEMA.COLUMNS\n WHERE\n TABLE_SCHEMA NOT IN('mysql', 'test', 'tmp', 'information_schema', 'sys', 'performance_schema') AND\n TABLE_NAME = @TABLE_NAME AND\n FIND_IN_SET(COLUMN_NAME, @COLUMNS_USED)\n) tbl\nGROUP BY\n TABLE_SCHEMA,\n TABLE_NAME;\n\nSELECT GROUP_CONCAT(REPLACE(@MULTIDB_QUERY, '$MULTIDB', CONCAT('', TABLE_SCHEMA, '')) SEPARATOR \"\\nUNION ALL\\n\")\nINTO @stmt_sql\nFROM `MULTIDB_TEMP_DB_TBL_COLS`;\n\nPREPARE stmt FROM @stmt_sql;\nEXECUTE stmt;\nDEALLOCATE PREPARE stmt;\nEND"
},
"len": 1173,
"last": 1173,
Expand Down
6 changes: 1 addition & 5 deletions tests/data/parser/parseCreateProcedure4.out
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
"errors": [],
"str": {
"@type": "PhpMyAdmin\\SqlParser\\UtfString",
"str": "DELIMITER $$\nCREATE DEFINER=`user`@`localhost` PROCEDURE `multiDBqueryRun_V12`(IN `query` TEXT, IN `table_name_var` VARCHAR(255), IN `columns_used_var` TEXT, IN `where_text_var` TEXT, IN `separator_value_var` VARCHAR(255)) COMMENT 'Query: SingleDB → MultiDB (All DBs) + run it' NOT DETERMINISTIC MODIFIES SQL DATA SQL SECURITY INVOKER BEGIN\nSET @TABLE_NAME = table_name_var;\nSET @WHERE_TEXT = where_text_var;\nSET @COLUMNS_USED = columns_used_var;\nSET @MULTIDB_QUERY = CONCAT('SELECT \"$MULTIDB\" FROM `$MULTIDB`.', @TABLE_NAME, @WHERE_TEXT);\n\n-- EXECUTION --\nCREATE TEMPORARY TABLE `MULTIDB_TEMP_DB_TBL_COLS` AS\nSELECT * FROM (\n SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME\n FROM INFORMATION_SCHEMA.COLUMNS\n WHERE\n TABLE_SCHEMA NOT IN('mysql', 'test', 'tmp', 'information_schema', 'sys', 'performance_schema') AND\n TABLE_NAME = @TABLE_NAME AND\n FIND_IN_SET(COLUMN_NAME, @COLUMNS_USED)\n) tbl\nGROUP BY\n TABLE_SCHEMA,\n TABLE_NAME;\n\nSELECT GROUP_CONCAT(REPLACE(@MULTIDB_QUERY, '$MULTIDB', CONCAT('', TABLE_SCHEMA, '')) SEPARATOR \"\\nUNION ALL\\n\")\nINTO @stmt_sql\nFROM `MULTIDB_TEMP_DB_TBL_COLS`;\n\nPREPARE stmt FROM @stmt_sql;\nEXECUTE stmt;\nDEALLOCATE PREPARE stmt;\nEND$$\n\n--\n-- Functions\n--\nDELIMITER $$\nCREATE DEFINER=`root`@`localhost` FUNCTION `attrParentShiftIds` (`parent_id` TEXT, `option_id_shift` INT, `option_value_id_shift` INT) RETURNS TEXT CHARSET utf8mb4 COLLATE utf8mb4_unicode_520_ci DETERMINISTIC READS SQL DATA SQL SECURITY INVOKER BEGIN\n DECLARE i INT UNSIGNED DEFAULT 0;\n DECLARE pair_count INT UNSIGNED;\n DECLARE result TEXT DEFAULT '';\n\n DECLARE pair VARCHAR(255) DEFAULT '';\n DECLARE oid INT DEFAULT '';\n DECLARE vid INT DEFAULT '';\n\n SET pair_count = substrCount(parent_id, ',') + 1;\n\n WHILE i < pair_count DO\n SET result = CONCAT(result, IF(i <= 0, '', ','));\n\n SET pair = split(parent_id, ',', i + 1);\n\n SET oid = split(pair, '-', 1) + option_id_shift;\n SET vid = split(pair, '-', 2) + option_value_id_shift;\n\n SET pair = CONCAT(oid, '-', vid);\n SET result = CONCAT(result, pair);\n\n SET i = i + 1;\n END WHILE;\n\n RETURN result;\nEND$$\n\nDELIMITER $$\nCREATE DEFINER=`user`@`localhost` FUNCTION `split` (`string` TEXT, `delim` TEXT, `n` INT) RETURNS TEXT CHARSET utf8mb4 COLLATE utf8mb4_unicode_520_ci DETERMINISTIC SQL SECURITY INVOKER RETURN IF(\n (LENGTH(string) - LENGTH(REPLACE(string, delim, ''))) / LENGTH(delim) < n - 1,\n NULL,\n SUBSTRING_INDEX(SUBSTRING_INDEX(string, delim, n), delim, -1)\n)$$\n\nDELIMITER $$\nCREATE DEFINER=`root`@`localhost` FUNCTION `substrCount` (`s` VARCHAR(255), `ss` VARCHAR(255)) RETURNS TINYINT(3) UNSIGNED DETERMINISTIC READS SQL DATA SQL SECURITY INVOKER BEGIN\nDECLARE COUNT TINYINT(3) UNSIGNED;\nDECLARE OFFSET_I TINYINT(3) UNSIGNED;\nDECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET s = NULL;\n\nSET COUNT = 0;\nSET OFFSET_I = 1;\n\nREPEAT\nIF NOT ISNULL(s) AND OFFSET_I > 0 THEN\nSET OFFSET_I = LOCATE(ss, s, OFFSET_I);\nIF OFFSET_I > 0 THEN\nSET COUNT = COUNT + 1;\nSET OFFSET_I = OFFSET_I + 1;\nEND IF;\nEND IF;\nUNTIL ISNULL(s) OR OFFSET_I = 0 END REPEAT;\n\nRETURN COUNT;\nEND$$\n\nDELIMITER ;\n\n",
"byteIdx": 3084,
"charIdx": 3082,
"byteLen": 3085,
"charLen": 3083
"str": "DELIMITER $$\nCREATE DEFINER=`user`@`localhost` PROCEDURE `multiDBqueryRun_V12`(IN `query` TEXT, IN `table_name_var` VARCHAR(255), IN `columns_used_var` TEXT, IN `where_text_var` TEXT, IN `separator_value_var` VARCHAR(255)) COMMENT 'Query: SingleDB → MultiDB (All DBs) + run it' NOT DETERMINISTIC MODIFIES SQL DATA SQL SECURITY INVOKER BEGIN\nSET @TABLE_NAME = table_name_var;\nSET @WHERE_TEXT = where_text_var;\nSET @COLUMNS_USED = columns_used_var;\nSET @MULTIDB_QUERY = CONCAT('SELECT \"$MULTIDB\" FROM `$MULTIDB`.', @TABLE_NAME, @WHERE_TEXT);\n\n-- EXECUTION --\nCREATE TEMPORARY TABLE `MULTIDB_TEMP_DB_TBL_COLS` AS\nSELECT * FROM (\n SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME\n FROM INFORMATION_SCHEMA.COLUMNS\n WHERE\n TABLE_SCHEMA NOT IN('mysql', 'test', 'tmp', 'information_schema', 'sys', 'performance_schema') AND\n TABLE_NAME = @TABLE_NAME AND\n FIND_IN_SET(COLUMN_NAME, @COLUMNS_USED)\n) tbl\nGROUP BY\n TABLE_SCHEMA,\n TABLE_NAME;\n\nSELECT GROUP_CONCAT(REPLACE(@MULTIDB_QUERY, '$MULTIDB', CONCAT('', TABLE_SCHEMA, '')) SEPARATOR \"\\nUNION ALL\\n\")\nINTO @stmt_sql\nFROM `MULTIDB_TEMP_DB_TBL_COLS`;\n\nPREPARE stmt FROM @stmt_sql;\nEXECUTE stmt;\nDEALLOCATE PREPARE stmt;\nEND$$\n\n--\n-- Functions\n--\nDELIMITER $$\nCREATE DEFINER=`root`@`localhost` FUNCTION `attrParentShiftIds` (`parent_id` TEXT, `option_id_shift` INT, `option_value_id_shift` INT) RETURNS TEXT CHARSET utf8mb4 COLLATE utf8mb4_unicode_520_ci DETERMINISTIC READS SQL DATA SQL SECURITY INVOKER BEGIN\n DECLARE i INT UNSIGNED DEFAULT 0;\n DECLARE pair_count INT UNSIGNED;\n DECLARE result TEXT DEFAULT '';\n\n DECLARE pair VARCHAR(255) DEFAULT '';\n DECLARE oid INT DEFAULT '';\n DECLARE vid INT DEFAULT '';\n\n SET pair_count = substrCount(parent_id, ',') + 1;\n\n WHILE i < pair_count DO\n SET result = CONCAT(result, IF(i <= 0, '', ','));\n\n SET pair = split(parent_id, ',', i + 1);\n\n SET oid = split(pair, '-', 1) + option_id_shift;\n SET vid = split(pair, '-', 2) + option_value_id_shift;\n\n SET pair = CONCAT(oid, '-', vid);\n SET result = CONCAT(result, pair);\n\n SET i = i + 1;\n END WHILE;\n\n RETURN result;\nEND$$\n\nDELIMITER $$\nCREATE DEFINER=`user`@`localhost` FUNCTION `split` (`string` TEXT, `delim` TEXT, `n` INT) RETURNS TEXT CHARSET utf8mb4 COLLATE utf8mb4_unicode_520_ci DETERMINISTIC SQL SECURITY INVOKER RETURN IF(\n (LENGTH(string) - LENGTH(REPLACE(string, delim, ''))) / LENGTH(delim) < n - 1,\n NULL,\n SUBSTRING_INDEX(SUBSTRING_INDEX(string, delim, n), delim, -1)\n)$$\n\nDELIMITER $$\nCREATE DEFINER=`root`@`localhost` FUNCTION `substrCount` (`s` VARCHAR(255), `ss` VARCHAR(255)) RETURNS TINYINT(3) UNSIGNED DETERMINISTIC READS SQL DATA SQL SECURITY INVOKER BEGIN\nDECLARE COUNT TINYINT(3) UNSIGNED;\nDECLARE OFFSET_I TINYINT(3) UNSIGNED;\nDECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET s = NULL;\n\nSET COUNT = 0;\nSET OFFSET_I = 1;\n\nREPEAT\nIF NOT ISNULL(s) AND OFFSET_I > 0 THEN\nSET OFFSET_I = LOCATE(ss, s, OFFSET_I);\nIF OFFSET_I > 0 THEN\nSET COUNT = COUNT + 1;\nSET OFFSET_I = OFFSET_I + 1;\nEND IF;\nEND IF;\nUNTIL ISNULL(s) OR OFFSET_I = 0 END REPEAT;\n\nRETURN COUNT;\nEND$$\n\nDELIMITER ;\n\n"
},
"len": 3083,
"last": 3083,
Expand Down
Loading

0 comments on commit 6bbea5b

Please sign in to comment.