Skip to content

Commit

Permalink
Merge pull request #135 from Masterminds/tokenizer-performance
Browse files Browse the repository at this point in the history
Tokenizer performance
  • Loading branch information
goetas authored Sep 1, 2017
2 parents e965886 + 5dca3fc commit b8afbae
Show file tree
Hide file tree
Showing 4 changed files with 6,469 additions and 29 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ script:
after_script:
- bash -c 'if [ "$TRAVIS_PHP_VERSION" == "5.6" ] ; then wget https://scrutinizer-ci.com/ocular.phar; fi;'
- bash -c 'if [ "$TRAVIS_PHP_VERSION" == "5.6" ] ; then php ocular.phar code-coverage:upload --format=php-clover coverage.xml; fi;'
- php test/benchmark/run.php 50
65 changes: 36 additions & 29 deletions src/HTML5/Parser/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,8 @@ public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HT
*/
public function parse()
{
$p = 0;
do {
$p = $this->scanner->position();
$this->consumeData();

// FIXME: Add infinite loop protection.
} while ($this->carryOn);
}
Expand Down Expand Up @@ -145,7 +142,8 @@ protected function consumeData()
*/
protected function characterData()
{
if ($this->scanner->current() === false) {
$tok = $this->scanner->current();
if ($tok === false) {
return false;
}
switch ($this->textMode) {
Expand All @@ -154,7 +152,6 @@ protected function characterData()
case Elements::TEXT_RCDATA:
return $this->rcdata();
default:
$tok = $this->scanner->current();
if (strspn($tok, "<&")) {
return false;
}
Expand Down Expand Up @@ -408,24 +405,26 @@ protected function isTagEnd(&$selfClose)
if ($tok == '/') {
$this->scanner->next();
$this->scanner->whitespace();
if ($this->scanner->current() == '>') {
$tok = $this->scanner->current();

if ($tok == '>') {
$selfClose = true;
return true;
}
if ($this->scanner->current() === false) {
if ($tok === false) {
$this->parseError("Unexpected EOF inside of tag.");
return true;
}
// Basically, we skip the / token and go on.
// See 8.2.4.43.
$this->parseError("Unexpected '%s' inside of a tag.", $this->scanner->current());
$this->parseError("Unexpected '%s' inside of a tag.", $tok);
return false;
}

if ($this->scanner->current() == '>') {
if ($tok == '>') {
return true;
}
if ($this->scanner->current() === false) {
if ($tok === false) {
$this->parseError("Unexpected EOF inside of tag.");
return true;
}
Expand Down Expand Up @@ -541,15 +540,21 @@ protected function quotedAttributeValue($quote)
{
$stoplist = "\f" . $quote;
$val = '';
$tok = $this->scanner->current();
while (strspn($tok, $stoplist) == 0 && $tok !== false) {
if ($tok == '&') {
$val .= $this->decodeCharacterReference(true);
$tok = $this->scanner->current();

while (true) {
$tokens = $this->scanner->charsUntil($stoplist.'&');
if ($tokens !== false) {
$val .= $tokens;
} else {
$val .= $tok;
$tok = $this->scanner->next();
break;
}

$tok = $this->scanner->current();
if ($tok == '&') {
$val .= $this->decodeCharacterReference(true, $tok);
continue;
}
break;
}
$this->scanner->next();
return $val;
Expand Down Expand Up @@ -591,18 +596,18 @@ protected function unquotedAttributeValue()
*/
protected function bogusComment($leading = '')
{

// TODO: This can be done more efficiently when the
// scanner exposes a readUntil() method.
$comment = $leading;
$tokens = $this->scanner->charsUntil('>');
if ($tokens !== false) {
$comment .= $tokens;
}
$tok = $this->scanner->current();
do {
if ($tok !== false) {
$comment .= $tok;
$tok = $this->scanner->next();
} while ($tok !== false && $tok != '>');
}

$this->flushBuffer();
$this->events->comment($comment . $tok);
$this->events->comment($comment);
$this->scanner->next();

return true;
Expand Down Expand Up @@ -646,15 +651,17 @@ protected function comment()
*/
protected function isCommentEnd()
{
$tok = $this->scanner->current();

// EOF
if ($this->scanner->current() === false) {
if ($tok === false) {
// Hit the end.
$this->parseError("Unexpected EOF in a comment.");
return true;
}

// If it doesn't start with -, not the end.
if ($this->scanner->current() != '-') {
if ($tok != '-') {
return false;
}

Expand Down Expand Up @@ -737,7 +744,6 @@ protected function doctype()

$pub = strtoupper($this->scanner->getAsciiAlpha());
$white = strlen($this->scanner->whitespace());
$tok = $this->scanner->current();

// Get ID, and flag it as pub or system.
if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
Expand Down Expand Up @@ -938,10 +944,11 @@ protected function sequenceMatches($sequence, $caseSensitive = true)
$len = strlen($sequence);
$buffer = '';
for ($i = 0; $i < $len; ++ $i) {
$buffer .= $this->scanner->current();
$tok = $this->scanner->current();
$buffer .= $tok;

// EOF. Rewind and let the caller handle it.
if ($this->scanner->current() === false) {
if ($tok === false) {
$this->scanner->unconsume($i);
return false;
}
Expand Down
Loading

0 comments on commit b8afbae

Please sign in to comment.