From 4b89421f30298bfe9947ae3d406267609d7e2779 Mon Sep 17 00:00:00 2001 From: Timm Friebe Date: Sun, 26 Jan 2025 11:45:09 +0100 Subject: [PATCH 1/7] Add suport for heredoc (and its nowdoc variant) --- src/main/php/lang/ast/Tokens.class.php | 19 +++++++++++++++++++ .../ast/unittest/parse/LiteralsTest.class.php | 13 +++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/main/php/lang/ast/Tokens.class.php b/src/main/php/lang/ast/Tokens.class.php index bc4303a..5d76465 100755 --- a/src/main/php/lang/ast/Tokens.class.php +++ b/src/main/php/lang/ast/Tokens.class.php @@ -179,6 +179,25 @@ public function iterator($language) { continue; } null === $t || $offset-= strlen($t); + } else if ('<' === $token) { + $t= $next(self::DELIMITERS); + if ('<' === $t) { + $t= $next(self::DELIMITERS); + if ('<' === $t) { + $label= $next("\r\n"); + $end= trim($label, '"\''); + $string= "<<<{$label}"; + do { + $token= $next("\r\n"); + if ("\n" === $token) $line++; + } while (strncmp($end, $token, strlen($end)) && $string.= $token); + $string.= $end; + yield new Token($language->symbol('(literal)'), 'heredoc', $string, $line); + $offset--; + continue; + } + } + $offset-= 2; } // Handle combined operators. First, ensure we have enough bytes in our buffer diff --git a/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php b/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php index 1eb9273..fff5c5d 100755 --- a/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php +++ b/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php @@ -86,4 +86,17 @@ public function dangling_comma_in_key_value_map($declaration) { $pair= [new Literal('"key"', self::LINE), new Literal('"value"', self::LINE)]; $this->assertParsed([new ArrayLiteral([$pair], self::LINE)], $declaration); } + + #[Test, Values(['EOD', '"EOD"', "'EOD'"])] + public function heredoc($label) { + $nowdoc= ( + "<<<{$label}\n". + "Line 1\n". + "Line 2\n". + "\n". + "Line 4\n". + "EOD" + ); + $this->assertParsed([new Literal($nowdoc, self::LINE + 5)], $nowdoc.';'); + } } \ No newline at end of file From 34481d60fa3fb606beac5d6f26b3a97e25a5fba7 Mon Sep 17 00:00:00 2001 From: Timm Friebe Date: Sun, 26 Jan 2025 11:53:14 +0100 Subject: [PATCH 2/7] Fix generic syntax --- src/main/php/lang/ast/Tokens.class.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/php/lang/ast/Tokens.class.php b/src/main/php/lang/ast/Tokens.class.php index 5d76465..944f6a0 100755 --- a/src/main/php/lang/ast/Tokens.class.php +++ b/src/main/php/lang/ast/Tokens.class.php @@ -182,8 +182,8 @@ public function iterator($language) { } else if ('<' === $token) { $t= $next(self::DELIMITERS); if ('<' === $t) { - $t= $next(self::DELIMITERS); - if ('<' === $t) { + $n= $next(self::DELIMITERS); + if ('<' === $n) { $label= $next("\r\n"); $end= trim($label, '"\''); $string= "<<<{$label}"; @@ -196,8 +196,9 @@ public function iterator($language) { $offset--; continue; } + $offset-= strlen($n); } - $offset-= 2; + $offset-= strlen($t); } // Handle combined operators. First, ensure we have enough bytes in our buffer From a4bf34665e8f3134dc2e28190de44f43b08ad259 Mon Sep 17 00:00:00 2001 From: Timm Friebe Date: Sun, 26 Jan 2025 12:05:06 +0100 Subject: [PATCH 3/7] Implement indentation in heredoc --- src/main/php/lang/ast/Tokens.class.php | 18 ++++++++++++------ .../php/lang/ast/unittest/TokensTest.class.php | 5 +++++ .../ast/unittest/parse/LiteralsTest.class.php | 13 +++++++++++++ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/main/php/lang/ast/Tokens.class.php b/src/main/php/lang/ast/Tokens.class.php index 944f6a0..89dcf21 100755 --- a/src/main/php/lang/ast/Tokens.class.php +++ b/src/main/php/lang/ast/Tokens.class.php @@ -186,15 +186,21 @@ public function iterator($language) { if ('<' === $n) { $label= $next("\r\n"); $end= trim($label, '"\''); + $l= strlen($end); $string= "<<<{$label}"; do { $token= $next("\r\n"); - if ("\n" === $token) $line++; - } while (strncmp($end, $token, strlen($end)) && $string.= $token); - $string.= $end; - yield new Token($language->symbol('(literal)'), 'heredoc', $string, $line); - $offset--; - continue; + if ("\n" === $token) { + $line++; + } else if (0 === substr_compare($token, $end, $p= strspn($token, ' '), $l)) { + $string.= substr($token, 0, $p + $l); + $offset-= strlen($token) - $p - $l; + yield new Token($language->symbol('(literal)'), 'heredoc', $string, $line); + continue 2; + } + $string.= $token; + } while (null !== $token); + throw new FormatException('Unclosed heredoc literal starting at line '.$line); } $offset-= strlen($n); } diff --git a/src/test/php/lang/ast/unittest/TokensTest.class.php b/src/test/php/lang/ast/unittest/TokensTest.class.php index 267b45b..ca403c7 100755 --- a/src/test/php/lang/ast/unittest/TokensTest.class.php +++ b/src/test/php/lang/ast/unittest/TokensTest.class.php @@ -43,6 +43,11 @@ public function unclosed_string_literals($input) { (new Tokens($input))->iterator($this->language)->current(); } + #[Test, Expect(class: FormatException::class, message: '/Unclosed heredoc literal/'), Values(['<<iterator($this->language)->current(); + } + #[Test, Values(['0', '1', '1_000_000_000'])] public function integer_literal($input) { $this->assertTokens([['integer' => str_replace('_', '', $input)]], new Tokens($input)); diff --git a/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php b/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php index fff5c5d..2f5fa91 100755 --- a/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php +++ b/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php @@ -99,4 +99,17 @@ public function heredoc($label) { ); $this->assertParsed([new Literal($nowdoc, self::LINE + 5)], $nowdoc.';'); } + + #[Test] + public function heredoc_indentation() { + $nowdoc= ( + "<<assertParsed([new Literal($nowdoc, self::LINE + 5)], $nowdoc.';'); + } } \ No newline at end of file From 6a204ee44056224de4888c0a393a07204e93267d Mon Sep 17 00:00:00 2001 From: Timm Friebe Date: Sun, 26 Jan 2025 12:18:08 +0100 Subject: [PATCH 4/7] Make line number consistent with multiline strings --- src/main/php/lang/ast/Tokens.class.php | 26 +++++++++---------- .../ast/unittest/parse/LiteralsTest.class.php | 4 +-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/main/php/lang/ast/Tokens.class.php b/src/main/php/lang/ast/Tokens.class.php index 89dcf21..d63d47e 100755 --- a/src/main/php/lang/ast/Tokens.class.php +++ b/src/main/php/lang/ast/Tokens.class.php @@ -188,19 +188,19 @@ public function iterator($language) { $end= trim($label, '"\''); $l= strlen($end); $string= "<<<{$label}"; - do { - $token= $next("\r\n"); - if ("\n" === $token) { - $line++; - } else if (0 === substr_compare($token, $end, $p= strspn($token, ' '), $l)) { - $string.= substr($token, 0, $p + $l); - $offset-= strlen($token) - $p - $l; - yield new Token($language->symbol('(literal)'), 'heredoc', $string, $line); - continue 2; - } - $string.= $token; - } while (null !== $token); - throw new FormatException('Unclosed heredoc literal starting at line '.$line); + + heredoc: $token= $next("\r\n"); + if (0 === substr_compare($token, $end, $p= strspn($token, ' '), $l)) { + $p+= $l; + $offset-= strlen($token) - $p; + yield new Token($language->symbol('(literal)'), 'heredoc', $string.substr($token, 0, $p), $line); + $line+= substr_count($string, "\n"); + continue; + } else if (null === $token) { + throw new FormatException('Unclosed heredoc literal starting at line '.$line); + } + $string.= $token; + goto heredoc; } $offset-= strlen($n); } diff --git a/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php b/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php index 2f5fa91..607d910 100755 --- a/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php +++ b/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php @@ -97,7 +97,7 @@ public function heredoc($label) { "Line 4\n". "EOD" ); - $this->assertParsed([new Literal($nowdoc, self::LINE + 5)], $nowdoc.';'); + $this->assertParsed([new Literal($nowdoc, self::LINE)], $nowdoc.';'); } #[Test] @@ -110,6 +110,6 @@ public function heredoc_indentation() { " Line 4\n". " EOD" ); - $this->assertParsed([new Literal($nowdoc, self::LINE + 5)], $nowdoc.';'); + $this->assertParsed([new Literal($nowdoc, self::LINE)], $nowdoc.';'); } } \ No newline at end of file From fc246faf772bdc004b484d48f8ce3dd33507cdfd Mon Sep 17 00:00:00 2001 From: Timm Friebe Date: Sun, 26 Jan 2025 12:21:57 +0100 Subject: [PATCH 5/7] Test line numbers --- .../ast/unittest/parse/LiteralsTest.class.php | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php b/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php index 607d910..cfc231d 100755 --- a/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php +++ b/src/test/php/lang/ast/unittest/parse/LiteralsTest.class.php @@ -112,4 +112,30 @@ public function heredoc_indentation() { ); $this->assertParsed([new Literal($nowdoc, self::LINE)], $nowdoc.';'); } + + #[Test] + public function line_number_after_multiline_string() { + $string= ( + "'\n". + " ...\n". + "'" + ); + $this->assertParsed( + [new Literal($string, self::LINE), new Literal('null', self::LINE + 3)], + $string.";\nnull;" + ); + } + + #[Test] + public function line_number_after_heredoc() { + $nowdoc= ( + "<<assertParsed( + [new Literal($nowdoc, self::LINE), new Literal('null', self::LINE + 3)], + $nowdoc.";\nnull;" + ); + } } \ No newline at end of file From 013abc8a8d0cb3473f3116b6a4ebefc9717e1385 Mon Sep 17 00:00:00 2001 From: Timm Friebe Date: Sun, 26 Jan 2025 13:25:57 +0100 Subject: [PATCH 6/7] Simplify heredoc implementation --- src/main/php/lang/ast/Tokens.class.php | 87 +++++++++++--------------- 1 file changed, 38 insertions(+), 49 deletions(-) diff --git a/src/main/php/lang/ast/Tokens.class.php b/src/main/php/lang/ast/Tokens.class.php index d63d47e..4152029 100755 --- a/src/main/php/lang/ast/Tokens.class.php +++ b/src/main/php/lang/ast/Tokens.class.php @@ -12,7 +12,7 @@ class Tokens { const DELIMITERS = " \r\n\t'\$\"`=,;.:?!(){}[]#+-*/|&^@%~<>"; const OPERATORS = [ - '<' => ['<=>', '<<=', '<=', '<<', '<>', ' ['<=>', '<<=', '<<<', '<=', '<<', '<>', '' => ['>>=', '>=', '>>'], '=' => ['===', '=>', '=='], '!' => ['!==', '!='], @@ -22,7 +22,7 @@ class Tokens { '+' => ['+=', '++'], '-' => ['-=', '--', '->'], '*' => ['**=', '*=', '**'], - '/' => ['/='], + '/' => ['/=', '//', '/*'], '~' => ['~='], '%' => ['%='], '?' => ['?->', '??=', '?:', '??'], @@ -106,10 +106,10 @@ public function iterator($language) { $end= '\\'.$token; do { $chunk= $next($end); - if (null === $chunk) { - throw new FormatException('Unclosed string literal starting at line '.$line); - } else if ('\\' === $chunk) { + if ('\\' === $chunk) { $string.= $chunk.$next($end); + } else if (null === $chunk) { + throw new FormatException('Unclosed string literal starting at line '.$line); } else { $string.= $chunk; } @@ -162,49 +162,6 @@ public function iterator($language) { goto number; } $offset-= strlen($t); - } else if ('/' === $token) { - $t= $next(self::DELIMITERS); - if ('/' === $t) { - yield new Token(null, 'comment', '//'.$next("\r\n"), $line); - continue; - } else if ('*' === $t) { - $comment= ''; - do { - $chunk= $next('/'); - $comment.= $chunk; - } while (null !== $chunk && '*' !== $chunk[strlen($chunk) - 1]); - $comment.= $next('/'); - yield new Token(null, '*' === $comment[0] ? 'apidoc' : 'comment', '/*'.$comment, $line); - $line+= substr_count($comment, "\n"); - continue; - } - null === $t || $offset-= strlen($t); - } else if ('<' === $token) { - $t= $next(self::DELIMITERS); - if ('<' === $t) { - $n= $next(self::DELIMITERS); - if ('<' === $n) { - $label= $next("\r\n"); - $end= trim($label, '"\''); - $l= strlen($end); - $string= "<<<{$label}"; - - heredoc: $token= $next("\r\n"); - if (0 === substr_compare($token, $end, $p= strspn($token, ' '), $l)) { - $p+= $l; - $offset-= strlen($token) - $p; - yield new Token($language->symbol('(literal)'), 'heredoc', $string.substr($token, 0, $p), $line); - $line+= substr_count($string, "\n"); - continue; - } else if (null === $token) { - throw new FormatException('Unclosed heredoc literal starting at line '.$line); - } - $string.= $token; - goto heredoc; - } - $offset-= strlen($n); - } - $offset-= strlen($t); } // Handle combined operators. First, ensure we have enough bytes in our buffer @@ -225,7 +182,39 @@ public function iterator($language) { $offset+= strlen($token); } - yield new Token($language->symbol($token), 'operator', $token, $line); + // Distinguish single- and multiline comments as well as heredoc from operators + if ('//' === $token) { + yield new Token(null, 'comment', '//'.$next("\r\n"), $line); + } else if ('/*' === $token) { + $comment= ''; + do { + $chunk= $next('/'); + $comment.= $chunk; + } while (null !== $chunk && '*' !== $chunk[strlen($chunk) - 1]); + $comment.= $next('/'); + yield new Token(null, '*' === $comment[0] ? 'apidoc' : 'comment', '/*'.$comment, $line); + $line+= substr_count($comment, "\n"); + } else if ('<<<' === $token) { + $label= $next("\r\n"); + $end= trim($label, '"\''); + $l= strlen($end); + $string= "<<<{$label}"; + + heredoc: $token= $next("\r\n"); + if (0 === substr_compare($token, $end, $p= strspn($token, ' '), $l)) { + $p+= $l; + $offset-= strlen($token) - $p; + yield new Token($language->symbol('(literal)'), 'heredoc', $string.substr($token, 0, $p), $line); + $line+= substr_count($string, "\n"); + continue; + } else if (null === $token) { + throw new FormatException('Unclosed heredoc literal starting at line '.$line); + } + $string.= $token; + goto heredoc; + } else { + yield new Token($language->symbol($token), 'operator', $token, $line); + } } else { yield new Token($language->symbols[$token] ?? $language->symbol('(name)'), 'name', $token, $line); } From af447f3b70c7a3f90f5b406ae5729c19f26eb8ef Mon Sep 17 00:00:00 2001 From: Timm Friebe Date: Sun, 26 Jan 2025 13:37:54 +0100 Subject: [PATCH 7/7] Improve performance for single-character operators --- src/main/php/lang/ast/Tokens.class.php | 67 +++++++++++++------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/src/main/php/lang/ast/Tokens.class.php b/src/main/php/lang/ast/Tokens.class.php index 4152029..c8b278f 100755 --- a/src/main/php/lang/ast/Tokens.class.php +++ b/src/main/php/lang/ast/Tokens.class.php @@ -166,13 +166,13 @@ public function iterator($language) { // Handle combined operators. First, ensure we have enough bytes in our buffer // Our longest operator is 3 characters, hardcode this here. - if (self::OPERATORS[$token]) { + if ($combined= self::OPERATORS[$token]) { $offset--; while ($offset + 3 > $length && $this->in->available()) { $buffer.= $this->in->read(8192); $length= strlen($buffer); } - foreach (self::OPERATORS[$token] as $operator) { + foreach ($combined as $operator) { if ($offset + strlen($operator) > $length) continue; if (0 === substr_compare($buffer, $operator, $offset, strlen($operator))) { $token= $operator; @@ -180,41 +180,42 @@ public function iterator($language) { } } $offset+= strlen($token); - } - - // Distinguish single- and multiline comments as well as heredoc from operators - if ('//' === $token) { - yield new Token(null, 'comment', '//'.$next("\r\n"), $line); - } else if ('/*' === $token) { - $comment= ''; - do { - $chunk= $next('/'); - $comment.= $chunk; - } while (null !== $chunk && '*' !== $chunk[strlen($chunk) - 1]); - $comment.= $next('/'); - yield new Token(null, '*' === $comment[0] ? 'apidoc' : 'comment', '/*'.$comment, $line); - $line+= substr_count($comment, "\n"); - } else if ('<<<' === $token) { - $label= $next("\r\n"); - $end= trim($label, '"\''); - $l= strlen($end); - $string= "<<<{$label}"; - heredoc: $token= $next("\r\n"); - if (0 === substr_compare($token, $end, $p= strspn($token, ' '), $l)) { - $p+= $l; - $offset-= strlen($token) - $p; - yield new Token($language->symbol('(literal)'), 'heredoc', $string.substr($token, 0, $p), $line); - $line+= substr_count($string, "\n"); + // Distinguish single- and multiline comments as well as heredoc from operators + if ('//' === $token) { + yield new Token(null, 'comment', '//'.$next("\r\n"), $line); + continue; + } else if ('/*' === $token) { + $comment= ''; + do { + $chunk= $next('/'); + $comment.= $chunk; + } while (null !== $chunk && '*' !== $chunk[strlen($chunk) - 1]); + $comment.= $next('/'); + yield new Token(null, '*' === $comment[0] ? 'apidoc' : 'comment', '/*'.$comment, $line); + $line+= substr_count($comment, "\n"); continue; - } else if (null === $token) { - throw new FormatException('Unclosed heredoc literal starting at line '.$line); + } else if ('<<<' === $token) { + $label= $next("\r\n"); + $end= trim($label, '"\''); + $l= strlen($end); + $string= "<<<{$label}"; + + heredoc: $token= $next("\r\n"); + if (0 === substr_compare($token, $end, $p= strspn($token, ' '), $l)) { + $p+= $l; + $offset-= strlen($token) - $p; + yield new Token($language->symbol('(literal)'), 'heredoc', $string.substr($token, 0, $p), $line); + $line+= substr_count($string, "\n"); + continue; + } else if (null === $token) { + throw new FormatException('Unclosed heredoc literal starting at line '.$line); + } + $string.= $token; + goto heredoc; } - $string.= $token; - goto heredoc; - } else { - yield new Token($language->symbol($token), 'operator', $token, $line); } + yield new Token($language->symbol($token), 'operator', $token, $line); } else { yield new Token($language->symbols[$token] ?? $language->symbol('(name)'), 'name', $token, $line); }