diff --git a/lib/prism/translation/parser.rb b/lib/prism/translation/parser.rb index 4f28afa86f..6b417be423 100644 --- a/lib/prism/translation/parser.rb +++ b/lib/prism/translation/parser.rb @@ -51,7 +51,7 @@ def parse(source_buffer) source = source_buffer.source offset_cache = build_offset_cache(source) - result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache) + result = unwrap(Prism.parse(source, **prism_options), offset_cache) build_ast(result.value, offset_cache) ensure @@ -64,7 +64,7 @@ def parse_with_comments(source_buffer) source = source_buffer.source offset_cache = build_offset_cache(source) - result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache) + result = unwrap(Prism.parse(source, **prism_options), offset_cache) [ build_ast(result.value, offset_cache), @@ -83,7 +83,7 @@ def tokenize(source_buffer, recover = false) offset_cache = build_offset_cache(source) result = begin - unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache) + unwrap(Prism.parse_lex(source, **prism_options), offset_cache) rescue ::Parser::SyntaxError raise if !recover end @@ -285,6 +285,20 @@ def build_range(location, offset_cache) ) end + # Options for how prism should parse/lex the source. + def prism_options + options = { + filepath: @source_buffer.name, + version: convert_for_prism(version), + partial_script: true, + } + # The parser gem always encodes to UTF-8, unless it is binary. + # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/source/buffer.rb#L80-L107 + options[:encoding] = false if @source_buffer.source.encoding != Encoding::BINARY + + options + end + # Converts the version format handled by Parser to the format handled by Prism. def convert_for_prism(version) case version diff --git a/test/prism/fixtures/encoding_binary.txt b/test/prism/fixtures/encoding_binary.txt new file mode 100644 index 0000000000..f3dfc85abd --- /dev/null +++ b/test/prism/fixtures/encoding_binary.txt @@ -0,0 +1,9 @@ +# encoding: binary + +"\xcd" + +:"\xcd" + +/#{"\xcd"}/ + +%W[\xC0] diff --git a/test/prism/fixtures/encoding_euc_jp.txt b/test/prism/fixtures/encoding_euc_jp.txt new file mode 100644 index 0000000000..bbee76eae5 --- /dev/null +++ b/test/prism/fixtures/encoding_euc_jp.txt @@ -0,0 +1,6 @@ +# encoding: euc-jp + +# \x8E indicates a double-byte character, \x01 is not a valid second byte in euc-jp +"\x8E\x01" + +%W["\x8E\x01"] diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb index 4ba38bd0c0..2e9211e70d 100644 --- a/test/prism/ruby/parser_test.rb +++ b/test/prism/ruby/parser_test.rb @@ -17,6 +17,18 @@ # First, opt in to every AST feature. Parser::Builders::Default.modernize +# The parser gem rejects some strings that would most likely lead to errors +# in consumers due to encoding problems. RuboCop however monkey-patches this +# method out in order to accept such code. +# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/builders/default.rb#L2289-L2295 +Parser::Builders::Default.prepend( + Module.new { + def string_value(token) + value(token) + end + } +) + # Modify the source map == check so that it doesn't check against the node # itself so we don't get into a recursive loop. Parser::Source::Map.prepend( diff --git a/test/prism/ruby/ruby_parser_test.rb b/test/prism/ruby/ruby_parser_test.rb index 1aa0f540cc..1d530dd13b 100644 --- a/test/prism/ruby/ruby_parser_test.rb +++ b/test/prism/ruby/ruby_parser_test.rb @@ -26,6 +26,7 @@ def ==(other) module Prism class RubyParserTest < TestCase todos = [ + "encoding_euc_jp.txt", "newline_terminated.txt", "regex_char_width.txt", "seattlerb/bug169.txt", diff --git a/test/prism/snapshots/encoding_binary.txt b/test/prism/snapshots/encoding_binary.txt new file mode 100644 index 0000000000..ad5449c1f1 --- /dev/null +++ b/test/prism/snapshots/encoding_binary.txt @@ -0,0 +1,49 @@ +@ ProgramNode (location: (3,0)-(9,8)) +├── flags: ∅ +├── locals: [] +└── statements: + @ StatementsNode (location: (3,0)-(9,8)) + ├── flags: ∅ + └── body: (length: 4) + ├── @ StringNode (location: (3,0)-(3,6)) + │ ├── flags: newline + │ ├── opening_loc: (3,0)-(3,1) = "\"" + │ ├── content_loc: (3,1)-(3,5) = "\\xcd" + │ ├── closing_loc: (3,5)-(3,6) = "\"" + │ └── unescaped: "\xCD" + ├── @ SymbolNode (location: (5,0)-(5,7)) + │ ├── flags: newline, static_literal + │ ├── opening_loc: (5,0)-(5,2) = ":\"" + │ ├── value_loc: (5,2)-(5,6) = "\\xcd" + │ ├── closing_loc: (5,6)-(5,7) = "\"" + │ └── unescaped: "\xCD" + ├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11)) + │ ├── flags: newline, static_literal + │ ├── opening_loc: (7,0)-(7,1) = "/" + │ ├── parts: (length: 1) + │ │ └── @ EmbeddedStatementsNode (location: (7,1)-(7,10)) + │ │ ├── flags: ∅ + │ │ ├── opening_loc: (7,1)-(7,3) = "\#{" + │ │ ├── statements: + │ │ │ @ StatementsNode (location: (7,3)-(7,9)) + │ │ │ ├── flags: ∅ + │ │ │ └── body: (length: 1) + │ │ │ └── @ StringNode (location: (7,3)-(7,9)) + │ │ │ ├── flags: static_literal, frozen + │ │ │ ├── opening_loc: (7,3)-(7,4) = "\"" + │ │ │ ├── content_loc: (7,4)-(7,8) = "\\xcd" + │ │ │ ├── closing_loc: (7,8)-(7,9) = "\"" + │ │ │ └── unescaped: "\xCD" + │ │ └── closing_loc: (7,9)-(7,10) = "}" + │ └── closing_loc: (7,10)-(7,11) = "/" + └── @ ArrayNode (location: (9,0)-(9,8)) + ├── flags: newline + ├── elements: (length: 1) + │ └── @ StringNode (location: (9,3)-(9,7)) + │ ├── flags: ∅ + │ ├── opening_loc: ∅ + │ ├── content_loc: (9,3)-(9,7) = "\\xC0" + │ ├── closing_loc: ∅ + │ └── unescaped: "\xC0" + ├── opening_loc: (9,0)-(9,3) = "%W[" + └── closing_loc: (9,7)-(9,8) = "]" diff --git a/test/prism/snapshots/encoding_euc_jp.txt b/test/prism/snapshots/encoding_euc_jp.txt new file mode 100644 index 0000000000..934a1fdb7e --- /dev/null +++ b/test/prism/snapshots/encoding_euc_jp.txt @@ -0,0 +1,24 @@ +@ ProgramNode (location: (4,0)-(6,14)) +├── flags: ∅ +├── locals: [] +└── statements: + @ StatementsNode (location: (4,0)-(6,14)) + ├── flags: ∅ + └── body: (length: 2) + ├── @ StringNode (location: (4,0)-(4,10)) + │ ├── flags: newline + │ ├── opening_loc: (4,0)-(4,1) = "\"" + │ ├── content_loc: (4,1)-(4,9) = "\\x8E\\x01" + │ ├── closing_loc: (4,9)-(4,10) = "\"" + │ └── unescaped: "\x8E\x01" + └── @ ArrayNode (location: (6,0)-(6,14)) + ├── flags: newline + ├── elements: (length: 1) + │ └── @ StringNode (location: (6,3)-(6,13)) + │ ├── flags: ∅ + │ ├── opening_loc: ∅ + │ ├── content_loc: (6,3)-(6,13) = "\"\\x8E\\x01\"" + │ ├── closing_loc: ∅ + │ └── unescaped: "\"\x8E\x01\"" + ├── opening_loc: (6,0)-(6,3) = "%W[" + └── closing_loc: (6,13)-(6,14) = "]" diff --git a/test/prism/snippets_test.rb b/test/prism/snippets_test.rb index 26847da184..66802c5dc3 100644 --- a/test/prism/snippets_test.rb +++ b/test/prism/snippets_test.rb @@ -5,6 +5,7 @@ module Prism class SnippetsTest < TestCase except = [ + "encoding_binary.txt", "newline_terminated.txt", "seattlerb/begin_rescue_else_ensure_no_bodies.txt", "seattlerb/case_in.txt",