Skip to content

Commit

Permalink
Check if a file can be compared exactly
Browse files Browse the repository at this point in the history
  • Loading branch information
tomasr8 committed Oct 29, 2024
1 parent 1ef8c7a commit faf7ade
Showing 1 changed file with 27 additions and 11 deletions.
38 changes: 27 additions & 11 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
import token
import tokenize
import unittest
Expand Down Expand Up @@ -1803,7 +1804,7 @@ def test_backslash_continuation(self):
u.prev_row = 2
u.add_whitespace((4, 4))
self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n', compare_tokens_only=True)
TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n')

def test_iter_compat(self):
u = tokenize.Untokenizer()
Expand All @@ -1819,18 +1820,34 @@ def test_iter_compat(self):
self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')


def contains_ambiguous_backslash(source):
"""Return `True` if the source contains a backslash on a
line by itself. For example:
a = (1
\\
)
Code like this cannot be untokenized exactly. This is because
the tokenizer does not produce any tokens for the line containing
the backslash and so there is no way to know its indent.
"""
pattern = re.compile(br'\n\s*\\\s*\r?\n')
return pattern.search(source) is not None


class TestRoundtrip(TestCase):

def check_roundtrip(self, f, *, compare_tokens_only=False):
def check_roundtrip(self, f):
"""
Test roundtrip for `untokenize`. `f` is an open file or a string.
The source code in f is tokenized to both 5- and 2-tuples.
Both sequences are converted back to source code via
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
The test fails if the 3 pair tokenizations do not match.
If `compare_tokens_only` is False, the exact output of `untokenize`
is compared against the original source code.
If the source code can be untokenized unambiguously, the
untokenized code must match the original code exactly.
When untokenize bugs are fixed, untokenize with 5-tuples should
reproduce code that does not contain a backslash continuation
Expand All @@ -1855,12 +1872,12 @@ def check_roundtrip(self, f, *, compare_tokens_only=False):
tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
self.assertEqual(tokens2_from5, tokens2)

# Compare the exact output
if not compare_tokens_only:
readline = iter(code.splitlines(keepends=True)).__next__
# The BOM does not produce a token so there is no way to preserve it
if not contains_ambiguous_backslash(code):
# The BOM does not produce a token so there is no way to preserve it.
code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
self.assertEqual(code_without_bom, tokenize.untokenize(tokenize.tokenize(readline)))
readline = iter(code_without_bom.splitlines(keepends=True)).__next__
untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
self.assertEqual(code_without_bom, untokenized_code)

def check_line_extraction(self, f):
if isinstance(f, str):
Expand Down Expand Up @@ -2011,8 +2028,7 @@ def test_random_files(self):
print('tokenize', testfile)
with open(testfile, 'rb') as f:
with self.subTest(file=testfile):
compare_tokens_only = os.path.basename(testfile) == "test_traceback.py" # Ambiguous backslash continuation
self.check_roundtrip(f, compare_tokens_only=compare_tokens_only)
self.check_roundtrip(f)
self.check_line_extraction(f)


Expand Down

0 comments on commit faf7ade

Please sign in to comment.