Check if a file can be compared exactly

tomasr8 · Oct 29, 2024 · faf7ade · faf7ade
1 parent 1ef8c7a
commit faf7ade
Showing 1 changed file with 27 additions and 11 deletions.
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
@@ -1,4 +1,5 @@
 import os
+import re
 import token
 import tokenize
 import unittest
@@ -1803,7 +1804,7 @@ def test_backslash_continuation(self):
         u.prev_row = 2
         u.add_whitespace((4, 4))
         self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', '    '])
-        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n', compare_tokens_only=True)
+        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
 
     def test_iter_compat(self):
         u = tokenize.Untokenizer()
@@ -1819,18 +1820,34 @@ def test_iter_compat(self):
         self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
 
 
+def contains_ambiguous_backslash(source):
+    """Return `True` if the source contains a backslash on a
+    line by itself. For example:
+
+    a = (1
+        \\
+    )
+
+    Code like this cannot be untokenized exactly. This is because
+    the tokenizer does not produce any tokens for the line containing
+    the backslash and so there is no way to know its indent.
+    """
+    pattern = re.compile(br'\n\s*\\\s*\r?\n')
+    return pattern.search(source) is not None
+
+
 class TestRoundtrip(TestCase):
 
-    def check_roundtrip(self, f, *, compare_tokens_only=False):
+    def check_roundtrip(self, f):
         """
         Test roundtrip for `untokenize`. `f` is an open file or a string.
         The source code in f is tokenized to both 5- and 2-tuples.
         Both sequences are converted back to source code via
         tokenize.untokenize(), and the latter tokenized again to 2-tuples.
         The test fails if the 3 pair tokenizations do not match.
 
-        If `compare_tokens_only` is False, the exact output of `untokenize`
-        is compared against the original source code.
+        If the source code can be untokenized unambiguously, the
+        untokenized code must match the original code exactly.
 
         When untokenize bugs are fixed, untokenize with 5-tuples should
         reproduce code that does not contain a backslash continuation
@@ -1855,12 +1872,12 @@ def check_roundtrip(self, f, *, compare_tokens_only=False):
         tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
         self.assertEqual(tokens2_from5, tokens2)
 
-        # Compare the exact output
-        if not compare_tokens_only:
-            readline = iter(code.splitlines(keepends=True)).__next__
-            # The BOM does not produce a token so there is no way to preserve it
+        if not contains_ambiguous_backslash(code):
+            # The BOM does not produce a token so there is no way to preserve it.
             code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
-            self.assertEqual(code_without_bom, tokenize.untokenize(tokenize.tokenize(readline)))
+            readline = iter(code_without_bom.splitlines(keepends=True)).__next__
+            untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
+            self.assertEqual(code_without_bom, untokenized_code)
 
     def check_line_extraction(self, f):
         if isinstance(f, str):
@@ -2011,8 +2028,7 @@ def test_random_files(self):
                 print('tokenize', testfile)
             with open(testfile, 'rb') as f:
                 with self.subTest(file=testfile):
-                    compare_tokens_only = os.path.basename(testfile) == "test_traceback.py"  # Ambiguous backslash continuation
-                    self.check_roundtrip(f, compare_tokens_only=compare_tokens_only)
+                    self.check_roundtrip(f)
                     self.check_line_extraction(f)