From d4c3f07f3f6e4e80df1e673c61f5547708a7e392 Mon Sep 17 00:00:00 2001
From: Scott Bailey <scott.bailey@godaddy.com>
Date: Tue, 11 Jan 2022 18:23:00 -0500
Subject: [PATCH 1/2] Entropy scanning fixups

The loosey-goosey "base64-anything" matching introduced in 3.0.0 turned
out to be problematic because we several examples of things that looked
like base64 encodings but weren't and which generated new issues thanks
to the extended combined alphabet. Simple tweaks looked problematic
(read "nondeterministic") going forward.

This fix is a little bit more invasive.

* We scan for BASE64 and BASE64URL encodings separately (using the same
  flakey expressions as pre-3.0.0 did) so they cannot bleed into each
  other.  The cost of this is that we now make 3 (vs 2) regex scans of
  input text. A number of unit tests required revision to account for
  the additional calls.
* To offset this cost, we further adjust the regex expressions to match
  a minimum of 20 characters, offloading much of the work from function
  `util.find_strings_by_regex()` which spent most of its time throwing
  away short strings. Nothing in the tartufo codebase allows users to
  supply a different minimum length; I added a warning for folks who
  might be building other code on top of tartufo.
* Additionally, `scanner.scan_entropy()` used to split lines into words
  and scan the words individually. Since the supplied regexs will do
  this by themselves anyway, we eliminate the `line.split()` step and
  eliminate a pass over the input text. This required fixes to a number
  of unit tests where we generally make fewer calls on longer targets.
* But wait! There's more! Many strings are both base64 and base64url,
  and we don't want to generate duplicate (or overlapping) issues.
  That resulted in the following two changes...
* `scanner.evaluate_entropy_string()` now returns an optional issue
  instead of being a generator (that returned either 1 or 0 issues).
  This allows its only caller to inspect the return before passing it
  back to higher-level callers.
* `scanner.scan_entropy()` now jumps through deduplication hoops:
  - results from both base64 and base64url scans are combined in a set
    to eliminate strict duplicates; as a side effect, it is possible that
    lines with multiple issues (in the same line) may return the issues
    in a different order.
  - target strings are evaluated from longest to shortest, and we no
    longer report an issue for a substring if we already reported an issue
    on the longer string containing it. We do not strictly test that the
    substring is located within the longer string, so a line that contained
    "bad-string even-more-bad-string" would not generate an issue for
    the initial "bad-string" even though it appears before the subsequent
    "even-more-bad-string" (which would get an issue).
  - We also test for duplicate hex encoding issues (because a hex string
    is also a base64 string) -- this could not happen with default
    sensitivity (backstory omitted) but is nice to close off.

My testing shows variations in the noise level when tartufo scans itself,
but it seems like the extra overhead should be approximately balanced by
the efficiency gains and we'll see a wash. Time will tell.

Note it is *still* possible to get new issues (not reported by 2.x) if a
string begins or ends with `-` or `_` and the old string didn't generate
an issue but the new longer string does.
---
 tartufo/scanner.py         | 38 ++++++++++++-----
 tartufo/util.py            |  6 +++
 tests/test_base_scanner.py | 83 +++++++++++++++++++++++++++++++-------
 tests/test_util.py         |  6 +--
 4 files changed, 105 insertions(+), 28 deletions(-)

diff --git a/tartufo/scanner.py b/tartufo/scanner.py
index dfb7df36..02676bff 100755
--- a/tartufo/scanner.py
+++ b/tartufo/scanner.py
@@ -36,8 +36,9 @@
     Scope,
 )
 
-BASE64_REGEX = re.compile(r"[A-Z0-9=+/_-]+", re.IGNORECASE)
-HEX_REGEX = re.compile(r"[0-9A-F]+", re.IGNORECASE)
+BASE64_REGEX = re.compile(r"[A-Z0-9=+/]{20,}", re.IGNORECASE)
+BASE64URL_REGEX = re.compile(r"[A-Z0-9=_-]{20,}", re.IGNORECASE)
+HEX_REGEX = re.compile(r"[0-9A-F]{20,}", re.IGNORECASE)
 
 
 class Issue:
@@ -561,15 +562,29 @@ def scan_entropy(
         """
 
         for line in chunk.contents.split("\n"):
-            for word in line.split():
-                for string in util.find_strings_by_regex(word, BASE64_REGEX):
-                    yield from self.evaluate_entropy_string(
+            # Using a set eliminates exact duplicates, which are common
+            base64_strings = set(util.find_strings_by_regex(line, BASE64_REGEX))
+            base64_strings.update(util.find_strings_by_regex(line, BASE64URL_REGEX))
+            # Don't report a string which is a substring of a string we already reported
+            reported_strings: List[str] = []
+            for string in sorted(base64_strings, key=len, reverse=True):
+                if all((string not in already for already in reported_strings)):
+                    report = self.evaluate_entropy_string(
                         chunk, line, string, self.b64_entropy_limit
                     )
-                for string in util.find_strings_by_regex(word, HEX_REGEX):
-                    yield from self.evaluate_entropy_string(
+                    if report is not None:
+                        reported_strings.append(string)
+                        yield report
+            # Hex alphabet is subset of base64, so check duplicates here too;
+            # but it's easier because hex string will never be substring of another
+            # hex string.
+            for string in util.find_strings_by_regex(line, HEX_REGEX):
+                if all((string not in already for already in reported_strings)):
+                    report = self.evaluate_entropy_string(
                         chunk, line, string, self.hex_entropy_limit
                     )
+                    if report is not None:
+                        yield report
 
     def evaluate_entropy_string(
         self,
@@ -577,22 +592,23 @@ def evaluate_entropy_string(
         line: str,
         string: str,
         min_entropy_score: float,
-    ) -> Generator[Issue, None, None]:
+    ) -> Optional[Issue]:
         """Check entropy string using entropy characters and score.
 
         :param chunk: The chunk of data to check
         :param line: Source line containing string of interest
         :param string: String to check
         :param min_entropy_score: Minimum entropy score to flag
-        :return: Generator of issues flagged
+        :return: Issue, if one is detected; otherwise None
         """
         if not self.signature_is_excluded(string, chunk.file_path):
             entropy_score = self.calculate_entropy(string)
             if entropy_score > min_entropy_score:
                 if self.entropy_string_is_excluded(string, line, chunk.file_path):
                     self.logger.debug("line containing entropy was excluded: %s", line)
-                else:
-                    yield Issue(types.IssueType.Entropy, string, chunk)
+                    return None
+                return Issue(types.IssueType.Entropy, string, chunk)
+        return None
 
     def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]:
         """Scan a chunk of data for matches against the configured regexes.
diff --git a/tartufo/util.py b/tartufo/util.py
index 2237c5cb..8bb19ae5 100644
--- a/tartufo/util.py
+++ b/tartufo/util.py
@@ -209,6 +209,12 @@ def find_strings_by_regex(
     :param text: The text string to be analyzed
     :param regex: A pattern which matches all character sequences of interest
     :param threshold: The minimum acceptable length of a matching string
+
+    WARNING: If you are passing one of the scanner.py regex variables, note
+    they are compiled with a minimum length of 20, and using a lower threshold
+    here will not have the intended effect. Nothing in this codebase passes a
+    threshold parameter, but external users might need to adjust or use their
+    own expressions.
     """
 
     for match in regex.finditer(text):
diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py
index 2fe2fd2a..2f370319 100644
--- a/tests/test_base_scanner.py
+++ b/tests/test_base_scanner.py
@@ -444,7 +444,20 @@ def test_issues_are_not_created_for_b64_string_excluded_signatures(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = (["foo"], [], [], [], [], [])
+        mock_strings.side_effect = (
+            ["foo"],
+            ["foo"],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
         mock_signature.return_value = True
         issues = list(self.scanner.scan_entropy(self.chunk))
         mock_calculate.assert_not_called()
@@ -459,7 +472,7 @@ def test_issues_are_not_created_for_hex_string_excluded_signatures(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = ([], ["foo"], [], [], [], [])
+        mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
         mock_signature.return_value = True
         issues = list(self.scanner.scan_entropy(self.chunk))
         mock_calculate.assert_not_called()
@@ -474,7 +487,20 @@ def test_issues_are_created_for_high_entropy_b64_strings(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = (["foo"], [], [], [], [], [])
+        mock_strings.side_effect = (
+            ["foo"],
+            ["foo"],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
         issues = list(self.scanner.scan_entropy(self.chunk))
@@ -491,7 +517,7 @@ def test_issues_are_created_for_high_entropy_hex_strings(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = ([], ["foo"], [], [], [], [])
+        mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
         issues = list(self.scanner.scan_entropy(self.chunk))
@@ -510,7 +536,7 @@ def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_ex
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = ([], ["foo"], [], [], [], [])
+        mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
         mock_entropy.return_value = True
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
@@ -528,7 +554,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_exc
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = (["foo"], [], [], [], [], [])
+        mock_strings.side_effect = (
+            ["foo"],
+            ["foo"],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
         mock_entropy.return_value = True
         mock_signature.return_value = False
         mock_calculate.return_value = 9.0
@@ -544,7 +583,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = (["foo"], [], [], [], [], [])
+        mock_strings.side_effect = (
+            ["foo"],
+            ["foo"],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+        )
         mock_signature.return_value = False
         mock_calculate.return_value = 1.0
         issues = list(self.scanner.scan_entropy(self.chunk))
@@ -559,7 +611,7 @@ def test_issues_are_not_created_for_low_entropy_hex_strings(
         mock_signature: mock.MagicMock,
         mock_calculate: mock.MagicMock,
     ):
-        mock_strings.side_effect = ([], ["foo"], [], [], [], [])
+        mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], [])
         mock_signature.return_value = False
         mock_calculate.return_value = 1.0
         issues = list(self.scanner.scan_entropy(self.chunk))
@@ -604,12 +656,15 @@ def test_scan_entropy_find_b64_strings_for_every_word_in_diff(
         list(self.scanner.scan_entropy(self.chunk))
         mock_strings.assert_has_calls(
             (
-                mock.call("foo", scanner.BASE64_REGEX),
-                mock.call("foo", scanner.HEX_REGEX),
-                mock.call("bar", scanner.BASE64_REGEX),
-                mock.call("bar", scanner.HEX_REGEX),
-                mock.call("asdfqwer", scanner.BASE64_REGEX),
-                mock.call("asdfqwer", scanner.HEX_REGEX),
+                mock.call("        foo bar", scanner.BASE64_REGEX),
+                mock.call("        foo bar", scanner.BASE64URL_REGEX),
+                mock.call("        foo bar", scanner.HEX_REGEX),
+                mock.call("        asdfqwer", scanner.BASE64_REGEX),
+                mock.call("        asdfqwer", scanner.BASE64URL_REGEX),
+                mock.call("        asdfqwer", scanner.HEX_REGEX),
+                mock.call("        ", scanner.BASE64_REGEX),
+                mock.call("        ", scanner.BASE64URL_REGEX),
+                mock.call("        ", scanner.HEX_REGEX),
             )
         )
 
diff --git a/tests/test_util.py b/tests/test_util.py
index 59aee19d..f9577fa5 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -427,14 +427,14 @@ def test_find_strings_by_regex_recognizes_base64url(self):
         """
 
         strings = list(
-            util.find_strings_by_regex(sample_input, scanner.BASE64_REGEX, 20)
+            util.find_strings_by_regex(sample_input, scanner.BASE64URL_REGEX, 20)
         )
         self.assertEqual(strings, ["111111111-ffffCCCC=="])
 
     def test_find_strings_by_regex_recognizes_mutant_base64(self):
 
         sample_input = """
-        +111111111-ffffCCCC= Can't mix + and - but both are in regex
+        +111111111-ffffCCCC= Can't mix + and - and substrings are too short
         111111111111111111111== Not a valid length but we don't care
         ==111111111111111111 = Is supposed to be end only but we don't care
         """
@@ -444,5 +444,5 @@ def test_find_strings_by_regex_recognizes_mutant_base64(self):
         )
         self.assertEqual(
             strings,
-            ["+111111111-ffffCCCC=", "111111111111111111111==", "==111111111111111111"],
+            ["111111111111111111111==", "==111111111111111111"],
         )

From 90cd64e1a62b7b926953f7e93becf853133338d5 Mon Sep 17 00:00:00 2001
From: Scott Bailey <scott.bailey@godaddy.com>
Date: Tue, 11 Jan 2022 20:04:38 -0500
Subject: [PATCH 2/2] CHANGELOG updated

---
 CHANGELOG.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d59874f0..f1bf0244 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,13 @@
+v3.0.x - TBD
+------------
+
+Bug fixes:
+
+* [315](https://github.com/godaddy/tartufo/issues/315) - Rework entropy scanning
+  to reduce number of new issues generated for text that passed 2.x scans without
+  issues. Some corner cases may remain that will require exclusions to silence.
+  See [319](https://github.com/godaddy/tartufo/pull/319) for details.
+
 v3.0.0 - 5 January 2022
 -----------------------