From d4c3f07f3f6e4e80df1e673c61f5547708a7e392 Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Tue, 11 Jan 2022 18:23:00 -0500 Subject: [PATCH 1/2] Entropy scanning fixups The loosey-goosey "base64-anything" matching introduced in 3.0.0 turned out to be problematic because we several examples of things that looked like base64 encodings but weren't and which generated new issues thanks to the extended combined alphabet. Simple tweaks looked problematic (read "nondeterministic") going forward. This fix is a little bit more invasive. * We scan for BASE64 and BASE64URL encodings separately (using the same flakey expressions as pre-3.0.0 did) so they cannot bleed into each other. The cost of this is that we now make 3 (vs 2) regex scans of input text. A number of unit tests required revision to account for the additional calls. * To offset this cost, we further adjust the regex expressions to match a minimum of 20 characters, offloading much of the work from function `util.find_strings_by_regex()` which spent most of its time throwing away short strings. Nothing in the tartufo codebase allows users to supply a different minimum length; I added a warning for folks who might be building other code on top of tartufo. * Additionally, `scanner.scan_entropy()` used to split lines into words and scan the words individually. Since the supplied regexs will do this by themselves anyway, we eliminate the `line.split()` step and eliminate a pass over the input text. This required fixes to a number of unit tests where we generally make fewer calls on longer targets. * But wait! There's more! Many strings are both base64 and base64url, and we don't want to generate duplicate (or overlapping) issues. That resulted in the following two changes... * `scanner.evaluate_entropy_string()` now returns an optional issue instead of being a generator (that returned either 1 or 0 issues). This allows its only caller to inspect the return before passing it back to higher-level callers. * `scanner.scan_entropy()` now jumps through deduplication hoops: - results from both base64 and base64url scans are combined in a set to eliminate strict duplicates; as a side effect, it is possible that lines with multiple issues (in the same line) may return the issues in a different order. - target strings are evaluated from longest to shortest, and we no longer report an issue for a substring if we already reported an issue on the longer string containing it. We do not strictly test that the substring is located within the longer string, so a line that contained "bad-string even-more-bad-string" would not generate an issue for the initial "bad-string" even though it appears before the subsequent "even-more-bad-string" (which would get an issue). - We also test for duplicate hex encoding issues (because a hex string is also a base64 string) -- this could not happen with default sensitivity (backstory omitted) but is nice to close off. My testing shows variations in the noise level when tartufo scans itself, but it seems like the extra overhead should be approximately balanced by the efficiency gains and we'll see a wash. Time will tell. Note it is *still* possible to get new issues (not reported by 2.x) if a string begins or ends with `-` or `_` and the old string didn't generate an issue but the new longer string does. --- tartufo/scanner.py | 38 ++++++++++++----- tartufo/util.py | 6 +++ tests/test_base_scanner.py | 83 +++++++++++++++++++++++++++++++------- tests/test_util.py | 6 +-- 4 files changed, 105 insertions(+), 28 deletions(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index dfb7df36..02676bff 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -36,8 +36,9 @@ Scope, ) -BASE64_REGEX = re.compile(r"[A-Z0-9=+/_-]+", re.IGNORECASE) -HEX_REGEX = re.compile(r"[0-9A-F]+", re.IGNORECASE) +BASE64_REGEX = re.compile(r"[A-Z0-9=+/]{20,}", re.IGNORECASE) +BASE64URL_REGEX = re.compile(r"[A-Z0-9=_-]{20,}", re.IGNORECASE) +HEX_REGEX = re.compile(r"[0-9A-F]{20,}", re.IGNORECASE) class Issue: @@ -561,15 +562,29 @@ def scan_entropy( """ for line in chunk.contents.split("\n"): - for word in line.split(): - for string in util.find_strings_by_regex(word, BASE64_REGEX): - yield from self.evaluate_entropy_string( + # Using a set eliminates exact duplicates, which are common + base64_strings = set(util.find_strings_by_regex(line, BASE64_REGEX)) + base64_strings.update(util.find_strings_by_regex(line, BASE64URL_REGEX)) + # Don't report a string which is a substring of a string we already reported + reported_strings: List[str] = [] + for string in sorted(base64_strings, key=len, reverse=True): + if all((string not in already for already in reported_strings)): + report = self.evaluate_entropy_string( chunk, line, string, self.b64_entropy_limit ) - for string in util.find_strings_by_regex(word, HEX_REGEX): - yield from self.evaluate_entropy_string( + if report is not None: + reported_strings.append(string) + yield report + # Hex alphabet is subset of base64, so check duplicates here too; + # but it's easier because hex string will never be substring of another + # hex string. + for string in util.find_strings_by_regex(line, HEX_REGEX): + if all((string not in already for already in reported_strings)): + report = self.evaluate_entropy_string( chunk, line, string, self.hex_entropy_limit ) + if report is not None: + yield report def evaluate_entropy_string( self, @@ -577,22 +592,23 @@ def evaluate_entropy_string( line: str, string: str, min_entropy_score: float, - ) -> Generator[Issue, None, None]: + ) -> Optional[Issue]: """Check entropy string using entropy characters and score. :param chunk: The chunk of data to check :param line: Source line containing string of interest :param string: String to check :param min_entropy_score: Minimum entropy score to flag - :return: Generator of issues flagged + :return: Issue, if one is detected; otherwise None """ if not self.signature_is_excluded(string, chunk.file_path): entropy_score = self.calculate_entropy(string) if entropy_score > min_entropy_score: if self.entropy_string_is_excluded(string, line, chunk.file_path): self.logger.debug("line containing entropy was excluded: %s", line) - else: - yield Issue(types.IssueType.Entropy, string, chunk) + return None + return Issue(types.IssueType.Entropy, string, chunk) + return None def scan_regex(self, chunk: types.Chunk) -> Generator[Issue, None, None]: """Scan a chunk of data for matches against the configured regexes. diff --git a/tartufo/util.py b/tartufo/util.py index 2237c5cb..8bb19ae5 100644 --- a/tartufo/util.py +++ b/tartufo/util.py @@ -209,6 +209,12 @@ def find_strings_by_regex( :param text: The text string to be analyzed :param regex: A pattern which matches all character sequences of interest :param threshold: The minimum acceptable length of a matching string + + WARNING: If you are passing one of the scanner.py regex variables, note + they are compiled with a minimum length of 20, and using a lower threshold + here will not have the intended effect. Nothing in this codebase passes a + threshold parameter, but external users might need to adjust or use their + own expressions. """ for match in regex.finditer(text): diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index 2fe2fd2a..2f370319 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -444,7 +444,20 @@ def test_issues_are_not_created_for_b64_string_excluded_signatures( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = (["foo"], [], [], [], [], []) + mock_strings.side_effect = ( + ["foo"], + ["foo"], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + ) mock_signature.return_value = True issues = list(self.scanner.scan_entropy(self.chunk)) mock_calculate.assert_not_called() @@ -459,7 +472,7 @@ def test_issues_are_not_created_for_hex_string_excluded_signatures( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = ([], ["foo"], [], [], [], []) + mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], []) mock_signature.return_value = True issues = list(self.scanner.scan_entropy(self.chunk)) mock_calculate.assert_not_called() @@ -474,7 +487,20 @@ def test_issues_are_created_for_high_entropy_b64_strings( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = (["foo"], [], [], [], [], []) + mock_strings.side_effect = ( + ["foo"], + ["foo"], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + ) mock_signature.return_value = False mock_calculate.return_value = 9.0 issues = list(self.scanner.scan_entropy(self.chunk)) @@ -491,7 +517,7 @@ def test_issues_are_created_for_high_entropy_hex_strings( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = ([], ["foo"], [], [], [], []) + mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], []) mock_signature.return_value = False mock_calculate.return_value = 9.0 issues = list(self.scanner.scan_entropy(self.chunk)) @@ -510,7 +536,7 @@ def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_ex mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = ([], ["foo"], [], [], [], []) + mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], []) mock_entropy.return_value = True mock_signature.return_value = False mock_calculate.return_value = 9.0 @@ -528,7 +554,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_exc mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = (["foo"], [], [], [], [], []) + mock_strings.side_effect = ( + ["foo"], + ["foo"], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + ) mock_entropy.return_value = True mock_signature.return_value = False mock_calculate.return_value = 9.0 @@ -544,7 +583,20 @@ def test_issues_are_not_created_for_low_entropy_b64_strings( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = (["foo"], [], [], [], [], []) + mock_strings.side_effect = ( + ["foo"], + ["foo"], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + ) mock_signature.return_value = False mock_calculate.return_value = 1.0 issues = list(self.scanner.scan_entropy(self.chunk)) @@ -559,7 +611,7 @@ def test_issues_are_not_created_for_low_entropy_hex_strings( mock_signature: mock.MagicMock, mock_calculate: mock.MagicMock, ): - mock_strings.side_effect = ([], ["foo"], [], [], [], []) + mock_strings.side_effect = ([], [], ["foo"], [], [], [], [], [], [], [], [], []) mock_signature.return_value = False mock_calculate.return_value = 1.0 issues = list(self.scanner.scan_entropy(self.chunk)) @@ -604,12 +656,15 @@ def test_scan_entropy_find_b64_strings_for_every_word_in_diff( list(self.scanner.scan_entropy(self.chunk)) mock_strings.assert_has_calls( ( - mock.call("foo", scanner.BASE64_REGEX), - mock.call("foo", scanner.HEX_REGEX), - mock.call("bar", scanner.BASE64_REGEX), - mock.call("bar", scanner.HEX_REGEX), - mock.call("asdfqwer", scanner.BASE64_REGEX), - mock.call("asdfqwer", scanner.HEX_REGEX), + mock.call(" foo bar", scanner.BASE64_REGEX), + mock.call(" foo bar", scanner.BASE64URL_REGEX), + mock.call(" foo bar", scanner.HEX_REGEX), + mock.call(" asdfqwer", scanner.BASE64_REGEX), + mock.call(" asdfqwer", scanner.BASE64URL_REGEX), + mock.call(" asdfqwer", scanner.HEX_REGEX), + mock.call(" ", scanner.BASE64_REGEX), + mock.call(" ", scanner.BASE64URL_REGEX), + mock.call(" ", scanner.HEX_REGEX), ) ) diff --git a/tests/test_util.py b/tests/test_util.py index 59aee19d..f9577fa5 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -427,14 +427,14 @@ def test_find_strings_by_regex_recognizes_base64url(self): """ strings = list( - util.find_strings_by_regex(sample_input, scanner.BASE64_REGEX, 20) + util.find_strings_by_regex(sample_input, scanner.BASE64URL_REGEX, 20) ) self.assertEqual(strings, ["111111111-ffffCCCC=="]) def test_find_strings_by_regex_recognizes_mutant_base64(self): sample_input = """ - +111111111-ffffCCCC= Can't mix + and - but both are in regex + +111111111-ffffCCCC= Can't mix + and - and substrings are too short 111111111111111111111== Not a valid length but we don't care ==111111111111111111 = Is supposed to be end only but we don't care """ @@ -444,5 +444,5 @@ def test_find_strings_by_regex_recognizes_mutant_base64(self): ) self.assertEqual( strings, - ["+111111111-ffffCCCC=", "111111111111111111111==", "==111111111111111111"], + ["111111111111111111111==", "==111111111111111111"], ) From 90cd64e1a62b7b926953f7e93becf853133338d5 Mon Sep 17 00:00:00 2001 From: Scott Bailey Date: Tue, 11 Jan 2022 20:04:38 -0500 Subject: [PATCH 2/2] CHANGELOG updated --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d59874f0..f1bf0244 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +v3.0.x - TBD +------------ + +Bug fixes: + +* [315](https://github.com/godaddy/tartufo/issues/315) - Rework entropy scanning + to reduce number of new issues generated for text that passed 2.x scans without + issues. Some corner cases may remain that will require exclusions to silence. + See [319](https://github.com/godaddy/tartufo/pull/319) for details. + v3.0.0 - 5 January 2022 -----------------------