From 9f1403cf5c52021075f92b03cf6d4e2bd07799ea Mon Sep 17 00:00:00 2001 From: Edvard Rejthar Date: Wed, 13 Nov 2019 11:56:38 +0100 Subject: [PATCH] better github crash report filter+unique fix Signed-off-by: Edvard Rejthar --- README.md | 10 ++++++---- convey/__main__.py | 3 +-- convey/config.py | 11 ++++++----- convey/identifier.py | 13 ++++++++----- convey/informer.py | 4 ++-- convey/parser.py | 2 +- convey/types.py | 4 ++-- setup.py | 2 +- 8 files changed, 27 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index df6e3d1..a4dc72f 100644 --- a/README.md +++ b/README.md @@ -177,13 +177,13 @@ Some of the field types we are able to auto-detect: * **ip** – standard IPv4 / IPv6 addresses * **cidr** – CIDR notation, ex: 127.0.0.1/32 -* **portIP** – IPv4 in the form 1.2.3.4.port -* **anyIP** – IPv4 garbled in the form `any text 1.2.3.4 any text` +* **port_ip** – IPv4 in the form 1.2.3.4.port +* **any_ip** – IPv4 garbled in the form `any text 1.2.3.4 any text` * **hostname** – or FQDN; 2nd or 3rd domain name * **url** – URL starting with http/https * **asn** – AS Number * **base64** – text encoded with base64 -* **wrongURL** – URL that has been deactivated by replacing certain chars, ex: "hxxp://example[.]com" +* **wrong_url** – URL that has been deactivated by replacing certain chars, ex: "hxxp://example[.]com" ### Overview of all methods: @@ -257,7 +257,8 @@ Should there be multiple ways of using your generator, you may decorate with `Pi ```python3 from convey import PickMethod -PickMethod("all") + +@PickMethod("all") class any_method(PickMethod): def all(x): ''' All of them. ''' @@ -279,6 +280,7 @@ If you need a direct user entry before each processing, import `PickInput` and m ```python3 from convey import PickInput + @PickInput def time_format(val, format="%H:%M"): ''' This text will be displayed to the user. diff --git a/convey/__main__.py b/convey/__main__.py index 7676491..f39b4b9 100644 --- a/convey/__main__.py +++ b/convey/__main__.py @@ -2,7 +2,6 @@ import bdb import pdb import sys -from urllib.parse import quote __doc__ = """Convey – CSV swiss knife brought by CSIRT.cz""" __author__ = "Edvard Rejthar, CSIRT.CZ" @@ -43,7 +42,7 @@ def main(): print(f"Convey crashed at {value} on {traceback.format_exc().splitlines()[-3].strip()}") if Config.get("github_crash_submit"): body = f"```bash\n{traceback.format_exc()}```\n\n```json5\n{tb.tb_next.tb_frame.f_locals}\n```" - Config.github_issue(f"crash: {value}", quote(body)) + Config.github_issue(f"crash: {value}", body) class WebServer: diff --git a/convey/config.py b/convey/config.py index 678334a..ef22eec 100644 --- a/convey/config.py +++ b/convey/config.py @@ -10,6 +10,7 @@ from shutil import copy from subprocess import Popen, PIPE, call from time import sleep +from urllib.parse import quote from appdirs import user_config_dir @@ -22,7 +23,8 @@ handlers.append(fileHandler) except PermissionError: fileHandler = None - print("Cannot create convey.log here at " + str(Path(".").absolute())) + print("Cannot create convey.log here at " + str(Path(".").absolute()) + " – change directory please.") + quit() except FileNotFoundError: # FileNotFoundError emitted when we are in a directory whose inode exists no more print("Current working directory doesn't exist.") quit() @@ -313,10 +315,9 @@ def get(key, section='CONVEY', get=None): @staticmethod def github_issue(title, body): - url = f"https://github.com/CZ-NIC/convey/issues/new?" \ - f"title={title}&body={body}" + url = f"https://github.com/CZ-NIC/convey/issues/new?title={quote(title)}&body={quote(body)}" webbrowser.open(url) - input("\nPlease submit a Github issue at https://github.com/CZ-NIC/convey/issues/new" + input(f"\nPlease submit a Github issue at {url}" "\nTrying to open issue tracker in a browser...") @staticmethod @@ -342,7 +343,7 @@ def get_cache_dir(): return Config.cache_dir @staticmethod - def edit_configuration(flags = 3): + def edit_configuration(flags=3): if flags & 2: app = Popen(['xdg-open', Config.path], stdout=PIPE, stderr=PIPE) elif flags & 1: diff --git a/convey/identifier.py b/convey/identifier.py index aa855cd..64774bd 100644 --- a/convey/identifier.py +++ b/convey/identifier.py @@ -144,12 +144,15 @@ def guess_dialect(sample): # header detection l = [line.strip() for line in sample] - header_to_rows_similarity = mean([SequenceMatcher(None, l[0], it).ratio() for it in l[1:]]) - if len(l[1:]) > 1: - rows_similarity = mean([SequenceMatcher(None, *comb).ratio() for comb in itertools.combinations(l[1:], 2)]) - has_header = rows_similarity > header_to_rows_similarity + 0.1 # it seems that first line differs -> header + if len(l[1:]) > 0: + header_to_rows_similarity = mean([SequenceMatcher(None, l[0], it).ratio() for it in l[1:]]) + if len(l[1:]) > 1: + rows_similarity = mean([SequenceMatcher(None, *comb).ratio() for comb in itertools.combinations(l[1:], 2)]) + has_header = rows_similarity > header_to_rows_similarity + 0.1 # it seems that first line differs -> header + else: + has_header = header_to_rows_similarity < 0.5 else: - has_header = header_to_rows_similarity < 0.5 + has_header = False try: s = sample[1] # we dont take header (there is no empty column for sure) diff --git a/convey/informer.py b/convey/informer.py index 2195984..0744719 100644 --- a/convey/informer.py +++ b/convey/informer.py @@ -37,9 +37,9 @@ def sout_info(self, clear=True, full=False): if self.csv.has_header is not None: l.append("header: " + ("used" if self.csv.has_header else "not used")) if self.csv.settings["filter"]: - l.append("Filter: " + ", ".join([f"{self.csv.fields[f]}({val})" for f, val in self.csv.settings["filter"]])) + l.append("Filter: " + ", ".join([f"{self.csv.fields[f].name}({val})" for f, val in self.csv.settings["filter"]])) if self.csv.settings["unique"]: - l.append("Unique col: " + ", ".join([self.csv.fields[f] for f in self.csv.settings["unique"]])) + l.append("Unique col: " + ", ".join([self.csv.fields[f].name for f in self.csv.settings["unique"]])) if self.csv.settings["split"]: l.append("Split by: {}".format(self.csv.fields[self.csv.settings["split"]])) diff --git a/convey/parser.py b/convey/parser.py index e06f04c..2a6f31b 100644 --- a/convey/parser.py +++ b/convey/parser.py @@ -824,7 +824,7 @@ def get_samples(self, max_samples=inf, supposed_type=None, target_type=None): return res def compute_preview(self, source_line): - if Config.get("compute_preview"): + if Config.get("compute_preview") and self.source_field: c = source_line[self.source_field.col_i] if c is None: # source column has not yet been resolved because of column resorting diff --git a/convey/types.py b/convey/types.py index 2395932..c15e04d 100644 --- a/convey/types.py +++ b/convey/types.py @@ -940,8 +940,8 @@ def add(start, target, disable_s): # formatting mark to put one cluster below another, not aside if flags & 16: # looks nicer in a presentation l.extend([ - "spf -> timestamp[style=invis]", - "formatted_time -> plaintext[style=invis]", + #"spf -> timestamp[style=invis]", + #"formatted_time -> plaintext[style=invis]", ]) else: # looks nicer in README.md l.extend([ diff --git a/setup.py b/setup.py index b9341c7..a5a7995 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( name='convey', - version="1.2rc8", + version="1.2", packages=['convey'], author='Edvard Rejthar', author_email='edvard.rejthar@nic.cz',