diff --git a/README.md b/README.md index 72724d2..24c2990 100644 --- a/README.md +++ b/README.md @@ -86,23 +86,41 @@ $ sha256sum -c discogs_*_CHECKSUM.txt Run `run.py` to convert the dump files to csv. +There are two run modes: + +1. You can point it to a directory where the discogs dump files are + and use one or multiple `--export` options to indicate which files to process: + ```sh # ensure the virtual environment is active (.discogsenv) $ python3 run.py \ --bz2 \ # compresses resulting csv files --apicounts \ # provides more accurate progress counts --export artist --export label --export master --export release \ + --output csv-dir # folder where to output the csv files dump-dir \ # folder where the data dumps are - csv-dir # folder where to output the csv files +``` + +2. You can specify the individual files instead: + +```sh +# ensure the virtual environment is active +(.discogsenv) $ python3 run.py \ + --bz2 \ # compresses resulting csv files + --apicounts \ # provides more accurate progress counts + --output csv-dir # folder where to output the csv files + path/to/discogs_20200806_artist.xml.gz path/to/discogs_20200806_labels.xml.gz ``` `run.py` takes the following arguments: - `--export`: the types of dump files to export: "artist", "label", "master", "release. It matches the names of the dump files, e.g. "discogs_20200806_*artist*s.xml.gz" + Not needed if the individual files are specified. - `--bz2`: Compresses output csv files using bz2 compression library. - `--limit=`: Limits export to some number of entities - `--apicounts`: Makes progress report more accurate by getting total amounts from Discogs API. +- `--output` : the folder where to store the csv files; default it current directory The exporter provides progress information in real time: diff --git a/discogsxml2db/exporter.py b/discogsxml2db/exporter.py index 9a23c7c..f62f4b4 100644 --- a/discogsxml2db/exporter.py +++ b/discogsxml2db/exporter.py @@ -37,7 +37,7 @@ def _write_rows(writer, entity, name): class EntityCsvExporter(object): """Read a Discogs dump XML file and exports SQL table records as CSV. """ - def __init__(self, entity, in_dir, out_dir, + def __init__(self, entity, in_file_or_dir, out_dir, limit=None, bz2=True, dry_run=False, debug=False, max_hint=None, verbose=False): self.entity = entity @@ -45,8 +45,11 @@ def __init__(self, entity, in_dir, out_dir, self.max_hint = max_hint self.verbose = verbose - lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity) - self.pattern = os.path.join(in_dir, lookup) + if os.path.isfile(in_file_or_dir): + self.pattern = in_file_or_dir + else: + lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity) + self.pattern = os.path.join(in_file_or_dir, lookup) # where and how the exporter will write to self.out_dir = out_dir @@ -287,8 +290,7 @@ def write_track_artists(self, writer, release): def main(arguments): - in_base = arguments['INPUT'] - out_base = arguments['OUTPUT'] or '.' + out_base = arguments['--output'] or '.' limit = int(arguments['--limit']) if arguments['--limit'] else None bz2_on = arguments['--bz2'] debug = arguments['--debug'] @@ -312,14 +314,38 @@ def main(arguments): except Exception: pass - for entity in arguments['--export']: - expected_count = rough_counts['{}s'.format(entity)] - exporter = _exporters[entity]( - in_base, - out_base, - limit=limit, - bz2=bz2_on, - debug=debug, - max_hint=min(expected_count, limit or expected_count), - dry_run=dry_run) - exporter.export() + if arguments["INPUT_DIR"] and os.path.isdir(arguments["INPUT_DIR"]): + # use --export to select the entities + in_base = arguments['INPUT_DIR'] + for entity in arguments['--export']: + expected_count = rough_counts['{}s'.format(entity)] + exporter = _exporters[entity]( + in_base, + out_base, + limit=limit, + bz2=bz2_on, + debug=debug, + max_hint=min(expected_count, limit or expected_count), + dry_run=dry_run) + exporter.export() + elif arguments[""] or os.path.isfile(arguments["INPUT_DIR"]): + files = [] + if arguments[""]: + files = arguments[""] + else: + files = [arguments["INPUT_DIR"]] + for in_file in files: + for entity in _exporters: + # discogs files are named discogs_{date}_{entity}s.xml + if f"_{entity}" in in_file: + expected_count = rough_counts['{}s'.format(entity)] + exporter = _exporters[entity]( + in_file, + out_base, + limit=limit, + bz2=bz2_on, + debug=debug, + max_hint=min(expected_count, limit or expected_count), + dry_run=dry_run) + exporter.export() + break diff --git a/run.py b/run.py index 7f88f04..76f7277 100644 --- a/run.py +++ b/run.py @@ -1,15 +1,18 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- """Usage: - run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] INPUT [OUTPUT] [--export=]... + run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] [--output=] ... + run.py [--bz2] [--dry-run] [--limit=] [--debug] [--apicounts] [--output=] INPUT_DIR [--export=]... Options: --bz2 Compress output files using bz2 compression library. - --limit= Limit export to some number of entities - --export= Limit export to some entities (repeatable) + --limit= Limit export to some number of entities (all otherwise) + --export= Limit export to some entities (repeatable). + Entity is one of: artist, label, master, release. --debug Turn on debugging prints --apicounts Check entities counts with Discogs API - --dry-run Do not write + --dry-run Do not write csv files. + --output= Where to write the csv files. Defaults to current dir. """ import sys @@ -20,4 +23,6 @@ if __name__ == '__main__': arguments = docopt(__doc__, version='Discogs-to-SQL exporter') + if arguments["--debug"]: + print(arguments) sys.exit(main(arguments)) diff --git a/tests/test_extract.py b/tests/test_extract.py index af2532e..285c33f 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -76,8 +76,8 @@ def _check_counts(self, entity, tmp_path): # - export=label arguments = { - "INPUT": self._samples_folder, - "OUTPUT": tmp_path, + "INPUT_DIR": self._samples_folder, + "--output": tmp_path, "--export": [entity], "--limit": None, "--bz2": False,