Skip to content

Commit

Permalink
Merge pull request #120 from philipmat/112_specify_files_directly
Browse files Browse the repository at this point in the history
Allows specifying dump files individually
  • Loading branch information
philipmat authored Sep 8, 2020
2 parents cd0929a + 654a7de commit 3ba6ba9
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 23 deletions.
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,23 +86,41 @@ $ sha256sum -c discogs_*_CHECKSUM.txt

Run `run.py` to convert the dump files to csv.

There are two run modes:

1. You can point it to a directory where the discogs dump files are
and use one or multiple `--export` options to indicate which files to process:

```sh
# ensure the virtual environment is active
(.discogsenv) $ python3 run.py \
--bz2 \ # compresses resulting csv files
--apicounts \ # provides more accurate progress counts
--export artist --export label --export master --export release \
--output csv-dir # folder where to output the csv files
dump-dir \ # folder where the data dumps are
csv-dir # folder where to output the csv files
```

2. You can specify the individual files instead:

```sh
# ensure the virtual environment is active
(.discogsenv) $ python3 run.py \
--bz2 \ # compresses resulting csv files
--apicounts \ # provides more accurate progress counts
--output csv-dir # folder where to output the csv files
path/to/discogs_20200806_artist.xml.gz path/to/discogs_20200806_labels.xml.gz
```

`run.py` takes the following arguments:

- `--export`: the types of dump files to export: "artist", "label", "master", "release.
It matches the names of the dump files, e.g. "discogs_20200806_*artist*s.xml.gz"
Not needed if the individual files are specified.
- `--bz2`: Compresses output csv files using bz2 compression library.
- `--limit=<lines>`: Limits export to some number of entities
- `--apicounts`: Makes progress report more accurate by getting total amounts from Discogs API.
- `--output` : the folder where to store the csv files; default it current directory

The exporter provides progress information in real time:

Expand Down
58 changes: 42 additions & 16 deletions discogsxml2db/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,19 @@ def _write_rows(writer, entity, name):
class EntityCsvExporter(object):
"""Read a Discogs dump XML file and exports SQL table records as CSV.
"""
def __init__(self, entity, in_dir, out_dir,
def __init__(self, entity, in_file_or_dir, out_dir,
limit=None, bz2=True,
dry_run=False, debug=False, max_hint=None, verbose=False):
self.entity = entity
self.parser = _parsers[entity]()
self.max_hint = max_hint
self.verbose = verbose

lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity)
self.pattern = os.path.join(in_dir, lookup)
if os.path.isfile(in_file_or_dir):
self.pattern = in_file_or_dir
else:
lookup = 'discogs_[0-9]*_{}s.xml*'.format(entity)
self.pattern = os.path.join(in_file_or_dir, lookup)

# where and how the exporter will write to
self.out_dir = out_dir
Expand Down Expand Up @@ -287,8 +290,7 @@ def write_track_artists(self, writer, release):


def main(arguments):
in_base = arguments['INPUT']
out_base = arguments['OUTPUT'] or '.'
out_base = arguments['--output'] or '.'
limit = int(arguments['--limit']) if arguments['--limit'] else None
bz2_on = arguments['--bz2']
debug = arguments['--debug']
Expand All @@ -312,14 +314,38 @@ def main(arguments):
except Exception:
pass

for entity in arguments['--export']:
expected_count = rough_counts['{}s'.format(entity)]
exporter = _exporters[entity](
in_base,
out_base,
limit=limit,
bz2=bz2_on,
debug=debug,
max_hint=min(expected_count, limit or expected_count),
dry_run=dry_run)
exporter.export()
if arguments["INPUT_DIR"] and os.path.isdir(arguments["INPUT_DIR"]):
# use --export to select the entities
in_base = arguments['INPUT_DIR']
for entity in arguments['--export']:
expected_count = rough_counts['{}s'.format(entity)]
exporter = _exporters[entity](
in_base,
out_base,
limit=limit,
bz2=bz2_on,
debug=debug,
max_hint=min(expected_count, limit or expected_count),
dry_run=dry_run)
exporter.export()
elif arguments["<INPUT_FILE>"] or os.path.isfile(arguments["INPUT_DIR"]):
files = []
if arguments["<INPUT_FILE>"]:
files = arguments["<INPUT_FILE>"]
else:
files = [arguments["INPUT_DIR"]]
for in_file in files:
for entity in _exporters:
# discogs files are named discogs_{date}_{entity}s.xml
if f"_{entity}" in in_file:
expected_count = rough_counts['{}s'.format(entity)]
exporter = _exporters[entity](
in_file,
out_base,
limit=limit,
bz2=bz2_on,
debug=debug,
max_hint=min(expected_count, limit or expected_count),
dry_run=dry_run)
exporter.export()
break
13 changes: 9 additions & 4 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Usage:
run.py [--bz2] [--dry-run] [--limit=<lines>] [--debug] [--apicounts] INPUT [OUTPUT] [--export=<entity>]...
run.py [--bz2] [--dry-run] [--limit=<lines>] [--debug] [--apicounts] [--output=<dir>] <INPUT_FILE> <INPUT_FILE>...
run.py [--bz2] [--dry-run] [--limit=<lines>] [--debug] [--apicounts] [--output=<dir>] INPUT_DIR [--export=<entity>]...
Options:
--bz2 Compress output files using bz2 compression library.
--limit=<lines> Limit export to some number of entities
--export=<entity> Limit export to some entities (repeatable)
--limit=<lines> Limit export to some number of entities (all otherwise)
--export=<entity> Limit export to some entities (repeatable).
Entity is one of: artist, label, master, release.
--debug Turn on debugging prints
--apicounts Check entities counts with Discogs API
--dry-run Do not write
--dry-run Do not write csv files.
--output=<dir> Where to write the csv files. Defaults to current dir.
"""
import sys
Expand All @@ -20,4 +23,6 @@

if __name__ == '__main__':
arguments = docopt(__doc__, version='Discogs-to-SQL exporter')
if arguments["--debug"]:
print(arguments)
sys.exit(main(arguments))
4 changes: 2 additions & 2 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def _check_counts(self, entity, tmp_path):
# - export=label

arguments = {
"INPUT": self._samples_folder,
"OUTPUT": tmp_path,
"INPUT_DIR": self._samples_folder,
"--output": tmp_path,
"--export": [entity],
"--limit": None,
"--bz2": False,
Expand Down

0 comments on commit 3ba6ba9

Please sign in to comment.