Skip to content

Commit

Permalink
Merge pull request #114 from philipmat/dotnet_parser
Browse files Browse the repository at this point in the history
.NET Core based parser - 2x speedup
  • Loading branch information
philipmat authored Sep 8, 2020
2 parents c765fb4 + 2632f89 commit cd0929a
Show file tree
Hide file tree
Showing 32 changed files with 2,092 additions and 7 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/dotnet.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: DotNet Build

on: [ push, pull_request ]
# push:
# branches: [ develop ]
# pull_request:
# branches: [ develop ]

jobs:
build:
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./alternatives/dotnet

steps:
- uses: actions/checkout@v2
- name: Setup .NET Core
uses: actions/setup-dotnet@v1
with:
dotnet-version: 3.1.x
- name: Install dependencies
run: |
pwd
dotnet restore
- name: Build
run: dotnet build --configuration Release --no-restore
- name: Test
run: dotnet test --no-restore --verbosity normal

- name: Publish
run: |
dotnet publish discogs/discogs.csproj -o ./artifacts/discogs-linux -r linux-x64 --self-contained true -p:PublishSingleFile=true -p:PublishTrimmed=true
dotnet publish discogs/discogs.csproj -o ./artifacts/discogs-osx -r osx-x64 --self-contained true -p:PublishSingleFile=true -p:PublishTrimmed=true
dotnet publish discogs/discogs.csproj -o ./artifacts/discogs-win -r win-x64 --self-contained true -p:PublishSingleFile=true -p:PublishTrimmed=true
- name: Upload build artifacts - linux
uses: actions/upload-artifact@v2
with:
name: discogsxml2db-linux-x64
path: /home/runner/work/discogs-xml2db/discogs-xml2db/alternatives/dotnet/artifacts/discogs-linux
- name: Upload build artifacts - macOS
uses: actions/upload-artifact@v2
with:
name: discogsxml2db-osx-x64
path: /home/runner/work/discogs-xml2db/discogs-xml2db/alternatives/dotnet/artifacts/discogs-osx
- name: Upload build artifacts - Win
uses: actions/upload-artifact@v2
with:
name: discogsxml2db-win-x64
path: /home/runner/work/discogs-xml2db/discogs-xml2db/alternatives/dotnet/artifacts/discogs-win
28 changes: 26 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# discogs-xml2db v2.0
# discogs-xml2db v2

discogs-xml2db is a python program for importing [discogs data dumps](https://data.discogs.com/)
into several databases.

Version 2.0 is a rewrite of the original *discogs-xml2db*
Version 2 is a rewrite of the original *discogs-xml2db*
(referred in here as the *classic* version).
It is based on a [branch by RedApple](https://github.com/redapple/discogs-xml2db)
and it is several times faster.
Expand All @@ -12,6 +12,30 @@ Currently supports MySQL and PostgreSQL as target databases.
Instructions for importing into MongoDB, though these are untested.
Let us know how it goes!

## Experimental version

In parallel to the original Python codebase, we're working on a parser/exporter
that's even faster. This is a complete rewrite in C# and initial results are highly
promising:

| File | Record Count | Python | C# |
| --- | ---: | :---: | :---: |
| discogs_20200806_artists.xml.gz | 7,046,615 | 6:22 | 2:35 |
| discogs_20200806_labels.xml.gz | 1,571,873 | 1:15 | 0:22 |
| discogs_20200806_masters.xml.gz | 1,734,371 | 3:56 | 1:57 |
| discogs_20200806_releases.xml.gz | 12,867,980 | 1:45:16 | 42:38 |

If you're interested in testing one of this versions, read more about it
in the [.NET Parser README](./alternatives/dotnet/README.md) or grab
the appropriate binaries from the
[Releases page](https://github.com/philipmat/discogs-xml2db/releases).

While this version does not have yet complete feature-parity with the Python
version, the core export-to-csv is there and it's likely it will
eventually replace it.

![DotNet Build](https://github.com/philipmat/discogs-xml2db/workflows/DotNet%20Build/badge.svg)

## Running discogs-xml2db

![Build Status - develop](https://github.com/philipmat/discogs-xml2db/workflows/Python%20build%20check/badge.svg?branch=develop)
Expand Down
57 changes: 57 additions & 0 deletions alternatives/dotnet/.vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
// Use IntelliSense to find out which attributes exist for C# debugging
// Use hover for the description of the existing attributes
// For further information visit https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md
"version": "0.2.0",
"configurations": [
{
"name": ".NET Core Launch (console)",
"type": "coreclr",
"request": "launch",
"preLaunchTask": "build",
// If you have changed target frameworks, make sure to update the program path.
// "program": "${workspaceFolder}/dotnet.sln",
"program": "${workspaceFolder}/discogs/bin/Debug/netcoreapp3.1/discogs.dll",
"args": ["--verbose", "${input:runOptions}", "${input:testFiles}"],
"cwd": "${workspaceFolder}",
// For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console
"console": "internalConsole",
"stopAtEntry": false
},
{
"name": ".NET Core Attach",
"type": "coreclr",
"request": "attach",
"processId": "${command:pickProcess}"
}
],
"inputs": [
{
"id": "runOptions",
"description": "What options",
"type": "pickString",
"options": [
"",
"--dry-run",
"--gz",
]
},
{
"id": "testFiles",
"description": "What file to process?",
"type": "pickString",
"options": [
"",
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/artist.xml",
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/label.xml",
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/master.xml",
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/release.xml",
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_artists.xml.gz",
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_labels.xml.gz",
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_masters.xml.gz",
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_releases.xml.gz",
"/Users/af59986/Dev/tmp/discogs/discogs_20200806_labels.xml.gz",
]
}
]
}
42 changes: 42 additions & 0 deletions alternatives/dotnet/.vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "build",
"command": "dotnet",
"type": "process",
"args": [
"build",
"${workspaceFolder}/dotnet.sln",
"/property:GenerateFullPaths=true",
"/consoleloggerparameters:NoSummary"
],
"problemMatcher": "$msCompile"
},
{
"label": "publish",
"command": "dotnet",
"type": "process",
"args": [
"publish",
"${workspaceFolder}/discogs/discogs.csproj",
"/property:GenerateFullPaths=true",
"/consoleloggerparameters:NoSummary"
],
"problemMatcher": "$msCompile"
},
{
"label": "watch",
"command": "dotnet",
"type": "process",
"args": [
"watch",
"run",
"${workspaceFolder}/discogs/discogs.csproj",
"/property:GenerateFullPaths=true",
"/consoleloggerparameters:NoSummary"
],
"problemMatcher": "$msCompile"
}
]
}
66 changes: 66 additions & 0 deletions alternatives/dotnet/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Discogs .NET Parser

This alternative `discogsxml2db` is written in C# and run on Microsoft .NET Core.

It provides a significant speedup over the python version:

| File | Record Count | Python | C# |
| --- | ---: | :---: | :---: |
| discogs_20200806_artists.xml.gz | 7,046,615 | 6:22 | 2:35 |
| discogs_20200806_labels.xml.gz | 1,571,873 | 1:15 | 0:22 |
| discogs_20200806_masters.xml.gz | 1,734,371 | 3:56 | 1:57 |
| discogs_20200806_releases.xml.gz | 12,867,980 | 1:45:16 | 42:38 |

## Features

**Done**:

- parsing all four discogs dumps, both *.xml* and *.xml.gz*;
- exporting to csv and compressed csv. Produces the exact same
files that the Python version does;
- displaying progress of import/export process;
- "dry runs": only parsing the files and displaying counts,
not producing any csv files;

**TODO**:

- option to track progress display against the most recently reported
discogs record counts (`--api-counts` argument);
- option to import the resulting csv files into the database;
this process is currently manual or done through the python DB-specific
scripts;
- option to specify the output folder for csv files;

## Installing

Unlike the Python version, this version requires no installation.

Simply download the archive appropriate for your platform. Unzip,
and you should have 2 files: a `discogs` executable (or `discogs.exe` on
Windows) and a "discogs.pdb" support file.

That's it.

## Running

Executing `discogs` without any parameters or passing `--help` will
output a list of available arguments:

```text
Usage: discogs [options] [files...]
Options:
--dry-run Parse the files, output counts, but don't write any actual files
--verbose More verbose output
--gz Compress output files (gzip)
files... Path to discogs_[date]_[type].xml, or .xml.gz files.
Can specify multiple files.
```

To export one or more discogs xml files to csv, simply pass it as parameters
to the executable: `discogs /tmp/discogs_20200806_artists.xml.gz /tmp/discogs_20200806_labels.xml.gz`.

Currently, the program exports the csv files in the same folder as each of the
original xml files. If you would like the csv files to be compressed to `.csv.gz`,
pass the `--gz` argument: `discogs --gz /tmp/discogs_20200806_artists.xml.gz`.
114 changes: 114 additions & 0 deletions alternatives/dotnet/discogs/CsvExporter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Threading.Tasks;

namespace discogs
{
public interface IExporter<T> : IDisposable
where T : IExportToCsv, new()
{
Task ExportAsync(T value);
Task CompleteExportAsync(int finalCount);
}

public class CsvExporter<T> : IExporter<T>
where T : IExportToCsv, new()
{
private const int BufferSize = 1024 * 1024;
private readonly string _typeName;
private readonly Dictionary<string, (string FilePath, StreamWriter FileStream)> _csvStreams;
private bool disposedValue;

public CsvExporter(string outPutDirectory, bool compress = false, bool verbose = false)
{
_typeName = typeof(T).Name.Split('.')[^1];
_csvStreams = GetCsvFilesFor(outPutDirectory, compress);
}
public async Task CompleteExportAsync(int finalCount)
{
var csvFileNames = string.Join("; ", _csvStreams.Select(kvp => kvp.Value.FilePath));
// pbar.WriteLine("Parsing done. Writing streams.");
foreach (var kvp in _csvStreams)
{
await kvp.Value.FileStream.FlushAsync();
kvp.Value.FileStream.Close();
// await kvp.Value.FileStream.DisposeAsync();
}
Console.WriteLine($"Found {finalCount:n0} {_typeName}s. Wrote them to {csvFileNames}.");
}

public async Task ExportAsync(T value)
{
IEnumerable<(string StreamName, string[] Row)> csvExports = value.ExportToCsv();
foreach (var (streamName, row) in csvExports)
{
await _csvStreams[streamName].FileStream.WriteLineAsync(CsvExtensions.ToCsv(row));
}
}

private static Dictionary<string, (string FilePath, StreamWriter FileStream)> GetCsvFilesFor(string outPutDirectory, bool compress)
{
var obj = new T();
IReadOnlyDictionary<string, string[]> files = obj.GetCsvExportScheme();
Dictionary<string, (string FilePath, StreamWriter FileStream)> csvFiles = files.ToDictionary(
kvp => kvp.Key,
kvp =>
{
var extension = compress ? "csv.gz" : "csv";
var csvFile = Path.Combine(outPutDirectory, $"{kvp.Key}.{extension}");
StreamWriter stream;
if (compress)
{
var fs = File.Create(csvFile, bufferSize: BufferSize);
var gzStream = new GZipStream(fs, CompressionMode.Compress, leaveOpen: false);
stream = new StreamWriter(gzStream, encoding: System.Text.Encoding.UTF8);
}
else
{
stream = new StreamWriter(csvFile, append: false, encoding: System.Text.Encoding.UTF8, bufferSize: BufferSize);
}
stream.WriteLine(CsvExtensions.ToCsv(kvp.Value));
return (csvFile, stream);
});

return csvFiles;
}

// // TODO: override finalizer only if 'Dispose(bool disposing)' has code to free unmanaged resources
// ~CsvExporter()
// {
// // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
// Dispose(disposing: false);
// }

public void Dispose()
{
// Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
Dispose(disposing: true);
GC.SuppressFinalize(this);
}

protected virtual void Dispose(bool disposing)
{
if (!disposedValue)
{
if (disposing)
{
// dispose managed state (managed objects)
foreach (var kvp in _csvStreams)
{
var (_, stream) = kvp.Value;
stream.Dispose();
}
}

// TODO: free unmanaged resources (unmanaged objects) and override finalizer
// TODO: set large fields to null
disposedValue = true;
}
}
}
}
Loading

0 comments on commit cd0929a

Please sign in to comment.