-
Notifications
You must be signed in to change notification settings - Fork 76
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #114 from philipmat/dotnet_parser
.NET Core based parser - 2x speedup
- Loading branch information
Showing
32 changed files
with
2,092 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions | ||
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions | ||
|
||
name: DotNet Build | ||
|
||
on: [ push, pull_request ] | ||
# push: | ||
# branches: [ develop ] | ||
# pull_request: | ||
# branches: [ develop ] | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
defaults: | ||
run: | ||
working-directory: ./alternatives/dotnet | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
- name: Setup .NET Core | ||
uses: actions/setup-dotnet@v1 | ||
with: | ||
dotnet-version: 3.1.x | ||
- name: Install dependencies | ||
run: | | ||
pwd | ||
dotnet restore | ||
- name: Build | ||
run: dotnet build --configuration Release --no-restore | ||
- name: Test | ||
run: dotnet test --no-restore --verbosity normal | ||
|
||
- name: Publish | ||
run: | | ||
dotnet publish discogs/discogs.csproj -o ./artifacts/discogs-linux -r linux-x64 --self-contained true -p:PublishSingleFile=true -p:PublishTrimmed=true | ||
dotnet publish discogs/discogs.csproj -o ./artifacts/discogs-osx -r osx-x64 --self-contained true -p:PublishSingleFile=true -p:PublishTrimmed=true | ||
dotnet publish discogs/discogs.csproj -o ./artifacts/discogs-win -r win-x64 --self-contained true -p:PublishSingleFile=true -p:PublishTrimmed=true | ||
- name: Upload build artifacts - linux | ||
uses: actions/upload-artifact@v2 | ||
with: | ||
name: discogsxml2db-linux-x64 | ||
path: /home/runner/work/discogs-xml2db/discogs-xml2db/alternatives/dotnet/artifacts/discogs-linux | ||
- name: Upload build artifacts - macOS | ||
uses: actions/upload-artifact@v2 | ||
with: | ||
name: discogsxml2db-osx-x64 | ||
path: /home/runner/work/discogs-xml2db/discogs-xml2db/alternatives/dotnet/artifacts/discogs-osx | ||
- name: Upload build artifacts - Win | ||
uses: actions/upload-artifact@v2 | ||
with: | ||
name: discogsxml2db-win-x64 | ||
path: /home/runner/work/discogs-xml2db/discogs-xml2db/alternatives/dotnet/artifacts/discogs-win |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
{ | ||
// Use IntelliSense to find out which attributes exist for C# debugging | ||
// Use hover for the description of the existing attributes | ||
// For further information visit https://github.com/OmniSharp/omnisharp-vscode/blob/master/debugger-launchjson.md | ||
"version": "0.2.0", | ||
"configurations": [ | ||
{ | ||
"name": ".NET Core Launch (console)", | ||
"type": "coreclr", | ||
"request": "launch", | ||
"preLaunchTask": "build", | ||
// If you have changed target frameworks, make sure to update the program path. | ||
// "program": "${workspaceFolder}/dotnet.sln", | ||
"program": "${workspaceFolder}/discogs/bin/Debug/netcoreapp3.1/discogs.dll", | ||
"args": ["--verbose", "${input:runOptions}", "${input:testFiles}"], | ||
"cwd": "${workspaceFolder}", | ||
// For more information about the 'console' field, see https://aka.ms/VSCode-CS-LaunchJson-Console | ||
"console": "internalConsole", | ||
"stopAtEntry": false | ||
}, | ||
{ | ||
"name": ".NET Core Attach", | ||
"type": "coreclr", | ||
"request": "attach", | ||
"processId": "${command:pickProcess}" | ||
} | ||
], | ||
"inputs": [ | ||
{ | ||
"id": "runOptions", | ||
"description": "What options", | ||
"type": "pickString", | ||
"options": [ | ||
"", | ||
"--dry-run", | ||
"--gz", | ||
] | ||
}, | ||
{ | ||
"id": "testFiles", | ||
"description": "What file to process?", | ||
"type": "pickString", | ||
"options": [ | ||
"", | ||
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/artist.xml", | ||
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/label.xml", | ||
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/master.xml", | ||
"/Users/af59986/Dev/discogs-xml2db/alternatives/dotnet/tests/Resources/release.xml", | ||
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_artists.xml.gz", | ||
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_labels.xml.gz", | ||
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_masters.xml.gz", | ||
"/Users/af59986/Dev/discogs-xml2db/tests/samples/discogs_20200806_releases.xml.gz", | ||
"/Users/af59986/Dev/tmp/discogs/discogs_20200806_labels.xml.gz", | ||
] | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
{ | ||
"version": "2.0.0", | ||
"tasks": [ | ||
{ | ||
"label": "build", | ||
"command": "dotnet", | ||
"type": "process", | ||
"args": [ | ||
"build", | ||
"${workspaceFolder}/dotnet.sln", | ||
"/property:GenerateFullPaths=true", | ||
"/consoleloggerparameters:NoSummary" | ||
], | ||
"problemMatcher": "$msCompile" | ||
}, | ||
{ | ||
"label": "publish", | ||
"command": "dotnet", | ||
"type": "process", | ||
"args": [ | ||
"publish", | ||
"${workspaceFolder}/discogs/discogs.csproj", | ||
"/property:GenerateFullPaths=true", | ||
"/consoleloggerparameters:NoSummary" | ||
], | ||
"problemMatcher": "$msCompile" | ||
}, | ||
{ | ||
"label": "watch", | ||
"command": "dotnet", | ||
"type": "process", | ||
"args": [ | ||
"watch", | ||
"run", | ||
"${workspaceFolder}/discogs/discogs.csproj", | ||
"/property:GenerateFullPaths=true", | ||
"/consoleloggerparameters:NoSummary" | ||
], | ||
"problemMatcher": "$msCompile" | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# Discogs .NET Parser | ||
|
||
This alternative `discogsxml2db` is written in C# and run on Microsoft .NET Core. | ||
|
||
It provides a significant speedup over the python version: | ||
|
||
| File | Record Count | Python | C# | | ||
| --- | ---: | :---: | :---: | | ||
| discogs_20200806_artists.xml.gz | 7,046,615 | 6:22 | 2:35 | | ||
| discogs_20200806_labels.xml.gz | 1,571,873 | 1:15 | 0:22 | | ||
| discogs_20200806_masters.xml.gz | 1,734,371 | 3:56 | 1:57 | | ||
| discogs_20200806_releases.xml.gz | 12,867,980 | 1:45:16 | 42:38 | | ||
|
||
## Features | ||
|
||
**Done**: | ||
|
||
- parsing all four discogs dumps, both *.xml* and *.xml.gz*; | ||
- exporting to csv and compressed csv. Produces the exact same | ||
files that the Python version does; | ||
- displaying progress of import/export process; | ||
- "dry runs": only parsing the files and displaying counts, | ||
not producing any csv files; | ||
|
||
**TODO**: | ||
|
||
- option to track progress display against the most recently reported | ||
discogs record counts (`--api-counts` argument); | ||
- option to import the resulting csv files into the database; | ||
this process is currently manual or done through the python DB-specific | ||
scripts; | ||
- option to specify the output folder for csv files; | ||
|
||
## Installing | ||
|
||
Unlike the Python version, this version requires no installation. | ||
|
||
Simply download the archive appropriate for your platform. Unzip, | ||
and you should have 2 files: a `discogs` executable (or `discogs.exe` on | ||
Windows) and a "discogs.pdb" support file. | ||
|
||
That's it. | ||
|
||
## Running | ||
|
||
Executing `discogs` without any parameters or passing `--help` will | ||
output a list of available arguments: | ||
|
||
```text | ||
Usage: discogs [options] [files...] | ||
Options: | ||
--dry-run Parse the files, output counts, but don't write any actual files | ||
--verbose More verbose output | ||
--gz Compress output files (gzip) | ||
files... Path to discogs_[date]_[type].xml, or .xml.gz files. | ||
Can specify multiple files. | ||
``` | ||
|
||
To export one or more discogs xml files to csv, simply pass it as parameters | ||
to the executable: `discogs /tmp/discogs_20200806_artists.xml.gz /tmp/discogs_20200806_labels.xml.gz`. | ||
|
||
Currently, the program exports the csv files in the same folder as each of the | ||
original xml files. If you would like the csv files to be compressed to `.csv.gz`, | ||
pass the `--gz` argument: `discogs --gz /tmp/discogs_20200806_artists.xml.gz`. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using System.IO.Compression; | ||
using System.Linq; | ||
using System.Threading.Tasks; | ||
|
||
namespace discogs | ||
{ | ||
public interface IExporter<T> : IDisposable | ||
where T : IExportToCsv, new() | ||
{ | ||
Task ExportAsync(T value); | ||
Task CompleteExportAsync(int finalCount); | ||
} | ||
|
||
public class CsvExporter<T> : IExporter<T> | ||
where T : IExportToCsv, new() | ||
{ | ||
private const int BufferSize = 1024 * 1024; | ||
private readonly string _typeName; | ||
private readonly Dictionary<string, (string FilePath, StreamWriter FileStream)> _csvStreams; | ||
private bool disposedValue; | ||
|
||
public CsvExporter(string outPutDirectory, bool compress = false, bool verbose = false) | ||
{ | ||
_typeName = typeof(T).Name.Split('.')[^1]; | ||
_csvStreams = GetCsvFilesFor(outPutDirectory, compress); | ||
} | ||
public async Task CompleteExportAsync(int finalCount) | ||
{ | ||
var csvFileNames = string.Join("; ", _csvStreams.Select(kvp => kvp.Value.FilePath)); | ||
// pbar.WriteLine("Parsing done. Writing streams."); | ||
foreach (var kvp in _csvStreams) | ||
{ | ||
await kvp.Value.FileStream.FlushAsync(); | ||
kvp.Value.FileStream.Close(); | ||
// await kvp.Value.FileStream.DisposeAsync(); | ||
} | ||
Console.WriteLine($"Found {finalCount:n0} {_typeName}s. Wrote them to {csvFileNames}."); | ||
} | ||
|
||
public async Task ExportAsync(T value) | ||
{ | ||
IEnumerable<(string StreamName, string[] Row)> csvExports = value.ExportToCsv(); | ||
foreach (var (streamName, row) in csvExports) | ||
{ | ||
await _csvStreams[streamName].FileStream.WriteLineAsync(CsvExtensions.ToCsv(row)); | ||
} | ||
} | ||
|
||
private static Dictionary<string, (string FilePath, StreamWriter FileStream)> GetCsvFilesFor(string outPutDirectory, bool compress) | ||
{ | ||
var obj = new T(); | ||
IReadOnlyDictionary<string, string[]> files = obj.GetCsvExportScheme(); | ||
Dictionary<string, (string FilePath, StreamWriter FileStream)> csvFiles = files.ToDictionary( | ||
kvp => kvp.Key, | ||
kvp => | ||
{ | ||
var extension = compress ? "csv.gz" : "csv"; | ||
var csvFile = Path.Combine(outPutDirectory, $"{kvp.Key}.{extension}"); | ||
StreamWriter stream; | ||
if (compress) | ||
{ | ||
var fs = File.Create(csvFile, bufferSize: BufferSize); | ||
var gzStream = new GZipStream(fs, CompressionMode.Compress, leaveOpen: false); | ||
stream = new StreamWriter(gzStream, encoding: System.Text.Encoding.UTF8); | ||
} | ||
else | ||
{ | ||
stream = new StreamWriter(csvFile, append: false, encoding: System.Text.Encoding.UTF8, bufferSize: BufferSize); | ||
} | ||
stream.WriteLine(CsvExtensions.ToCsv(kvp.Value)); | ||
return (csvFile, stream); | ||
}); | ||
|
||
return csvFiles; | ||
} | ||
|
||
// // TODO: override finalizer only if 'Dispose(bool disposing)' has code to free unmanaged resources | ||
// ~CsvExporter() | ||
// { | ||
// // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method | ||
// Dispose(disposing: false); | ||
// } | ||
|
||
public void Dispose() | ||
{ | ||
// Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method | ||
Dispose(disposing: true); | ||
GC.SuppressFinalize(this); | ||
} | ||
|
||
protected virtual void Dispose(bool disposing) | ||
{ | ||
if (!disposedValue) | ||
{ | ||
if (disposing) | ||
{ | ||
// dispose managed state (managed objects) | ||
foreach (var kvp in _csvStreams) | ||
{ | ||
var (_, stream) = kvp.Value; | ||
stream.Dispose(); | ||
} | ||
} | ||
|
||
// TODO: free unmanaged resources (unmanaged objects) and override finalizer | ||
// TODO: set large fields to null | ||
disposedValue = true; | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.