From 0b63c6290ea9673b9ffb1b2b9f8fe59675d23940 Mon Sep 17 00:00:00 2001 From: Thomas Nind Date: Wed, 23 Jun 2021 08:20:11 +0100 Subject: [PATCH 1/3] Added class shell for jsonl reader --- .../Modules/Attachers/JSONLAttacher.cs | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs diff --git a/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs b/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs new file mode 100644 index 0000000000..75cc3e1b2c --- /dev/null +++ b/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs @@ -0,0 +1,59 @@ +using Newtonsoft.Json; +using Rdmp.Core.Curation.Data; +using Rdmp.Core.DataFlowPipeline; +using Rdmp.Core.DataLoad.Engine.Attachers; +using Rdmp.Core.DataLoad.Engine.Job; +using ReusableLibraryCode.Checks; +using ReusableLibraryCode.Progress; +using System.IO; + +namespace Rdmp.Core.DataLoad.Modules.Attachers +{ + public class JsonLAttacher : Attacher + { + [DemandsInitialization("The root table in RAW that will be loaded by this Attacher", Mandatory = true)] + public TableInfo RootTable { get; set; } + + [DemandsInitialization("Pattern to match files in forLoading in. Defaults to *.json", DefaultValue = "*.json", Mandatory = true)] + public string FilePattern { get; set; } = "*.json"; + + [DemandsInitialization(@"Map for sub attributes to tables. Format is MyProp=MyTable,MyProp2=MyTable2. +If not specified then properties must exactly match table names")] + public string AttributeTableMap { get; set; } + + public JsonLAttacher():base(true) + { + + } + + public override ExitCodeType Attach(IDataLoadJob job, GracefulCancellationToken cancellationToken) + { + foreach(var file in LoadDirectory.ForLoading.GetFiles(FilePattern)) + { + using (var sr = new StreamReader(file.FullName)) + { + var jsonReader = new JsonTextReader(sr) + { + SupportMultipleContent = true // This is important! + }; + + while (jsonReader.Read()) + { + // load tables + } + } + } + + + return ExitCodeType.Success; + } + + public override void Check(ICheckNotifier notifier) + { + } + + public override void LoadCompletedSoDispose(ExitCodeType exitCode, IDataLoadEventListener postLoadEventListener) + { + } + } +} From 0879229bc9c8b7d86ecf31b384681e935b3aa4be Mon Sep 17 00:00:00 2001 From: Thomas Nind Date: Wed, 23 Jun 2021 11:05:02 +0100 Subject: [PATCH 2/3] Added test for jsonlattacher (does not pass) --- .../Engine/Integration/JsonLAttacherTests.cs | 125 ++++++++++++++++++ .../Modules/Attachers/JSONLAttacher.cs | 7 +- 2 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 Rdmp.Core.Tests/DataLoad/Engine/Integration/JsonLAttacherTests.cs diff --git a/Rdmp.Core.Tests/DataLoad/Engine/Integration/JsonLAttacherTests.cs b/Rdmp.Core.Tests/DataLoad/Engine/Integration/JsonLAttacherTests.cs new file mode 100644 index 0000000000..6bb9856585 --- /dev/null +++ b/Rdmp.Core.Tests/DataLoad/Engine/Integration/JsonLAttacherTests.cs @@ -0,0 +1,125 @@ +using FAnsi; +using FAnsi.Discovery; +using NUnit.Framework; +using Rdmp.Core.Curation; +using Rdmp.Core.Curation.Data; +using Rdmp.Core.DataFlowPipeline; +using Rdmp.Core.DataLoad; +using Rdmp.Core.DataLoad.Engine.Job; +using Rdmp.Core.DataLoad.Modules.Attachers; +using ReusableLibraryCode.Progress; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Tests.Common; +using TypeGuesser; + +namespace Rdmp.Core.Tests.DataLoad.Engine.Integration +{ + class JsonLAttacherTests : DatabaseTests + { + private LoadDirectory LoadDirectory; + DirectoryInfo parentDir; + private DiscoveredDatabase _database; + + [SetUp] + protected override void SetUp() + { + base.SetUp(); + + var workingDir = new DirectoryInfo(TestContext.CurrentContext.TestDirectory); + parentDir = workingDir.CreateSubdirectory("FlatFileAttacherTests"); + + DirectoryInfo toCleanup = parentDir.GetDirectories().SingleOrDefault(d => d.Name.Equals("Test_CSV_Attachment")); + if (toCleanup != null) + toCleanup.Delete(true); + + LoadDirectory = LoadDirectory.CreateDirectoryStructure(parentDir, "JsonLAttacherTests"); + + // create a separate builder for setting an initial catalog on (need to figure out how best to stop child classes changing ServerICan... as this then causes TearDown to fail) + _database = GetCleanedServer(DatabaseType.MicrosoftSQLServer); + + using (var con = _database.Server.GetConnection()) + { + con.Open(); + + var cmdCreateTable = _database.Server.GetCommand("CREATE Table " + _database.GetRuntimeName() + "..Bob([name] [varchar](500),[name2] [varchar](500))", con); + cmdCreateTable.ExecuteNonQuery(); + } + } + + [Test] + public void SimpleJsonl_TwoTables() + { + string json = @" + {""name"": ""Gilbert"", ""wins"": [[""straight"", ""7♣""], [""one pair"", ""10♥""]]} +{""name"": ""Alexa"", ""wins"": [[""two pair"", ""4♠""], [""two pair"", ""9♠""]]} +{ ""name"": ""May"", ""wins"": []} +{ ""name"": ""Deloise"", ""wins"": [[""three of a kind"", ""5♣""]]} +"; + + string filename = Path.Combine(LoadDirectory.ForLoading.FullName, "some.jsonl"); + File.WriteAllText(filename, json); + + var attacher = new JsonLAttacher(); + attacher.Initialize(LoadDirectory, _database); + attacher.FilePattern = "some.jsonl"; + + var player = _database.CreateTable("Player", new[] { + new DatabaseColumnRequest("name", new DatabaseTypeRequest(typeof(string), 10)){IsPrimaryKey = true} + }); + + // TODO: this is not really what the JSON above shows! its an array, maybe instead load this as an array JSON type in the single table? + var wins = _database.CreateTable("Wins", new[] { + new DatabaseColumnRequest("name", new DatabaseTypeRequest(typeof(string), 10)), + new DatabaseColumnRequest("winType", new DatabaseTypeRequest(typeof(string), 10)), + new DatabaseColumnRequest("card", new DatabaseTypeRequest(typeof(string), 2)) + }); + + Import(player, out ITableInfo playerTi,out _); + Import(wins); + + attacher.RootTable = playerTi; + + //other cases (i.e. correct separator) + attacher.Attach(new ThrowImmediatelyDataLoadJob(), new GracefulCancellationToken()); + + Assert.AreEqual(4, player.GetRowCount()); + Assert.AreEqual(5, wins.GetRowCount()); + + using (var con = _database.Server.GetConnection()) + { + con.Open(); + using (var r = _database.Server.GetCommand("Select * from Player", con).ExecuteReader()) + { + Assert.IsTrue(r.Read()); + Assert.AreEqual("Gilbert", r["name"]); + Assert.IsTrue(r.Read()); + Assert.AreEqual("Alexa", r["name"]); + Assert.IsTrue(r.Read()); + Assert.AreEqual("May", r["name"]); + Assert.IsTrue(r.Read()); + Assert.AreEqual("Deloise", r["name"]); + } + + using (var r = _database.Server.GetCommand("Select * from Wins", con).ExecuteReader()) + { + Assert.IsTrue(r.Read()); + Assert.AreEqual("Gilbert", r["name"]); + Assert.AreEqual("straight", r["winType"]); + Assert.AreEqual("7♣", r["card"]); + + // todo the other values + } + } + + attacher.LoadCompletedSoDispose(ExitCodeType.Success, new ThrowImmediatelyDataLoadEventListener()); + + File.Delete(filename); + } + + } +} diff --git a/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs b/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs index 75cc3e1b2c..d5d93d0a61 100644 --- a/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs +++ b/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs @@ -1,4 +1,5 @@ -using Newtonsoft.Json; +using FAnsi.Discovery; +using Newtonsoft.Json; using Rdmp.Core.Curation.Data; using Rdmp.Core.DataFlowPipeline; using Rdmp.Core.DataLoad.Engine.Attachers; @@ -12,7 +13,7 @@ namespace Rdmp.Core.DataLoad.Modules.Attachers public class JsonLAttacher : Attacher { [DemandsInitialization("The root table in RAW that will be loaded by this Attacher", Mandatory = true)] - public TableInfo RootTable { get; set; } + public ITableInfo RootTable { get; set; } [DemandsInitialization("Pattern to match files in forLoading in. Defaults to *.json", DefaultValue = "*.json", Mandatory = true)] public string FilePattern { get; set; } = "*.json"; @@ -39,6 +40,8 @@ public override ExitCodeType Attach(IDataLoadJob job, GracefulCancellationToken while (jsonReader.Read()) { + DiscoveredTable t; + t.Insert() // load tables } } From 4e1dd080ee68ff325d2024af42a371ed80e229c1 Mon Sep 17 00:00:00 2001 From: Thomas Nind Date: Tue, 29 Jun 2021 12:10:20 +0100 Subject: [PATCH 3/3] Fixed test to match simpler expectations --- .../Engine/Integration/JsonLAttacherTests.cs | 34 +++++-------------- .../Modules/Attachers/JSONLAttacher.cs | 32 +++++++++++++++-- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/Rdmp.Core.Tests/DataLoad/Engine/Integration/JsonLAttacherTests.cs b/Rdmp.Core.Tests/DataLoad/Engine/Integration/JsonLAttacherTests.cs index 6bb9856585..3a75518046 100644 --- a/Rdmp.Core.Tests/DataLoad/Engine/Integration/JsonLAttacherTests.cs +++ b/Rdmp.Core.Tests/DataLoad/Engine/Integration/JsonLAttacherTests.cs @@ -37,7 +37,7 @@ protected override void SetUp() if (toCleanup != null) toCleanup.Delete(true); - LoadDirectory = LoadDirectory.CreateDirectoryStructure(parentDir, "JsonLAttacherTests"); + LoadDirectory = LoadDirectory.CreateDirectoryStructure(parentDir, "JsonLAttacherTests",true); // create a separate builder for setting an initial catalog on (need to figure out how best to stop child classes changing ServerICan... as this then causes TearDown to fail) _database = GetCleanedServer(DatabaseType.MicrosoftSQLServer); @@ -52,7 +52,7 @@ protected override void SetUp() } [Test] - public void SimpleJsonl_TwoTables() + public void SimpleJsonl_OneTable() { string json = @" {""name"": ""Gilbert"", ""wins"": [[""straight"", ""7♣""], [""one pair"", ""10♥""]]} @@ -69,50 +69,32 @@ public void SimpleJsonl_TwoTables() attacher.FilePattern = "some.jsonl"; var player = _database.CreateTable("Player", new[] { - new DatabaseColumnRequest("name", new DatabaseTypeRequest(typeof(string), 10)){IsPrimaryKey = true} - }); - - // TODO: this is not really what the JSON above shows! its an array, maybe instead load this as an array JSON type in the single table? - var wins = _database.CreateTable("Wins", new[] { - new DatabaseColumnRequest("name", new DatabaseTypeRequest(typeof(string), 10)), - new DatabaseColumnRequest("winType", new DatabaseTypeRequest(typeof(string), 10)), - new DatabaseColumnRequest("card", new DatabaseTypeRequest(typeof(string), 2)) + new DatabaseColumnRequest("name", new DatabaseTypeRequest(typeof(string), 10)){IsPrimaryKey = true}, + new DatabaseColumnRequest("wins", new DatabaseTypeRequest(typeof(string), int.MaxValue)) }); Import(player, out ITableInfo playerTi,out _); - Import(wins); attacher.RootTable = playerTi; - + //other cases (i.e. correct separator) attacher.Attach(new ThrowImmediatelyDataLoadJob(), new GracefulCancellationToken()); Assert.AreEqual(4, player.GetRowCount()); - Assert.AreEqual(5, wins.GetRowCount()); using (var con = _database.Server.GetConnection()) { con.Open(); - using (var r = _database.Server.GetCommand("Select * from Player", con).ExecuteReader()) + using (var r = _database.Server.GetCommand("Select * from Player order by name", con).ExecuteReader()) { - Assert.IsTrue(r.Read()); - Assert.AreEqual("Gilbert", r["name"]); Assert.IsTrue(r.Read()); Assert.AreEqual("Alexa", r["name"]); Assert.IsTrue(r.Read()); - Assert.AreEqual("May", r["name"]); - Assert.IsTrue(r.Read()); Assert.AreEqual("Deloise", r["name"]); - } - - using (var r = _database.Server.GetCommand("Select * from Wins", con).ExecuteReader()) - { Assert.IsTrue(r.Read()); Assert.AreEqual("Gilbert", r["name"]); - Assert.AreEqual("straight", r["winType"]); - Assert.AreEqual("7♣", r["card"]); - - // todo the other values + Assert.IsTrue(r.Read()); + Assert.AreEqual("May", r["name"]); } } diff --git a/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs b/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs index d5d93d0a61..9be3975378 100644 --- a/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs +++ b/Rdmp.Core/DataLoad/Modules/Attachers/JSONLAttacher.cs @@ -1,11 +1,14 @@ using FAnsi.Discovery; using Newtonsoft.Json; using Rdmp.Core.Curation.Data; +using Rdmp.Core.Curation.Data.DataLoad; using Rdmp.Core.DataFlowPipeline; using Rdmp.Core.DataLoad.Engine.Attachers; using Rdmp.Core.DataLoad.Engine.Job; using ReusableLibraryCode.Checks; using ReusableLibraryCode.Progress; +using System; +using System.Collections.Generic; using System.IO; namespace Rdmp.Core.DataLoad.Modules.Attachers @@ -38,11 +41,34 @@ public override ExitCodeType Attach(IDataLoadJob job, GracefulCancellationToken SupportMultipleContent = true // This is important! }; + Dictionary vals = new Dictionary(); + + var tblName = RootTable.GetRuntimeName(LoadStage.AdjustRaw, job?.Configuration?.DatabaseNamer); + var tbl = _dbInfo.ExpectTable(tblName); + + if(!tbl.Exists()) + { + throw new Exception($"Expected table {tbl.GetFullyQualifiedName()} was not found in RAW databse"); + } + while (jsonReader.Read()) { - DiscoveredTable t; - t.Insert() - // load tables + if(jsonReader.TokenType == JsonToken.PropertyName) + { + vals.Add(jsonReader.Value.ToString(), null); + + } + if(jsonReader.TokenType == JsonToken.String) + { + if(vals.ContainsKey(jsonReader.Path)) + vals[jsonReader.Path] = jsonReader.Value; + } + if (jsonReader.TokenType == JsonToken.EndObject) + { + // load tables + tbl.Insert(vals); + vals.Clear(); + } } } }