diff --git a/.github/workflows/full.yml b/.github/workflows/full.yml index 7505211f..0751f2b0 100644 --- a/.github/workflows/full.yml +++ b/.github/workflows/full.yml @@ -1,8 +1,8 @@ name: 'Full Workflow' env: - VERSION: 4.5.4 - ASM_VERSION: 4.0.0 + VERSION: 4.6.0 + ASM_VERSION: 4.6.0 on: push: @@ -32,39 +32,6 @@ jobs: - name: 'test on ${{ matrix.os }}' run: dotnet test src/Parquet.sln -c release - #run-benchmarks: - # runs-on: ${{ matrix.os }} - # strategy: - # matrix: - # os: [ubuntu-latest, windows-latest, macos-latest] - # fail-fast: false - # steps: - # - uses: actions/checkout@v3 - # - name: Setup .NET - # uses: actions/setup-dotnet@v3 - # with: - # dotnet-version: | - # 3.1.x - # 6.0.x - # 7.0.x - # - name: 'Write Performance' - # run: dotnet run -c release -- write - # working-directory: src/Parquet.PerfRunner - # - name: 'Prep' - # run: mv results ${{ matrix.os }} - # working-directory: src/Parquet.PerfRunner/BenchmarkDotNet.Artifacts - - # - name: debug - # run: ls -R - # working-directory: src/Parquet.PerfRunner - - # - uses: actions/upload-artifact@v3 - # name: Collect Results - # with: - # name: benchmarks - # path: | - # src/Parquet.PerfRunner/BenchmarkDotNet.Artifacts/${{ matrix.os }}/ - build: diff --git a/docs/README.md b/docs/README.md index 89373a8c..5932f697 100644 --- a/docs/README.md +++ b/docs/README.md @@ -7,15 +7,22 @@ **Fully portable, managed** .NET library to 📖read and ✍️write [Apache Parquet](https://parquet.apache.org/) files. Targets `.NET 7`, `.NET 6.0`, `.NET Core 3.1`, `.NET Standard 2.1` and `.NET Standard 2.0`. -Runs everywhere .NET runs Linux, MacOS, Windows, iOS, Android, Tizen, Xbox, PS4, Raspberry Pi, Samsung TVs and much more. +Whether you want to build apps for Linux, MacOS, Windows, iOS, Android, Tizen, Xbox, PS4, Raspberry Pi, Samsung TVs or much more, Parquet.NET has you covered. -## Quick Start +## Why + +Parquet is a great format for storing and processing large amounts of data, but it can be tricky to use with .NET. That's why this library is here to help. It's a pure library that doesn't need any external dependencies, and it's super fast - faster than Python and Java, and other C# solutions. It's also native to .NET, so you don't have to deal with any wrappers or adapters that might slow you down or limit your options. + +This library is the best option for parquet files in .NET. It has a simple and intuitive API, supports all the parquet features you need, and handles complex scenarios with ease. -Why should I use this? I think you shouldn't. Go away and look at better alternatives, like [PyArrow](https://arrow.apache.org/docs/python/) that does it much better in Python. Also I'd rather you use [Apache Spark](https://spark.apache.org/) with native support for Parquet and other commercial alternatives. Seriously. Comparing to those, this library is just pure shite, developed in spare time by one person. Despite that, it's a de facto standard for .NET when it comes to reading and writing Parquet files. Why? Because: +Also it: -- It has zero dependencies - pure library that just works. -- It's really fast. Faster than Python and Java implementations. -- It's .NET native. Designed to utilise .NET and made for .NET developers. +- Has zero dependencies - pure library that just works. +- Really fast. Faster than Python and Java, and alternative C# implementations out there. It's often even faster than native C++ implementations. +- .NET native. Designed to utilise .NET and made for .NET developers, not the other way around. +- Not a "wrapper" that forces you to fit in. It's the other way around - forces parquet to fit into .NET. + +## Quick Start Parquet is designed to handle *complex data in bulk*. It's *column-oriented* meaning that data is physically stored in columns rather than rows. This is very important for big data systems if you want to process only a subset of columns - reading just the right columns is extremely efficient. @@ -54,7 +61,7 @@ var data = Enumerable.Range(0, 1_000_000).Select(i => new Record { Now, to write these to a file in say `/mnt/storage/data.parquet` you can use the following **line** of code: ```csharp -await ParquetConvert.SerializeAsync(data, "/mnt/storage/data.parquet"); +await ParquetSerializer.SerializeAsync(data, "/mnt/storage/data.parquet"); ``` That's pretty much it! You can [customise many things](serialisation.md) in addition to the magical magic process, but if you are a really lazy person that will do just fine for today. @@ -134,7 +141,6 @@ using(Stream fs = System.IO.File.OpenWrite("/mnt/storage/data.parquet")) { await groupWriter.WriteColumnAsync(column1); await groupWriter.WriteColumnAsync(column2); await groupWriter.WriteColumnAsync(column3); - } } } @@ -147,7 +153,7 @@ What's going on?:?: 3. Row group is like a data partition inside the file. In this example we have just one, but you can create more if there are too many values that are hard to fit in computer memory. 4. Three calls to row group writer write out the columns. Note that those are performed sequentially, and in the same order as schema defines them. -Read more on writing [here](writing.md). +Read more on writing [here](writing.md) which also includes guides on writing [nested types](nested_types.md) such as lists, maps, and structs. ### 📖Reading Data @@ -158,7 +164,7 @@ Reading data also has three different approaches, so I'm going to unwrap them he Provided that you have written the data, or just have some external data with the same structure as above, you can read those by simply doing the following: ```csharp -Record[] data2 = await ParquetConvert.DeserializeAsync("/mnt/storage/data.parquet"); +IList data = await ParquetSerializer.DeserializeAsync("/mnt/storage/data.parquet"); ``` This will give us an array with one million class instances similar to this: @@ -216,15 +222,25 @@ This is what's happening: If you have a choice, then the choice is easy - use Low Level API. They are the fastest and the most flexible. But what if you for some reason don't have a choice? Then think about this: -| Feature | 🚤Class Serialisation | 🌛Table API | ⚙️Low Level API | -| --------------------- | ---------------------- | ---------------- | ---------------- | -| Performance | high | very low | very high | -| Developer Convenience | feels like C# (great!) | feels like Excel | close to Parquet | -| Row based access | easy | easy | hard | -| Column based access | hard | hard | easy | +| Feature | 🚤Class Serialisation | 🌛Table API | ⚙️Low Level API | +| --------------------- | -------------------- | ---------------- | -------------------------- | +| Performance | high | very low | very high | +| Developer Convenience | C# native | feels like Excel | close to Parquet internals | +| Row based access | easy | easy | hard | +| Column based access | C# native | hard | easy | ## Contributing -Any contributions are welcome, in any form. Documentation, code, tests, donations or anything else. I don't like processes so anything goes. If you happen to get interested in parquet development, there are some [interesting links](parquet-getting-started-md). +Any contributions are welcome, in any form. Documentation, code, tests, donations or anything else. I don't like processes so anything goes. If you happen to get interested in parquet development, there are some [interesting links](parquet-getting-started.md). + +## Special Thanks + +Without these tools development would be really painful. + +- [Visual Studio Community](https://visualstudio.microsoft.com/vs/community/) - free IDE from Microsoft. The best in class C# and C++ development tool. It's worth using Windows just because Visual Studio exists there. +- [JetBrains Rider](https://www.jetbrains.com/rider/) - for their cross-platform C# IDE, which has some great features. +- [IntelliJ IDEA](https://www.jetbrains.com/idea/) - the best Python, Scala and Java IDE. +- [LINQPad](https://www.linqpad.net/) - extremely powerful C# REPL with unique visualisation features, IL decompiler, expression tree visualiser, benchmarking, charting and so on. Again it's worth having Windows just for this tool. Please support the author and purchase it. +- [Benchmarkdotnet](https://benchmarkdotnet.org/) - the best cross-platform tool that can microbenchmark C# code. This library is faster than native ones only thanks for this. diff --git a/docs/complex-types.md b/docs/complex-types.md deleted file mode 100644 index 7a5c71c7..00000000 --- a/docs/complex-types.md +++ /dev/null @@ -1,43 +0,0 @@ -# Complex Types - -Please read [getting started with Parquet](parquet-getting-started.md) to better understand Parquet internals. - -## Arrays - -Arrays *aka repeatable fields* is a basis for understanding how more complex data structures work in Parquet. - -`DataColumn` in Parquet can contain not just a single but multiple values. Sometimes they are called repeated fields (because the data type value repeats) or arrays. In order to create a schema for a repeatable field, let's say of type `int` you could use one of two forms: - -```csharp -var field = new DataField>("items"); -``` -To check if the field is repeated you can always test `.IsArray` Boolean flag. - -Parquet columns are flat, so in order to store an array in the array which can only keep simple elements and not other arrays, you would *flatten* them. For instance to store two elements: - -- `[1, 2, 3]` -- `[4, 5]` - -in a flat array, it will look like `[1, 2, 3, 4, 5]`. And that's exactly how parquet stores them. Now, the problem starts when you want to read the values back. Is this `[1, 2]` and `[3, 4, 5]` or `[1]` and `[2, 3, 4, 5]`? There's no way to know without an extra information. Therefore, parquet also stores that extra information an an extra column per data column, which is called *repetition levels*. In the previous example, our array of arrays will expand into the following two columns: - -| # | Data Column | Repetition Levels Column | -| ---- | ----------- | ------------------------ | -| 0 | 1 | 0 | -| 1 | 2 | 1 | -| 2 | 3 | 1 | -| 3 | 4 | 0 | -| 4 | 5 | 1 | - -In other words - it is the level at which we have to create a new list for the current value. In other words, the repetition level can be seen as a marker of when to start a new list and at which level. - -To represent this in C# code: - -```csharp -var field = new DataField>("items"); -var column = new DataColumn( - field, - new int[] { 1, 2, 3, 4, 5 }, - new int[] { 0, 1, 1, 0, 1 }); -``` - - diff --git a/docs/img/struct-path.png b/docs/img/struct-path.png new file mode 100644 index 00000000..e66a746c Binary files /dev/null and b/docs/img/struct-path.png differ diff --git a/docs/legacy_serialisation.md b/docs/legacy_serialisation.md index 7c8ae45a..508dbf02 100644 --- a/docs/legacy_serialisation.md +++ b/docs/legacy_serialisation.md @@ -1,5 +1,7 @@ # Class Serialisation +> This document refers to legacy serialisation, which is still in the library, but is marked as obsolete and will be removed by the end of 2023. No new features will be added and you should [migrate](serialisation.md). + Parquet library is generally extremely flexible in terms of supporting internals of the Apache Parquet format and allows you to do whatever the low level API allow to. However, in many cases writing boilerplate code is not suitable if you are working with business objects and just want to serialise them into a parquet file. Class serialisation is **really fast** as it generates [MSIL](https://en.wikipedia.org/wiki/Common_Intermediate_Language) on the fly. That means there is a tiny bit of delay when serialising a first entity, which in most cases is negligible. Once the class is serialised at least once, further operations become blazingly fast (around *x40* speed improvement comparing to reflection on relatively large amounts of data (~5 million records)). diff --git a/docs/nested_types.md b/docs/nested_types.md new file mode 100644 index 00000000..b4072fe7 --- /dev/null +++ b/docs/nested_types.md @@ -0,0 +1,242 @@ +# Nested Types + +Optionally (but not required) read [getting started with Parquet](parquet-getting-started.md) to better understand Parquet internals. + +## Structs + +Structures are the easiest to understand. A structure is simply a container with extra fields i.e. table inside a table cell. From parquet's point of view, there is no difference between a struct's column and top-level column, they are absolutely identical. + +Structures are mostly used to logically separate entities and simplify naming for a user. To demonstrate, let's say you have the following very simple class hierarchy: + +```mermaid +classDiagram + direction RL + Root "1" *-- "1" Address + class Root { + +string name + } + class Address { + +string line1 + +string postcode + } +``` + + + +In tabular form, it can be represented like this + +| name | address.line1 | address.postcode | +| -------- | ------------- | ---------------- | +| (column) | (column) | (column) | + +which is also identical to + +| name | address | +| ------ | ------------------------------------------- | +| Column | **line1** (column) \| **postcode** (column) | + +Each table still has 3 physical columns, they are just named differently. + +To make schema for this, we'll use `StructField` which accepts other fields as children: + +```csharp +var schema = new ParquetSchema( + new DataField("name"), + new StructField("address", + new DataField("line1"), + new DataField("postcode") + )); +``` + +To write data, we use plain columns: + +```csharp +using var ms = new MemoryStream(); +using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, ms)) { + ParquetRowGroupWriter rgw = writer.CreateRowGroup(); + + await rgw.WriteColumnAsync( + new DataColumn(new DataField("name"), new[] { "Joe" })); + + await rgw.WriteColumnAsync( + new DataColumn(new DataField("line1"), new[] { "Amazonland" })); + + await rgw.WriteColumnAsync( + new DataColumn(new DataField("postcode"), new[] { "AAABBB" })); +} + +``` + +To read back, again, the data is in plain columns: + +```csharp + ms.Position = 0; + +using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { + using ParquetRowGroupReader rg = reader.OpenRowGroupReader(0); + + DataField[] dataFields = reader.Schema.GetDataFields(); + + DataColumn name = await rg.ReadColumnAsync(dataFields[0]); + DataColumn line1 = await rg.ReadColumnAsync(dataFields[1]); + DataColumn postcode = await rg.ReadColumnAsync(dataFields[2]); + + Assert.Equal(new[] { "Joe" }, name.Data); + Assert.Equal(new[] { "Amazonland" }, line1.Data); + Assert.Equal(new[] { "AAABBB" }, postcode.Data); +} +``` + +Note that the only indication that this is a part of struct is `Path` property in the read schema containing struct name: + +![](img/struct-path.png) + +## Lists and Arrays + +Arrays *aka repeatable fields* is a basis for understanding how more complex data structures work in Parquet. + +`DataColumn` in Parquet can contain not just a single but multiple values. Sometimes they are called repeated fields (because the data type value repeats) or arrays. In order to create a schema for a repeatable field, let's say of type `int` you could use one of two forms: + +```csharp +var field = new DataField>("items"); +``` +To check if the field is repeated you can always test `.IsArray` Boolean flag. + +Parquet columns are flat, so in order to store an array in the array which can only keep simple elements and not other arrays, you would *flatten* them. For instance to store two elements: + +- `[1, 2, 3]` +- `[4, 5]` + +in a flat array, it will look like `[1, 2, 3, 4, 5]`. And that's exactly how parquet stores them. Now, the problem starts when you want to read the values back. Is this `[1, 2]` and `[3, 4, 5]` or `[1]` and `[2, 3, 4, 5]`? There's no way to know without an extra information. Therefore, parquet also stores that extra information an an extra column per data column, which is called *repetition levels*. In the previous example, our array of arrays will expand into the following two columns: + +| # | Data Column | Repetition Levels Column | +| ---- | ----------- | ------------------------ | +| 0 | 1 | 0 | +| 1 | 2 | 1 | +| 2 | 3 | 1 | +| 3 | 4 | 0 | +| 4 | 5 | 1 | + +In other words - it is the level at which we have to create a new list for the current value. In other words, the repetition level can be seen as a marker of when to start a new list and at which level. + +To represent this in C# code: + +```csharp +var field = new DataField>("items"); +var column = new DataColumn( + field, + new int[] { 1, 2, 3, 4, 5 }, + new int[] { 0, 1, 1, 0, 1 }); +``` + +### Lists + +Although arrays are useful, most of the systems write lists of data using `List` type. Unlike arrays, which can only contain primitive types, lists can contain anything. The most common use of lists is lists of structs. + +### Lists of Structs + +To demonstrate, I'll come back to the beginning of this document, and slightly change the relationship. Now our `Root` class does not just contain `Address` structure, but a list of address structures: + +```mermaid +classDiagram + direction RL + Root "1" *-- "*" Address : list of + class Root { + +string name + } + class Address { + +string line1 + +string postcode + } +``` + +And we'd like to save the following data: + +```json +[ + { + "name": "Joe", + "addresses": [ + { + "line1": "Amazonland", + "postcode": "AAABBB" + }, + { + "line1": "Disneyland", + "postcode": "CCCDDD" + } + ] + }, + { + "name": "Bob", + "addresses": [ + { + "line1": "Cryptoland", + "postcode": "EEEFFF" + } + ] + } +] +``` + +Knowing how structs and arrays are serialised, we can flatten this hierarchy to the following form so that it can be saved to Parquet: + +| name | RL | addresses.list.element.line1 | RL | addresses.list.element.postcode | RL | +| ---- | ---- | ---------------------------- | ---- | ------------------------------- | ---- | +| Joe | | Amazonland | 0 | AAABBB | 0 | +| Bob | | Disneyland | 1 | CCCDDD | 1 | +| | | Cryptoland | 0 | EEEFFF | 0 | + +where **RL** column indicated *repetition levels* of the column to the left. + +`name` does not have any repetition levels as it's a normal plain simple column. + +`line1` is a part of a list and it has a slightly longer name than usual. This is because of parquet [naming conventions for lists](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists). List must **always** annotate a **3-level** structure: + +- Outer group, which is a name of your business property (`addresses`) + - Middle level, always called **list** annotates repeatable field. + - A field always called **element** that annotates the list contents. When lists contain a primitive type, this field is that type. In our case it's a structure called **element** containing two fields - `line1` and `postcode`. + +If it feels complicated, it **IS**! Therefore general recommendation would be to use plain columns whenever possible. Nested types in parquet carry both mental and performance overhead 🤯 + +Moving on, let's declare a schema for this: + +```csharp +var nameField = new DataField("name"); +var line1Field = new DataField("line1"); +var postcodeField = new DataField("postcode"); + +var schema = new ParquetSchema( + nameField, + new ListField("addresses", + new StructField(ListField.ElementName, + line1Field, + postcodeField))); + +``` + +One thing to note - `ListField` automatically assumes there will be an internal *middle level* called **list** so it's omitted from the schema declaration. + +The struct is called `"element"` which is what `ListField.ElementName` constant is equal to. Theoretically you can name it anything you want, but common convention is recommended to be followed. + +And the final thing is to create data for those 3 columns with their repetition levels: + +```csharp +var nameCol = new DataColumn(nameField, new string[] { "Joe", "Bob" }); +var line1Col = new DataColumn(line1Field, new[] { "Amazonland", "Disneyland", "Cryptoland" }, new[] { 0, 1, 0 }); +var postcodeCol = new DataColumn(postcodeField, new[] { "AAABBB", "CCCDDD", "EEEFFF" }, new[] { 0, 1, 0 }); +``` + +Congrats, you have saved your first list! + +You might have noticed that list schema allows you to specify any `Field` - and that's 100% correct. Lists can contain any element type, including other lists! The idea of saving lists of lists is identical to the above. + +For more examples or just to run the above, please refer to unit tests in this project. + +## Maps + +Maps are stored as lists of structures, where each structure has two elements - key and value. Theoretically you don't need maps at all, it's just a hint to programming language to deserialise it in a more convenient way. + +## I'm Fed Up + +Yeah? Then just use [class serialisation](serialisation.md). diff --git a/docs/schema.md b/docs/schema.md index cc768be5..b8e30b71 100644 --- a/docs/schema.md +++ b/docs/schema.md @@ -1,6 +1,6 @@ # Declaring Schema -Due to the fact that Parquet is s strong typed format you need to declare a schema before writing any data. +Parquet is a format that stores data in a structured way. It has different types for different kinds of data, like numbers, strings, dates and so on. This means that you have to tell Parquet what type each column of your data is before you can write it to a file. This is called declaring a schema. Declaring a schema helps Parquet to compress and read your data more efficiently. Schema can be defined by creating an instance of `ParquetSchema` class and passing a collection of `Field`. Various helper methods on both `DataSet` and `ParquetSchema` exist to simplify the schema declaration, but we are going to be more specific on this page. diff --git a/docs/serialisation.md b/docs/serialisation.md index ac184aa0..bec9fb3b 100644 --- a/docs/serialisation.md +++ b/docs/serialisation.md @@ -6,7 +6,7 @@ Parquet library is generally extremely flexible in terms of supporting internals Class serialisation is **really fast** as internally it generates [compiled expression trees](https://learn.microsoft.com/en-US/dotnet/csharp/programming-guide/concepts/expression-trees/) on the fly. That means there is a tiny bit of delay when serialising a first entity, which in most cases is negligible. Once the class is serialised at least once, further operations become blazingly fast (around *x40* speed improvement comparing to reflection on relatively large amounts of data (~5 million records)). -Class serialisation philosophy is trying to simply mimic .NET's built-in **json** serialisation infrastructure in order to ease in learning path and reuse as much existing code as possible. +Class serialisation philosophy is based on the idea that we don't need to reinvent the wheel when it comes to converting objects to and from JSON. Instead of creating our own custom serialisers and deserialisers, we can leverage the existing JSON infrastructure that .NET provides. This way, we can save time and effort, and also make our code more consistent and compatible with other .NET applications that use JSON. ## Quick Start @@ -52,20 +52,249 @@ Serialisation tries to fit into C# ecosystem like a ninja 🥷, including custom - [`JsonPropertyName`](https://learn.microsoft.com/en-us/dotnet/api/system.text.json.serialization.jsonpropertynameattribute?view=net-7.0) - changes mapping of column name to property name. - [`JsonIgnore`](https://learn.microsoft.com/en-us/dotnet/api/system.text.json.serialization.jsonignoreattribute?view=net-7.0) - ignores property when reading or writing. -## Non-Trivial Types +## Nested Types -You can also serialize more complex types supported by the Parquet format. +You can also serialize [more complex types](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types) supported by the Parquet format. Sometimes you might want to store more complex data in your parquet files, like lists or maps. These are called *nested types* and they can be useful for organizing your information. However, they also come with a trade-off: they make your code slower and use more CPU resources. That's why you should only use them when you really need them and not just because they look cool. Simple columns are faster and easier to work with, so stick to them whenever you can. + +> If you would like to use low-level API for complex types, there is a [guide](nested_types.md) available too. + +### Structures + +Structures are just class members of a class and are completely transparent. For instance, `AddressBookEntry` class may contain a structure called `Address`: + +```csharp +class Address { + public string? Country { get; set; } + + public string? City { get; set; } +} + +class AddressBookEntry { + public string? FirstName { get; set; } + + public string? LastName { get; set; } + + public Address? Address { get; set; } +} +``` + +Populated with the following fake data: + +```csharp +var data = Enumerable.Range(0, 1_000_000).Select(i => new AddressBookEntry { + FirstName = "Joe", + LastName = "Bloggs", + Address = new Address() { + Country = "UK", + City = "Unknown" + } + }).ToList(); +``` + +You can serialise/deserialise those using the same `ParquetSerializer.SerializeAsync` / `ParquetSerializer.DeserializeAsync` methods. It does understand subclasses and will magically traverse inside them. ### Lists +One of the cool things about lists is that Parquet can handle any kind of data structure in a list. You can have a list of atoms, like `1, 2, 3`, or a list of lists, `[[1, 2], [3, 4], [5, 6]]`, or even a list of structures. Parquet.Net is awesome like that! + +For instance, a simple `MovementHistory` class with `Id` and list of `ParentIds` looking like the following: + +```csharp +class MovementHistoryCompressed { + public int? PersonId { get; set; } + + public List? ParentIds { get; set; } +} +``` + +Is totally fine to serialise/deserialise: + +```csharp +var data = Enumerable.Range(0, 100).Select(i => new MovementHistoryCompressed { + PersonId = i, + ParentIds = Enumerable.Range(i, 4).ToList() +}).ToList(); + +await ParquetSerializer.SerializeAsync(data, "c:\\tmp\\lat.parquet"); +``` + + + + Reading it in `Spark` produces the following schema + +``` +root + |-- PersonId: integer (nullable = true) + |-- ParentIds: array (nullable = true) + | |-- element: integer (containsNull = true) +``` + +and data: + +``` ++--------+---------------+ +|PersonId|ParentIds | ++--------+---------------+ +|0 |[0, 1, 2, 3] | +|1 |[1, 2, 3, 4] | +|2 |[2, 3, 4, 5] | +|3 |[3, 4, 5, 6] | +|4 |[4, 5, 6, 7] | +|5 |[5, 6, 7, 8] | +|6 |[6, 7, 8, 9] | +|7 |[7, 8, 9, 10] | +|8 |[8, 9, 10, 11] | +|9 |[9, 10, 11, 12]| ++--------+---------------+ +``` + +Or as a more complicate example, here is a list of structures (classes in C#): + +```csharp +class Address { + public string? Country { get; set; } + + public string? City { get; set; } +} + +class MovementHistory { + public int? PersonId { get; set; } + public string? Comments { get; set; } + + public List
? Addresses { get; set; } +} + + var data = Enumerable.Range(0, 1_000).Select(i => new MovementHistory { + PersonId = i, + Comments = i % 2 == 0 ? "none" : null, + Addresses = Enumerable.Range(0, 4).Select(a => new Address { + City = "Birmingham", + Country = "United Kingdom" + }).ToList() + }).ToList(); + +await ParquetSerializer.SerializeAsync(data, "c:\\tmp\\ls.parquet"); +``` + +that by reading from Spark produced the following schema + +``` +root + |-- PersonId: integer (nullable = true) + |-- Comments: string (nullable = true) + |-- Addresses: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- Country: string (nullable = true) + | | |-- City: string (nullable = true) +``` + +and data + +``` ++--------+--------+--------------------+ +|PersonId|Comments| Addresses| ++--------+--------+--------------------+ +| 0| none|[{United Kingdom,...| +| 1| null|[{United Kingdom,...| +| 2| none|[{United Kingdom,...| +| 3| null|[{United Kingdom,...| +| 4| none|[{United Kingdom,...| +| 5| null|[{United Kingdom,...| +| 6| none|[{United Kingdom,...| +| 7| null|[{United Kingdom,...| +| 8| none|[{United Kingdom,...| +| 9| null|[{United Kingdom,...| ++--------+--------+--------------------+ +``` ### Maps (Dictionaries) +Maps are useful constructs if you need to serialize key-value pairs where each row can have different amount of keys. For example, if you want to store the names and hobbies of your friends, you can use a map like this: + +```json +{"Alice": ["reading", "cooking", "gardening"], "Bob": ["gaming", "coding", "sleeping"], "Charlie": ["traveling"]} +``` + +Notice how Alice has three hobbies, Bob has two and Charlie has only one. A map allows you to handle this variability without wasting space or creating empty values. Of course, you could also use a list, but then you would have to remember the order of the elements and deal with missing data. A map makes your life easier by letting you access the values by their keys. + +In this library, maps are represented as an instance of generic `IDictionary` type. + +To give you a minimal example, let's say we have the following class with two properties: `Id` and `Tags`. The `Id` property is an integer that can be used to identify a row or an item in a collection. The `Tags` property is a dictionary of strings that can store arbitrary key-value pairs. For example, the `Tags` property can be used to store metadata or attributes of the item: + +```csharp +class IdWithTags { + public int Id { get; set; } + + public Dictionary? Tags { get; set; } +} +``` + + + +You can easily use `ParquetSerializer` to work with this class: + +```csharp +var data = Enumerable.Range(0, 10).Select(i => new IdWithTags { + Id = i, + Tags = new Dictionary { + ["id"] = i.ToString(), + ["gen"] = DateTime.UtcNow.ToString() + }}).ToList(); + +await ParquetSerializer.SerializeAsync(data, "c:\\tmp\\map.parquet"); +``` + +When read by Spark, the schema looks like the following: + +``` +root + |-- Id: integer (nullable = true) + |-- Tags: map (nullable = true) + | |-- key: string + | |-- value: string (valueContainsNull = true) + +``` + +And the data: + +``` ++---+-------------------------------------+ +|Id |Tags | ++---+-------------------------------------+ +|0 |{id -> 0, gen -> 17/03/2023 13:06:04}| +|1 |{id -> 1, gen -> 17/03/2023 13:06:04}| +|2 |{id -> 2, gen -> 17/03/2023 13:06:04}| +|3 |{id -> 3, gen -> 17/03/2023 13:06:04}| +|4 |{id -> 4, gen -> 17/03/2023 13:06:04}| +|5 |{id -> 5, gen -> 17/03/2023 13:06:04}| +|6 |{id -> 6, gen -> 17/03/2023 13:06:04}| +|7 |{id -> 7, gen -> 17/03/2023 13:06:04}| +|8 |{id -> 8, gen -> 17/03/2023 13:06:04}| +|9 |{id -> 9, gen -> 17/03/2023 13:06:04}| ++---+-------------------------------------+ +``` + + + +### Supported Collection Types + +Similar to JSON [supported collection types](https://learn.microsoft.com/en-us/dotnet/standard/serialization/system-text-json/supported-collection-types?pivots=dotnet-7-0), here are collections Parquet.Net currently supports: + +| Type | Serialization | Deserialization | +| ------------------------------------------------------------ | ------------- | --------------- | +| [Single-dimensional array](https://learn.microsoft.com/en-us/dotnet/csharp/programming-guide/arrays/single-dimensional-arrays) `**` | ❌ | ❌ | +| [Muti-dimensional arrays](https://learn.microsoft.com/en-us/dotnet/csharp/programming-guide/arrays/multidimensional-arrays) `*` | ❌ | ❌ | +| [`IList`](https://learn.microsoft.com/en-us/dotnet/api/system.collections.generic.ilist-1?view=net-7.0) | ✔️ | ❌`**` | +| [`List`](https://learn.microsoft.com/en-us/dotnet/api/system.collections.generic.ilist-1?view=net-7.0) | ✔️ | ✔️ | +| [`IDictionary`](https://learn.microsoft.com/en-us/dotnet/api/system.collections.generic.idictionary-2?view=net-7.0) `**` | ❌ | ❌ | +| [`Dictionary`](https://learn.microsoft.com/en-us/dotnet/api/system.collections.generic.dictionary-2?view=net-7.0) | ✔️ | ✔️ | +`*` Technically impossible. +`**` Technically possible, but not implemented yet. ## FAQ **Q.** Can I specify schema for serialisation/deserialisation. -**A.** No. Your class definition is the schema, so you don't need to supply it separately. +**A.** If you're using a class-based approach to define your data model, you don't have to worry about providing a schema separately. The class definition itself is the schema, meaning it specifies the fields and types of your data. This makes it easier to write and maintain your code, since you only have to define your data model once and use it everywhere. diff --git a/docs/writing.md b/docs/writing.md index d667c301..815fd5f3 100644 --- a/docs/writing.md +++ b/docs/writing.md @@ -125,4 +125,4 @@ using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { # Complex Types -To write complex types (arrays, lists, maps, structs) read [this guide](complex-types.md). +To write complex types (arrays, lists, maps, structs) read [this guide](nested_types.md). diff --git a/src/Parquet.PerfRunner/Benchmarks/WriteBenchmark.cs b/src/Parquet.PerfRunner/Benchmarks/WriteBenchmark.cs index e6721664..30924e85 100644 --- a/src/Parquet.PerfRunner/Benchmarks/WriteBenchmark.cs +++ b/src/Parquet.PerfRunner/Benchmarks/WriteBenchmark.cs @@ -15,7 +15,9 @@ public class WriteBenchmark : BenchmarkBase { //[Params(typeof(int), typeof(int?), typeof(double), typeof(double?))] [Params(typeof(string))] +#pragma warning disable CS8618 // Non-nullable field must contain a non-null value when exiting constructor. Consider declaring as nullable. public Type DataType; +#pragma warning restore CS8618 // Non-nullable field must contain a non-null value when exiting constructor. Consider declaring as nullable. private DataField? _f; private DataColumn? _c; @@ -24,12 +26,14 @@ public class WriteBenchmark : BenchmarkBase { private Array? _ar; [GlobalSetup] - public async Task SetupAsync() { + public Task SetupAsync() { _f = new DataField("test", DataType!); _ar = CreateTestData(DataType); _c = new DataColumn(_f, _ar); _psc = new Column(DataType!, "test"); + + return Task.CompletedTask; } [Benchmark] @@ -45,7 +49,7 @@ public async Task ParquetNet() { public void ParquetSharp() { using var ms = new MemoryStream(); using var writer = new ManagedOutputStream(ms); - using var fileWriter = new ParquetFileWriter(writer, new[] { _psc }); + using var fileWriter = new ParquetFileWriter(writer, new[] { _psc! }); using RowGroupWriter rowGroup = fileWriter.AppendRowGroup(); if(DataType == typeof(int)) { diff --git a/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj b/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj index b6d041c1..26ac86eb 100644 --- a/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj +++ b/src/Parquet.PerfRunner/Parquet.PerfRunner.csproj @@ -8,7 +8,7 @@ - + diff --git a/src/Parquet.Test/DocRef.cs b/src/Parquet.Test/DocRef.cs index 1cbcdca7..43d93532 100644 --- a/src/Parquet.Test/DocRef.cs +++ b/src/Parquet.Test/DocRef.cs @@ -20,21 +20,6 @@ class Record { public class DocTest { - //[Fact] - public async Task Write1() { - var data = Enumerable.Range(0, 1_000_000).Select(i => new Record { - Timestamp = DateTime.UtcNow.AddSeconds(i), - EventName = i % 2 == 0 ? "on" : "off", - MeterValue = i - }).ToList(); - - System.IO.File.Delete("c:\\tmp\\data.parquet"); - await ParquetConvert.SerializeAsync(data, "c:\\tmp\\data.parquet"); - - Record[] data2 = await ParquetConvert.DeserializeAsync("c:\\tmp\\data.parquet"); - Assert.NotNull(data2); - } - //[Fact] public async Task Write2() { var table = new Table( diff --git a/src/Parquet.Test/Extensions/SpanExtensionsTest.cs b/src/Parquet.Test/Extensions/SpanExtensionsTest.cs index 945efe01..714b9003 100644 --- a/src/Parquet.Test/Extensions/SpanExtensionsTest.cs +++ b/src/Parquet.Test/Extensions/SpanExtensionsTest.cs @@ -8,7 +8,7 @@ public class SpanExtensionsTest { public void StringMinMax() { ReadOnlySpan span = new string[] { "one", "two", "three" }.AsSpan(); - span.MinMax(out string min, out string max); + span.MinMax(out string? min, out string? max); Assert.Equal("one", min); Assert.Equal("two", max); diff --git a/src/Parquet.Test/Extensions/TypeExtensionsTest.cs b/src/Parquet.Test/Extensions/TypeExtensionsTest.cs index ea6d2452..c9014159 100644 --- a/src/Parquet.Test/Extensions/TypeExtensionsTest.cs +++ b/src/Parquet.Test/Extensions/TypeExtensionsTest.cs @@ -8,30 +8,30 @@ namespace Parquet.Test.Extensions { public class TypeExtensionsTest { [Fact] public void String_array_is_enumerable() { - Assert.True(typeof(string[]).TryExtractEnumerableType(out Type? et)); + Assert.True(typeof(string[]).TryExtractIEnumerableType(out Type? et)); Assert.Equal(typeof(string), et); } [Fact] public void String_is_not_enumerable() { - Assert.False(typeof(string).TryExtractEnumerableType(out Type? et)); + Assert.False(typeof(string).TryExtractIEnumerableType(out Type? et)); } [Fact] public void StringIenumerable_is_enumerable() { - Assert.True(typeof(IEnumerable).TryExtractEnumerableType(out Type? et)); + Assert.True(typeof(IEnumerable).TryExtractIEnumerableType(out Type? et)); Assert.Equal(typeof(string), et); } [Fact] public void Nullable_element_is_not_stripped() { - Assert.True(typeof(IEnumerable).TryExtractEnumerableType(out Type? et)); + Assert.True(typeof(IEnumerable).TryExtractIEnumerableType(out Type? et)); Assert.Equal(typeof(int?), et); } [Fact] public void ListOfT_is_ienumerable() { - Assert.True(typeof(List).TryExtractEnumerableType(out Type? baseType)); + Assert.True(typeof(List).TryExtractIEnumerableType(out Type? baseType)); Assert.Equal(typeof(int), baseType); } } diff --git a/src/Parquet.Test/Integration/IntegrationBase.cs b/src/Parquet.Test/Integration/IntegrationBase.cs new file mode 100644 index 00000000..71f03e73 --- /dev/null +++ b/src/Parquet.Test/Integration/IntegrationBase.cs @@ -0,0 +1,67 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Text; + +namespace Parquet.Test.Integration { + public class IntegrationBase : TestBase { + private readonly string _toolsPath; + private readonly string _toolsJarPath; + private readonly string _javaExecName; + + public IntegrationBase() { + _toolsPath = Path.GetFullPath(Path.Combine("..", "..", "..", "..", "..", "tools")); + _toolsJarPath = Path.Combine(_toolsPath, "parquet-tools-1.9.0.jar"); + + _javaExecName = Environment.OSVersion.Platform == PlatformID.Win32NT + ? "java.exe" + : "java"; + } + + private string? ExecJavaAndGetOutput(string arguments) { + var psi = new ProcessStartInfo { + FileName = _javaExecName, + Arguments = arguments, + UseShellExecute = false, + RedirectStandardOutput = true, + RedirectStandardError = true, + CreateNoWindow = true + }; + + var proc = new Process { StartInfo = psi }; + + if(!proc.Start()) + return null; + + var so = new StringBuilder(); + var se = new StringBuilder(); + + while(!proc.StandardOutput.EndOfStream) { + string? line = proc.StandardOutput.ReadLine(); + if(line != null) { + so.AppendLine(line); + } + } + + while(!proc.StandardError.EndOfStream) { + string? line = proc.StandardError.ReadLine(); + if(line != null) { + se.AppendLine(line); + } + } + + proc.WaitForExit(); + + if(proc.ExitCode != 0) { + throw new Exception("process existed with code " + proc.ExitCode + ", error: " + se.ToString()); + } + + return so.ToString().Trim(); + } + + protected string? ExecMrCat(string testFileName) { + return ExecJavaAndGetOutput($"-jar \"{_toolsJarPath}\" cat -j \"{testFileName}\""); + } + } +} diff --git a/src/Parquet.Test/Integration/ParquetSerializerTest.cs b/src/Parquet.Test/Integration/ParquetSerializerTest.cs new file mode 100644 index 00000000..96bbb8af --- /dev/null +++ b/src/Parquet.Test/Integration/ParquetSerializerTest.cs @@ -0,0 +1,9 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Parquet.Test.Integration { + public class ParquetSerializerTest { + + } +} diff --git a/src/Parquet.Test/Integration/ParquetMrIntegrationTest.cs b/src/Parquet.Test/Integration/TablesTest.cs similarity index 63% rename from src/Parquet.Test/Integration/ParquetMrIntegrationTest.cs rename to src/Parquet.Test/Integration/TablesTest.cs index bb91674e..5c9f37c6 100644 --- a/src/Parquet.Test/Integration/ParquetMrIntegrationTest.cs +++ b/src/Parquet.Test/Integration/TablesTest.cs @@ -15,21 +15,9 @@ namespace Parquet.Test.Integration { /// This class does some fairly basic integration tests by compring results with parquet-mr using parquet-tools jar package. /// You must have java available in PATH. /// - public class ParquetMrIntegrationTest : TestBase { - private readonly string _toolsPath; - private readonly string _toolsJarPath; - private readonly string _javaExecName; - - public ParquetMrIntegrationTest() { - _toolsPath = Path.GetFullPath(Path.Combine("..", "..", "..", "..", "..", "tools")); - _toolsJarPath = Path.Combine(_toolsPath, "parquet-tools-1.9.0.jar"); - - _javaExecName = Environment.OSVersion.Platform == PlatformID.Win32NT - ? "java.exe" - : "java"; - } + public class TablesTest : IntegrationBase { - private async Task CompareWithMr(Table t, Func jsonPreprocessor = null) { + private async Task CompareWithMr(Table t, Func? jsonPreprocessor = null) { string testFileName = Path.GetFullPath("temp.parquet"); if(F.Exists(testFileName)) @@ -51,7 +39,7 @@ private async Task CompareWithMr(Table t, Func jsonPreprocessor Assert.Equal(t.ToString("j"), t2.ToString("j"), ignoreLineEndingDifferences: true); string myJson = t.ToString("j"); - string mrJson = ExecAndGetOutput(_javaExecName, $"-jar \"{_toolsJarPath}\" cat -j \"{testFileName}\""); + string? mrJson = ExecMrCat(testFileName); if(jsonPreprocessor != null) { myJson = jsonPreprocessor(myJson); @@ -60,46 +48,7 @@ private async Task CompareWithMr(Table t, Func jsonPreprocessor Assert.Equal(myJson, mrJson); } - private static string? ExecAndGetOutput(string fileName, string arguments) { - var psi = new ProcessStartInfo { - FileName = fileName, - Arguments = arguments, - UseShellExecute = false, - RedirectStandardOutput = true, - RedirectStandardError = true, - CreateNoWindow = true - }; - - var proc = new Process { StartInfo = psi }; - - if(!proc.Start()) - return null; - - var so = new StringBuilder(); - var se = new StringBuilder(); - - while(!proc.StandardOutput.EndOfStream) { - string? line = proc.StandardOutput.ReadLine(); - if(line != null) { - so.AppendLine(line); - } - } - - while(!proc.StandardError.EndOfStream) { - string? line = proc.StandardError.ReadLine(); - if(line != null) { - se.AppendLine(line); - } - } - - proc.WaitForExit(); - if(proc.ExitCode != 0) { - throw new Exception("process existed with code " + proc.ExitCode + ", error: " + se.ToString()); - } - - return so.ToString().Trim(); - } [Fact] public async Task Integers_all_types() { @@ -109,7 +58,7 @@ public async Task Integers_all_types() { //generate fake data for(int i = 0; i < 1000; i++) { - table.Add(new Row((sbyte)(i % 127 - 255), (byte)(i % 255), (short)i, (ushort)i, i, (long)i)); + table.Add(new Row((sbyte)((i % 127) - 255), (byte)(i % 255), (short)i, (ushort)i, i, (long)i)); } await CompareWithMr(table); diff --git a/src/Parquet.Test/ListTest.cs b/src/Parquet.Test/ListTest.cs deleted file mode 100644 index 0edb583b..00000000 --- a/src/Parquet.Test/ListTest.cs +++ /dev/null @@ -1,55 +0,0 @@ -using System.Threading.Tasks; -using Parquet.Data; -using Parquet.Schema; -using Xunit; - -namespace Parquet.Test { - public class ListTest : TestBase { - [Fact] - public async Task List_of_structures_writes_reads() { - var idsch = new DataField("id"); - var cnamech = new DataField("name"); - var ccountrych = new DataField("country"); - - var schema = new ParquetSchema( - idsch, - new ListField("cities", - new StructField("element", - cnamech, - ccountrych))); - - var id = new DataColumn(idsch, new int[] { 1 }); - var cname = new DataColumn(cnamech, new[] { "London", "New York" }, new[] { 0, 1 }); - var ccountry = new DataColumn(ccountrych, new[] { "UK", "US" }, new[] { 0, 1 }); - - await WriteReadSingleRowGroup(schema, new[] { id, cname, ccountry }); - } - - [Fact] - public async Task List_of_elements_with_some_items_empty_reads_file() { - /* - list data: - - 1: [1, 2, 3] - - 2: [] - - 3: [1, 2, 3] - - 4: [] - */ - - using(ParquetReader reader = await ParquetReader.CreateAsync(OpenTestFile("list_empty_alt.parquet"))) { - using(ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(0)) { - Assert.Equal(4, groupReader.RowCount); - DataField[] fs = reader.Schema.GetDataFields(); - - DataColumn id = await groupReader.ReadColumnAsync(fs[0]); - Assert.Equal(4, id.Data.Length); - Assert.False(id.HasRepetitions); - - DataColumn list = await groupReader.ReadColumnAsync(fs[1]); - Assert.Equal(8, list.Data.Length); - Assert.Equal(new int[] { 0, 1, 1, 0, 0, 1, 1, 0 }, list.RepetitionLevels); - } - } - - } - } -} \ No newline at end of file diff --git a/src/Parquet.Test/Parquet.Test.csproj b/src/Parquet.Test/Parquet.Test.csproj index 0d0d408b..ba36a143 100644 --- a/src/Parquet.Test/Parquet.Test.csproj +++ b/src/Parquet.Test/Parquet.Test.csproj @@ -21,14 +21,17 @@ - + - + runtime; build; native; contentfiles; analyzers; buildtransitive all + + + diff --git a/src/Parquet.Test/ParquetReaderOnTestFilesTest.cs b/src/Parquet.Test/ParquetReaderOnTestFilesTest.cs index 970ffb16..e5e1cc47 100644 --- a/src/Parquet.Test/ParquetReaderOnTestFilesTest.cs +++ b/src/Parquet.Test/ParquetReaderOnTestFilesTest.cs @@ -1,4 +1,5 @@ using Parquet.Data; +using Parquet.Rows; using Parquet.Schema; using System; using System.IO; @@ -125,5 +126,12 @@ public async Task Read_delta_binary_packed() { Assert.Equal(200, bw1.Count); } } + + [Fact] + public async Task Read_col_names_with_trailing_dots() { + using Stream s = OpenTestFile("trailing_dot_col_name.parquet"); + Table tbl = await ParquetReader.ReadTableFromStreamAsync(s); + Assert.NotNull(tbl); + } } } \ No newline at end of file diff --git a/src/Parquet.Test/ParquetReaderTest.cs b/src/Parquet.Test/ParquetReaderTest.cs index 1dce5023..e07acf08 100644 --- a/src/Parquet.Test/ParquetReaderTest.cs +++ b/src/Parquet.Test/ParquetReaderTest.cs @@ -16,17 +16,17 @@ public class ParquetReaderTest : TestBase { [Fact] public async Task Opening_small_file_fails() { - await Assert.ThrowsAsync(async () => await ParquetReader.CreateAsync("small".ToMemoryStream())); + await Assert.ThrowsAsync(async () => await ParquetReader.CreateAsync("small".ToMemoryStream()!)); } [Fact] public async Task Opening_file_without_proper_head_fails() { - await Assert.ThrowsAsync(async () => await ParquetReader.CreateAsync("PAR2dataPAR1".ToMemoryStream())); + await Assert.ThrowsAsync(async () => await ParquetReader.CreateAsync("PAR2dataPAR1".ToMemoryStream()!)); } [Fact] public async Task Opening_file_without_proper_tail_fails() { - await Assert.ThrowsAsync(async () => await ParquetReader.CreateAsync("PAR1dataPAR2".ToMemoryStream())); + await Assert.ThrowsAsync(async () => await ParquetReader.CreateAsync("PAR1dataPAR2".ToMemoryStream()!)); } [Fact] diff --git a/src/Parquet.Test/ParquetWriterTest.cs b/src/Parquet.Test/ParquetWriterTest.cs index ce50c186..598d054e 100644 --- a/src/Parquet.Test/ParquetWriterTest.cs +++ b/src/Parquet.Test/ParquetWriterTest.cs @@ -69,6 +69,7 @@ public async Task Write_in_small_row_groups() { } } +#if NET7_0_OR_GREATER [Fact] public async Task Write_in_small_row_groups_write_only_stream() { //write to a write-only stream that does not implement the Position property @@ -123,6 +124,8 @@ public async Task Write_in_small_row_groups_write_only_stream() { //run the work and ensure that nothing throws await reader; } +#endif + [Fact] public async Task Append_to_file_reads_all_data() { @@ -207,7 +210,7 @@ public async Task Writes_only_beginning_of_array() { //read back using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { - Assert.Equal(3, reader.ThriftMetadata.Num_rows); + Assert.Equal(3, reader.ThriftMetadata!.Num_rows); using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(new int[] { 1, 2, 3 }, (await rg.ReadColumnAsync(id)).Data); @@ -229,7 +232,7 @@ public async Task Writes_only_end_of_array() { //read back using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { - Assert.Equal(3, reader.ThriftMetadata.Num_rows); + Assert.Equal(3, reader.ThriftMetadata!.Num_rows); using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(new int[] { 2, 3, 4 }, (await rg.ReadColumnAsync(id)).Data); @@ -251,7 +254,7 @@ public async Task FileMetadata_sets_num_rows_on_file_and_row_group() { //read back using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { - Assert.Equal(4, reader.ThriftMetadata.Num_rows); + Assert.Equal(4, reader.ThriftMetadata!.Num_rows); using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(4, rg.RowCount); @@ -277,7 +280,7 @@ public async Task FileMetadata_sets_num_rows_on_file_and_row_group_multiple_row_ //read back using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { - Assert.Equal(6, reader.ThriftMetadata.Num_rows); + Assert.Equal(6, reader.ThriftMetadata!.Num_rows); using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) { Assert.Equal(4, rg.RowCount); diff --git a/src/Parquet.Test/PrimitiveTypesTest.cs b/src/Parquet.Test/PrimitiveTypesTest.cs index 19f895d3..38a31731 100644 --- a/src/Parquet.Test/PrimitiveTypesTest.cs +++ b/src/Parquet.Test/PrimitiveTypesTest.cs @@ -18,10 +18,10 @@ public async Task Write_loads_of_booleans_all_true(int count) { data[i] = true; } - DataColumn read = await WriteReadSingleColumn(id, new DataColumn(id, data)); + DataColumn? read = await WriteReadSingleColumn(id, new DataColumn(id, data)); for(int i = 0; i < count; i++) { - Assert.True((bool)read.Data.GetValue(i), $"got FALSE at position {i}"); + Assert.True((bool)read!.Data.GetValue(i)!, $"got FALSE at position {i}"); } } @@ -36,9 +36,9 @@ public async Task Write_bunch_of_uints(uint count) { data[i] = uint.MaxValue - i; } - DataColumn read = await WriteReadSingleColumn(id, new DataColumn(id, data)); + DataColumn? read = await WriteReadSingleColumn(id, new DataColumn(id, data)); for(uint i = 0; i < count; i++) { - uint result = (uint)read.Data.GetValue(i); + uint result = (uint)read!.Data.GetValue(i)!; Assert.Equal(uint.MaxValue - i, result); } } @@ -53,9 +53,9 @@ public async Task Write_bunch_of_ulongs(ulong count) { data[i] = ulong.MaxValue - i; } - DataColumn read = await WriteReadSingleColumn(id, new DataColumn(id, data)); + DataColumn? read = await WriteReadSingleColumn(id, new DataColumn(id, data)); for(uint i = 0; i < count; i++) { - ulong result = (ulong)read.Data.GetValue(i); + ulong result = (ulong)read!.Data.GetValue(i)!; Assert.Equal(ulong.MaxValue - i, result); } } diff --git a/src/Parquet.Test/Reader/ParquetCsvComparison.cs b/src/Parquet.Test/Reader/ParquetCsvComparison.cs index 11727a49..7c6e5f75 100644 --- a/src/Parquet.Test/Reader/ParquetCsvComparison.cs +++ b/src/Parquet.Test/Reader/ParquetCsvComparison.cs @@ -68,7 +68,7 @@ private void Compare(DataColumn[] parquet, DataColumn[] csv, Type[] columnTypes) } else if(clrType == typeof(byte[])) { byte[] pva = (byte[])pv; - byte[] cva = (byte[])cv; + byte[] cva = (byte[])cv!; if(pva.Length != cva.Length) errors.Add($"expected length {cva.Length} but was {pva.Length} in column {pc.Field.Name}, value #{ri}"); @@ -149,7 +149,7 @@ private DataColumn[] ReadCsv(string name) { //compose result return - columnNames.Select((n, i) => new DataColumn(new DataField(n), columns[i].ToArray())) + columnNames!.Select((n, i) => new DataColumn(new DataField(n), columns[i].ToArray())) .ToArray(); } diff --git a/src/Parquet.Test/Reader/UseCultureAttribute.cs b/src/Parquet.Test/Reader/UseCultureAttribute.cs index 79a17c2e..6e88ecf6 100644 --- a/src/Parquet.Test/Reader/UseCultureAttribute.cs +++ b/src/Parquet.Test/Reader/UseCultureAttribute.cs @@ -14,8 +14,8 @@ public class UseCultureAttribute : BeforeAfterTestAttribute { readonly Lazy culture; readonly Lazy uiCulture; - CultureInfo originalCulture; - CultureInfo originalUICulture; + CultureInfo? originalCulture; + CultureInfo? originalUICulture; /// /// Replaces the culture and UI culture of the current thread with @@ -75,8 +75,10 @@ public override void Before(MethodInfo methodUnderTest) { /// /// The method under test public override void After(MethodInfo methodUnderTest) { - Thread.CurrentThread.CurrentCulture = originalCulture; - Thread.CurrentThread.CurrentUICulture = originalUICulture; + if(originalCulture != null) + Thread.CurrentThread.CurrentCulture = originalCulture; + if(originalUICulture != null) + Thread.CurrentThread.CurrentUICulture = originalUICulture; CultureInfo.CurrentCulture.ClearCachedData(); CultureInfo.CurrentUICulture.ClearCachedData(); diff --git a/src/Parquet.Test/Rows/LazyColumnEnumeratorTest.cs b/src/Parquet.Test/Rows/LazyColumnEnumeratorTest.cs index ef848a98..b353f202 100644 --- a/src/Parquet.Test/Rows/LazyColumnEnumeratorTest.cs +++ b/src/Parquet.Test/Rows/LazyColumnEnumeratorTest.cs @@ -9,28 +9,21 @@ public class LazyColumnEnumeratorTest { [Fact] public void Two_level_rep_levels() { //prepare columns with two items, each item has two inline items - var dc = new DataColumn(new DataField("openingHours") { MaxRepetitionLevel = 2 }, - new[] - { - 1, 2, 3, 4, - 5, 6, - - 7, 8, 9, - 10, 11, 12, 13 - - }, - null, - 1, - new[] - { - 0, 2, 2, 2, - 1, 2, - - 0, 2, 2, - 1, 2, 2, 2 - }, - 2 - ); + var dc = new DataColumn( + new DataField("openingHours") { MaxRepetitionLevel = 2, MaxDefinitionLevel = 1 }, + new[] { + 1, 2, 3, 4, + 5, 6, + + 7, 8, 9, + 10, 11, 12, 13 }, + null, + new[] { + 0, 2, 2, 2, + 1, 2, + + 0, 2, 2, + 1, 2, 2, 2 }); var e = new LazyColumnEnumerator(dc); @@ -51,20 +44,15 @@ public void Two_level_rep_levels() { [Fact] public void Simple_array() { - var dc = new DataColumn(new DataField("ids") { MaxRepetitionLevel = 1 }, - new[] - { - 1, 2, 3, 4, - 5, 6 - }, - null, - 1, - new[] - { - 0, 1, 1, 1, - 0, 1 - }, - 2); + var dc = new DataColumn( + new DataField("ids") { MaxRepetitionLevel = 1, MaxDefinitionLevel = 1 }, + new[] { + 1, 2, 3, 4, + 5, 6 }, + null, + new[] { + 0, 1, 1, 1, + 0, 1 }); var e = new LazyColumnEnumerator(dc); @@ -78,22 +66,18 @@ public void Simple_array() { [Fact] public void Empty_list() { - var dc = new DataColumn(new DataField("ids") { MaxRepetitionLevel = 1 }, - new int?[] - { - 1, 2, - null, - 5, 6 - }, - null, - 1, - new[] - { - 0, 1, - 0, - 0, 1 - }, - 2); + var dc = new DataColumn( + new DataField("ids") { MaxRepetitionLevel = 1, MaxDefinitionLevel = 1 }, + new int?[] { + 1, 2, + null, + 5, 6 }, + null, + new[] { + 0, 1, + 0, + 0, 1 + }); var e = new LazyColumnEnumerator(dc); diff --git a/src/Parquet.Test/Rows/RowsModelTest.cs b/src/Parquet.Test/Rows/RowsModelTest.cs index f481ca00..6ff47a35 100644 --- a/src/Parquet.Test/Rows/RowsModelTest.cs +++ b/src/Parquet.Test/Rows/RowsModelTest.cs @@ -426,7 +426,7 @@ public async Task List_of_lists_read_write_structures() { }); Table t1 = await WriteReadAsync(t); - Assert.Equal(3, ((object[])t1[0][1]).Length); + Assert.Equal(3, ((object[])t1[0]![1]!).Length); Assert.Equal(t.ToString(), t1.ToString(), ignoreLineEndingDifferences: true); } @@ -503,10 +503,10 @@ public async Task Special_read_all_legacy_decimals(string parquetFile) { Table ds = await ReadTestFileAsTableAsync(parquetFile); Row row = ds[0]; - Assert.Equal(1, (int)row[0]); - Assert.Equal(1.2m, (decimal)row[1], 2); + Assert.Equal(1, (int)row[0]!); + Assert.Equal(1.2m, (decimal)row[1]!, 2); Assert.Null(row[2]); - Assert.Equal(-1m, (decimal)row[3], 2); + Assert.Equal(-1m, (decimal)row[3]!, 2); } [Fact] diff --git a/src/Parquet.Test/Schema/FieldPathTest.cs b/src/Parquet.Test/Schema/FieldPathTest.cs new file mode 100644 index 00000000..a8013f5a --- /dev/null +++ b/src/Parquet.Test/Schema/FieldPathTest.cs @@ -0,0 +1,35 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Parquet.Schema; +using Xunit; + +namespace Parquet.Test.Schema { + public class FieldPathTest { + [Fact] + public void Construct_from_perfect_string() { + var p = new FieldPath("ID"); + + Assert.Single(p.ToList()); + Assert.Equal("ID", p.ToList()[0]); + } + + [Fact] + public void Construct_from_name_with_dot_inside() { + var p = new FieldPath("ID."); + + Assert.Single(p.ToList()); + Assert.Equal("ID.", p.ToList()[0]); + } + + [Fact] + public void Equal_simple_same() { + Assert.Equal(new FieldPath("id"), new FieldPath("id")); + } + + [Fact] + public void Equal_simple_not_same() { + Assert.NotEqual(new FieldPath("id1"), new FieldPath("id")); + } + } +} diff --git a/src/Parquet.Test/Schema/SchemaTest.cs b/src/Parquet.Test/Schema/SchemaTest.cs index a931ab94..e2d0b39a 100644 --- a/src/Parquet.Test/Schema/SchemaTest.cs +++ b/src/Parquet.Test/Schema/SchemaTest.cs @@ -85,7 +85,7 @@ public void Generic_dictionaries_are_not_allowed() { [Fact] public void Invalid_dictionary_declaration() { - Assert.Throws(() => new DataField>("d")); + Assert.Throws(() => new DataField>("d")); } [Fact] @@ -180,9 +180,9 @@ public void Lists_are_not_equal_by_item() { [Fact] public void List_maintains_path_prefix() { var list = new ListField("List", new DataField("id")); - list.PathPrefix = "Parent"; + list.PathPrefix = new FieldPath("Parent"); - Assert.Equal("Parent.List.list.id", list.Item.Path); + Assert.Equal(new FieldPath("Parent", "List", "list", "id"), list.Item.Path); } [Fact] @@ -225,17 +225,20 @@ public void List_of_structures_valid_levels() { var nameField = new DataField("name"); var schema = new ParquetSchema( - new DataField("id"), + new DataField("topLevelId"), new ListField("structs", new StructField("mystruct", idField, nameField))); + Assert.Equal(0, schema[0].MaxRepetitionLevel); + Assert.Equal(0, schema[0].MaxDefinitionLevel); + Assert.Equal(1, idField.MaxRepetitionLevel); - Assert.Equal(2, idField.MaxDefinitionLevel); + Assert.Equal(3, idField.MaxDefinitionLevel); // optional list + optional group + optional struct + required field Assert.Equal(1, nameField.MaxRepetitionLevel); - Assert.Equal(3, nameField.MaxDefinitionLevel); + Assert.Equal(4, nameField.MaxDefinitionLevel); } [Theory] @@ -335,6 +338,5 @@ public void SystemTypeToThriftMapping(Type t, TT expectedTT, CT? expectedCT) { Assert.Equal(expectedTT, foundTT); Assert.Equal(expectedCT, foundCT); } - } } \ No newline at end of file diff --git a/src/Parquet.Test/Serialisation/DremelAssemblerTest.cs b/src/Parquet.Test/Serialisation/DremelAssemblerTest.cs new file mode 100644 index 00000000..f1e6245c --- /dev/null +++ b/src/Parquet.Test/Serialisation/DremelAssemblerTest.cs @@ -0,0 +1,133 @@ +using System; +using System.Collections.Generic; +using System.Text.Json; +using Parquet.Serialization; +using Parquet.Serialization.Dremel; +using Parquet.Test.Serialisation.Paper; +using Parquet.Test.Xunit; +using Xunit; + +namespace Parquet.Test.Serialisation { + public class DremelAssemblerTest { + + private readonly Assembler _asm; + + public DremelAssemblerTest() { + _asm = new Assembler(typeof(Document).GetParquetSchema(true)); + } + + [Fact] + public void Totals_Has6Assemblers() { + Assert.Equal(6, _asm.FieldAssemblers.Count); + } + + [Fact] + public void Field_1_DocId() { + var docs = new List { new Document(), new Document() }; + _asm.FieldAssemblers[0].Assemble(docs, Document.RawColumns[0]); + Assert.Equal(10, docs[0].DocId); + Assert.Equal(20, docs[1].DocId); + } + + [Fact] + public void Field_2_Links_Backward() { + var docs = new List { new Document(), new Document() }; + _asm.FieldAssemblers[1].Assemble(docs, Document.RawColumns[1]); + Assert.NotNull(docs[0].Links); + Assert.Null(docs[0].Links!.Backward); + Assert.Null(docs[0].Links!.Forward); + + + Assert.NotNull(docs[1].Links); + Assert.Equal(new long[] { 10, 30 }, docs[1].Links!.Backward!); + Assert.Null(docs[1].Links!.Forward); + } + + [Fact] + public void Field_3_Links_Forward() { + var docs = new List { new Document(), new Document() }; + _asm.FieldAssemblers[2].Assemble(docs, Document.RawColumns[2]); + Assert.NotNull(docs[0].Links); + Assert.NotNull(docs[1].Links); + + Assert.Equal(new long[] { 20, 40, 60 }, docs[0].Links!.Forward!); + Assert.Equal(new long[] { 80 }, docs[1].Links!.Forward!); + } + + [Fact] + public void Field_4_Name_Language_Code() { + var docs = new List { new Document(), new Document() }; + _asm.FieldAssemblers[3].Assemble(docs, Document.RawColumns[3]); + + // assert + + // Name + Assert.NotNull(docs[0].Name); + Assert.NotNull(docs[1].Name); + + // Name count is 3, regardless of the null value + Assert.Equal(3, docs[0].Name!.Count); + Assert.Single(docs[1].Name!); + + // Language count + Assert.Equal(2, docs[0].Name![0].Language!.Count); + + // language values + Assert.Equal("en-us", docs[0].Name![0].Language![0].Code); + Assert.Equal("en", docs[0].Name![0].Language![1].Code); + Assert.Equal("en-gb", docs[0].Name![2].Language![0].Code); + } + + [Fact] + public void Field_5_Name_Language_Country() { + var docs = new List { new Document(), new Document() }; + _asm.FieldAssemblers[4].Assemble(docs, Document.RawColumns[4]); + + // assert + + // Name + Assert.NotNull(docs[0].Name); + Assert.NotNull(docs[1].Name); + + Assert.Equal(3, docs[0].Name!.Count); + + // Language count + Assert.Equal(2, docs[0].Name![0].Language!.Count); + + // language values + Assert.Equal("us", docs[0].Name![0].Language![0].Country); + Assert.Null(docs[0].Name![0].Language![1].Country); + Assert.Equal("gb", docs[0].Name![2].Language![0].Country); + } + + [Fact] + public void Field_6_Name_Url() { + var docs = new List { new Document(), new Document() }; + _asm.FieldAssemblers[5].Assemble(docs, Document.RawColumns[5]); + + // assert + + // Name + Assert.NotNull(docs[0].Name); + Assert.NotNull(docs[1].Name); + + Assert.Equal(3, docs[0].Name!.Count); + Assert.Single(docs[1].Name!); + } + + [Fact] + public void FullReassembly() { + var docs = new List { new Document(), new Document() }; + + for(int i = 0; i < Document.RawColumns.Length; i++) { + try { + _asm.FieldAssemblers[i].Assemble(docs, Document.RawColumns[i]); + } catch(Exception ex) { + throw new InvalidOperationException("failure on " + _asm.FieldAssemblers[i].Field, ex); + } + } + + XAssert.JsonEquivalent(Document.Both, docs); + } + } +} diff --git a/src/Parquet.Test/Serialisation/DremelStriperTest.cs b/src/Parquet.Test/Serialisation/DremelStriperTest.cs new file mode 100644 index 00000000..b7847432 --- /dev/null +++ b/src/Parquet.Test/Serialisation/DremelStriperTest.cs @@ -0,0 +1,135 @@ +using System.Collections.Generic; +using Parquet.Schema; +using Parquet.Serialization; +using Parquet.Serialization.Dremel; +using Parquet.Test.Serialisation.Paper; +using Xunit; + +namespace Parquet.Test.Serialisation { + + /// + /// These tests validate repetition and definition levels are generated correctly according to the main + /// Dremel Paper written by google. + /// Link to original paper: https://research.google/pubs/pub36632/ + /// + public class DremelStriperTest { + + private readonly Striper _striper; + + public DremelStriperTest() { + _striper = new Striper(typeof(Document).GetParquetSchema(false)); + } + + [Fact] + public void Schema_AllLevels() { + // check schema + ParquetSchema schema = typeof(Document).GetParquetSchema(false); + + // DocId + Field docId = schema[0]; + Assert.Equal(new FieldPath("DocId"), docId.Path); + Assert.Equal(0, docId.MaxRepetitionLevel); + Assert.Equal(0, docId.MaxDefinitionLevel); + + // Links + Field links = schema[1]; + Assert.Equal(new FieldPath("Links"), links.Path); + Assert.Equal(0, links.MaxRepetitionLevel); + Assert.Equal(1, links.MaxDefinitionLevel); + + // Links.Backward + Field lBack = schema[1].Children[0]; + Assert.Equal(new FieldPath("Links", "Backward"), lBack.Path); + Assert.Equal(1, lBack.MaxRepetitionLevel); + Assert.Equal(2, lBack.MaxDefinitionLevel); + + // Links.Forward + Field lForw = schema[1].Children[1]; + Assert.Equal(new FieldPath("Links", "Forward"), lForw.Path); + Assert.Equal(1, lForw.MaxRepetitionLevel); + Assert.Equal(2, lForw.MaxDefinitionLevel); + + // Name.Language.Code + Field nlCode = schema[2].NaturalChildren[0].NaturalChildren[0]; + Assert.Equal(new FieldPath("Name", "list", "element", "Language", "list", "element", "Code"), nlCode.Path); + Assert.Equal(2, nlCode.MaxRepetitionLevel); + Assert.Equal(7, nlCode.MaxDefinitionLevel); + + // Name.Language.Country + Field nlCountry = schema[2].NaturalChildren[0].NaturalChildren[1]; + Assert.Equal(new FieldPath("Name", "list", "element", "Language", "list", "element", "Country"), nlCountry.Path); + Assert.Equal(2, nlCountry.MaxRepetitionLevel); + Assert.Equal(7, nlCountry.MaxDefinitionLevel); + + // Name.Url + Assert.Equal(new FieldPath("Name", "list", "element", "Url"), schema[2].Children[0].Children[1].Path); + Assert.Equal(1, schema[2].Children[0].Children[1].MaxRepetitionLevel); + Assert.Equal(4, schema[2].Children[0].Children[1].MaxDefinitionLevel); + } + + + [Fact] + public void Totals_Has6Stripers() { + + Assert.Equal(6, _striper.FieldStripers.Count); + } + + [Fact] + public void Field_1_DocId() { + // DocId (field 1 of 6) + FieldStriper striper = _striper.FieldStripers[0]; + ShreddedColumn col = striper.Stripe(striper.Field, Document.Both); + Assert.Equal(new long[] { 10, 20 }, col.Data); + Assert.Null(col.RepetitionLevels); + Assert.Null(col.DefinitionLevels); + } + + [Fact] + public void Field_2_Links_Backward() { + + // Links.Backward (field 2 of 6) + FieldStriper striper = _striper.FieldStripers[1]; + ShreddedColumn col = striper.Stripe(striper.Field, Document.Both); + Assert.Equal(new long[] { 10, 30 }, col.Data); + Assert.Equal(new int[] { 0, 0, 1 }, col.RepetitionLevels!); + Assert.Equal(new int[] { 1, 2, 2 }, col.DefinitionLevels!); + } + + // Links.Forward (field 3 of 6) + [Fact] + public void Field_3_Links_Forward() { + FieldStriper striper = _striper.FieldStripers[2]; + ShreddedColumn col = striper.Stripe(striper.Field, Document.Both); + Assert.Equal(new long[] { 20, 40, 60, 80 }, col.Data); + Assert.Equal(new int[] { 0, 1, 1, 0 }, col.RepetitionLevels!); + Assert.Equal(new int[] { 2, 2, 2, 2 }, col.DefinitionLevels!); + } + + [Fact] + public void Field_4_Name_Language_Code() { + FieldStriper striper = _striper.FieldStripers[3]; + ShreddedColumn col = striper.Stripe(striper.Field, Document.Both); + Assert.Equal(new string[] { "en-us", "en", "en-gb" }, col.Data); + Assert.Equal(new int[] { 0, 2, 1, 1, 0 }, col.RepetitionLevels!); + Assert.Equal(new int[] { 7, 7, 4, 7, 4 }, col.DefinitionLevels!); + } + + [Fact] + public void Field_5_Name_Language_Country() { + FieldStriper striper = _striper.FieldStripers[4]; + ShreddedColumn col = striper.Stripe(striper.Field, Document.Both); + Assert.Equal(new string[] { "us", "gb" }, col.Data); + Assert.Equal(new int[] { 0, 2, 1, 1, 0 }, col.RepetitionLevels!); + Assert.Equal(new int[] { 7, 6, 4, 7, 4 }, col.DefinitionLevels!); + } + + [Fact] + public void Field_6_Name_Url() { + FieldStriper striper = _striper.FieldStripers[5]; + ShreddedColumn col = striper.Stripe(striper.Field, Document.Both); + Assert.Equal(new string[] { "http://A", "http://B", "http://C" }, col.Data); + Assert.Equal(new int[] { 0, 1, 1, 0 }, col.RepetitionLevels!); + Assert.Equal(new int[] { 4, 4, 3, 4 }, col.DefinitionLevels!); + } + } +} diff --git a/src/Parquet.Test/Serialisation/InheritedPropertiesTest.cs b/src/Parquet.Test/Serialisation/InheritedPropertiesTest.cs index 87901b0c..69b56dff 100644 --- a/src/Parquet.Test/Serialisation/InheritedPropertiesTest.cs +++ b/src/Parquet.Test/Serialisation/InheritedPropertiesTest.cs @@ -4,6 +4,7 @@ using Parquet.Schema; using Parquet.Serialization; using Xunit; +using System; namespace Parquet.Test.Serialisation { public class InheritedPropertiesTest : TestBase { @@ -21,7 +22,7 @@ private InheritedClass[] GenerateRecordsToSerialize() { return recordsToSerialize; } - [Fact] + [Obsolete, Fact] public async Task Serialize_class_with_inherited_properties() { InheritedClass[] recordsToSerialize = GenerateRecordsToSerialize(); ParquetSchema schema = typeof(InheritedClass).GetParquetSchema(true); diff --git a/src/Parquet.Test/Serialisation/Paper/Document.cs b/src/Parquet.Test/Serialisation/Paper/Document.cs new file mode 100644 index 00000000..54e9da35 --- /dev/null +++ b/src/Parquet.Test/Serialisation/Paper/Document.cs @@ -0,0 +1,105 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Parquet.Data; +using Parquet.Schema; + +namespace Parquet.Test.Serialisation.Paper { + + class Document { + public long DocId { get; set; } + + public Links? Links { get; set; } + + public List? Name { get; set; } + + public static Document R1 => new() { + DocId = 10, + Links = new Links { + Forward = new List { 20, 40, 60 } + }, + Name = new List { + new Name { + Language = new List { + new Language { + Code = "en-us", + Country = "us" + }, + new Language { + Code = "en" + } + }, + Url = "http://A" + }, + new Name { + Url = "http://B" + }, + new Name { + Language = new List { + new Language { + Code = "en-gb", + Country = "gb" + } + } + } + } + }; + + public static Document R2 => new() { + DocId = 20, + Links = new Links { + Backward = new List { 10, 30 }, + Forward = new List { 80 } + }, + Name = new List { + new Name { + Url = "http://C" + } + } + }; + + public static List Both => new List{ R1, R2 }; + + public static DataColumn[] RawColumns { + get { + return new DataColumn[] { + new DataColumn(new DataField("DocId"), + new long[] { 10, 20 }, + (List?)null, + null, + false), + + new DataColumn(new DataField("Backward"), + new long[] { 10, 30 }, + new List { 1, 2, 2 }, + new() { 0, 0, 1 }, + false), + + new DataColumn(new DataField("Forward"), + new long[] { 20, 40, 60, 80 }, + new List { 2, 2, 2, 2 }, + new() { 0, 1, 1, 0 }, + false), + + new DataColumn(new DataField("Code"), + new string[] { "en-us", "en", "en-gb" }, + new List { 7, 7, 2, 7, 2 }, + new() { 0, 2, 1, 1, 0 }, + false), + + new DataColumn(new DataField("Country"), + new string[] { "us", "gb" }, + new List { 7, 6, 4, 7, 4 }, + new() { 0, 2, 1, 1, 0 }, + false), + + new DataColumn(new DataField("Url"), + new string[] { "http://A", "http://B", "http://C" }, + new List { 4, 4, 3, 4 }, + new() { 0, 1, 1, 0 }, + false) + }; + } + } + } +} diff --git a/src/Parquet.Test/Serialisation/Paper/Language.cs b/src/Parquet.Test/Serialisation/Paper/Language.cs new file mode 100644 index 00000000..a18b5ebe --- /dev/null +++ b/src/Parquet.Test/Serialisation/Paper/Language.cs @@ -0,0 +1,11 @@ +namespace Parquet.Test.Serialisation.Paper { + #region [ Paper ] + + class Language { + public string? Code { get; set; } + + public string? Country { get; set; } + } + + #endregion +} diff --git a/src/Parquet.Test/Serialisation/Paper/Links.cs b/src/Parquet.Test/Serialisation/Paper/Links.cs new file mode 100644 index 00000000..2790e731 --- /dev/null +++ b/src/Parquet.Test/Serialisation/Paper/Links.cs @@ -0,0 +1,13 @@ +using System.Collections.Generic; + +namespace Parquet.Test.Serialisation.Paper { + #region [ Paper ] + + class Links { + public List? Backward { get; set; } + + public List? Forward { get; set; } + } + + #endregion +} diff --git a/src/Parquet.Test/Serialisation/Paper/Name.cs b/src/Parquet.Test/Serialisation/Paper/Name.cs new file mode 100644 index 00000000..a0016a99 --- /dev/null +++ b/src/Parquet.Test/Serialisation/Paper/Name.cs @@ -0,0 +1,14 @@ +using System.Collections.Generic; + +namespace Parquet.Test.Serialisation.Paper { + #region [ Paper ] + + class Name { + + public List? Language { get; set; } + + public string? Url { get; set; } + } + + #endregion +} diff --git a/src/Parquet.Test/Serialisation/ParquetConvertTest.cs b/src/Parquet.Test/Serialisation/ParquetConvertTest.cs index 13ced2a7..af3d04ec 100644 --- a/src/Parquet.Test/Serialisation/ParquetConvertTest.cs +++ b/src/Parquet.Test/Serialisation/ParquetConvertTest.cs @@ -10,6 +10,8 @@ using Xunit; namespace Parquet.Test.Serialisation { + + [Obsolete] public class ParquetConvertTest : TestBase { [Fact] @@ -711,6 +713,9 @@ public class SimpleStructureWithFewProperties { public int Id { get; set; } public string? Name { get; set; } } + +#pragma warning disable CS0618 // Type or member is obsolete + public class StructureWithIgnoredProperties { public int Id { get; set; } public string? Name { get; set; } @@ -734,6 +739,8 @@ public class StructureWithIgnoredProperties { [ParquetIgnore] public decimal? NullableDecimal { get; set; } } +#pragma warning restore CS0618 // Type or member is obsolete + public class SimpleRenamed { public int Id { get; set; } @@ -778,10 +785,10 @@ public class SimpleMap { public DateTime Timestamp { get; set; } [JsonPropertyName("ip")] - public string IpAddress { get; set; } + public string? IpAddress { get; set; } [JsonPropertyName("tags")] - public Dictionary Tags { get; set; } + public Dictionary? Tags { get; set; } } } } \ No newline at end of file diff --git a/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs b/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs index 63d67e4f..e7891e7a 100644 --- a/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs +++ b/src/Parquet.Test/Serialisation/ParquetSerializerTest.cs @@ -1,35 +1,25 @@ using System; using System.Collections.Generic; -using System.Diagnostics; using System.IO; using System.Linq; -using System.Text; using System.Threading.Tasks; using Parquet.Serialization; +using Parquet.Test.Xunit; using Xunit; namespace Parquet.Test.Serialisation { public class ParquetSerializerTest { - class Record : IEquatable { + class Record { public DateTime Timestamp { get; set; } public string? EventName { get; set; } public double MeterValue { get; set; } - - public bool Equals(Record? other) { - if(other == null) - return false; - - return Timestamp == other.Timestamp && - EventName == other.EventName && - MeterValue == other.MeterValue; - } } [Fact] - public async Task SerializeDeserializeRecord() { + public async Task Atomics_Simplest_Serde() { - var data = Enumerable.Range(0, 1_000_000).Select(i => new Record { + var data = Enumerable.Range(0, 1_000).Select(i => new Record { Timestamp = DateTime.UtcNow.AddSeconds(i), EventName = i % 2 == 0 ? "on" : "off", MeterValue = i @@ -41,7 +31,176 @@ public async Task SerializeDeserializeRecord() { ms.Position = 0; IList data2 = await ParquetSerializer.DeserializeAsync(ms); - Assert.Equal(data2, data); + Assert.Equivalent(data2, data); + } + + class NullableRecord : Record { + public int? ParentId { get; set; } + } + + [Fact] + public async Task Atomics_Nullable_Serde() { + + var data = Enumerable.Range(0, 1_000).Select(i => new NullableRecord { + Timestamp = DateTime.UtcNow.AddSeconds(i), + EventName = i % 2 == 0 ? "on" : "off", + MeterValue = i, + ParentId = (i % 4 == 0) ? null : i + }).ToList(); + + using var ms = new MemoryStream(); + await ParquetSerializer.SerializeAsync(data, ms); + + ms.Position = 0; + IList data2 = await ParquetSerializer.DeserializeAsync(ms); + + Assert.Equivalent(data2, data); + } + + class Address { + public string? Country { get; set; } + + public string? City { get; set; } + } + + class AddressBookEntry { + public string? FirstName { get; set; } + + public string? LastName { get; set; } + + public Address? Address { get; set; } + } + + [Fact] + public async Task Struct_Serde() { + + var data = Enumerable.Range(0, 1_000).Select(i => new AddressBookEntry { + FirstName = "Joe", + LastName = "Bloggs", + Address = new Address() { + Country = "UK", + City = "Unknown" + } + }).ToList(); + + using var ms = new MemoryStream(); + await ParquetSerializer.SerializeAsync(data, ms); + + ms.Position = 0; + IList data2 = await ParquetSerializer.DeserializeAsync(ms); + + Assert.Equivalent(data2, data); + } + + class MovementHistory { + public int? PersonId { get; set; } + + public string? Comments { get; set; } + + public List
? Addresses { get; set; } + } + + [Fact] + public async Task Struct_WithNullProps_Serde() { + + var data = Enumerable.Range(0, 1_000).Select(i => new AddressBookEntry { + FirstName = "Joe", + LastName = "Bloggs" + // Address is null + }).ToList(); + + using var ms = new MemoryStream(); + await ParquetSerializer.SerializeAsync(data, ms); + + ms.Position = 0; + IList data2 = await ParquetSerializer.DeserializeAsync(ms); + + Assert.Equivalent(data2, data); + } + + [Fact] + public async Task List_Structs_Serde() { + var data = Enumerable.Range(0, 1_000).Select(i => new MovementHistory { + PersonId = i, + Comments = i % 2 == 0 ? "none" : null, + Addresses = Enumerable.Range(0, 4).Select(a => new Address { + City = "Birmingham", + Country = "United Kingdom" + }).ToList() + }).ToList(); + + // serialise + using var ms = new MemoryStream(); + await ParquetSerializer.SerializeAsync(data, ms); + //await ParquetSerializer.SerializeAsync(data, "c:\\tmp\\ls.parquet"); + + // deserialise + ms.Position = 0; + IList data2 = await ParquetSerializer.DeserializeAsync(ms); + + // assert + XAssert.JsonEquivalent(data, data2); + + } + + class MovementHistoryCompressed { + public int? PersonId { get; set; } + + public List? ParentIds { get; set; } + } + + [Fact] + public async Task List_Atomics_Serde() { + + var data = Enumerable.Range(0, 100).Select(i => new MovementHistoryCompressed { + PersonId = i, + ParentIds = Enumerable.Range(i, 4).ToList() + }).ToList(); + + // serialise + using var ms = new MemoryStream(); + await ParquetSerializer.SerializeAsync(data, ms); + //await ParquetSerializer.SerializeAsync(data, "c:\\tmp\\lat.parquet"); + + // deserialise + ms.Position = 0; + IList data2 = await ParquetSerializer.DeserializeAsync(ms); + + // assert + Assert.Equivalent(data, data2); + + } + + class IdWithTags { + public int Id { get; set; } + + public Dictionary? Tags { get; set; } + } + + + [Fact] + public async Task Map_Simple_Serde() { + var data = Enumerable.Range(0, 10).Select(i => new IdWithTags { + Id = i, + Tags = new Dictionary { + ["id"] = i.ToString(), + ["gen"] = DateTime.UtcNow.ToString() + }}).ToList(); + + var t = new Dictionary(); + + // serialise + using var ms = new MemoryStream(); + await ParquetSerializer.SerializeAsync(data, ms); + //await ParquetSerializer.SerializeAsync(data, "c:\\tmp\\map.parquet"); + + // deserialise + ms.Position = 0; + IList data2 = await ParquetSerializer.DeserializeAsync(ms); + + // assert + XAssert.JsonEquivalent(data, data2); + } } } diff --git a/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs b/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs index aaab5014..c9f3dddd 100644 --- a/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs +++ b/src/Parquet.Test/Serialisation/SchemaReflectorTest.cs @@ -1,3 +1,4 @@ +using System; using System.Collections.Generic; using System.Text.Json.Serialization; using Parquet.Schema; @@ -119,6 +120,8 @@ public void AliasedProperties() { ), schema); } +#pragma warning disable CS0618 // Type or member is obsolete + class IgnoredPoco { public int NotIgnored { get; set; } @@ -129,6 +132,8 @@ class IgnoredPoco { [JsonIgnore] public int Ignored2 { get; set; } } +#pragma warning restore CS0618 // Type or member is obsolete + [Fact] public void IgnoredProperties() { @@ -154,5 +159,51 @@ public void SimpleMap() { new DataField("Key"), new DataField("Value"))), schema); } + + class StructMemberPoco { + public string? FirstName { get; set; } + + public string? LastName { get; set; } + } + + class StructMasterPoco { + public int Id { get; set; } + + public StructMemberPoco? Name { get; set; } + } + + [Fact] + public void SimpleStruct() { + ParquetSchema schema = typeof(StructMasterPoco).GetParquetSchema(true); + + Assert.Equal(new ParquetSchema( + new DataField("Id"), + new StructField("Name", + new DataField("FirstName"), + new DataField("LastName") + )), schema); + } + + class ListOfStructsPoco { + public int Id { get; set; } + + public List? Members { get; set; } + } + + [Fact] + public void ListOfStructs() { + ParquetSchema actualSchema = typeof(ListOfStructsPoco).GetParquetSchema(true); + + var expectedSchema = new ParquetSchema( + new DataField("Id"), + new ListField("Members", + new StructField("element", + new DataField("FirstName"), + new DataField("LastName")))); + + Assert.True( + expectedSchema.Equals(actualSchema), + expectedSchema.GetNotEqualsMessage(actualSchema, "expected", "actual")); + } } } \ No newline at end of file diff --git a/src/Parquet.Test/StatisticsTest.cs b/src/Parquet.Test/StatisticsTest.cs index 868d9985..b94a6c6b 100644 --- a/src/Parquet.Test/StatisticsTest.cs +++ b/src/Parquet.Test/StatisticsTest.cs @@ -129,9 +129,9 @@ public async Task Distinct_stat_for_basic_data_types(string name) { var id = new DataField("id", test!.Type!); - DataColumn rc = await WriteReadSingleColumn(id, new DataColumn(id, test!.Data!)); + DataColumn? rc = await WriteReadSingleColumn(id, new DataColumn(id, test!.Data!)); - Assert.Equal(test.Data!.Length, rc.CalculateRowCount()); + Assert.Equal(test.Data!.Length, rc!.CalculateRowCount()); //Assert.Equal(test.DistinctCount, rc.Statistics.DistinctCount); Assert.Equal(test.NullCount, rc.Statistics.NullCount); Assert.Equal(test.Min, rc.Statistics.MinValue); diff --git a/src/Parquet.Test/StructureTest.cs b/src/Parquet.Test/StructureTest.cs deleted file mode 100644 index 529f4f06..00000000 --- a/src/Parquet.Test/StructureTest.cs +++ /dev/null @@ -1,33 +0,0 @@ -using System.IO; -using System.Threading.Tasks; -using Parquet.Data; -using Parquet.Schema; -using Xunit; - -namespace Parquet.Test { - public class StructureTest : TestBase { - [Fact] - public async Task Simple_structure_write_read() { - var schema = new ParquetSchema( - new DataField("name"), - new StructField("address", - new DataField("line1"), - new DataField("postcode") - )); - - var ms = new MemoryStream(); - await ms.WriteSingleRowGroupParquetFileAsync(schema, - new DataColumn(new DataField("name"), new[] { "Hazel" }), - new DataColumn(new DataField("line1"), new[] { "woods" }), - new DataColumn(new DataField("postcode"), new[] { "postcode" })); - ms.Position = 0; - - // out Schema readSchema, out DataColumn[] readColumns - (ParquetSchema readSchema, DataColumn[] readColumns) = await ms.ReadSingleRowGroupParquetFile(); - - Assert.Equal("Hazel", readColumns[0].Data.GetValue(0)); - Assert.Equal("woods", readColumns[1].Data.GetValue(0)); - Assert.Equal("postcode", readColumns[2].Data.GetValue(0)); - } - } -} \ No newline at end of file diff --git a/src/Parquet.Test/TestBase.cs b/src/Parquet.Test/TestBase.cs index a5fbc815..d5e5bb65 100644 --- a/src/Parquet.Test/TestBase.cs +++ b/src/Parquet.Test/TestBase.cs @@ -15,14 +15,14 @@ protected Stream OpenTestFile(string name) { return F.OpenRead("./data/" + name); } + [Obsolete] protected async Task ConvertSerialiseDeserialise(IEnumerable instances) where T : new() { - using(var ms = new MemoryStream()) { - ParquetSchema s = await ParquetConvert.SerializeAsync(instances, ms); + using var ms = new MemoryStream(); + ParquetSchema s = await ParquetConvert.SerializeAsync(instances, ms); - ms.Position = 0; + ms.Position = 0; - return await ParquetConvert.DeserializeAsync(ms); - } + return await ParquetConvert.DeserializeAsync(ms); } protected async Task ReadTestFileAsTableAsync(string name) { @@ -48,46 +48,39 @@ protected async Task
WriteReadAsync(Table table, bool saveLocal = false) } } - protected async Task WriteReadSingleColumn(DataField field, DataColumn dataColumn) { - using(var ms = new MemoryStream()) { - // write with built-in extension method - await ms.WriteSingleRowGroupParquetFileAsync(new ParquetSchema(field), dataColumn); - ms.Position = 0; - - //System.IO.File.WriteAllBytes("c:\\tmp\\1.parquet", ms.ToArray()); + protected async Task WriteReadSingleColumn(DataField field, DataColumn dataColumn) { + using var ms = new MemoryStream(); + // write with built-in extension method + await ms.WriteSingleRowGroupParquetFileAsync(new ParquetSchema(field), dataColumn); + ms.Position = 0; - // read first gow group and first column - using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { - if(reader.RowGroupCount == 0) - return null; - ParquetRowGroupReader rgReader = reader.OpenRowGroupReader(0); + //System.IO.File.WriteAllBytes("c:\\tmp\\1.parquet", ms.ToArray()); - return await rgReader.ReadColumnAsync(field); - } + // read first gow group and first column + using ParquetReader reader = await ParquetReader.CreateAsync(ms); + if(reader.RowGroupCount == 0) + return null; + ParquetRowGroupReader rgReader = reader.OpenRowGroupReader(0); - - } + return await rgReader.ReadColumnAsync(field); } protected async Task> WriteReadSingleRowGroup( ParquetSchema schema, DataColumn[] columns) { ParquetSchema readSchema; - using(var ms = new MemoryStream()) { - await ms.WriteSingleRowGroupParquetFileAsync(schema, columns); - ms.Position = 0; + using var ms = new MemoryStream(); + await ms.WriteSingleRowGroupParquetFileAsync(schema, columns); + ms.Position = 0; - //System.IO.File.WriteAllBytes("c:\\tmp\\1.parquet", ms.ToArray()); + //System.IO.File.WriteAllBytes("c:\\tmp\\1.parquet", ms.ToArray()); - using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { - readSchema = reader.Schema; + using ParquetReader reader = await ParquetReader.CreateAsync(ms); + readSchema = reader.Schema; - using(ParquetRowGroupReader rgReader = reader.OpenRowGroupReader(0)) { - return Tuple.Create(await columns.Select(c => - rgReader.ReadColumnAsync(c.Field)) - .SequentialWhenAll(), readSchema); - } - } - } + using ParquetRowGroupReader rgReader = reader.OpenRowGroupReader(0); + return Tuple.Create(await columns.Select(c => + rgReader.ReadColumnAsync(c.Field)) + .SequentialWhenAll(), readSchema); } protected async Task WriteReadSingle(DataField field, object? value, CompressionMethod compressionMethod = CompressionMethod.None) { @@ -100,13 +93,12 @@ protected async Task WriteReadSingle(DataField field, object? value, Com using(ParquetWriter writer = await ParquetWriter.CreateAsync(new ParquetSchema(field), ms)) { writer.CompressionMethod = compressionMethod; - using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) { - Array dataArray = Array.CreateInstance(field.ClrNullableIfHasNullsType, 1); - dataArray.SetValue(value, 0); - var column = new DataColumn(field, dataArray); + using ParquetRowGroupWriter rg = writer.CreateRowGroup(); + Array dataArray = Array.CreateInstance(field.ClrNullableIfHasNullsType, 1); + dataArray.SetValue(value, 0); + var column = new DataColumn(field, dataArray); - await rg.WriteColumnAsync(column); - } + await rg.WriteColumnAsync(column); } data = ms.ToArray(); @@ -116,13 +108,11 @@ protected async Task WriteReadSingle(DataField field, object? value, Com // read back single value ms.Position = 0; - using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { - using(ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(0)) { - DataColumn column = await rowGroupReader.ReadColumnAsync(field); + using ParquetReader reader = await ParquetReader.CreateAsync(ms); + using ParquetRowGroupReader rowGroupReader = reader.OpenRowGroupReader(0); + DataColumn column = await rowGroupReader.ReadColumnAsync(field); - return column.Data.GetValue(0); - } - } + return column.Data.GetValue(0)!; } } } diff --git a/src/Parquet.Test/Types/ListTest.cs b/src/Parquet.Test/Types/ListTest.cs new file mode 100644 index 00000000..dee317d9 --- /dev/null +++ b/src/Parquet.Test/Types/ListTest.cs @@ -0,0 +1,55 @@ +using System.Threading.Tasks; +using Parquet.Data; +using Parquet.Schema; +using Xunit; + +namespace Parquet.Test.Types { + public class ListTest : TestBase { + [Fact] + public async Task List_of_structures_writes_reads() { + var nameField = new DataField("name"); + var line1Field = new DataField("line1"); + var postcodeField = new DataField("postcode"); + + var schema = new ParquetSchema( + nameField, + new ListField("addresses", + new StructField(ListField.ElementName, + line1Field, + postcodeField))); + + var nameCol = new DataColumn(nameField, new string[] { "Joe", "Bob" }); + var line1Col = new DataColumn(line1Field, new[] { "Amazonland", "Disneyland", "Cryptoland" }, new[] { 0, 1, 0 }); + var postcodeCol = new DataColumn(postcodeField, new[] { "AAABBB", "CCCDDD", "EEEFFF" }, new[] { 0, 1, 0 }); + + await WriteReadSingleRowGroup(schema, new[] { nameCol, line1Col, postcodeCol }); + } + + [Fact] + public async Task List_of_elements_with_some_items_empty_reads_file() { + /* + list data: + - 1: [1, 2, 3] + - 2: [] + - 3: [1, 2, 3] + - 4: [] + */ + + using(ParquetReader reader = await ParquetReader.CreateAsync(OpenTestFile("list_empty_alt.parquet"))) + + using(ParquetRowGroupReader groupReader = reader.OpenRowGroupReader(0)) { + Assert.Equal(4, groupReader.RowCount); + DataField[] fs = reader.Schema.GetDataFields(); + + DataColumn id = await groupReader.ReadColumnAsync(fs[0]); + Assert.Equal(4, id.Data.Length); + Assert.False(id.HasRepetitions); + + DataColumn list = await groupReader.ReadColumnAsync(fs[1]); + Assert.Equal(8, list.Data.Length); + Assert.Equal(new int[] { 0, 1, 1, 0, 0, 1, 1, 0 }, list.RepetitionLevels); + } + + } + } +} \ No newline at end of file diff --git a/src/Parquet.Test/Types/RepeatableFieldsTest.cs b/src/Parquet.Test/Types/RepeatableFieldsTest.cs index 3d546e3e..6c42fd78 100644 --- a/src/Parquet.Test/Types/RepeatableFieldsTest.cs +++ b/src/Parquet.Test/Types/RepeatableFieldsTest.cs @@ -17,11 +17,11 @@ public async Task Simple_repeated_field_write_read() { new int[] { 0, 1, 1, 0, 1 }); // act - DataColumn rc = await WriteReadSingleColumn(field, column); + DataColumn? rc = await WriteReadSingleColumn(field, column); // assert - Assert.Equal(new int[] { 1, 2, 3, 4, 5 }, rc.Data); - Assert.Equal(new int[] { 0, 1, 1, 0, 1 }, rc.RepetitionLevels); + Assert.Equal(new int[] { 1, 2, 3, 4, 5 }, rc!.Data); + Assert.Equal(new int[] { 0, 1, 1, 0, 1 }, rc!.RepetitionLevels); } } } diff --git a/src/Parquet.Test/Types/StructureTest.cs b/src/Parquet.Test/Types/StructureTest.cs new file mode 100644 index 00000000..07e031b1 --- /dev/null +++ b/src/Parquet.Test/Types/StructureTest.cs @@ -0,0 +1,54 @@ +using System.IO; +using System.Threading.Tasks; +using Parquet.Data; +using Parquet.Schema; +using Xunit; + +namespace Parquet.Test.Types { + public class StructureTest : TestBase { + + /// + /// This method is used in documentation, keep formatting clear + /// + /// + [Fact] + public async Task Simple_structure_write_read() { + var schema = new ParquetSchema( + new DataField("name"), + new StructField("address", + new DataField("line1"), + new DataField("postcode") + )); + + using var ms = new MemoryStream(); + using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, ms)) { + ParquetRowGroupWriter rgw = writer.CreateRowGroup(); + + await rgw.WriteColumnAsync( + new DataColumn((DataField)schema[0], new[] { "Joe" })); + + await rgw.WriteColumnAsync( + new DataColumn((DataField)schema[1].NaturalChildren[0], new[] { "Amazonland" })); + + await rgw.WriteColumnAsync( + new DataColumn((DataField)schema[1].NaturalChildren[1], new[] { "AAABBB" })); + } + + ms.Position = 0; + + using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) { + using ParquetRowGroupReader rg = reader.OpenRowGroupReader(0); + + DataField[] dataFields = reader.Schema.GetDataFields(); + + DataColumn name = await rg.ReadColumnAsync(dataFields[0]); + DataColumn line1 = await rg.ReadColumnAsync(dataFields[1]); + DataColumn postcode = await rg.ReadColumnAsync(dataFields[2]); + + Assert.Equal(new[] { "Joe" }, name.Data); + Assert.Equal(new[] { "Amazonland" }, line1.Data); + Assert.Equal(new[] { "AAABBB" }, postcode.Data); + } + } + } +} \ No newline at end of file diff --git a/src/Parquet.Test/Xunit/XAssert.cs b/src/Parquet.Test/Xunit/XAssert.cs new file mode 100644 index 00000000..a55c6c16 --- /dev/null +++ b/src/Parquet.Test/Xunit/XAssert.cs @@ -0,0 +1,13 @@ +using System.Text.Json; +using Xunit; + +namespace Parquet.Test.Xunit { + public static class XAssert { + public static void JsonEquivalent(object? expected, object? actual) { + string expectedJson = JsonSerializer.Serialize(expected); + string actualJson = JsonSerializer.Serialize(actual); + + Assert.Equal(expectedJson, actualJson); + } + } +} diff --git a/src/Parquet.Test/data/trailing_dot_col_name.parquet b/src/Parquet.Test/data/trailing_dot_col_name.parquet new file mode 100644 index 00000000..b76d3731 Binary files /dev/null and b/src/Parquet.Test/data/trailing_dot_col_name.parquet differ diff --git a/src/Parquet/Data/DataColumn.cs b/src/Parquet/Data/DataColumn.cs index d453a789..f0ff2907 100644 --- a/src/Parquet/Data/DataColumn.cs +++ b/src/Parquet/Data/DataColumn.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Linq; using Parquet.Extensions; using Parquet.Schema; @@ -12,11 +13,6 @@ public class DataColumn { private readonly int _offset; private readonly int _count = -1; - private DataColumn(DataField field) { - Field = field ?? throw new ArgumentNullException(nameof(field)); - Data = Array.Empty(); - } - /// /// /// @@ -35,7 +31,8 @@ public DataColumn(DataField field, Array data, int[]? repetitionLevels = null) /// /// /// - public DataColumn(DataField field, Array data, int offset, int count, int[]? repetitionLevels = null) : this(field) { + public DataColumn(DataField field, Array data, int offset, int count, int[]? repetitionLevels = null) { + Field = field ?? throw new ArgumentNullException(nameof(field)); Data = data ?? throw new ArgumentNullException(nameof(data)); _offset = offset; _count = count; @@ -45,20 +42,42 @@ public DataColumn(DataField field, Array data, int offset, int count, int[]? rep internal DataColumn(DataField field, Array definedData, - Span definitionLevels, int maxDefinitionLevel, - int[]? repetitionLevels, int maxRepetitionLevel) : this(field) { + Span definitionLevels, + int[]? repetitionLevels, + bool unpackDefinitions = true) { + Field = field ?? throw new ArgumentNullException(nameof(field)); Data = definedData; // 1. Apply definitions - if(definitionLevels != null) { - Data = field.UnpackDefinitions(Data, definitionLevels, maxDefinitionLevel); + if(unpackDefinitions) { + if(definitionLevels != null) { + Data = field.UnpackDefinitions(Data, definitionLevels); + } + } else { + if(definitionLevels != null) { + DefinitionLevels = definitionLevels.ToArray(); + } } // 2. Apply repetitions RepetitionLevels = repetitionLevels; } + /// + /// Convenience used by Dremel algorithm + /// + internal DataColumn(DataField df, Array values, List? dls, List? rls, bool unpackDefinitions = true) { + Field = df ?? throw new ArgumentNullException(nameof(df)); + Data = values; + if(dls != null && unpackDefinitions) { + Data = df.UnpackDefinitions(Data, dls.ToArray().AsSpan()); + } else { + DefinitionLevels = dls?.ToArray(); + } + RepetitionLevels = rls?.ToArray(); + } + /// /// Column data where definition levels are already applied /// @@ -84,16 +103,17 @@ internal DataColumn(DataField field, /// /// When T is invalid type public Span AsSpan(int? offset = null, int? count = null) { - if(Data is not T[] ar) - throw new InvalidOperationException($"data is not castable to {typeof(T)}[]"); - - return ar.AsSpan(offset ?? Offset, count ?? Count); + return Data is not T[] ar + ? throw new InvalidOperationException($"data is not castable to {typeof(T)}[]") + : ar.AsSpan(offset ?? Offset, count ?? Count); } + internal int[]? DefinitionLevels { get; } + /// /// Repetition levels if any. /// - public int[]? RepetitionLevels { get; private set; } + public int[]? RepetitionLevels { get; } /// /// Data field @@ -116,21 +136,12 @@ internal int CalculateNullCount() { return Data.CalculateNullCountFast(Offset, Count); } - internal void PackDefinitions(Span definitions, + internal static void PackDefinitions(Span definitions, Array data, int dataOffset, int dataCount, Array packedData, - int maxDefinitionLevel) { - - data.PackNullsFast(dataOffset, dataCount, packedData, definitions, maxDefinitionLevel); - } - - internal long CalculateRowCount() { - if(Field.MaxRepetitionLevel > 0) { - return RepetitionLevels?.Count(rl => rl == 0) ?? 0; - } + int maxDefinitionLevel) => data.PackNullsFast(dataOffset, dataCount, packedData, definitions, maxDefinitionLevel); - return Count; - } + internal long CalculateRowCount() => Field.MaxRepetitionLevel > 0 ? RepetitionLevels?.Count(rl => rl == 0) ?? 0 : Count; /// /// pretty print diff --git a/src/Parquet/Encodings/SchemaEncoder.cs b/src/Parquet/Encodings/SchemaEncoder.cs index 3af164cc..aa088720 100644 --- a/src/Parquet/Encodings/SchemaEncoder.cs +++ b/src/Parquet/Encodings/SchemaEncoder.cs @@ -178,33 +178,37 @@ static bool TryBuildList(List schema, } Thrift.SchemaElement tseList = schema[index]; - field = ListField.CreateWithNoItem(tseList.Name); + field = ListField.CreateWithNoItem(tseList.Name, tseList.Repetition_type != FieldRepetitionType.REQUIRED); //https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules Thrift.SchemaElement tseRepeated = schema[index + 1]; - //Rule 1. If the repeated field is not a group, then its type is the element type and elements are required. - //not implemented + // Rule 1. If the repeated field is not a group, then its type is the element type and elements are required. + // todo: not implemented - //Rule 2. If the repeated field is a group with multiple fields, then its type is the element type and elements are required. - //not implemented + // Rule 2. If the repeated field is a group with multiple fields, then its type is the element type and elements are required. + // todo: not implemented - //Rule 3. f the repeated field is a group with one field and is named either array or uses - //the LIST-annotated group's name with _tuple appended then the repeated type is the element - //type and elements are required. + // Rule 3. If the repeated field is a group with one field and is named either "array" or uses + // the "LIST"-annotated group's name with "_tuple" appended then the repeated type is the element + // type and elements are required. + // todo: not implemented fully, only "array" // "group with one field and is named either array": if(tseList.Num_children == 1 && tseRepeated.Name == "array") { - field.Path = tseList.Name; + field.Path = new FieldPath(tseList.Name); index += 1; //only skip this element ownedChildren = 1; return true; } + // Normal "modern" LIST: //as we are skipping elements set path hint - field.Path = new FieldPath(tseList.Name, schema[index + 1].Name); + Thrift.SchemaElement tseRepeatedGroup = schema[index + 1]; + field.Path = new FieldPath(tseList.Name, tseRepeatedGroup.Name); + field.GroupSchemaElement = tseRepeatedGroup; index += 2; //skip this element and child container - ownedChildren = 1; //we should get this element assigned back + ownedChildren = 1; //we should get this element assigned back return true; } @@ -230,8 +234,11 @@ static bool TryBuildMap(List schema, //followed by a key and a value, but we declared them as owned - var map = new MapField(root.Name); - map.Path = new FieldPath(root.Name, tseContainer.Name); + var map = new MapField(root.Name) { + Path = new FieldPath(root.Name, tseContainer.Name), + IsNullable = root.Repetition_type != FieldRepetitionType.REQUIRED, + GroupSchemaElement = tseContainer + }; index += 1; ownedChildren = 2; @@ -253,6 +260,7 @@ static bool TryBuildStruct(List schema, index++; ownedChildren = container.Num_children; //make then owned to receive in .Assign() field = StructField.CreateWithNoElements(container.Name); + field.IsNullable = container.Repetition_type != FieldRepetitionType.REQUIRED; return true; } @@ -397,7 +405,7 @@ private static void Encode(ListField listField, Thrift.SchemaElement parent, ILi //add list container var root = new Thrift.SchemaElement(listField.Name) { Converted_type = Thrift.ConvertedType.LIST, - Repetition_type = Thrift.FieldRepetitionType.OPTIONAL, + Repetition_type = listField.IsNullable ? Thrift.FieldRepetitionType.OPTIONAL : Thrift.FieldRepetitionType.REQUIRED, Num_children = 1 //field container below }; container.Add(root); diff --git a/src/Parquet/Extensions/ExpressionExtensions.cs b/src/Parquet/Extensions/ExpressionExtensions.cs new file mode 100644 index 00000000..cf0c5d8d --- /dev/null +++ b/src/Parquet/Extensions/ExpressionExtensions.cs @@ -0,0 +1,127 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq.Expressions; +using System.Reflection; + +namespace Parquet.Extensions { + static class ExpressionExtensions { + public static Expression Loop(this Expression iteration, + Expression collection, + Type elementType, + ParameterExpression element) { + + Type enumeratorGenericType = typeof(IEnumerator<>).MakeGenericType(elementType); + Type enumerableGenericType = typeof(IEnumerable<>).MakeGenericType(elementType); + + ParameterExpression enumeratorVar = Expression.Variable(enumeratorGenericType); + MethodCallExpression getEnumeratorCall = Expression.Call(collection, + enumerableGenericType.GetMethod(nameof(IEnumerable.GetEnumerator))!); + MethodCallExpression moveNextCall = Expression.Call(enumeratorVar, + typeof(IEnumerator).GetMethod(nameof(IEnumerator.MoveNext))!); + LabelTarget loopBreakLabel = Expression.Label(); + + // doc: Expression.Loop is an infinite loop that can be exited with "break" + LoopExpression loop = Expression.Loop( + Expression.IfThenElse( + + // test + Expression.Equal(moveNextCall, Expression.Constant(true)), + + // if true + Expression.Block( + //new[] { classElementVar }, + + // get class element into loopVar + Expression.Assign(element, Expression.Property(enumeratorVar, nameof(IEnumerator.Current))), + + iteration), + + // if false + Expression.Break(loopBreakLabel) + ), loopBreakLabel); + + return Expression.Block( + new[] { enumeratorVar, element }, + + // get enumerator from class collection + Expression.Assign(enumeratorVar, getEnumeratorCall), + + // loop over classes + loop); + } + + public static Expression ForLoop(this Expression fromVar, Expression toVar, + ParameterExpression iVar, Expression body) { + + LabelTarget loopBreakLabel = Expression.Label(); + return Expression.Block( + new[] { iVar }, + Expression.Assign(iVar, fromVar), + + Expression.Loop( + Expression.IfThenElse( + Expression.LessThan(iVar, toVar), + + Expression.Block( + body, + Expression.PostIncrementAssign(iVar)), + + Expression.Break(loopBreakLabel)), + loopBreakLabel) + ); + } + + /// + /// Calls + /// + public static Expression ClearArray(this ParameterExpression array, Expression? fromIndexVar = null) { + + Expression from = fromIndexVar ?? Expression.Constant(0); + Expression length = Expression.Property(array, nameof(Array.Length)); + if(fromIndexVar != null) length = Expression.Subtract(length, fromIndexVar); + + return Expression.Call( + typeof(Array).GetMethod( + nameof(Array.Clear), + BindingFlags.Static | BindingFlags.Public, + null, + new[] { typeof(Array), typeof(int), typeof(int) }, + null)!, + array, from, length); + } + + public static Expression CollectionCount(this Expression collection, Type collectionType) { + return Expression.Property(collection, nameof(IReadOnlyCollection.Count)); + } + + public static Expression CollectionAdd(this Expression collection, Type collectionType, Expression element, Type elementType) { + + MethodInfo? method = collectionType.GetMethod(nameof(IList.Add), new[] { elementType }); + + if(method == null) + throw new NotSupportedException($"can't find {nameof(IList.Add)} method"); + + return Expression.Call( + collection, + method, + element); + } + + public static Expression IsNull(this Expression nullableVar) { + return Expression.Equal(nullableVar, Expression.Constant(null)); + } + + /// + /// Gets internal property "DebugView" which is normally only available in Visual Studio debugging session + /// + /// + public static string GetPseudoCode(this Expression expression) { + PropertyInfo? propertyInfo = typeof(Expression).GetProperty("DebugView", BindingFlags.Instance | BindingFlags.NonPublic); + if(propertyInfo == null) + return string.Empty; + + return (string)propertyInfo.GetValue(expression)!; + } + } +} diff --git a/src/Parquet/Extensions/TypeExtensions.cs b/src/Parquet/Extensions/TypeExtensions.cs index c6ade117..f519626b 100644 --- a/src/Parquet/Extensions/TypeExtensions.cs +++ b/src/Parquet/Extensions/TypeExtensions.cs @@ -2,6 +2,7 @@ using System.Collections.Generic; using System.Collections; using System.Reflection; +using System.Linq; namespace Parquet { static class TypeExtensions { @@ -20,7 +21,7 @@ public static IList CreateGenericList(this Type t) { /// /// /// - public static bool TryExtractEnumerableType(this Type t, out Type? baseType) { + public static bool TryExtractIEnumerableType(this Type t, out Type? baseType) { if(typeof(byte[]) == t) { //it's a special case to avoid confustion between byte arrays and repeatable bytes baseType = null; @@ -30,13 +31,13 @@ public static bool TryExtractEnumerableType(this Type t, out Type? baseType) { TypeInfo ti = t.GetTypeInfo(); Type[] args = ti.GenericTypeArguments; - if(args.Length == 1) { + if(args.Length > 0) { //check derived interfaces foreach(Type interfaceType in ti.ImplementedInterfaces) { TypeInfo iti = interfaceType.GetTypeInfo(); if(iti.IsGenericType && iti.GetGenericTypeDefinition() == typeof(IEnumerable<>)) { - baseType = ti.GenericTypeArguments[0]; + baseType = iti.GenericTypeArguments[0]; return true; } } @@ -57,10 +58,30 @@ public static bool TryExtractEnumerableType(this Type t, out Type? baseType) { return false; } - public static bool TryExtractDictionaryType(this Type t, out Type? keyType, out Type? valueType) { - TypeInfo ti = t.GetTypeInfo(); + public static Type ExtractElementTypeFromEnumerableType(this Type t) { + if(t.TryExtractIEnumerableType(out Type? iet)) + return iet!; + + throw new ArgumentException($"type {t} is not single-element generic enumerable", nameof(t)); + + } - if(ti.IsGenericType && ti.GetGenericTypeDefinition().GetTypeInfo().IsAssignableFrom(typeof(Dictionary<,>).GetTypeInfo())) { + public static MethodInfo GetGenericListAddMethod(this Type listType) { + Type elementType = listType.ExtractElementTypeFromEnumerableType(); + Type genericListType = typeof(List<>).MakeGenericType(elementType); + MethodInfo? method = genericListType.GetMethod(nameof(IList.Add)); + return method ?? throw new InvalidOperationException("method not present"); + } + + public static bool IsGenericIDictionary(this Type t) { + return t.IsGenericType && + (t.GetGenericTypeDefinition() == typeof(IDictionary<,>) || + t.GetInterfaces().Any(x => x.IsGenericType && typeof(IDictionary<,>) == x.GetGenericTypeDefinition())); + } + + public static bool TryExtractDictionaryType(this Type t, out Type? keyType, out Type? valueType) { + if(t.IsGenericIDictionary()) { + TypeInfo ti = t.GetTypeInfo(); keyType = ti.GenericTypeArguments[0]; valueType = ti.GenericTypeArguments[1]; return true; diff --git a/src/Parquet/File/DataColumnReader.cs b/src/Parquet/File/DataColumnReader.cs index 79eae88f..b11ae5a7 100644 --- a/src/Parquet/File/DataColumnReader.cs +++ b/src/Parquet/File/DataColumnReader.cs @@ -18,10 +18,8 @@ class DataColumnReader { private readonly Thrift.ColumnChunk _thriftColumnChunk; private readonly Thrift.SchemaElement? _thriftSchemaElement; private readonly ThriftFooter _footer; - private readonly ParquetOptions? _options; + private readonly ParquetOptions _options; private readonly ThriftStream _thriftStream; - private readonly int _maxRepetitionLevel; - private readonly int _maxDefinitionLevel; public DataColumnReader( DataField dataField, @@ -35,12 +33,13 @@ public DataColumnReader( _footer = footer ?? throw new ArgumentNullException(nameof(footer)); _options = parquetOptions ?? throw new ArgumentNullException(nameof(parquetOptions)); + if(!dataField.IsAttachedToSchema) { + throw new ArgumentException( + $"Field [{dataField}] is not attached to any schema. You need to construct a schema passing in this field first.", + nameof(dataField)); + } + _thriftStream = new ThriftStream(inputStream); - _footer.GetLevels(_thriftColumnChunk, out int mrl, out int mdl); - _dataField.MaxRepetitionLevel = mrl; - _dataField.MaxDefinitionLevel = mdl; - _maxRepetitionLevel = mrl; - _maxDefinitionLevel = mdl; _thriftSchemaElement = _footer.GetSchemaElement(_thriftColumnChunk); } @@ -71,7 +70,7 @@ public async Task ReadAsync(CancellationToken cancellationToken = de } // all the data is available here! - DataColumn column = pc.Unpack(_maxDefinitionLevel, _maxRepetitionLevel); + DataColumn column = pc.Unpack(_options.UnpackDefinitions); if(_thriftColumnChunk.Meta_data.Statistics != null) { @@ -166,24 +165,26 @@ private async Task ReadDataPageAsync(Thrift.PageHeader ph, PackedColumn pc, long int dataUsed = 0; int nonNullValueCount = ph.Data_page_header.Num_values; - if(_maxRepetitionLevel > 0) { + if(_dataField.MaxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. int levelsRead = ReadLevels( - bytes.AsSpan(), _maxRepetitionLevel, + bytes.AsSpan(), _dataField.MaxRepetitionLevel, pc.GetWriteableRepetitionLevelSpan(), ph.Data_page_header.Num_values, null, out int usedLength); pc.MarkRepetitionLevels(levelsRead); dataUsed += usedLength; } - if(_maxDefinitionLevel > 0) { + if(_dataField.MaxDefinitionLevel > 0) { int levelsRead = ReadLevels( - bytes.AsSpan().Slice(dataUsed), _maxDefinitionLevel, + bytes.AsSpan().Slice(dataUsed), _dataField.MaxDefinitionLevel, pc.GetWriteableDefinitionLevelSpan(), ph.Data_page_header.Num_values, null, out int usedLength); dataUsed += usedLength; - pc.MarkDefinitionLevels(levelsRead, ph.Data_page_header.__isset.statistics ? -1 : _maxDefinitionLevel, out int nullCount); + pc.MarkDefinitionLevels(levelsRead, + ph.Data_page_header.__isset.statistics ? -1 : _dataField.MaxDefinitionLevel, + out int nullCount); if(ph.Data_page_header.__isset.statistics) { nonNullValueCount -= (int)ph.Data_page_header.Statistics!.Null_count; @@ -207,18 +208,18 @@ private async Task ReadDataPageV2Async(Thrift.PageHeader ph, PackedColumn pc, lo using IronCompress.IronCompressResult bytes = await ReadPageDataV2Async(ph); int dataUsed = 0; - if(_maxRepetitionLevel > 0) { + if(_dataField.MaxRepetitionLevel > 0) { //todo: use rented buffers, but be aware that rented length can be more than requested so underlying logic relying on array length must be fixed too. int levelsRead = ReadLevels(bytes.AsSpan(), - _maxRepetitionLevel, pc.GetWriteableRepetitionLevelSpan(), + _dataField.MaxRepetitionLevel, pc.GetWriteableRepetitionLevelSpan(), ph.Data_page_header_v2.Num_values, ph.Data_page_header_v2.Repetition_levels_byte_length, out int usedLength); dataUsed += usedLength; pc.MarkRepetitionLevels(levelsRead); } - if(_maxDefinitionLevel > 0) { + if(_dataField.MaxDefinitionLevel > 0) { int levelsRead = ReadLevels(bytes.AsSpan().Slice(dataUsed), - _maxDefinitionLevel, pc.GetWriteableDefinitionLevelSpan(), + _dataField.MaxDefinitionLevel, pc.GetWriteableDefinitionLevelSpan(), ph.Data_page_header_v2.Num_values, ph.Data_page_header_v2.Definition_levels_byte_length, out int usedLength); dataUsed += usedLength; pc.MarkDefinitionLevels(levelsRead, -1, out _); diff --git a/src/Parquet/File/DataColumnWriter.cs b/src/Parquet/File/DataColumnWriter.cs index 397519d3..63b43b0a 100644 --- a/src/Parquet/File/DataColumnWriter.cs +++ b/src/Parquet/File/DataColumnWriter.cs @@ -44,10 +44,9 @@ public DataColumnWriter( Thrift.ColumnChunk chunk = _footer.CreateColumnChunk( _compressionMethod, _stream, _schemaElement.Type, fullPath, column.Count); - _footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); + //_footer.GetLevels(chunk, out int maxRepetitionLevel, out int maxDefinitionLevel); ColumnSizes columnSizes = await WriteColumnAsync(column, _schemaElement, - maxRepetitionLevel, maxDefinitionLevel, cancellationToken); //generate stats for column chunk chunk.Meta_data.Statistics = column.Statistics.ToThriftStatistics(_schemaElement); @@ -97,9 +96,14 @@ private async Task CompressAndWriteAsync( private async Task WriteColumnAsync(DataColumn column, Thrift.SchemaElement tse, - int maxRepetitionLevel, - int maxDefinitionLevel, CancellationToken cancellationToken = default) { + + if(!column.Field.IsAttachedToSchema) { + throw new ArgumentException( + $"Field [{column.Field}] is not attached to any schema. You need to construct a schema passing in this field first.", + nameof(column)); + } + var r = new ColumnSizes(); /* @@ -109,7 +113,7 @@ private async Task WriteColumnAsync(DataColumn column, */ using var pc = new PackedColumn(column); - pc.Pack(maxDefinitionLevel, _options.UseDictionaryEncoding, _options.DictionaryEncodingThreshold); + pc.Pack(column.Field.MaxDefinitionLevel, _options.UseDictionaryEncoding, _options.DictionaryEncodingThreshold); // dictionary page if(pc.HasDictionary) { @@ -129,10 +133,10 @@ private async Task WriteColumnAsync(DataColumn column, using(MemoryStream ms = _rmsMgr.GetStream()) { Thrift.PageHeader ph = _footer.CreateDataPage(column.Count, pc.HasDictionary); if(pc.HasRepetitionLevels) { - WriteLevels(ms, pc.RepetitionLevels!, pc.RepetitionLevels!.Length, maxRepetitionLevel); + WriteLevels(ms, pc.RepetitionLevels!, pc.RepetitionLevels!.Length, column.Field.MaxRepetitionLevel); } if(pc.HasDefinitionLevels) { - WriteLevels(ms, pc.DefinitionLevels!, column.Count, maxDefinitionLevel); + WriteLevels(ms, pc.DefinitionLevels!, column.Count, column.Field.MaxDefinitionLevel); } if(pc.HasDictionary) { diff --git a/src/Parquet/File/PackedColumn.cs b/src/Parquet/File/PackedColumn.cs index d98de759..60d1336b 100644 --- a/src/Parquet/File/PackedColumn.cs +++ b/src/Parquet/File/PackedColumn.cs @@ -97,9 +97,7 @@ public void MarkRepetitionLevels(int count) { } public Span GetWriteableDefinitionLevelSpan() { - if(_definitionLevels == null) { - _definitionLevels = IntPool.Rent(_plainData.Length + 8); - } + _definitionLevels ??= IntPool.Rent(_plainData.Length + 8); return _definitionLevels.AsSpan(_definitionOffset); } @@ -156,7 +154,7 @@ public void Pack(int maxDefinitionLevel, bool useDictionaryEncoding, double dict Array packedData = _column.Field.CreateArray(_column.Count - nullCount); _definitionLevels = IntPool.Rent(_column.Count); - _column.PackDefinitions(_definitionLevels.AsSpan(0, _column.Count), + DataColumn.PackDefinitions(_definitionLevels.AsSpan(0, _column.Count), _plainData!, _plainDataOffset, _plainDataCount, packedData, maxDefinitionLevel); @@ -202,7 +200,7 @@ public void UnpackCheckpoint() { } } - public DataColumn Unpack(int maxDefinitionLevel, int maxRepetitionLevel) { + public DataColumn Unpack(bool unpackDefinitions) { UnpackCheckpoint(); @@ -210,8 +208,9 @@ public DataColumn Unpack(int maxDefinitionLevel, int maxRepetitionLevel) { throw new InvalidOperationException("no plain data"); return new DataColumn(_field, _plainData, - DefinitionLevels == null ? null : DefinitionLevels.AsSpan(0, _definitionOffset).ToArray(), maxDefinitionLevel, - RepetitionLevels == null ? null : RepetitionLevels.AsSpan(0, _repetitionOffset).ToArray(), maxRepetitionLevel); + DefinitionLevels?.AsSpan(0, _definitionOffset).ToArray(), + RepetitionLevels?.AsSpan(0, _repetitionOffset).ToArray(), + unpackDefinitions); } public void Dispose() { diff --git a/src/Parquet/File/ThriftFooter.cs b/src/Parquet/File/ThriftFooter.cs index 9e44309b..b22be1ac 100644 --- a/src/Parquet/File/ThriftFooter.cs +++ b/src/Parquet/File/ThriftFooter.cs @@ -12,10 +12,10 @@ class ThriftFooter { private readonly Thrift.FileMetaData _fileMeta; private readonly ThriftSchemaTree _tree; - internal static ThriftFooter Empty => new ThriftFooter(); + internal static ThriftFooter Empty => new(); internal ThriftFooter() { - _fileMeta= new Thrift.FileMetaData(); + _fileMeta = new Thrift.FileMetaData(); _tree= new ThriftSchemaTree(); } @@ -92,64 +92,20 @@ public FieldPath GetPath(Thrift.SchemaElement schemaElement) { return new FieldPath(path); } - // could use value tuple, would that nuget ref be ok to bring in? - readonly Dictionary> _memoizedLevels = new Dictionary>(); - - public void GetLevels(Thrift.ColumnChunk columnChunk, out int maxRepetitionLevel, out int maxDefinitionLevel) { - maxRepetitionLevel = 0; - maxDefinitionLevel = 0; - - int i = 0; - List path = columnChunk.Meta_data.Path_in_schema; - - var comparer = new StringListComparer(path); - if(_memoizedLevels.TryGetValue(comparer, out Tuple? t)) { - maxRepetitionLevel = t.Item1; - maxDefinitionLevel = t.Item2; - return; - } - - int fieldCount = _fileMeta.Schema.Count; - - foreach(string pp in path) { - while(i < fieldCount) { - SchemaElement schemaElement = _fileMeta.Schema[i]; - if(string.CompareOrdinal(schemaElement.Name, pp) == 0) { - Thrift.SchemaElement se = schemaElement; - - bool repeated = (se.__isset.repetition_type && se.Repetition_type == Thrift.FieldRepetitionType.REPEATED); - bool defined = (se.Repetition_type == Thrift.FieldRepetitionType.REQUIRED); - - if(repeated) - maxRepetitionLevel += 1; - if(!defined) - maxDefinitionLevel += 1; - - break; - } - - i++; - } - } - - _memoizedLevels.Add(comparer, Tuple.Create(maxRepetitionLevel, maxDefinitionLevel)); - } - public Thrift.SchemaElement[] GetWriteableSchema() { return _fileMeta.Schema.Where(tse => tse.__isset.type).ToArray(); } public Thrift.RowGroup AddRowGroup() { var rg = new Thrift.RowGroup(); - if(_fileMeta.Row_groups == null) - _fileMeta.Row_groups = new List(); + _fileMeta.Row_groups ??= new List(); _fileMeta.Row_groups.Add(rg); return rg; } public Thrift.ColumnChunk CreateColumnChunk(CompressionMethod compression, System.IO.Stream output, Thrift.Type columnType, FieldPath path, int valuesCount) { - Thrift.CompressionCodec codec = (Thrift.CompressionCodec)(int)compression; + CompressionCodec codec = (Thrift.CompressionCodec)(int)compression; var chunk = new Thrift.ColumnChunk(); long startPos = output.Position; @@ -170,19 +126,17 @@ public Thrift.ColumnChunk CreateColumnChunk(CompressionMethod compression, Syste return chunk; } - public Thrift.PageHeader CreateDataPage(int valueCount, bool isDictionary) { - var ph = new Thrift.PageHeader(Thrift.PageType.DATA_PAGE, 0, 0); - ph.Data_page_header = new Thrift.DataPageHeader { - Encoding = isDictionary ? Thrift.Encoding.PLAIN_DICTIONARY : Thrift.Encoding.PLAIN, - Definition_level_encoding = Thrift.Encoding.RLE, - Repetition_level_encoding = Thrift.Encoding.RLE, - Num_values = valueCount, - Statistics = new Thrift.Statistics() + public Thrift.PageHeader CreateDataPage(int valueCount, bool isDictionary) => + new Thrift.PageHeader(Thrift.PageType.DATA_PAGE, 0, 0) { + Data_page_header = new Thrift.DataPageHeader { + Encoding = isDictionary ? Thrift.Encoding.PLAIN_DICTIONARY : Thrift.Encoding.PLAIN, + Definition_level_encoding = Thrift.Encoding.RLE, + Repetition_level_encoding = Thrift.Encoding.RLE, + Num_values = valueCount, + Statistics = new Thrift.Statistics() + } }; - return ph; - } - public Thrift.PageHeader CreateDictionaryPage(int numValues) { var ph = new Thrift.PageHeader(Thrift.PageType.DICTIONARY_PAGE, 0, 0); ph.Dictionary_page_header = new DictionaryPageHeader { @@ -227,18 +181,6 @@ private void CreateModelSchema(FieldPath? path, IList container, int chil } } - private void ThrowNoHandler(Thrift.SchemaElement tse) { - string? ct = tse.__isset.converted_type - ? $" ({tse.Converted_type})" - : null; - - string t = tse.__isset.type - ? $"'{tse.Type}'" - : ""; - - throw new NotSupportedException($"cannot find data type handler for schema element '{tse.Name}' (type: {t}{ct})"); - } - #endregion #region [ Convertion from Model Schema ] @@ -249,7 +191,7 @@ public Thrift.FileMetaData CreateThriftSchema(ParquetSchema schema) { meta.Schema = new List(); meta.Row_groups = new List(); - Thrift.SchemaElement root = AddRoot(meta.Schema); + Thrift.SchemaElement root = ThriftFooter.AddRoot(meta.Schema); foreach(Field se in schema.Fields) { SchemaEncoder.Encode(se, root, meta.Schema); } @@ -258,7 +200,7 @@ public Thrift.FileMetaData CreateThriftSchema(ParquetSchema schema) { } - private Thrift.SchemaElement AddRoot(IList container) { + private static Thrift.SchemaElement AddRoot(IList container) { var root = new Thrift.SchemaElement("root"); container.Add(root); return root; diff --git a/src/Parquet/Globals.cs b/src/Parquet/Globals.cs index 1aa0e8b9..c9ac4eeb 100644 --- a/src/Parquet/Globals.cs +++ b/src/Parquet/Globals.cs @@ -16,5 +16,8 @@ public static class Globals { /// https://docs.github.com/en/actions/learn-github-actions/variables /// public static readonly string GithubSha = "${GITHUB_SHA}"; + + internal const string DataTypeEnumObsolete = "Please resort to using System.Type overloads. Will be removed in v6."; + internal const string ParquetConvertObsolete = "ParquetConvert was an experimental project and is not obsolete. Consider switching to ParquetSerializer which supports all data types, including nested ones, and is just superior. ParquetConvert will be removed in v6."; } } diff --git a/src/Parquet/Parquet.csproj b/src/Parquet/Parquet.csproj index 4bb78105..571dd28d 100644 --- a/src/Parquet/Parquet.csproj +++ b/src/Parquet/Parquet.csproj @@ -47,7 +47,7 @@ - + diff --git a/src/Parquet/ParquetConvert.cs b/src/Parquet/ParquetConvert.cs index 399518b4..6b7ed749 100644 --- a/src/Parquet/ParquetConvert.cs +++ b/src/Parquet/ParquetConvert.cs @@ -6,7 +6,6 @@ using System.Threading.Tasks; using Parquet.Data; using Parquet.Extensions; -using Parquet.File; using Parquet.Schema; using Parquet.Serialization; @@ -14,6 +13,7 @@ namespace Parquet { /// /// High-level object oriented API for Apache Parquet /// + [Obsolete(Globals.ParquetConvertObsolete)] public static class ParquetConvert { /// /// Serialises a collection of classes into a Parquet stream diff --git a/src/Parquet/ParquetOptions.cs b/src/Parquet/ParquetOptions.cs index 0ff08d96..aeb55073 100644 --- a/src/Parquet/ParquetOptions.cs +++ b/src/Parquet/ParquetOptions.cs @@ -1,4 +1,5 @@ using System; +using System.Data; namespace Parquet { /// @@ -27,5 +28,9 @@ public class ParquetOptions { /// public double DictionaryEncodingThreshold { get; set; } = 0.8; + /// + /// When set (default) contains values with defnition levels applied. + /// + internal bool UnpackDefinitions { get; set; } = true; } } diff --git a/src/Parquet/ParquetRowGroupReader.cs b/src/Parquet/ParquetRowGroupReader.cs index 1a8be783..ab67b6d9 100644 --- a/src/Parquet/ParquetRowGroupReader.cs +++ b/src/Parquet/ParquetRowGroupReader.cs @@ -17,7 +17,7 @@ public class ParquetRowGroupReader : IDisposable { private readonly Stream _stream; private readonly ThriftStream _thriftStream; private readonly ParquetOptions? _parquetOptions; - private readonly Dictionary _pathToChunk = new Dictionary(); + private readonly Dictionary _pathToChunk = new(); internal ParquetRowGroupReader( Thrift.RowGroup rowGroup, diff --git a/src/Parquet/Rows/RowValidator.cs b/src/Parquet/Rows/RowValidator.cs index 7061c9ca..471f05a5 100644 --- a/src/Parquet/Rows/RowValidator.cs +++ b/src/Parquet/Rows/RowValidator.cs @@ -39,7 +39,7 @@ public static void Validate(Row row, IReadOnlyList fields) { } private static void ValidateMap(MapField mf, object? value) { - if(value == null || !value.GetType().TryExtractEnumerableType(out Type? elementType)) + if(value == null || !value.GetType().TryExtractIEnumerableType(out Type? elementType)) throw new ArgumentException($"map must be a collection, but found {value?.GetType()}"); if(elementType != typeof(Row)) @@ -54,7 +54,7 @@ private static void ValidateMap(MapField mf, object? value) { private static void ValidateList(ListField lf, object? value) { Type? elementType = null; - bool isEnumerable = value?.GetType().TryExtractEnumerableType(out elementType) ?? false; + bool isEnumerable = value?.GetType().TryExtractIEnumerableType(out elementType) ?? false; //value must be an enumeration of items if(!isEnumerable) diff --git a/src/Parquet/Rows/RowsToDataColumnsConverter.cs b/src/Parquet/Rows/RowsToDataColumnsConverter.cs index 7b37a9f7..d61efbfb 100644 --- a/src/Parquet/Rows/RowsToDataColumnsConverter.cs +++ b/src/Parquet/Rows/RowsToDataColumnsConverter.cs @@ -9,7 +9,7 @@ namespace Parquet.Rows { class RowsToDataColumnsConverter { private readonly ParquetSchema _schema; private readonly IReadOnlyCollection _rows; - private readonly Dictionary _pathToDataColumn = new Dictionary(); + private readonly Dictionary _pathToDataColumn = new(); public RowsToDataColumnsConverter(ParquetSchema schema, IReadOnlyCollection rows) { _schema = schema; diff --git a/src/Parquet/Schema/DataField.cs b/src/Parquet/Schema/DataField.cs index c1a1887c..2b4a2ad9 100644 --- a/src/Parquet/Schema/DataField.cs +++ b/src/Parquet/Schema/DataField.cs @@ -15,14 +15,14 @@ public class DataField : Field { /// /// Parquet data type of this element /// - [Obsolete] + [Obsolete(Globals.DataTypeEnumObsolete)] public DataType DataType { get; } /// /// When true, this element is allowed to have nulls. Bad naming, probably should be something like IsNullable. /// Changes property accordingly. /// - public bool IsNullable { + public override bool IsNullable { get => _isNullable; internal set { _isNullable = value; ClrNullableIfHasNullsType = value ? ClrType.GetNullable() : ClrType; @@ -112,12 +112,22 @@ internal override FieldPath? PathPrefix { set => Path = value + new FieldPath(Name); } - /// - /// see - /// + internal bool IsAttachedToSchema { get; set; } = false; + internal override void PropagateLevels(int parentRepetitionLevel, int parentDefinitionLevel) { - MaxRepetitionLevel = parentRepetitionLevel + (IsArray ? 1 : 0); - MaxDefinitionLevel = parentDefinitionLevel + (IsNullable ? 1 : 0); + MaxRepetitionLevel = parentRepetitionLevel; + if(IsArray) + MaxRepetitionLevel++; + + MaxDefinitionLevel = parentDefinitionLevel; + + // can't be both array and nullable + if(IsArray) + MaxDefinitionLevel++; + else if(IsNullable) + MaxDefinitionLevel++; + + IsAttachedToSchema = true; } /// @@ -127,10 +137,10 @@ internal override void PropagateLevels(int parentRepetitionLevel, int parentDefi /// internal Array CreateArray(int length) => Array.CreateInstance(ClrType, length); - internal Array UnpackDefinitions(Array definedData, Span definitionLevels, int maxDefinitionLevel) { + internal Array UnpackDefinitions(Array definedData, Span definitionLevels) { if(IsNullable) { Array result = Array.CreateInstance(ClrNullableIfHasNullsType, definitionLevels.Length); - definedData.UnpackNullsFast(definitionLevels, maxDefinitionLevel, result); + definedData.UnpackNullsFast(definitionLevels, MaxDefinitionLevel, result); return result; } else { return definedData; @@ -167,11 +177,11 @@ private static void Discover(Type t, out Type baseType, out bool isArray, out bo isNullable = false; //throw a useful hint - if(t.TryExtractDictionaryType(out Type? dKey, out Type? dValue)) { - throw new ArgumentException($"cannot declare a dictionary this way, please use {nameof(MapField)}."); + if(t.IsGenericIDictionary()) { + throw new NotSupportedException($"cannot declare a dictionary this way, please use {nameof(MapField)}."); } - if(t.TryExtractEnumerableType(out Type? enumItemType)) { + if(t.TryExtractIEnumerableType(out Type? enumItemType)) { baseType = enumItemType!; isArray = true; } diff --git a/src/Parquet/Schema/DataType.cs b/src/Parquet/Schema/DataType.cs index 34d2a70a..bc24792e 100644 --- a/src/Parquet/Schema/DataType.cs +++ b/src/Parquet/Schema/DataType.cs @@ -4,7 +4,7 @@ namespace Parquet.Schema { /// /// List of supported data types /// - [Obsolete("Please remove references to this enum and use System.Type where appropriate. WIll be removed in the next major release.")] + [Obsolete(Globals.DataTypeEnumObsolete)] public enum DataType { /// /// Type is not specified, shouldn't be used. diff --git a/src/Parquet/Schema/Field.cs b/src/Parquet/Schema/Field.cs index 1bb3561e..bf59bbe3 100644 --- a/src/Parquet/Schema/Field.cs +++ b/src/Parquet/Schema/Field.cs @@ -1,4 +1,6 @@ using System; +using System.Collections.Generic; +using System.Linq; namespace Parquet.Schema { @@ -23,6 +25,26 @@ public abstract class Field { /// public FieldPath Path { get; internal set; } + /// + /// Original nullability + /// + public virtual bool IsNullable { get; internal set; } = false; + + internal List GetNaturalChildPath(List path) { + if(SchemaType == SchemaType.List) { + // element.list.element.child + return path.Skip(3).ToList(); + } + + if(SchemaType == SchemaType.Map) { + // element.key_value.key|value + return path.Skip(2).ToList(); + } + + // element.child + return path.Skip(1).ToList(); + } + /// /// Max repetition level /// @@ -78,6 +100,21 @@ internal virtual void Assign(Field field) { //only used by some schema fields internally to help construct a field hierarchy } + /// + /// Get child fields, which only makes sense for complex types + /// + internal virtual Field[] Children { get; } = Array.Empty(); + + internal virtual Field[] NaturalChildren { + get { + if(SchemaType == SchemaType.List) { + return Children[0].Children; + } + + return Children; + } + } + internal bool Equals(Thrift.SchemaElement tse) { if(ReferenceEquals(tse, null)) return false; @@ -88,7 +125,7 @@ internal bool Equals(Thrift.SchemaElement tse) { #endregion /// - public override string ToString() => $"{Path} ({SchemaType})"; + public override string ToString() => $"{Path} ({SchemaType}, RL: {MaxRepetitionLevel}, DL: {MaxDefinitionLevel})"; /// /// Basic equality check diff --git a/src/Parquet/Schema/FieldPath.cs b/src/Parquet/Schema/FieldPath.cs index 7a5a8e99..173d7937 100644 --- a/src/Parquet/Schema/FieldPath.cs +++ b/src/Parquet/Schema/FieldPath.cs @@ -10,25 +10,14 @@ namespace Parquet.Schema { /// public sealed class FieldPath : IEquatable { - /// - /// Path separator - /// - public const string Separator = "."; - private readonly List _parts; - private string _str; /// - /// Construct path from raw string (unsafe!) + /// Construct path single part, which becomes the first part in the result path. /// - /// - public FieldPath(string path) { - _str = path ?? throw new ArgumentNullException(nameof(path)); -#if NETSTANDARD2_0 - _parts = path.Split(new[] { Separator[0] }, StringSplitOptions.RemoveEmptyEntries).ToList(); -#else - _parts = path.Split(Separator, StringSplitOptions.RemoveEmptyEntries).ToList(); -#endif + /// + public FieldPath(string firstPart) { + _parts = new List { firstPart ?? throw new ArgumentNullException(nameof(firstPart)) }; } /// @@ -38,8 +27,6 @@ public FieldPath(IEnumerable parts) { _parts = parts .Where(i => !string.IsNullOrEmpty(i)) .ToList(); - _str = string.Join(Separator, _parts); - } /// @@ -59,7 +46,6 @@ public void Append(string value) { return; _parts.Add(value); - _str = string.Join(Separator, _parts); } /// @@ -73,6 +59,11 @@ public void Append(string value) { /// public string? FirstPart => _parts.Count == 0 ? null : _parts[0]; + /// + /// Gets part by index + /// + public string this[int i] => _parts[i]; + /// /// Number of elements in path /// @@ -85,19 +76,38 @@ public bool Equals(FieldPath? other) { if(ReferenceEquals(this, other)) return true; - return _str.Equals(other?._str); + if(other?._parts.Count != _parts.Count) + return false; + + for(int i = 0; i < _parts.Count; i++) { + if(_parts[i] != other._parts[i]) return false; + } + return true; } - /// - /// Hash code of string path - /// - public override int GetHashCode() => _str.GetHashCode(); + /// + public override int GetHashCode() { + int hash = 19; + unchecked { + foreach(string part in _parts) { + hash = (hash * 31) + part.GetHashCode(); + } + } + return hash; + } + + /// + public override bool Equals(object? obj) { + if(obj is not FieldPath fp) return false; + + return Equals(fp); + } /// /// String repr /// - public override string ToString() => _str; + public override string ToString() => string.Join("/", _parts); /// /// Combines two paths safely @@ -110,15 +120,5 @@ public bool Equals(FieldPath? other) { parts.AddRange(right._parts); return new FieldPath(parts); } - - /// - /// String repr - /// - public static implicit operator string(FieldPath p) => p._str; - - /// - /// Unsafe path constructor - /// - public static implicit operator FieldPath(string s) => new FieldPath(s); } } diff --git a/src/Parquet/Schema/ListField.cs b/src/Parquet/Schema/ListField.cs index 405a1bff..bdf6e7c2 100644 --- a/src/Parquet/Schema/ListField.cs +++ b/src/Parquet/Schema/ListField.cs @@ -9,6 +9,11 @@ public class ListField : Field { private bool _itemAssigned = false; + /// + /// Name of the element item for schema definition. + /// + public const string ElementName = "element"; + /// /// Default container name for a list /// @@ -21,6 +26,13 @@ public class ListField : Field { /// public Field Item { get; internal set; } + private ListField(string name) : base(name, SchemaType.List) { + ContainerName = "list"; + Item = new DataField("invalid"); + IsNullable = true; // lists are always nullable + } + + /// /// Creates a new instance of /// @@ -43,7 +55,7 @@ public ListField(string name, Field item, string containerName = DefaultContaine /// When set, uses this property to get the list's data. When not set, uses the property that matches the name parameter. /// Container name /// Element name - [Obsolete] + [Obsolete(Globals.DataTypeEnumObsolete)] public ListField(string name, DataType dataType, bool hasNulls = true, string? propertyName = null, string containerName = "list", string? elementName = null) : this(name) { Item = new DataField(elementName ?? name, dataType, hasNulls, false, propertyName ?? name); _itemAssigned = true; @@ -68,40 +80,40 @@ public ListField(string name, _itemAssigned = true; ContainerName = containerName; PathPrefix = null; - } - - private ListField(string name) : base(name, SchemaType.List) { - ContainerName = "list"; - Item = new DataField("invalid"); + IsNullable = true; // lists are always nullable } internal override FieldPath? PathPrefix { set { - Path = value + Name + ContainerName; + Path = value + new FieldPath(Name, ContainerName); Item.PathPrefix = Path; } } + internal override Field[] Children => new Field[] { Item }; + + internal Thrift.SchemaElement? GroupSchemaElement { get; set; } = null; + internal override void PropagateLevels(int parentRepetitionLevel, int parentDefinitionLevel) { - int rl = parentRepetitionLevel; - int dl = parentDefinitionLevel; - //"container" is optional, therefore +1 to DL - dl += 1; + // both get + MaxDefinitionLevel = parentDefinitionLevel; + MaxRepetitionLevel = parentRepetitionLevel + 1; // because it's repeated ;) - //"list" is repeated, both get +1 - rl += 1; - dl += 1; + if(IsNullable) { + MaxDefinitionLevel++; + } - MaxRepetitionLevel = rl; - MaxDefinitionLevel = dl; + if(GroupSchemaElement == null || GroupSchemaElement.Repetition_type != Thrift.FieldRepetitionType.REQUIRED) { + MaxDefinitionLevel++; + } //push to child item - Item.PropagateLevels(rl, dl); + Item.PropagateLevels(MaxRepetitionLevel, MaxDefinitionLevel); } - internal static ListField CreateWithNoItem(string name) { - return new ListField(name); + internal static ListField CreateWithNoItem(string name, bool isNullable) { + return new ListField(name) { IsNullable = isNullable }; } internal override void Assign(Field field) { diff --git a/src/Parquet/Schema/MapField.cs b/src/Parquet/Schema/MapField.cs index ece1f58b..1c6f8a24 100644 --- a/src/Parquet/Schema/MapField.cs +++ b/src/Parquet/Schema/MapField.cs @@ -32,14 +32,16 @@ public MapField(string name, Field keyField, Field valueField) Value = valueField; _keyAssigned = _valueAssigned = true; - Path = name.AddPath(ContainerName); + Path = new FieldPath(name, ContainerName); Key.PathPrefix = Path; Value.PathPrefix = Path; + IsNullable = true; } internal MapField(string name) : base(name, SchemaType.Map) { Key = Value = new DataField("invalid"); + IsNullable = true; } internal override void Assign(Field se) { @@ -56,26 +58,32 @@ internal override void Assign(Field se) { internal override FieldPath? PathPrefix { set { - Path = value + Name + ContainerName; + Path = value + new FieldPath(Name, ContainerName); Key.PathPrefix = Path; Value.PathPrefix = Path; } } + internal override Field[] Children => new Field[] { Key, Value }; + + internal Thrift.SchemaElement? GroupSchemaElement { get; set; } = null; + internal override void PropagateLevels(int parentRepetitionLevel, int parentDefinitionLevel) { - int rl = parentRepetitionLevel; - int dl = parentDefinitionLevel; - //"container" is optional and adds on 1 DL - dl += 1; + MaxDefinitionLevel = parentDefinitionLevel; + MaxRepetitionLevel = parentRepetitionLevel + 1; // because map is actually a list of key-values - //"key_value" is repeated therefore it adds on 1 RL + 1 DL - rl += 1; - dl += 1; + if(IsNullable) { + MaxDefinitionLevel++; + } + + if(GroupSchemaElement == null || GroupSchemaElement.Repetition_type != Thrift.FieldRepetitionType.REQUIRED) { + MaxDefinitionLevel++; + } //push to children - Key.PropagateLevels(rl, dl); - Value.PropagateLevels(rl, dl); + Key.PropagateLevels(MaxRepetitionLevel, MaxDefinitionLevel); + Value.PropagateLevels(MaxRepetitionLevel, MaxDefinitionLevel); } /// diff --git a/src/Parquet/Schema/Schema.cs b/src/Parquet/Schema/Schema.cs index af60c419..072f9d2e 100644 --- a/src/Parquet/Schema/Schema.cs +++ b/src/Parquet/Schema/Schema.cs @@ -47,7 +47,7 @@ private ParquetSchema(List fields) { _fields = fields; - //set levels now, after schema is constructeds + //set levels now, after schema is constructed foreach(Field field in fields) { field.PropagateLevels(0, 0); } @@ -105,10 +105,6 @@ void traverse(IEnumerable fields) { return result.ToArray(); } - internal DataField? FindDataField(string path) { - return GetDataFields().FirstOrDefault(f => f.Path == path); - } - /// /// Indicates whether the current object is equal to another object of the same type. /// diff --git a/src/Parquet/Schema/StructField.cs b/src/Parquet/Schema/StructField.cs index d898ec22..bdf59be8 100644 --- a/src/Parquet/Schema/StructField.cs +++ b/src/Parquet/Schema/StructField.cs @@ -8,7 +8,11 @@ namespace Parquet.Schema { /// Represents a structure i.e. a container for other fields. /// public class StructField : Field, IEquatable { - private readonly List _fields = new List(); + private readonly List _fields = new(); + + private StructField(string name) : base(name, SchemaType.Struct) { + IsNullable = true; + } /// /// Creates a new structure field @@ -16,33 +20,40 @@ public class StructField : Field, IEquatable { /// Structure name /// List of elements public StructField(string name, params Field[] elements) : this(name) { - if(elements == null || elements.Length == 0) throw new ArgumentException($"structure '{name}' requires at least one element"); + if(elements == null || elements.Length == 0) + throw new ArgumentException($"structure '{name}' requires at least one element"); //path for structures has no weirdnes, yay! - foreach(Field field in elements) _fields.Add(field); + foreach(Field field in elements) + _fields.Add(field); - Path = name; + Path = new FieldPath(name); PathPrefix = null; } internal override FieldPath? PathPrefix { set { - Path = value + Name; + Path = value + new FieldPath(Name); foreach(Field field in _fields) field.PathPrefix = Path; } } - internal override void PropagateLevels(int parentRepetitionLevel, int parentDefinitionLevel) { - //struct is a container, it doesn't have any levels + internal override Field[] Children => Fields.ToArray(); // make a copy - foreach(Field f in Fields) f.PropagateLevels(parentRepetitionLevel, parentDefinitionLevel); - } + internal override void PropagateLevels(int parentRepetitionLevel, int parentDefinitionLevel) { + //struct is a container, it doesn't have any repetition levels - private StructField(string name) : base(name, SchemaType.Struct) { + MaxRepetitionLevel = parentRepetitionLevel; + MaxDefinitionLevel = parentDefinitionLevel; + if(IsNullable) + MaxDefinitionLevel++; + foreach(Field f in Fields) { + f.PropagateLevels(MaxRepetitionLevel, MaxDefinitionLevel); + } } internal static StructField CreateWithNoElements(string name) { diff --git a/src/Parquet/Serialization/ClrBridge.cs b/src/Parquet/Serialization/ClrBridge.cs index fd65ee45..4cb2aab3 100644 --- a/src/Parquet/Serialization/ClrBridge.cs +++ b/src/Parquet/Serialization/ClrBridge.cs @@ -24,7 +24,7 @@ public DataColumn BuildColumn(DataField field, Array classInstances, int classIn Type? prop = PropertyHelpers.GetDeclaredPropertyFromClassType(_classType, field)?.PropertyType; if(prop == null) throw new InvalidOperationException("cannot get property type"); - bool underlyingTypeIsEnumerable = prop.TryExtractEnumerableType(out _); + bool underlyingTypeIsEnumerable = prop.TryExtractIEnumerableType(out _); List? repLevelsList = field.IsArray || underlyingTypeIsEnumerable ? new List() : null; object result = populateList(classInstances, resultList, repLevelsList, field.MaxRepetitionLevel); diff --git a/src/Parquet/Serialization/Dremel/Assembler.cs b/src/Parquet/Serialization/Dremel/Assembler.cs new file mode 100644 index 00000000..7885a467 --- /dev/null +++ b/src/Parquet/Serialization/Dremel/Assembler.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Parquet.Schema; + +namespace Parquet.Serialization.Dremel { + class Assembler { + public Assembler(ParquetSchema schema) { + Schema = schema; + + FieldAssemblers = schema + .GetDataFields() + .Select(df => Compile(schema, df)) + .ToList(); + } + + private static FieldAssembler Compile(ParquetSchema schema, DataField df) { + try { + return new FieldAssemblerCompiler(schema, df).Compile(); + } catch(Exception ex) { + throw new FieldAccessException($"failed to compile '{df}'", ex); + } + } + + public ParquetSchema Schema { get; } + + public List> FieldAssemblers { get; } + } +} diff --git a/src/Parquet/Serialization/Dremel/FieldAssembler.cs b/src/Parquet/Serialization/Dremel/FieldAssembler.cs new file mode 100644 index 00000000..38a02210 --- /dev/null +++ b/src/Parquet/Serialization/Dremel/FieldAssembler.cs @@ -0,0 +1,17 @@ +using System; +using System.Collections.Generic; +using System.Linq.Expressions; +using Parquet.Data; +using Parquet.Schema; + +namespace Parquet.Serialization.Dremel { + class FieldAssembler : FieldWorker { + + public FieldAssembler(DataField field, Action, DataColumn> assembler, Expression expression, Expression iterationExpression) + : base(field, expression, iterationExpression) { + Assemble = assembler; + } + + public Action, DataColumn> Assemble { get; } + } +} diff --git a/src/Parquet/Serialization/Dremel/FieldAssemblerCompiler.cs b/src/Parquet/Serialization/Dremel/FieldAssemblerCompiler.cs new file mode 100644 index 00000000..7ba26ca4 --- /dev/null +++ b/src/Parquet/Serialization/Dremel/FieldAssemblerCompiler.cs @@ -0,0 +1,382 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Linq.Expressions; +using System.Reflection; +using Parquet.Data; +using Parquet.Extensions; +using Parquet.Schema; + +namespace Parquet.Serialization.Dremel { + class FieldAssemblerCompiler { + + private static readonly Expression Zero = Expression.Constant(0); + private static readonly Expression One = Expression.Constant(1); + + private readonly ParquetSchema _schema; + private readonly DataField _df; + + private readonly ParameterExpression _dcParam = Expression.Parameter(typeof(DataColumn), "dc"); + + private readonly ParameterExpression _classElementVar = Expression.Variable(typeof(TClass), "curr"); + +#if DEBUG + private readonly MethodInfo _injectLevelDebugMethod; +#endif + + + #region [ Data Pointers ] + + private readonly ParameterExpression _dataIdxVar = Expression.Variable(typeof(int), "dataIdx"); + private readonly ParameterExpression _dataVar; + private readonly ParameterExpression _dataElementVar; + + private readonly ParameterExpression _rlIdxVar = Expression.Variable(typeof(int), "rlIdx"); + private readonly ParameterExpression _rlVar = Expression.Variable(typeof(int), "rl"); + + private readonly ParameterExpression _dlIdxVar = Expression.Variable(typeof(int), "dlIdx"); + private readonly ParameterExpression _dlVar = Expression.Variable(typeof(int), "dl"); + + private readonly ParameterExpression _hasData = Expression.Variable(typeof(bool), "hasData"); + + #endregion + + // Repetition State Machine. + // Stores collection indexes for each repetition level on where to operate on. + // Size of the machine is Max Repetition Level (level 0 is a special case that clears entire machine). + private readonly ParameterExpression _rsmVar = Expression.Variable(typeof(int[]), "rsm"); + + private readonly bool _hasReps; + private readonly bool _hasDefs; + + + public FieldAssemblerCompiler(ParquetSchema schema, DataField df) { + _schema = schema; + _df = df; + + // expecting non-nullable elements, because definition levels are handled by this algorithm + _dataVar = Expression.Variable(_df.ClrType.MakeArrayType(), "data"); + _dataElementVar = Expression.Variable(_df.ClrType, "dataElement"); + +#if DEBUG + _injectLevelDebugMethod = GetType().GetMethod(nameof(InjectLevelDebug), BindingFlags.NonPublic | BindingFlags.Static)!; +#endif + _hasReps = df.MaxRepetitionLevel > 0; + _hasDefs = df.MaxDefinitionLevel > 0; + } + + private Expression GetDataLength() { + return Expression.Property(Expression.Property(_dcParam, nameof(DataColumn.Data)), nameof(Array.Length)); + } + + + private Expression GetRlLength() { + return Expression.Property(Expression.Property(_dcParam, nameof(DataColumn.RepetitionLevels)), nameof(Array.Length)); + } + + private Expression GetDLLength() { + return Expression.Property(Expression.Property(_dcParam, nameof(DataColumn.DefinitionLevels)), nameof(Array.Length)); + } + + private Expression GetRLAt(Expression index) { + return Expression.ArrayAccess(Expression.Property(_dcParam, nameof(DataColumn.RepetitionLevels)), index); + } + + private Expression GetDLAt(Expression index) { + return Expression.ArrayAccess(Expression.Property(_dcParam, nameof(DataColumn.DefinitionLevels)), index); + } + + + private Expression GetCurrentRLOr0() { + return _hasReps + ? Expression.Condition( + Expression.LessThan(_rlIdxVar, GetRlLength()), + GetRLAt(_rlIdxVar), + Expression.Constant(0)) + : Expression.Constant(0); + } + + private Expression TakeCurrentValuesAndAdvance() { + + Expression dataIdxLessThanExpr = Expression.LessThan(_dataIdxVar, GetDataLength()); + + return Expression.IfThenElse( + // _dataIdxVar < dcParam.Data.Length || _dlIdxVar < dcParam.DefinitionLevels.Length + _df.MaxDefinitionLevel > 0 + ? Expression.Or( + Expression.LessThan(_dlIdxVar, GetDLLength()), + dataIdxLessThanExpr) + : dataIdxLessThanExpr, + + Expression.Block( + + // get definition level value: _dlVar = _dcParam.DefinitionLevels[dlIdxVar]; + _df.MaxDefinitionLevel > 0 + ? Expression.Assign(_dlVar, GetDLAt(Expression.PostIncrementAssign(_dlIdxVar))) + : Expression.Empty(), + + // get array value, but only if definiton level is right + // _dataElementVar = _dataVar[_dataIdxVar]; + _df.MaxDefinitionLevel > 0 + ? Expression.IfThen( + Expression.Equal(_dlVar, Expression.Constant(_df.MaxDefinitionLevel)), + Expression.Assign(_dataElementVar, Expression.ArrayAccess(_dataVar, Expression.PostIncrementAssign(_dataIdxVar)))) + : Expression.Assign(_dataElementVar, Expression.ArrayAccess(_dataVar, Expression.PostIncrementAssign(_dataIdxVar))), + + // get repetition level value: rlVar = dcParam.RepetitionLevels[rlIndexVar]; + _df.MaxRepetitionLevel > 0 + ? Expression.Assign(_rlVar, GetRLAt(Expression.PostIncrementAssign(_rlIdxVar))) + : Expression.Empty(), + + // flag = true + Expression.Assign(_hasData, Expression.Constant(true)) + ), + + // flag = false + Expression.Assign(_hasData, Expression.Constant(false))); + } + + private static void Discover(Field field, out bool isRepeated) { + isRepeated = + field.SchemaType == SchemaType.List || + field.SchemaType == SchemaType.Map || + (field.SchemaType == SchemaType.Data && field is DataField rdf && rdf.IsArray); + } + + private static void InjectLevelDebug(string levelPropertyName, + object value, int dataIdx, + int dl, int rl, + int dlDepth, int rlDepth, + int[] rsm) { + Console.WriteLine("debug"); + } + + /// + /// Transitions RSM for current RL iteration + /// + private Expression TransitionRSM() { + return Expression.IfThenElse( + Expression.Equal(_rlVar, Zero), + _rsmVar.ClearArray(), + + Expression.Block( + // +1 to current RL + Expression.PostIncrementAssign(Expression.ArrayAccess(_rsmVar, Expression.Subtract(_rlVar, One))), + // zero out the rest of the elements on the right + _rsmVar.ClearArray(_rlVar))); + } + + private Expression GetCollectionElement(Expression collection, int rlDepth, + Type collectionType, Type elementType) { + ParameterExpression indexVar = Expression.Variable(typeof(int), "index"); + ParameterExpression resultElementVar = Expression.Variable(elementType, "resultElement"); + Expression downcastedCollection = Expression.Convert(collection, collectionType); + return Expression.Block( + new[] { indexVar, resultElementVar }, + + // C#: index = rsm[dlDepth - 1] + Expression.Assign(indexVar, Expression.ArrayAccess(_rsmVar, Expression.Constant(rlDepth - 1))), + + Expression.IfThenElse( + Expression.LessThanOrEqual(downcastedCollection.CollectionCount(collectionType), indexVar), + + Expression.Block( + Expression.Assign(resultElementVar, Expression.New(elementType)), + downcastedCollection.CollectionAdd(collectionType, resultElementVar, elementType)), + + Expression.Assign(resultElementVar, Expression.Property(downcastedCollection, "Item", indexVar)) + ), + + resultElementVar); + } + + private static void ReplaceIDictionaryTypes(Type t, out Type dictionaryType, out Type elementType) { + if(!t.TryExtractDictionaryType(out Type? keyType, out Type? valueType)) { + throw new NotSupportedException($"{t} is not a dictionary"); + } + + dictionaryType = typeof(ParquetDictionary<,>).MakeGenericType(keyType!, valueType!); + elementType = typeof(ParquetDictionary<,>.ParquetDictionaryElement).MakeGenericType(keyType!, valueType!); + } + + private Expression InjectLevel(Expression rootVar, Type rootType, Field[] levelFields, List path) { + + string currentPathPart = path.First(); + Field? field = levelFields.FirstOrDefault(x => x.Name == currentPathPart); + if(field == null) + throw new NotSupportedException($"field '{currentPathPart}' not found"); + + int dlDepth = field.MaxDefinitionLevel; + int rlDepth = field.MaxRepetitionLevel; + + Discover(field, out bool isRepeated); + bool isAtomic = path.Count == 1; + string levelPropertyName = field.ClrPropName ?? field.Name; + Expression levelProperty = Expression.Property(rootVar, levelPropertyName); + Type levelPropertyType = rootType.GetProperty(levelPropertyName)!.PropertyType; + + Expression iteration = Expression.Empty(); + + if(isRepeated) { + Expression rsmAccess = Expression.ArrayAccess(_rsmVar, Expression.Constant(rlDepth - 1)); + + Type levelPropertyElementType; + if(levelPropertyType.IsGenericIDictionary()) { + ReplaceIDictionaryTypes(levelPropertyType, out levelPropertyType, out levelPropertyElementType); + } else { + levelPropertyElementType = levelPropertyType.ExtractElementTypeFromEnumerableType(); + } + + Expression leafExpr; + + if(isAtomic) { + // add element to collection - end here + leafExpr = Expression.Call(levelProperty, + levelPropertyType.GetMethod(nameof(IList.Add))!, + Expression.Convert(_dataElementVar, levelPropertyElementType)); + + } else { + + // Map is also repeated type, but key and value cannot be constructed independently. + + ParameterExpression collectionElementVar = Expression.Variable(levelPropertyElementType, "collectionElement"); + leafExpr = Expression.Block( + new[] { collectionElementVar }, + + Expression.Assign(collectionElementVar, + GetCollectionElement(levelProperty, rlDepth, levelPropertyType, levelPropertyElementType)), + + // keep traversing the tree + InjectLevel(collectionElementVar, levelPropertyElementType, + field.NaturalChildren, field.GetNaturalChildPath(path)) + + ); + } + + iteration = leafExpr; + + } else { + if(isAtomic) { + + // C#: dlDepth <= _dlVar? + iteration = + Expression.IfThen( + Expression.Equal(Expression.Constant(dlDepth), _dlVar), + // levelProperty = (levelPropertyType)_dataElementVar + // conversion compensates for nullable types and maybe implicit conversions + Expression.Assign(levelProperty, Expression.Convert(_dataElementVar, levelPropertyType)) + ); + } else { + ParameterExpression deepVar = Expression.Variable(levelPropertyType); + + iteration = Expression.Block( + new[] { deepVar }, + + Expression.Assign(deepVar, levelProperty), + + InjectLevel(deepVar, levelPropertyType, + field.NaturalChildren, + field.GetNaturalChildPath(path))); + } + } + + if(!isAtomic || isRepeated) { + + iteration = Expression.IfThen( + // C#: dlDepth <= _dlVar? + Expression.LessThanOrEqual(Expression.Constant(dlDepth), _dlVar), + + Expression.Block( + Expression.IfThen( + Expression.Equal(levelProperty, Expression.Constant(null)), + Expression.Assign(levelProperty, Expression.New(levelPropertyType))), + + iteration)); + } + + return Expression.Block( +#if DEBUG + Expression.Call(_injectLevelDebugMethod, + Expression.Constant(levelPropertyName), + Expression.Convert(_dataElementVar, typeof(object)), + _dataIdxVar, + _dlVar, + _rlVar, + Expression.Constant(dlDepth), + Expression.Constant(rlDepth), + _rsmVar), +#endif + + iteration + ); + + + } + + private Expression InjectColumn() { + LabelTarget rlBreakLabel = Expression.Label(); + + // process current value tuple (_dataVar, _dlVar, _rlVar) + Expression body = + InjectLevel(_classElementVar, typeof(TClass), _schema.Fields.ToArray(), _df.Path.ToList()); + + return Expression.Block( + + Expression.Loop(Expression.Block( + TakeCurrentValuesAndAdvance(), + + // break out if no values available + Expression.IfThen(Expression.IsFalse(_hasData), Expression.Break(rlBreakLabel)), + + _hasReps + ? TransitionRSM() + : Expression.Empty(), + + // only proceed when value if defined (if definition levels are used) + //_df.MaxDefinitionLevel > 0 + // ? Expression.IfThen(Expression.Equal(_dlVar, Expression.Constant(_df.MaxDefinitionLevel)), body) + // : body, + body, + + + // be careful to check for NEXT RL, not the current one + // repeat until RL == 0 (always zero for non-repeated fields so we are OK here in any situation) + Expression.IfThen( + Expression.Equal(GetCurrentRLOr0(), Expression.Constant(0)), + Expression.Break(rlBreakLabel)) + + ), rlBreakLabel)); + } + + public FieldAssembler Compile() { + + ParameterExpression classesParam = Expression.Parameter(typeof(IEnumerable), "classes"); + + Expression iteration = InjectColumn(); + + BlockExpression block = Expression.Block( + new[] { _classElementVar, _dataVar, _dataIdxVar, _dataElementVar, _dlIdxVar, _dlVar, _rlIdxVar, _rlVar, _hasData, _rsmVar }, + + // initialise array vars + Expression.Assign(_dataVar, + Expression.Convert(Expression.Property(_dcParam, nameof(DataColumn.Data)), _df.ClrType.MakeArrayType())), + Expression.Assign(_dataIdxVar, Expression.Property(_dcParam, nameof(DataColumn.Offset))), + Expression.Assign(_rlIdxVar, Expression.Constant(0)), + Expression.Assign(_rlVar, Expression.Constant(0)), + Expression.Assign(_dlIdxVar, Expression.Constant(0)), + Expression.Assign(_dlVar, Expression.Constant(0)), + + // allocate state machine + Expression.Assign(_rsmVar, Expression.NewArrayBounds(typeof(int), Expression.Constant(_df.MaxRepetitionLevel))), + + iteration.Loop(classesParam, typeof(TClass), _classElementVar) + ); + + return new FieldAssembler(_df, + Expression.Lambda, DataColumn>>(block, classesParam, _dcParam).Compile(), + block, iteration); + + } + } +} diff --git a/src/Parquet/Serialization/Dremel/FieldStriper.cs b/src/Parquet/Serialization/Dremel/FieldStriper.cs new file mode 100644 index 00000000..59b79325 --- /dev/null +++ b/src/Parquet/Serialization/Dremel/FieldStriper.cs @@ -0,0 +1,17 @@ +using System; +using System.Collections.Generic; +using System.Linq.Expressions; +using Parquet.Schema; + +namespace Parquet.Serialization.Dremel { + class FieldStriper : FieldWorker { + + public FieldStriper(DataField field, Func, ShreddedColumn> striper, + Expression expression, Expression iterationExpression) + : base(field, expression, iterationExpression) { + Stripe = striper; + } + + public Func, ShreddedColumn> Stripe { get; } + } +} diff --git a/src/Parquet/Serialization/Dremel/FieldStriperCompiler.cs b/src/Parquet/Serialization/Dremel/FieldStriperCompiler.cs new file mode 100644 index 00000000..5fe4dbaf --- /dev/null +++ b/src/Parquet/Serialization/Dremel/FieldStriperCompiler.cs @@ -0,0 +1,258 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Linq.Expressions; +using System.Reflection; +using Parquet.Extensions; +using Parquet.Schema; + +namespace Parquet.Serialization.Dremel { + class FieldStriperCompiler { + + private static readonly MethodInfo LevelsAddMethod = typeof(List).GetMethod(nameof(IList.Add))!; + private readonly MethodInfo _valuesListAddMethod; + + private readonly ParquetSchema _schema; + private readonly DataField _df; + + // input parameters + private readonly ParameterExpression _dfParam = Expression.Parameter(typeof(DataField), "df"); + private readonly ParameterExpression _classesParam = Expression.Parameter(typeof(IEnumerable), "classes"); + private static readonly ConstructorInfo ShreddedColumnConstructor = + typeof(ShreddedColumn).GetConstructor(BindingFlags.Instance | BindingFlags.Public, null, + CallingConventions.HasThis, + new[] { typeof(Array), typeof(List), typeof(List) }, + null)!; + + // create lists for values, definition levels and repetition levels + private readonly Type _valuesListType; + private readonly ParameterExpression _valuesVar; + private readonly ParameterExpression _dlsVar; + private readonly ParameterExpression _rlsVar; + + // currently iterated class element + private readonly ParameterExpression _classElementVar = Expression.Variable(typeof(TClass), "curr"); + + public FieldStriperCompiler(ParquetSchema schema, DataField df) { + + _schema = schema; + _df = df; + + // + _valuesListType = typeof(List<>).MakeGenericType(df.ClrType); + _valuesVar = Expression.Variable(_valuesListType, "values"); + _dlsVar = Expression.Variable(typeof(List), "dls"); + _rlsVar = Expression.Variable(typeof(List), "rls"); + + // + _valuesListAddMethod = typeof(List<>).MakeGenericType(_df.ClrType).GetMethod(nameof(IList.Add))!; + } + + private static void Discover(Field field, out bool isRepeated) { + isRepeated = + field.SchemaType == SchemaType.List || + field.SchemaType == SchemaType.Map || + (field.SchemaType == SchemaType.Data && field is DataField rdf && rdf.IsArray); + } + + /// + /// + /// + /// + /// Definition level if value is defined. For optional atoms that are null it must be -1. + /// + /// + /// Value is atomic i.e. having real data value and not just RLs and DLs + /// + private Expression WriteValue(ParameterExpression valueVar, + int dl, Expression currentRlVar, + ParameterExpression isLeaf, bool isAtomic) { + + if(isAtomic) { + if(_df.IsNullable) { + + Expression getNonNullValue = _df.ClrNullableIfHasNullsType.IsSystemNullable() + ? Expression.Property(valueVar, "Value") + : valueVar; + + return Expression.IfThenElse( + // value == null? + Expression.Equal(valueVar, Expression.Constant(null)), + + // only need RL and DL-1 + Expression.Block( + Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl - 1)), + Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar)), + + // everything, but value must be non-null + Expression.Block( + Expression.Call(_valuesVar, _valuesListAddMethod, getNonNullValue), + Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)), + Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar))); + + } else { + // required atomics are simple - add value, RL and DL as is + return Expression.Block( + Expression.Call(_valuesVar, _valuesListAddMethod, valueVar), + Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)), + Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar)); + } + } + + // non-atomics still need RL and DL dumped + return Expression.Block( + Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)), + Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar)); + + } + + private Expression WriteMissingValue(int dl, Expression currentRlVar) { + return Expression.Block( + Expression.Call(_dlsVar, LevelsAddMethod, Expression.Constant(dl)), + Expression.Call(_rlsVar, LevelsAddMethod, currentRlVar)); + } + + private Expression WhileBody(Expression element, bool isAtomic, int dl, ParameterExpression currentRlVar, ParameterExpression seenFieldsVar, Field field, int rlDepth, Type elementType, List path) { + string suffix = field.Path.ToString().Replace(".", "_"); + ParameterExpression chRepetitionLevelVar = Expression.Variable(typeof(int), $"chRepetitionLevel_{suffix}"); + ParameterExpression valueVar = Expression.Variable(elementType, $"value_{suffix}"); + ParameterExpression isLeafVar = Expression.Variable(typeof(bool), $"isLeaf_{suffix}"); + return Expression.Block( + new[] { chRepetitionLevelVar, valueVar, isLeafVar }, + + // L8 + Expression.Assign(chRepetitionLevelVar, currentRlVar), + + // L9-13 + Expression.IfThenElse( + // if seenFields.Contains(field.Path) + Expression.Call(seenFieldsVar, typeof(HashSet).GetMethod("Contains")!, Expression.Constant(field.Path.ToString())), + + // chRepetitionLevelVar = treeDepth + Expression.Assign(chRepetitionLevelVar, Expression.Constant(rlDepth)), + + // seenFields.Add(field.Path) + Expression.Call(seenFieldsVar, typeof(HashSet).GetMethod("Add")!, Expression.Constant(field.Path.ToString())) + ), + + // L14- + Expression.Assign(valueVar, element), + + isAtomic + ? Expression.Assign(isLeafVar, Expression.Constant(true)) + : Expression.Assign(isLeafVar, elementType.IsValueType ? Expression.Constant(false) : valueVar.IsNull()), + + Expression.IfThenElse( + Expression.IsTrue(isLeafVar), + WriteValue(valueVar, dl, chRepetitionLevelVar, isLeafVar, isAtomic), + isAtomic + ? Expression.Empty() + : DissectRecord(valueVar, field.NaturalChildren, field.GetNaturalChildPath(path), elementType, rlDepth, chRepetitionLevelVar) + ) + + ); + } + + private static Type ExtractElementTypeFromEnumerableType(Type t) { + if(t.TryExtractDictionaryType(out Type? keyType, out Type? valueType)) + return typeof(KeyValuePair<,>).MakeGenericType(keyType!, valueType!); + + if(t.TryExtractIEnumerableType(out Type? iet)) + return iet!; + + throw new ArgumentException($"type {t} is not single-element generic enumerable", nameof(t)); + } + + + private Expression DissectRecord( + Expression rootVar, + Field[] levelFields, + List path, + Type rootType, + int rlDepth, + ParameterExpression currentRlVar) { + + // walk schema, not class instance + // this means value must be propagated down the tree, even if it's not present + + string currentPathPart = path.First(); + Field? field = levelFields.FirstOrDefault(x => x.Name == currentPathPart); + if(field == null) + throw new NotSupportedException($"field '{currentPathPart}' not found"); + int dl = field.MaxDefinitionLevel; + + FieldStriperCompiler.Discover(field, out bool isRepeated); + bool isAtomic = path.Count == 1; + if(isRepeated) + rlDepth += 1; + + // -- + + // while "decoder" + + string levelPropertyName = field.ClrPropName ?? field.Name; + Expression levelProperty = Expression.Property(rootVar, levelPropertyName); + Type levelPropertyType = rootType.GetProperty(levelPropertyName)!.PropertyType; + ParameterExpression seenFieldsVar = Expression.Variable(typeof(HashSet), $"seenFieldsVar_{levelPropertyName}"); + + Expression extraBody; + if(isRepeated) { + Type elementType = ExtractElementTypeFromEnumerableType(levelPropertyType); + Expression collection = levelProperty; + ParameterExpression element = Expression.Variable(elementType, "element"); + Expression elementProcessor = WhileBody(element, isAtomic, dl, currentRlVar, seenFieldsVar, field, rlDepth, elementType, path); + extraBody = elementProcessor.Loop(collection, elementType, element); + + // todo: if levelProperty (collection) is null, we need extra iteration with null value (which rep and def level?) + // we do this iteration with non-collection condition below, so need to be done for collection as well. + extraBody = Expression.IfThenElse( + Expression.Equal(levelProperty, Expression.Constant(null)), + WriteMissingValue(dl - 1, currentRlVar), + extraBody); + } else { + Expression element = levelProperty; + extraBody = WhileBody(element, isAtomic, dl, currentRlVar, seenFieldsVar, field, rlDepth, levelPropertyType, path); + } + + return Expression.Block( + new[] { seenFieldsVar }, + Expression.Assign(seenFieldsVar, Expression.New(typeof(HashSet))), + extraBody); + } + + public FieldStriper Compile() { + + ParameterExpression currentRl = Expression.Variable(typeof(int), "currentRl"); + + Expression iteration = DissectRecord(_classElementVar, _schema.Fields.ToArray(), _df.Path.ToList(), typeof(TClass), 0, currentRl); + Expression iterationLoop = iteration.Loop(_classesParam, typeof(TClass), _classElementVar); + + BlockExpression block = Expression.Block( + new[] { _valuesVar, _dlsVar, _rlsVar, _classElementVar, currentRl }, + + Expression.Assign(currentRl, Expression.Constant(0)), + + // init 3 building blocks + Expression.Block( + Expression.Assign(_valuesVar, Expression.New(_valuesListType)), + Expression.Assign(_dlsVar, Expression.New(typeof(List))), + Expression.Assign(_rlsVar, Expression.New(typeof(List)))), + + iterationLoop, + + // result: use triple to construct ShreddedColumn and return (last element in the block) + Expression.New(ShreddedColumnConstructor, + Expression.Call(_valuesVar, _valuesListType.GetMethod("ToArray")!), + _df.MaxDefinitionLevel == 0 ? Expression.Convert(Expression.Constant(null), typeof(List)) : _dlsVar, + _df.MaxRepetitionLevel == 0 ? Expression.Convert(Expression.Constant(null), typeof(List)) : _rlsVar) + ); + + Func, ShreddedColumn> lambda = Expression + .Lambda, ShreddedColumn>>(block, _dfParam, _classesParam) + .Compile(); + + return new FieldStriper(_df, lambda, block, iteration); + } + } +} diff --git a/src/Parquet/Serialization/Dremel/FieldWorker.cs b/src/Parquet/Serialization/Dremel/FieldWorker.cs new file mode 100644 index 00000000..58dac102 --- /dev/null +++ b/src/Parquet/Serialization/Dremel/FieldWorker.cs @@ -0,0 +1,20 @@ +using System.Linq.Expressions; +using Parquet.Schema; + +namespace Parquet.Serialization.Dremel { + class FieldWorker { + public DataField Field { get; } + + public Expression Expression { get; } + + public Expression IterationExpression { get; } + + public FieldWorker(DataField field, Expression expression, Expression iterationExpression) { + Field = field; + Expression = expression; + IterationExpression = iterationExpression; + } + + public override string ToString() => Field.ToString(); + } +} diff --git a/src/Parquet/Serialization/Dremel/ParquetDictionary.cs b/src/Parquet/Serialization/Dremel/ParquetDictionary.cs new file mode 100644 index 00000000..4c4c8e28 --- /dev/null +++ b/src/Parquet/Serialization/Dremel/ParquetDictionary.cs @@ -0,0 +1,63 @@ +using System.Collections.Generic; + +namespace Parquet.Serialization.Dremel { + class ParquetDictionary : Dictionary, IList.ParquetDictionaryElement> + where TKey : notnull { + + private readonly List _list = new(); + + #region [ IList Overrides ] + public ParquetDictionary.ParquetDictionaryElement this[int index] { + get => _list[index]; + set => _list[index] = value; + } + + public bool IsReadOnly => false; + + public void Add(ParquetDictionary.ParquetDictionaryElement item) { + item.Parent = this; + _list.Add(item); + } + + public bool Contains(ParquetDictionary.ParquetDictionaryElement item) => + _list.Contains(item); + public void CopyTo(ParquetDictionary.ParquetDictionaryElement[] array, int arrayIndex) => + _list.CopyTo(array, arrayIndex); + public int IndexOf(ParquetDictionary.ParquetDictionaryElement item) => + _list.IndexOf(item); + public void Insert(int index, ParquetDictionary.ParquetDictionaryElement item) => + _list.Insert(index, item); + public bool Remove(ParquetDictionary.ParquetDictionaryElement item) => + _list.Remove(item); + public void RemoveAt(int index) => + _list.RemoveAt(index); + IEnumerator.ParquetDictionaryElement> IEnumerable.ParquetDictionaryElement>.GetEnumerator() => + _list.GetEnumerator(); + + #endregion + + public new int Count => _list.Count; + + public class ParquetDictionaryElement { + + private TValue? _value; + + public ParquetDictionary? Parent; + + public TKey? Key { get; set; } + + public TValue? Value { + get => _value; + set { + _value = value; + + if(Parent != null && Key != null) { + ((Dictionary)Parent)[Key] = value!; + } + } + } + } + + + } +} diff --git a/src/Parquet/Serialization/Dremel/ShreddedColumn.cs b/src/Parquet/Serialization/Dremel/ShreddedColumn.cs new file mode 100644 index 00000000..700c8a02 --- /dev/null +++ b/src/Parquet/Serialization/Dremel/ShreddedColumn.cs @@ -0,0 +1,18 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Parquet.Serialization.Dremel { + class ShreddedColumn { + + public ShreddedColumn(Array data, List? definitionLevels, List? repetitionLevels) { + Data = data; + DefinitionLevels = definitionLevels; + RepetitionLevels = repetitionLevels; + } + + public Array Data; + public List? DefinitionLevels { get; set; } + public List? RepetitionLevels { get; set; } + } +} diff --git a/src/Parquet/Serialization/Dremel/Striper.cs b/src/Parquet/Serialization/Dremel/Striper.cs new file mode 100644 index 00000000..ada2749a --- /dev/null +++ b/src/Parquet/Serialization/Dremel/Striper.cs @@ -0,0 +1,32 @@ +using System.Collections.Generic; +using System.Linq; +using Parquet.Schema; + +namespace Parquet.Serialization.Dremel { + /// + /// Not a stripper + /// + class Striper { + + public Striper(ParquetSchema schema) { + Schema = schema; + + FieldStripers = schema + .GetDataFields() + .Select(CreateStriper) + .ToList(); + } + + public ParquetSchema Schema { get; } + + public IReadOnlyList> FieldStripers { get; } + + private FieldStriper CreateStriper(DataField df) { + return new FieldStriperCompiler(Schema, df).Compile(); + } + + public static Striper Create() { + return new Striper(typeof(TClass).GetParquetSchema(false)); + } + } +} diff --git a/src/Parquet/Serialization/MSILGenerator.cs b/src/Parquet/Serialization/MSILGenerator.cs index 7808c79a..6d1fa355 100644 --- a/src/Parquet/Serialization/MSILGenerator.cs +++ b/src/Parquet/Serialization/MSILGenerator.cs @@ -94,7 +94,7 @@ private void GenerateCollector(ILGenerator il, Type classType, Type? prop = PropertyHelpers.GetDeclaredPropertyFromClassType(classType, f)?.PropertyType; if(prop == null) throw new InvalidOperationException("cannot get property"); - bool underlyingTypeIsEnumerable = prop.TryExtractEnumerableType(out _); + bool underlyingTypeIsEnumerable = prop.TryExtractIEnumerableType(out _); if(f.IsArray || underlyingTypeIsEnumerable) { if(addRepLevelMethod == null) @@ -132,7 +132,8 @@ private void GenerateCollector(ILGenerator il, Type classType, //get current value, converting if necessary il.Emit(Ldloc, currentElement.LocalIndex); il.Emit(Callvirt, getValueMethod); - if(conversion != null) conversion.Emit(il); + if(conversion != null) + conversion.Emit(il); il.Emit(Stloc, item.LocalIndex); //store in destination list @@ -193,7 +194,7 @@ private void GenerateAssigner(ILGenerator il, Type classType, DataField field, Type? prop = PropertyHelpers.GetDeclaredPropertyFromClassType(classType, field)?.PropertyType; if(prop == null) throw new InvalidOperationException("cannot get property type"); - bool underlyingTypeIsEnumerable = prop.TryExtractEnumerableType(out _); + bool underlyingTypeIsEnumerable = prop.TryExtractIEnumerableType(out _); if(field.IsArray || underlyingTypeIsEnumerable) { LocalBuilder repItem = il.DeclareLocal(typeof(int)); LocalBuilder dce = il.DeclareLocal(typeof(DataColumnEnumerator)); diff --git a/src/Parquet/Serialization/ParquetSerializer.cs b/src/Parquet/Serialization/ParquetSerializer.cs index ed1501a5..f67c850a 100644 --- a/src/Parquet/Serialization/ParquetSerializer.cs +++ b/src/Parquet/Serialization/ParquetSerializer.cs @@ -1,211 +1,71 @@ using System; -using System.Collections; using System.Collections.Generic; using System.IO; -using System.Linq.Expressions; +using System.Linq; using System.Threading; using System.Threading.Tasks; using Parquet.Data; +using Parquet.Extensions; using Parquet.Schema; +using Parquet.Serialization.Dremel; namespace Parquet.Serialization { /// - /// High-level object serialisation V2. Internal only while being worked on. - /// Comes as a rewrite of ParquetConvert/ClrBridge/MSILGenerator - /// TODO: - /// - lists - /// - maps - /// - structs - /// - append to file + /// High-level object serialisation. + /// Comes as a rewrite of ParquetConvert/ClrBridge/MSILGenerator and supports nested types as well. /// - internal static class ParquetSerializer { - - private static Expression LogDebug(string s) { - return Expression.Call(typeof(Console).GetMethod("WriteLine", new[] { typeof(string) })!, Expression.Constant(s)); - } - - private static Func, Array> CreateCollectionExpression(Type listElementType, FieldPath clrPath) { - - if(clrPath.Length > 1) - throw new NotImplementedException(); - - Type listType = typeof(List<>).MakeGenericType(listElementType); - - ParameterExpression classesParam = Expression.Parameter(typeof(IEnumerable), "classes"); - ParameterExpression resultVar = Expression.Variable(listType, "values"); - - // loop over collection - ParameterExpression enumeratorVar = Expression.Variable(typeof(IEnumerator), "enumerator"); - MethodCallExpression getEnumeratorCall = Expression.Call(classesParam, - typeof(IEnumerable).GetMethod(nameof(IEnumerable.GetEnumerator))!); - MethodCallExpression moveNextCall = Expression.Call(enumeratorVar, - typeof(IEnumerator).GetMethod(nameof(IEnumerator.MoveNext))!); - ParameterExpression classElementVar = Expression.Variable(typeof(TClass), "curr"); - LabelTarget loopBreakLabel = Expression.Label("loopBreak"); - ParameterExpression classPropertyVar = Expression.Variable(listElementType, "currProp"); - - - // doc: Expression.Loop is an infinite loop that can be exited with "break" - LoopExpression loop = Expression.Loop( - Expression.IfThenElse( - - // test - Expression.Equal(moveNextCall, Expression.Constant(true)), - - // if true - Expression.Block( - new[] { classElementVar, classPropertyVar }, - - // get class element into loopVar - Expression.Assign(classElementVar, Expression.Property(enumeratorVar, nameof(IEnumerator.Current))), - - // get value of the property - Expression.Assign(classPropertyVar, Expression.Property(classElementVar, clrPath.FirstPart!)), - - // add propVar to the result list - Expression.Call(resultVar, listType.GetMethod(nameof(IList.Add))!, classPropertyVar) - ), - - // if false - Expression.Break(loopBreakLabel) - ), loopBreakLabel); - - // final assembly - BlockExpression block = Expression.Block( - new[] { resultVar, enumeratorVar }, - - // create list instance directly in the batch - Expression.Assign(resultVar, Expression.New(listType)), - - // get enumerator from class collection - Expression.Assign(enumeratorVar, getEnumeratorCall), - - // loop over classes - loop, - - // doc: When the block expression is executed, it returns the value of the last expression in the block. - Expression.Call(resultVar, listType.GetMethod("ToArray")!) - ); - - return Expression.Lambda, Array>>(block, classesParam).Compile(); - } - - private static Action, DataColumn> CreateColumnInjectionExpression(DataField df, FieldPath clrPath) { - - if(clrPath.Length > 1) - throw new NotImplementedException(); - - bool isDictionary = typeof(TClass) == typeof(Dictionary); - - ParameterExpression classesParam = Expression.Parameter(typeof(IEnumerable), "classes"); - ParameterExpression dcParam = Expression.Parameter(typeof(DataColumn), "dc"); - - // loop over collection of classes - ParameterExpression enumeratorVar = Expression.Variable(typeof(IEnumerator), "enumerator"); - MethodCallExpression getEnumeratorCall = Expression.Call(classesParam, - typeof(IEnumerable).GetMethod(nameof(IEnumerable.GetEnumerator))!); - MethodCallExpression moveNextCall = Expression.Call(enumeratorVar, - typeof(IEnumerator).GetMethod(nameof(IEnumerator.MoveNext))!); - ParameterExpression classInstanceVar = Expression.Variable(typeof(TClass), "curr"); - LabelTarget loopBreakLabel = Expression.Label("loopBreak"); - - ParameterExpression arrayElementVar = Expression.Variable(df.ClrNullableIfHasNullsType, "currProp"); - ParameterExpression arrayVar = Expression.Variable(df.ClrNullableIfHasNullsType.MakeArrayType(), "data"); - ParameterExpression arrayIndexVar = Expression.Variable(typeof(int), "dataIdx"); - - - LoopExpression loop = Expression.Loop( - Expression.IfThenElse( - - // test - Expression.Equal(moveNextCall, Expression.Constant(true)), - - // if true - Expression.Block( - // the variables are scoped to this block, do not redefine variables from the outer block! - new[] { classInstanceVar, arrayElementVar }, - - // get class element into loopVar - Expression.Assign(classInstanceVar, Expression.Property(enumeratorVar, nameof(IEnumerator.Current))), - - // get array element value - Expression.Assign(arrayElementVar, - Expression.ArrayAccess( - arrayVar, - Expression.PostIncrementAssign(arrayIndexVar))), - - - // assign value to class property - Expression.Assign( - Expression.Property(classInstanceVar, clrPath.FirstPart!), - arrayElementVar - ) - ), - - // if false - Expression.Break(loopBreakLabel) - - ), - loopBreakLabel); - - // final assembly - - BlockExpression block = Expression.Block( - new[] { enumeratorVar, arrayVar, arrayIndexVar, }, - - // get enumerator from class collection - Expression.Assign(enumeratorVar, getEnumeratorCall), - - // initialise array vars - Expression.Assign(arrayVar, - Expression.Convert(Expression.Property(dcParam, nameof(DataColumn.Data)), df.ClrNullableIfHasNullsType.MakeArrayType())), - Expression.Assign(arrayIndexVar, Expression.Property(dcParam, nameof(DataColumn.Offset))), - - // loop over classes - loop); - - return Expression.Lambda, DataColumn>>(block, classesParam, dcParam).Compile(); - } - - - private static DataColumn CreateDataColumn(DataField df, IEnumerable classes) { - // we need to collect instance field into 2 collections: - // 1. Actual list of values (including nulls, as DataColumn will pack them on serialization into definition levels) - // 2. Repetition levels (for complex types only) - - // create destination list - Type valueType = df.ClrNullableIfHasNullsType; - - // now extract the values - Func, Array> cx = CreateCollectionExpression(valueType, df.Path); - Array data = cx(classes); - return new DataColumn(df, data); - } - + public static class ParquetSerializer { + + /// + /// Serialize + /// + /// + /// + /// + /// + /// + /// + /// public static async Task SerializeAsync(IEnumerable objectInstances, Stream destination, ParquetSerializerOptions? options = null, CancellationToken cancellationToken = default) { - ParquetSchema schema = typeof(T).GetParquetSchema(false); - DataField[] dataFields = schema.GetDataFields(); + Striper striper = new Striper(typeof(T).GetParquetSchema(false)); - using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, destination)) { - using ParquetRowGroupWriter rg = writer.CreateRowGroup(); + using(ParquetWriter writer = await ParquetWriter.CreateAsync(striper.Schema, destination, null, false, cancellationToken)) { - foreach(DataField df in dataFields) { + if(options != null) { + writer.CompressionMethod = options.CompressionMethod; + writer.CompressionLevel = options.CompressionLevel; + } - if(df.MaxRepetitionLevel > 0) - throw new NotImplementedException("complex types are not implemented yet"); + using ParquetRowGroupWriter rg = writer.CreateRowGroup(); - DataColumn dc = CreateDataColumn(df, objectInstances); - await rg.WriteColumnAsync(dc, cancellationToken); + foreach(FieldStriper fs in striper.FieldStripers) { + DataColumn dc; + try { + ShreddedColumn sc = fs.Stripe(fs.Field, objectInstances); + dc = new DataColumn(fs.Field, sc.Data, sc.DefinitionLevels, sc.RepetitionLevels); + await rg.WriteColumnAsync(dc, cancellationToken); + } catch(Exception ex) { + throw new ApplicationException($"failed to serialise data column '{fs.Field.Path}'", ex); + } } } - return schema; + return striper.Schema; } + /// + /// Serialise + /// + /// + /// + /// + /// + /// + /// public static async Task SerializeAsync(IEnumerable objectInstances, string filePath, ParquetSerializerOptions? options = null, CancellationToken cancellationToken = default) { @@ -213,28 +73,39 @@ public static async Task SerializeAsync(IEnumerable objectI return await SerializeAsync(objectInstances, fs, options, cancellationToken); } + /// + /// Deserialise + /// + /// + /// + /// + /// + /// public static async Task> DeserializeAsync(Stream source, CancellationToken cancellationToken = default) where T : new() { + + Assembler asm = new Assembler(typeof(T).GetParquetSchema(true)); var result = new List(); - using ParquetReader reader = await ParquetReader.CreateAsync(source); - DataField[] dataFields = reader.Schema.GetDataFields(); + using ParquetReader reader = await ParquetReader.CreateAsync(source, new ParquetOptions { UnpackDefinitions = false }); for(int rgi = 0; rgi < reader.RowGroupCount; rgi++) { using ParquetRowGroupReader rg = reader.OpenRowGroupReader(rgi); // add more empty class instances to the result int prevRowCount = result.Count; for(int i = 0; i < rg.RowCount; i++) { - result.Add(new T()); + var ne = new T(); + result.Add(ne); } - - foreach(DataField df in dataFields) { - // todo: check if destination type contain this property? - DataColumn dc = await rg.ReadColumnAsync(df, cancellationToken); - Action, DataColumn> xtree = CreateColumnInjectionExpression(df, df.Path); - xtree(result, dc); + foreach(FieldAssembler fasm in asm.FieldAssemblers) { + DataColumn dc = await rg.ReadColumnAsync(fasm.Field, cancellationToken); + try { + fasm.Assemble(result.Skip(prevRowCount), dc); + } catch(Exception ex) { + throw new InvalidOperationException($"failed to deserialise column '{fasm.Field.Path}', pseude code: ['{fasm.IterationExpression.GetPseudoCode()}']", ex); + } } } diff --git a/src/Parquet/Serialization/ParquetSerializerOptions.cs b/src/Parquet/Serialization/ParquetSerializerOptions.cs index bc6b3a3d..3439e5c6 100644 --- a/src/Parquet/Serialization/ParquetSerializerOptions.cs +++ b/src/Parquet/Serialization/ParquetSerializerOptions.cs @@ -1,12 +1,20 @@ -using System; -using System.Collections.Generic; -using System.IO.Compression; -using System.Text; +using System.IO.Compression; namespace Parquet.Serialization { - internal class ParquetSerializerOptions { + /// + /// Parquet serializer options + /// + public class ParquetSerializerOptions { + /// + /// Page compression method + /// public CompressionMethod CompressionMethod { get; set; } = CompressionMethod.Snappy; + + /// + /// Page compression level + /// + public CompressionLevel CompressionLevel = CompressionLevel.Optimal; } } diff --git a/src/Parquet/Serialization/TypeExtensions.cs b/src/Parquet/Serialization/TypeExtensions.cs index 97815687..cec7723f 100644 --- a/src/Parquet/Serialization/TypeExtensions.cs +++ b/src/Parquet/Serialization/TypeExtensions.cs @@ -43,10 +43,10 @@ private static List FindProperties(Type t, bool forWriting) { : props.Where(p => p.CanRead).ToList(); } - private static Field ConstructDataField(string name, string propertyName, Type t, PropertyInfo pi) { + private static Field ConstructDataField(string name, string propertyName, Type t, PropertyInfo? pi) { var r = new DataField(name, t, propertyName: propertyName); - ParquetColumnAttribute? columnAttr = pi.GetCustomAttribute(); + ParquetColumnAttribute? columnAttr = pi?.GetCustomAttribute(); if(columnAttr != null) { if(columnAttr.UseListField) { @@ -95,40 +95,58 @@ private static MapField ConstructMapField(string name, string propertyName, return mf; } - /// - /// Makes field from property. - /// - /// - /// - /// or complex field (recursively scans class). Can return null if property is explicitly marked to be ignored. - /// + private static ListField ConstructListField(string name, string propertyName, + Type elementType, + bool forWriting) { + + return new ListField(name, MakeField(elementType, ListField.ElementName, propertyName, null, forWriting)!); + } + private static Field? MakeField(PropertyInfo pi, bool forWriting) { if(ShouldIgnore(pi)) return null; Type t = pi.PropertyType; string name = GetColumnName(pi); + string propertyName = pi.Name; + + return MakeField(t, name, propertyName, pi, forWriting); + } + + /// + /// Makes field from property. + /// + /// Type of property + /// Parquet file column name + /// Class property name + /// Optional that can be used to get attribute metadata. + /// + /// or complex field (recursively scans class). Can return null if property is explicitly marked to be ignored. + /// + private static Field MakeField(Type t, string columnName, string propertyName, + PropertyInfo? pi, + bool forWriting) { Type bt = t.IsNullable() ? t.GetNonNullable() : t; - if(bt.TryExtractEnumerableType(out Type? bti)) { + if(!bt.IsGenericIDictionary() && bt.TryExtractIEnumerableType(out Type? bti)) { bt = bti!; } if(SchemaEncoder.IsSupported(bt)) { - return ConstructDataField(name, pi.Name, t, pi); + return ConstructDataField(columnName, propertyName, t, pi); } else if(t.TryExtractDictionaryType(out Type? tKey, out Type? tValue)) { - return ConstructMapField(name, pi.Name, tKey!, tValue!, forWriting); - } else if(t.TryExtractEnumerableType(out Type? elementType)) { - throw new NotImplementedException("lists are not implemented yet"); - } else if(t.IsClass) { - // must be a struct then! + return ConstructMapField(columnName, propertyName, tKey!, tValue!, forWriting); + } else if(t.TryExtractIEnumerableType(out Type? elementType)) { + return ConstructListField(columnName, propertyName, elementType!, forWriting); + } else if(t.IsClass || t.IsValueType) { + // must be a struct then (c# class or c# struct)! List props = FindProperties(t, forWriting); Field[] fields = props.Select(p => MakeField(p, forWriting)).Where(f => f != null).Select(f => f!).ToArray(); if(fields.Length == 0) - return null; + throw new InvalidOperationException($"property '{propertyName}' has no fields"); - return new StructField(name, fields); + return new StructField(columnName, fields); } throw new NotImplementedException();