Skip to content

Commit

Permalink
moved parquet-tool to use cobra cli (#578)
Browse files Browse the repository at this point in the history
Added find command to search for values in parquet files
  • Loading branch information
thorfour authored Oct 31, 2023
1 parent 8af34fd commit bd48c2e
Show file tree
Hide file tree
Showing 8 changed files with 379 additions and 53 deletions.
56 changes: 56 additions & 0 deletions cmd/parquet-tool/cmd/dump.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package cmd

import (
"fmt"
"os"
"strings"

"github.com/dustin/go-humanize"
"github.com/olekukonko/tablewriter"
"github.com/spf13/cobra"
)

var dumpCmd = &cobra.Command{
Use: "dump",
Example: "parquet-tool dump <file.parquet>",
Short: "dump the database",
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
return dump(args[0])
},
}

func dump(file string) error {
pf, err := openParquetFile(file)
if err != nil {
return err
}
fmt.Println("schema:", pf.Schema())
meta := pf.Metadata()
fmt.Println("Num Rows:", meta.NumRows)

for i, rg := range meta.RowGroups {
fmt.Println("\t Row group:", i)
fmt.Println("\t\t Row Count:", rg.NumRows)
fmt.Println("\t\t Row size:", humanize.Bytes(uint64(rg.TotalByteSize)))
fmt.Println("\t\t Columns:")
table := tablewriter.NewWriter(os.Stdout)
table.SetHeader([]string{"Col", "Type", "NumVal", "Encoding", "TotalCompressedSize", "TotalUncompressedSize", "Compression", "%"})
for _, ds := range rg.Columns {
table.Append(
[]string{
strings.Join(ds.MetaData.PathInSchema, "/"),
ds.MetaData.Type.String(),
fmt.Sprintf("%d", ds.MetaData.NumValues),
fmt.Sprintf("%s", ds.MetaData.Encoding),
humanize.Bytes(uint64(ds.MetaData.TotalCompressedSize)),
humanize.Bytes(uint64(ds.MetaData.TotalUncompressedSize)),
fmt.Sprintf("%.2f", float64(ds.MetaData.TotalUncompressedSize-ds.MetaData.TotalCompressedSize)/float64(ds.MetaData.TotalCompressedSize)*100),
fmt.Sprintf("%.2f", float64(ds.MetaData.TotalCompressedSize)/float64(rg.TotalByteSize)*100),
})
}
table.Render()
}

return nil
}
153 changes: 153 additions & 0 deletions cmd/parquet-tool/cmd/find.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
package cmd

import (
"fmt"
"io/fs"
"os"
"path/filepath"
"strconv"
"strings"

"github.com/charmbracelet/lipgloss"
"github.com/charmbracelet/lipgloss/table"
"github.com/parquet-go/parquet-go"
"github.com/spf13/cobra"
)

var findCmd = &cobra.Command{
Use: "find",
Example: "parquet-tool find timestamp=1698684986287 </path/to/file.parquet or directory>",
Short: "Find value(s) in a parquet file",
Args: cobra.MinimumNArgs(2),
RunE: func(cmd *cobra.Command, args []string) error {
return findAll(args[1], args[0])
},
}

var HeaderStyle = lipgloss.NewStyle().
Bold(true).
Foreground(lipgloss.Color("#FAFAFA")).
Background(lipgloss.Color("#7D56F4"))

var EvenRowStyle = lipgloss.NewStyle().
Bold(false).
Foreground(lipgloss.Color("#FAFAFA"))

var OddRowStyle = lipgloss.NewStyle().
Bold(false).
Foreground(lipgloss.Color("#a6a4a4"))

func parseColumnArg(columnArg string) (string, string, error) {
splits := strings.Split(columnArg, "=")
if len(splits) != 2 {
return "", "", fmt.Errorf("invalid column argument: %s; expected format of <column>=<value>", columnArg)
}

return splits[0], splits[1], nil
}

func findAll(fileOrDir, column string) error {
info, err := os.Stat(fileOrDir)
if err != nil {
return err
}

t := table.New().
Border(lipgloss.NormalBorder()).
BorderStyle(lipgloss.NewStyle().Foreground(lipgloss.Color("99"))).
StyleFunc(func(row, col int) lipgloss.Style {
switch {
case row == 0:
return HeaderStyle
case row%2 == 0:
return EvenRowStyle
default:
return OddRowStyle
}
}).
Headers("FILE", "ROW GROUP", "MIN", "MAX")
defer fmt.Println(t)

if !info.IsDir() {
return find(fileOrDir, column, t)
}

return filepath.WalkDir(fileOrDir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}

if d.IsDir() {
return nil
}

return find(path, column, t)
})
}

func find(file, column string, t *table.Table) error {
pf, err := openParquetFile(file)
if err != nil {
return err
}

// TODO: would be nice to support humand readable timestamps; and parse them into int64s
column, val, err := parseColumnArg(column)
if err != nil {
return err
}

for i, rg := range pf.RowGroups() {
schema := rg.Schema()
for j, field := range schema.Fields() {
if field.Name() != column {
continue
}

v, err := getValue(val, field.Type().Kind())
if err != nil {
return err
}

// Check the min max values of each column
index := rg.ColumnChunks()[j].ColumnIndex()
for k := 0; k < index.NumPages(); k++ {

if compare(index.MinValue(k), v) <= 0 &&
compare(index.MaxValue(k), v) >= 0 {
t.Row(file, fmt.Sprint(i), fmt.Sprint(index.MinValue(k)), fmt.Sprint(index.MaxValue(k)))
}
}
}
}

return nil
}

func getValue(val string, kind parquet.Kind) (parquet.Value, error) {
switch kind {
case parquet.Int64:
i, err := strconv.ParseInt(val, 10, 64)
if err != nil {
return parquet.Value{}, err
}

return parquet.ValueOf(i), nil
case parquet.Int96:
fallthrough
case parquet.Boolean:
fallthrough
case parquet.Int32:
fallthrough
case parquet.Float:
fallthrough
case parquet.Double:
fallthrough
case parquet.ByteArray:
fallthrough
case parquet.FixedLenByteArray:
fallthrough
default:
return parquet.Value{}, fmt.Errorf("unsupported kind: %T", kind)
}
}
28 changes: 28 additions & 0 deletions cmd/parquet-tool/cmd/root.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package cmd

import (
"fmt"
"os"

"github.com/spf13/cobra"
)

var rootCmd = &cobra.Command{
Use: "parquet-tool",
Short: "Explort parquet files",
Run: func(cmd *cobra.Command, args []string) {
cmd.Help()
},
}

func Execute() {
if err := rootCmd.Execute(); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
}

func init() {
rootCmd.AddCommand(dumpCmd)
rootCmd.AddCommand(findCmd)
}
56 changes: 56 additions & 0 deletions cmd/parquet-tool/cmd/util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package cmd

import (
"fmt"
"os"

"github.com/parquet-go/parquet-go"
)

func openParquetFile(file string) (*parquet.File, error) {
f, err := os.Open(file)
if err != nil {
return nil, err
}
defer f.Close()
stats, err := f.Stat()
if err != nil {
return nil, err
}
pf, err := parquet.OpenFile(f, stats.Size())
if err != nil {
return nil, err
}

return pf, nil
}

func compare(v1, v2 parquet.Value) int {
if v1.IsNull() {
if v2.IsNull() {
return 0
}
return 1
}

if v2.IsNull() {
return -1
}

switch v1.Kind() {
case parquet.Int32:
return parquet.Int32Type.Compare(v1, v2)
case parquet.Int64:
return parquet.Int64Type.Compare(v1, v2)
case parquet.Float:
return parquet.FloatType.Compare(v1, v2)
case parquet.Double:
return parquet.DoubleType.Compare(v1, v2)
case parquet.ByteArray, parquet.FixedLenByteArray:
return parquet.ByteArrayType.Compare(v1, v2)
case parquet.Boolean:
return parquet.BooleanType.Compare(v1, v2)
default:
panic(fmt.Sprintf("unsupported value comparison: %v", v1.Kind()))
}
}
29 changes: 29 additions & 0 deletions cmd/parquet-tool/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
module github.com/polarsignals/frostdb/cmd/parquet-tool

go 1.21

require (
github.com/charmbracelet/lipgloss v0.9.1
github.com/dustin/go-humanize v1.0.1
github.com/olekukonko/tablewriter v0.0.5
github.com/parquet-go/parquet-go v0.19.0
github.com/spf13/cobra v1.7.0
)

require (
github.com/andybalholm/brotli v1.0.5 // indirect
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
github.com/google/uuid v1.3.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/klauspost/compress v1.16.7 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/mattn/go-isatty v0.0.18 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/muesli/reflow v0.3.0 // indirect
github.com/muesli/termenv v0.15.2 // indirect
github.com/pierrec/lz4/v4 v4.1.18 // indirect
github.com/rivo/uniseg v0.2.0 // indirect
github.com/segmentio/encoding v0.3.6 // indirect
github.com/spf13/pflag v1.0.5 // indirect
golang.org/x/sys v0.12.0 // indirect
)
54 changes: 54 additions & 0 deletions cmd/parquet-tool/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
github.com/charmbracelet/lipgloss v0.9.1 h1:PNyd3jvaJbg4jRHKWXnCj1akQm4rh8dbEzN1p/u1KWg=
github.com/charmbracelet/lipgloss v0.9.1/go.mod h1:1mPmG4cxScwUQALAAnacHaigiiHB9Pmr+v1VEawJl6I=
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I=
github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY=
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
github.com/mattn/go-isatty v0.0.18 h1:DOKFKCQ7FNG2L1rbrmstDN4QVRdS89Nkh85u68Uwp98=
github.com/mattn/go-isatty v0.0.18/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U=
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s=
github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8=
github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo=
github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8=
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
github.com/parquet-go/parquet-go v0.19.0 h1:xtHOBIE0/8CRhmf06V1GJ7q3qARY2/kXiSweFlscwUQ=
github.com/parquet-go/parquet-go v0.19.0/go.mod h1:6pu/Ca02WRyWyF6jbY1KceESGBZMsRMSijjLbajXaG8=
github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ=
github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/segmentio/asm v1.1.3/go.mod h1:Ld3L4ZXGNcSLRg4JBsZ3//1+f/TjYl0Mzen/DQy1EJg=
github.com/segmentio/encoding v0.3.6 h1:E6lVLyDPseWEulBmCmAKPanDd3jiyGDo5gMcugCRwZQ=
github.com/segmentio/encoding v0.3.6/go.mod h1:n0JeuIqEQrQoPDGsjo8UNd1iA0U8d8+oHAA4E3G3OxM=
github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
golang.org/x/sys v0.0.0-20211110154304-99a53858aa08/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
Loading

0 comments on commit bd48c2e

Please sign in to comment.