-
Notifications
You must be signed in to change notification settings - Fork 66
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
moved parquet-tool to use cobra cli (#578)
Added find command to search for values in parquet files
- Loading branch information
Showing
8 changed files
with
379 additions
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
package cmd | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
"strings" | ||
|
||
"github.com/dustin/go-humanize" | ||
"github.com/olekukonko/tablewriter" | ||
"github.com/spf13/cobra" | ||
) | ||
|
||
var dumpCmd = &cobra.Command{ | ||
Use: "dump", | ||
Example: "parquet-tool dump <file.parquet>", | ||
Short: "dump the database", | ||
Args: cobra.ExactArgs(1), | ||
RunE: func(cmd *cobra.Command, args []string) error { | ||
return dump(args[0]) | ||
}, | ||
} | ||
|
||
func dump(file string) error { | ||
pf, err := openParquetFile(file) | ||
if err != nil { | ||
return err | ||
} | ||
fmt.Println("schema:", pf.Schema()) | ||
meta := pf.Metadata() | ||
fmt.Println("Num Rows:", meta.NumRows) | ||
|
||
for i, rg := range meta.RowGroups { | ||
fmt.Println("\t Row group:", i) | ||
fmt.Println("\t\t Row Count:", rg.NumRows) | ||
fmt.Println("\t\t Row size:", humanize.Bytes(uint64(rg.TotalByteSize))) | ||
fmt.Println("\t\t Columns:") | ||
table := tablewriter.NewWriter(os.Stdout) | ||
table.SetHeader([]string{"Col", "Type", "NumVal", "Encoding", "TotalCompressedSize", "TotalUncompressedSize", "Compression", "%"}) | ||
for _, ds := range rg.Columns { | ||
table.Append( | ||
[]string{ | ||
strings.Join(ds.MetaData.PathInSchema, "/"), | ||
ds.MetaData.Type.String(), | ||
fmt.Sprintf("%d", ds.MetaData.NumValues), | ||
fmt.Sprintf("%s", ds.MetaData.Encoding), | ||
humanize.Bytes(uint64(ds.MetaData.TotalCompressedSize)), | ||
humanize.Bytes(uint64(ds.MetaData.TotalUncompressedSize)), | ||
fmt.Sprintf("%.2f", float64(ds.MetaData.TotalUncompressedSize-ds.MetaData.TotalCompressedSize)/float64(ds.MetaData.TotalCompressedSize)*100), | ||
fmt.Sprintf("%.2f", float64(ds.MetaData.TotalCompressedSize)/float64(rg.TotalByteSize)*100), | ||
}) | ||
} | ||
table.Render() | ||
} | ||
|
||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
package cmd | ||
|
||
import ( | ||
"fmt" | ||
"io/fs" | ||
"os" | ||
"path/filepath" | ||
"strconv" | ||
"strings" | ||
|
||
"github.com/charmbracelet/lipgloss" | ||
"github.com/charmbracelet/lipgloss/table" | ||
"github.com/parquet-go/parquet-go" | ||
"github.com/spf13/cobra" | ||
) | ||
|
||
var findCmd = &cobra.Command{ | ||
Use: "find", | ||
Example: "parquet-tool find timestamp=1698684986287 </path/to/file.parquet or directory>", | ||
Short: "Find value(s) in a parquet file", | ||
Args: cobra.MinimumNArgs(2), | ||
RunE: func(cmd *cobra.Command, args []string) error { | ||
return findAll(args[1], args[0]) | ||
}, | ||
} | ||
|
||
var HeaderStyle = lipgloss.NewStyle(). | ||
Bold(true). | ||
Foreground(lipgloss.Color("#FAFAFA")). | ||
Background(lipgloss.Color("#7D56F4")) | ||
|
||
var EvenRowStyle = lipgloss.NewStyle(). | ||
Bold(false). | ||
Foreground(lipgloss.Color("#FAFAFA")) | ||
|
||
var OddRowStyle = lipgloss.NewStyle(). | ||
Bold(false). | ||
Foreground(lipgloss.Color("#a6a4a4")) | ||
|
||
func parseColumnArg(columnArg string) (string, string, error) { | ||
splits := strings.Split(columnArg, "=") | ||
if len(splits) != 2 { | ||
return "", "", fmt.Errorf("invalid column argument: %s; expected format of <column>=<value>", columnArg) | ||
} | ||
|
||
return splits[0], splits[1], nil | ||
} | ||
|
||
func findAll(fileOrDir, column string) error { | ||
info, err := os.Stat(fileOrDir) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
t := table.New(). | ||
Border(lipgloss.NormalBorder()). | ||
BorderStyle(lipgloss.NewStyle().Foreground(lipgloss.Color("99"))). | ||
StyleFunc(func(row, col int) lipgloss.Style { | ||
switch { | ||
case row == 0: | ||
return HeaderStyle | ||
case row%2 == 0: | ||
return EvenRowStyle | ||
default: | ||
return OddRowStyle | ||
} | ||
}). | ||
Headers("FILE", "ROW GROUP", "MIN", "MAX") | ||
defer fmt.Println(t) | ||
|
||
if !info.IsDir() { | ||
return find(fileOrDir, column, t) | ||
} | ||
|
||
return filepath.WalkDir(fileOrDir, func(path string, d fs.DirEntry, err error) error { | ||
if err != nil { | ||
return err | ||
} | ||
|
||
if d.IsDir() { | ||
return nil | ||
} | ||
|
||
return find(path, column, t) | ||
}) | ||
} | ||
|
||
func find(file, column string, t *table.Table) error { | ||
pf, err := openParquetFile(file) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// TODO: would be nice to support humand readable timestamps; and parse them into int64s | ||
column, val, err := parseColumnArg(column) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for i, rg := range pf.RowGroups() { | ||
schema := rg.Schema() | ||
for j, field := range schema.Fields() { | ||
if field.Name() != column { | ||
continue | ||
} | ||
|
||
v, err := getValue(val, field.Type().Kind()) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// Check the min max values of each column | ||
index := rg.ColumnChunks()[j].ColumnIndex() | ||
for k := 0; k < index.NumPages(); k++ { | ||
|
||
if compare(index.MinValue(k), v) <= 0 && | ||
compare(index.MaxValue(k), v) >= 0 { | ||
t.Row(file, fmt.Sprint(i), fmt.Sprint(index.MinValue(k)), fmt.Sprint(index.MaxValue(k))) | ||
} | ||
} | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func getValue(val string, kind parquet.Kind) (parquet.Value, error) { | ||
switch kind { | ||
case parquet.Int64: | ||
i, err := strconv.ParseInt(val, 10, 64) | ||
if err != nil { | ||
return parquet.Value{}, err | ||
} | ||
|
||
return parquet.ValueOf(i), nil | ||
case parquet.Int96: | ||
fallthrough | ||
case parquet.Boolean: | ||
fallthrough | ||
case parquet.Int32: | ||
fallthrough | ||
case parquet.Float: | ||
fallthrough | ||
case parquet.Double: | ||
fallthrough | ||
case parquet.ByteArray: | ||
fallthrough | ||
case parquet.FixedLenByteArray: | ||
fallthrough | ||
default: | ||
return parquet.Value{}, fmt.Errorf("unsupported kind: %T", kind) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
package cmd | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
|
||
"github.com/spf13/cobra" | ||
) | ||
|
||
var rootCmd = &cobra.Command{ | ||
Use: "parquet-tool", | ||
Short: "Explort parquet files", | ||
Run: func(cmd *cobra.Command, args []string) { | ||
cmd.Help() | ||
}, | ||
} | ||
|
||
func Execute() { | ||
if err := rootCmd.Execute(); err != nil { | ||
fmt.Fprintln(os.Stderr, err) | ||
os.Exit(1) | ||
} | ||
} | ||
|
||
func init() { | ||
rootCmd.AddCommand(dumpCmd) | ||
rootCmd.AddCommand(findCmd) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
package cmd | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
|
||
"github.com/parquet-go/parquet-go" | ||
) | ||
|
||
func openParquetFile(file string) (*parquet.File, error) { | ||
f, err := os.Open(file) | ||
if err != nil { | ||
return nil, err | ||
} | ||
defer f.Close() | ||
stats, err := f.Stat() | ||
if err != nil { | ||
return nil, err | ||
} | ||
pf, err := parquet.OpenFile(f, stats.Size()) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return pf, nil | ||
} | ||
|
||
func compare(v1, v2 parquet.Value) int { | ||
if v1.IsNull() { | ||
if v2.IsNull() { | ||
return 0 | ||
} | ||
return 1 | ||
} | ||
|
||
if v2.IsNull() { | ||
return -1 | ||
} | ||
|
||
switch v1.Kind() { | ||
case parquet.Int32: | ||
return parquet.Int32Type.Compare(v1, v2) | ||
case parquet.Int64: | ||
return parquet.Int64Type.Compare(v1, v2) | ||
case parquet.Float: | ||
return parquet.FloatType.Compare(v1, v2) | ||
case parquet.Double: | ||
return parquet.DoubleType.Compare(v1, v2) | ||
case parquet.ByteArray, parquet.FixedLenByteArray: | ||
return parquet.ByteArrayType.Compare(v1, v2) | ||
case parquet.Boolean: | ||
return parquet.BooleanType.Compare(v1, v2) | ||
default: | ||
panic(fmt.Sprintf("unsupported value comparison: %v", v1.Kind())) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
module github.com/polarsignals/frostdb/cmd/parquet-tool | ||
|
||
go 1.21 | ||
|
||
require ( | ||
github.com/charmbracelet/lipgloss v0.9.1 | ||
github.com/dustin/go-humanize v1.0.1 | ||
github.com/olekukonko/tablewriter v0.0.5 | ||
github.com/parquet-go/parquet-go v0.19.0 | ||
github.com/spf13/cobra v1.7.0 | ||
) | ||
|
||
require ( | ||
github.com/andybalholm/brotli v1.0.5 // indirect | ||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect | ||
github.com/google/uuid v1.3.0 // indirect | ||
github.com/inconshreveable/mousetrap v1.1.0 // indirect | ||
github.com/klauspost/compress v1.16.7 // indirect | ||
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect | ||
github.com/mattn/go-isatty v0.0.18 // indirect | ||
github.com/mattn/go-runewidth v0.0.15 // indirect | ||
github.com/muesli/reflow v0.3.0 // indirect | ||
github.com/muesli/termenv v0.15.2 // indirect | ||
github.com/pierrec/lz4/v4 v4.1.18 // indirect | ||
github.com/rivo/uniseg v0.2.0 // indirect | ||
github.com/segmentio/encoding v0.3.6 // indirect | ||
github.com/spf13/pflag v1.0.5 // indirect | ||
golang.org/x/sys v0.12.0 // indirect | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs= | ||
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= | ||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= | ||
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= | ||
github.com/charmbracelet/lipgloss v0.9.1 h1:PNyd3jvaJbg4jRHKWXnCj1akQm4rh8dbEzN1p/u1KWg= | ||
github.com/charmbracelet/lipgloss v0.9.1/go.mod h1:1mPmG4cxScwUQALAAnacHaigiiHB9Pmr+v1VEawJl6I= | ||
github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= | ||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= | ||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= | ||
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= | ||
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= | ||
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= | ||
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg= | ||
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= | ||
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= | ||
github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= | ||
github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= | ||
github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= | ||
github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= | ||
github.com/mattn/go-isatty v0.0.18 h1:DOKFKCQ7FNG2L1rbrmstDN4QVRdS89Nkh85u68Uwp98= | ||
github.com/mattn/go-isatty v0.0.18/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= | ||
github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= | ||
github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= | ||
github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZgg3U= | ||
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= | ||
github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= | ||
github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= | ||
github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo= | ||
github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8= | ||
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= | ||
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= | ||
github.com/parquet-go/parquet-go v0.19.0 h1:xtHOBIE0/8CRhmf06V1GJ7q3qARY2/kXiSweFlscwUQ= | ||
github.com/parquet-go/parquet-go v0.19.0/go.mod h1:6pu/Ca02WRyWyF6jbY1KceESGBZMsRMSijjLbajXaG8= | ||
github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ= | ||
github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= | ||
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= | ||
github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= | ||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= | ||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= | ||
github.com/segmentio/asm v1.1.3/go.mod h1:Ld3L4ZXGNcSLRg4JBsZ3//1+f/TjYl0Mzen/DQy1EJg= | ||
github.com/segmentio/encoding v0.3.6 h1:E6lVLyDPseWEulBmCmAKPanDd3jiyGDo5gMcugCRwZQ= | ||
github.com/segmentio/encoding v0.3.6/go.mod h1:n0JeuIqEQrQoPDGsjo8UNd1iA0U8d8+oHAA4E3G3OxM= | ||
github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I= | ||
github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0= | ||
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= | ||
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= | ||
golang.org/x/sys v0.0.0-20211110154304-99a53858aa08/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= | ||
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= | ||
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= | ||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= | ||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= |
Oops, something went wrong.