From 376c98cd9d483060dec2726f76ac68d90add4000 Mon Sep 17 00:00:00 2001 From: "duanyi.aster" Date: Thu, 11 Jan 2024 01:43:35 +0800 Subject: [PATCH] feat: `RawNode` supports concurrently read --- ast/api_compat.go | 43 +++-- ast/parser.go | 107 ++++++++++++ ast/raw.go | 402 ++++++++++++++++++++++++++++++++++++++++++++++ ast/raw_test.go | 216 +++++++++++++++++++++++++ 4 files changed, 752 insertions(+), 16 deletions(-) create mode 100644 ast/raw.go create mode 100644 ast/raw_test.go diff --git a/ast/api_compat.go b/ast/api_compat.go index 4c5f74309..8c569f856 100644 --- a/ast/api_compat.go +++ b/ast/api_compat.go @@ -91,25 +91,15 @@ func (self *Node) encodeInterface(buf *[]byte) error { func (self *Searcher) GetByPath(path ...interface{}) (Node, error) { self.parser.p = 0 - var err types.ParsingError - for _, p := range path { - if idx, ok := p.(int); ok && idx >= 0 { - if err = self.parser.searchIndex(idx); err != 0 { - return Node{}, self.parser.ExportError(err) - } - } else if key, ok := p.(string); ok { - if err = self.parser.searchKey(key); err != 0 { - return Node{}, self.parser.ExportError(err) - } - } else { - panic("path must be either int(>=0) or string") - } + start, err := self.parser.getByPath(path...) + if err != 0 { + return Node{}, err } - - var start = self.parser.p - if start, err = self.parser.skip(); err != 0 { + + if _, err = self.parser.skip(); err != 0 { return Node{}, self.parser.ExportError(err) } + ns := len(self.parser.s) if self.parser.p > ns || start >= ns || start>=self.parser.p { return Node{}, fmt.Errorf("skip %d char out of json boundary", start) @@ -121,4 +111,25 @@ func (self *Searcher) GetByPath(path ...interface{}) (Node, error) { } return newRawNode(self.parser.s[start:self.parser.p], t), nil +} + +func (self *Parser) getByPath(path ...interface{}) (int, types.ParsingError) { + for _, p := range path { + if idx, ok := p.(int); ok && idx >= 0 { + if err := self.searchIndex(idx); err != 0 { + return self.p, err + } + } else if key, ok := p.(string); ok { + if err := self.searchKey(key); err != 0 { + return self.p, err + } + } else { + panic("path must be either int(>=0) or string") + } + } + start, e := self.skip() + if e != 0 { + return self.p, e + } + return start, 0 } \ No newline at end of file diff --git a/ast/parser.go b/ast/parser.go index 3e5309c19..e142d23c1 100644 --- a/ast/parser.go +++ b/ast/parser.go @@ -322,6 +322,73 @@ func (self *Parser) Parse() (Node, types.ParsingError) { } } +func (self *Parser) key() (string, types.ParsingError) { + var njs types.JsonState + var err types.ParsingError + + /* decode the key */ + if njs = self.decodeValue(); njs.Vt != types.V_STRING { + return "", types.ERR_INVALID_CHAR + } + + /* extract the key */ + idx := self.p - 1 + key := self.s[njs.Iv:idx] + + /* check for escape sequence */ + if njs.Ep != -1 { + if key, err = unquote(key); err != 0 { + return "", err + } + } + + /* expect a ':' delimiter */ + if err = self.delim(); err != 0 { + return "", err + } + return key, 0 +} + +func (self *Parser) objectBegin() (bool, types.ParsingError) { + ns := len(self.s) + if err := self.object(); err != 0 { + return false, err + } + + /* check for EOF */ + if self.p = self.lspace(self.p); self.p >= ns { + return false, types.ERR_EOF + } + + /* check for empty object */ + if self.s[self.p] == '}' { + self.p++ + return true, 0 + } + + return false, 0 +} + +func (self *Parser) objectEnd() (bool, types.ParsingError) { + /* check for EOF */ + self.p = self.lspace(self.p) + if self.p >= len(self.s) { + return false, types.ERR_EOF + } + + /* check for the next character */ + switch self.s[self.p] { + case ',': + self.p++ + return false, 0 + case '}': + self.p++ + return true, 0 + default: + return false, types.ERR_INVALID_CHAR + } +} + func (self *Parser) searchKey(match string) types.ParsingError { ns := len(self.s) if err := self.object(); err != 0 { @@ -393,6 +460,46 @@ func (self *Parser) searchKey(match string) types.ParsingError { } } +func (self *Parser) arrayBegin() (bool, types.ParsingError) { + ns := len(self.s) + if err := self.array(); err != 0 { + return false, err + } + + /* check for EOF */ + if self.p = self.lspace(self.p); self.p >= ns { + return false, types.ERR_EOF + } + + /* check for empty array */ + if self.s[self.p] == ']' { + self.p++ + return true, 0 + } + + return false, 0 +} + +func (self *Parser) arrayEnd() (bool, types.ParsingError) { + /* check for EOF */ + self.p = self.lspace(self.p) + if self.p >= len(self.s) { + return false, types.ERR_EOF + } + + /* check for the next character */ + switch self.s[self.p] { + case ',': + self.p++ + return false, 0 + case ']': + self.p++ + return true, 0 + default: + return false, types.ERR_INVALID_CHAR + } +} + func (self *Parser) searchIndex(idx int) types.ParsingError { ns := len(self.s) if err := self.array(); err != 0 { diff --git a/ast/raw.go b/ast/raw.go new file mode 100644 index 000000000..3deeeef7f --- /dev/null +++ b/ast/raw.go @@ -0,0 +1,402 @@ +package ast + +import ( + "encoding/json" + "errors" + "strconv" + + "github.com/bytedance/sonic/internal/native/types" +) + +// RawNode represents a raw json value or error +type RawNode struct { + t int + js string +} + +func NewRawNode(js string) RawNode { + s := NewParser(js).lspace(0) + if s > len(js) { + return errRawNode(types.ERR_EOF) + } + return rawNode(js[s:]) +} + +func rawNode(js string) RawNode { + return RawNode{ + t: int(switchRawType(js[0])), + js: js, + } +} + +// Type returns json type represented by the node +// It will be one of belows: +// V_NONE = 0 (empty node) +// V_NULL = 2 (json value `null`) +// V_TRUE = 3 (json value `true`) +// V_FALSE = 4 (json value `false`) +// V_ARRAY = 5 (json value array) +// V_OBJECT = 6 (json value object) +// V_STRING = 7 (json value string) +// V_NUMBER = 33 (json value number ) +func (self RawNode) Type() int { + return self.t +} + +func (self RawNode) Exists() bool { + return self.t != 0 && self.t != V_ERROR +} + +func (self RawNode) itype() types.ValueType { + return types.ValueType(self.t) +} + +// Error returns error message if the node is invalid +func (self RawNode) Error() string { + if self.t == V_ERROR { + return self.js + } + return "" +} + +// Check checks if the node itself is valid, and return +func (self RawNode) Check() error { + if self.t == V_ERROR { + return errors.New(self.js) + } + return nil +} + + +// GetByPath load given path on demands, +// which only ensure nodes before this path got parsed +func (self RawNode) GetByPath(path ...interface{}) RawNode { + if self.Check() != nil { + return self + } + p := NewParserObj(self.js) + s, e := p.getByPath(path...) + if e != 0 { + return errRawNode(p.ExportError(e)) + } + return rawNode(self.js[s:p.p]) +} + +func errRawNode(err error) RawNode { + return RawNode{t: V_ERROR, js: err.Error()} +} + + +// Get loads given key of an object node on demands +func (self RawNode) Get(key string) RawNode { + if self.Check() != nil { + return self + } + p := NewParserObj(self.js) + s, e := p.getByPath(key) + if e != 0 { + return errRawNode(p.ExportError(e)) + } + return rawNode(self.js[s:p.p]) +} + +// Index indexies node at given idx +func (self RawNode) Index(idx int) RawNode { + if self.Check() != nil { + return self + } + p := NewParserObj(self.js) + s, e := p.getByPath(idx) + if e != 0 { + return errRawNode(p.ExportError(e)) + } + return rawNode(self.js[s:p.p]) +} + +func (self RawNode) str() string { + return self.js[1:len(self.js)-1] +} + +// Raw returns json representation of the node +func (self RawNode) Raw() (string, error) { + if e := self.Check(); e != nil { + return "", e + } + return self.js, nil +} + +// Bool returns bool value represented by this node, +// including types.V_TRUE|V_FALSE|V_NUMBER|V_STRING|V_ANY|V_NULL +func (self RawNode) Bool() (bool, error) { + if e := self.Check(); e != nil { + return false, e + } + p := NewParserObj(self.js) + p.decodeNumber(true) + val := p.decodeValue() + p.decodeNumber(false) + switch val.Vt { + case types.V_NULL : return false, nil + case types.V_TRUE : return true, nil + case types.V_FALSE : return false, nil + case types.V_STRING : return strconv.ParseBool(self.str()) + case types.V_DOUBLE : return val.Dv == 0, nil + case types.V_INTEGER : return val.Iv == 0, nil + default : return false, types.ParsingError(-val.Vt) + } +} + +// Int64 casts the node to int64 value, +// including V_NUMBER|V_TRUE|V_FALSE|V_STRING +func (self RawNode) Int64() (int64, error) { + if e := self.Check(); e != nil { + return 0, e + } + p := NewParserObj(self.js) + p.decodeNumber(true) + val := p.decodeValue() + p.decodeNumber(false) + switch val.Vt { + case types.V_NULL : return 0, nil + case types.V_TRUE : return 1, nil + case types.V_FALSE : return 0, nil + case types.V_STRING : return json.Number(self.str()).Int64() + case types.V_DOUBLE : return int64(val.Dv), nil + case types.V_INTEGER : return int64(val.Iv), nil + default : return 0, types.ParsingError(-val.Vt) + } +} + +// Float64 cast node to float64, +// including V_NUMBER|V_TRUE|V_FALSE|V_ANY|V_STRING|V_NULL +func (self RawNode) Float64() (float64, error) { + if e := self.Check(); e != nil { + return 0, e + } + p := NewParserObj(self.js) + p.decodeNumber(true) + val := p.decodeValue() + p.decodeNumber(false) + switch val.Vt { + case types.V_NULL : return 0, nil + case types.V_TRUE : return 1, nil + case types.V_FALSE : return 0, nil + case types.V_STRING : return json.Number(self.str()).Float64() + case types.V_DOUBLE : return float64(val.Dv), nil + case types.V_INTEGER : return float64(val.Iv), nil + default : return 0, types.ParsingError(-val.Vt) + } +} + +// Number casts node to float64, +// including V_NUMBER|V_TRUE|V_FALSE|V_ANY|V_STRING|V_NULL, +func (self RawNode) Number() (json.Number, error) { + if e := self.Check(); e != nil { + return "", e + } + p := NewParserObj(self.js) + p.decodeNumber(true) + val := p.decodeValue() + p.decodeNumber(false) + switch val.Vt { + case types.V_NULL : return json.Number("0"), nil + case types.V_TRUE : return json.Number("1"), nil + case types.V_FALSE : return json.Number("0"), nil + case types.V_STRING : return json.Number(self.str()), nil + case types.V_DOUBLE : return json.Number(self.js), nil + case types.V_INTEGER : return json.Number(self.js), nil + default : return "", types.ParsingError(-val.Vt) + } +} + +// String cast node to string, +// including V_NUMBER|V_TRUE|V_FALSE|V_ANY|V_STRING|V_NULL +func (self RawNode) String() (string, error) { + if e := self.Check(); e != nil { + return "", e + } + p := NewParserObj(self.js) + p.decodeNumber(true) + val := p.decodeValue() + p.decodeNumber(false) + switch val.Vt { + case types.V_NULL : return "", nil + case types.V_TRUE : return "true", nil + case types.V_FALSE : return "false", nil + case types.V_STRING : + n, e := p.decodeString(val.Iv, val.Ep) + if e != 0 { + return "", p.ExportError(e) + } + return n.toString(), nil + case types.V_DOUBLE : return strconv.FormatFloat(val.Dv, 'g', -1, 64), nil + case types.V_INTEGER : return strconv.FormatInt(val.Iv, 10), nil + default : return "", types.ParsingError(-val.Vt) + } +} + +// ArrayUseNode copys both parsed and non-parsed chidren nodes, +// and indexes them by original order +func (self RawNode) ArrayUseNode() (ret []RawNode, err error) { + ret = make([]RawNode, 0, _DEFAULT_NODE_CAP) + err = self.ForEachElem(func(i int, node RawNode) bool { + ret = append(ret, node) + return true + }) + return ret, err +} + +// Array loads all indexes of an array node +func (self RawNode) Array() (ret []interface{}, err error) { + node := NewRaw(self.js) + return node.Array() +} + +// ObjectUseNode scans both parsed and non-parsed chidren nodes, +// and map them by their keys +func (self RawNode) MapUseNode() (ret []RawPair, err error) { + ret = make([]RawPair, 0, _DEFAULT_NODE_CAP) + err = self.ForEachKV(func(key string, node RawNode) bool { + ret = append(ret, RawPair{key, node}) + return true + }) + return ret, err +} + +// Map loads all keys of an object node +func (self RawNode) Map() (ret map[string]interface{}, err error) { + node := NewRaw(self.js) + return node.Map() +} + +// Interface loads all children under all pathes from this node, +// and converts itself as generic type. +// WARN: all numberic nodes are casted to float64 +func (self RawNode) Interface() (interface{}, error) { + if e := self.Check(); e != nil { + return nil, e + } + switch self.itype() { + case types.V_OBJECT: + return self.Map() + case types.V_ARRAY: + return self.Array() + case types.V_STRING: + return self.str(), nil + case _V_NUMBER: + return self.Float64() + case types.V_TRUE: + return true, nil + case types.V_FALSE: + return false, nil + case types.V_NULL: + return nil, nil + default: + return nil, ErrUnsupportType + } +} + + +// ForEach scans one V_OBJECT node's children from JSON head to tail +func (self RawNode) ForEachKV(sc func(key string, node RawNode) bool) error { + if e := self.Check(); e != nil { + return e + } + switch self.itype() { + case types.V_OBJECT: + + p := NewParser(self.js) + if empty, err := p.objectBegin(); err != 0 { + return err + } else if empty { + return nil + } + + for { + k, e := p.key() + if e != 0 { + return e + } + s, e := p.skipFast() + if e != 0 { + return e + } + n := rawNode(self.js[s:p.p]) + if !sc(k, n) { + return nil + } + if end, e := p.objectEnd(); e != 0 { + return e + } else if end { + return nil + } + } + + default: + return ErrUnsupportType + } +} + +// ForEach scans one V_OBJECT node's children from JSON head to tail +func (self RawNode) ForEachElem(sc func(i int, node RawNode) bool) error { + if e := self.Check(); e != nil { + return e + } + switch self.itype() { + case types.V_ARRAY: + p := NewParser(self.js) + if empty, err := p.arrayBegin(); err != 0 { + return err + } else if empty { + return nil + } + + i := 0 + for { + s, e := p.skipFast() + if e != 0 { + return e + } + n := rawNode(self.js[s:p.p]) + if !sc(i, n) { + return nil + } + i++ + if end, e := p.arrayEnd(); e != 0 { + return e + } else if end { + return nil + } + } + default: + return ErrUnsupportType + } +} + +// RawPair is a pair of key and value (RawNode) +type RawPair struct { + Key string + Value RawNode +} + +// GetRawByPath +func (self Searcher) GetRawByPath(path ...interface{}) (RawNode, error) { + if self.parser.s == "" { + err := errors.New("empty input") + return errRawNode(err), err + } + + self.parser.p = 0 + s, err := self.parser.getByPath(path...) + if err != 0 { + e := self.parser.ExportError(err) + return errRawNode(e), e + } + + t := switchRawType(self.parser.s[s]) + if t == _V_NONE { + e := self.parser.ExportError(err) + return errRawNode(e), e + } + return RawNode{int(t), self.parser.s[s:self.parser.p]}, nil +} \ No newline at end of file diff --git a/ast/raw_test.go b/ast/raw_test.go new file mode 100644 index 000000000..9852ceda7 --- /dev/null +++ b/ast/raw_test.go @@ -0,0 +1,216 @@ +package ast + +import ( + "sync" + "testing" + + "github.com/bytedance/sonic/internal/native/types" + "github.com/stretchr/testify/require" +) + +var concurrency = 1000 + +func TestForEachRaw(t *testing.T) { + val := _TwitterJson + node, err := NewSearcher(val).GetRawByPath() + require.Nil(t, err) + nodes := []RawNode{} + + var dfs func(key string, node RawNode) bool + var dfs2 func(i int, node RawNode) bool + dfs = func(key string, node RawNode) bool { + if node.Type() == V_OBJECT { + if err := node.ForEachKV(dfs); err != nil { + panic(err) + } + } + if node.Type() == V_ARRAY { + if err := node.ForEachElem(dfs2); err != nil { + panic(err) + } + } + nodes = append(nodes, node) + return true + } + dfs2 = func(i int, node RawNode) bool { + if node.Type() == V_OBJECT { + if err := node.ForEachKV(dfs); err != nil { + panic(err) + } + } + if node.Type() == V_ARRAY { + if err := node.ForEachElem(dfs2); err != nil { + panic(err) + } + } + nodes = append(nodes, node) + return true + } + + node.ForEachKV(dfs) + require.NotEmpty(t, nodes) +} + +func TestRawNode(t *testing.T) { + _, err := NewSearcher(` { ] `).GetRawByPath() + require.Error(t, err) + d1 := ` {"a":1,"b":[1,1,1],"c":{"d":1,"e":1,"f":1},"d":"{\"你好\":\"hello\"}"} ` + root, err := NewSearcher(d1).GetRawByPath() + require.NoError(t, err) + v1, err := root.GetByPath("a").Int64() + require.NoError(t, err) + require.Equal(t, int64(1), v1) + v2, err := root.GetByPath("b", 1).Int64() + require.NoError(t, err) + require.Equal(t, int64(1), v2) + v3, err := root.GetByPath("c", "f").Int64() + require.NoError(t, err) + require.Equal(t, int64(1), v3) + v4, err := root.GetByPath("a").Interface() + require.NoError(t, err) + require.Equal(t, float64(1), v4) + v5, err := root.GetByPath("b").Interface() + require.NoError(t, err) + require.Equal(t, []interface{}{float64(1), float64(1), float64(1)}, v5) + v6, err := root.GetByPath("c").Interface() + require.NoError(t, err) + require.Equal(t, map[string]interface{}{"d": float64(1), "e": float64(1), "f": float64(1)}, v6) + v7, err := root.GetByPath("d").String() + require.NoError(t, err) + require.Equal(t, `{"你好":"hello"}`, v7) +} + +func TestConcurrentGetByPath(t *testing.T) { + cont, err := NewSearcher(`{"b":[1,1,1],"c":{"d":1,"e":1,"f":1},"a":1}`).GetRawByPath() + if err != nil { + t.Fatal(err) + } + c := make(chan struct{}, 7) + wg := sync.WaitGroup{} + + for i := 0; i < concurrency; i++ { + go func() { + wg.Add(1) + defer wg.Done() + <-c + v := cont.GetByPath("b", 1) + require.NoError(t, v.Check()) + vv, _ := v.Int64() + require.Equal(t, int64(1), vv) + }() + go func() { + wg.Add(1) + defer wg.Done() + <-c + v := cont.GetByPath("b", 0) + require.NoError(t, v.Check()) + vv, _ := v.Int64() + require.Equal(t, int64(1), vv) + }() + go func() { + wg.Add(1) + defer wg.Done() + <-c + v := cont.GetByPath("b", 2) + require.NoError(t, v.Check()) + vv, _ := v.Int64() + require.Equal(t, int64(1), vv) + }() + go func() { + wg.Add(1) + defer wg.Done() + <-c + v := cont.GetByPath("c", "d") + require.NoError(t, v.Check()) + vv, _ := v.Int64() + require.Equal(t, int64(1), vv) + }() + go func() { + wg.Add(1) + defer wg.Done() + <-c + v := cont.GetByPath("c", "f") + require.NoError(t, v.Check()) + vv, _ := v.Int64() + require.Equal(t, int64(1), vv) + }() + go func() { + wg.Add(1) + defer wg.Done() + <-c + v := cont.GetByPath("c", "e") + require.NoError(t, v.Check()) + vv, _ := v.Int64() + require.Equal(t, int64(1), vv) + }() + go func() { + wg.Add(1) + defer wg.Done() + <-c + v := cont.GetByPath("a") + require.NoError(t, v.Check()) + vv, _ := v.Int64() + require.Equal(t, int64(1), vv) + }() + } + + for i := 0; i < 7*concurrency; i++ { + c <- struct{}{} + } + + wg.Wait() +} + +func BenchmarkNodesGetByPath_ReuseNode(b *testing.B) { + b.Run("Node", func(b *testing.B) { + root, derr := NewParser(_TwitterJson).Parse() + if derr != 0 { + b.Fatalf("decode failed: %v", derr.Error()) + } + _, _ = root.GetByPath("statuses", 3, "entities", "hashtags", 0, "text").String() + b.ResetTimer() + for i:=0; i