From 801a82a3af6e60d688e12238dec22bd5c9ea7e10 Mon Sep 17 00:00:00 2001 From: Maximilian Hoffman Date: Wed, 27 Mar 2024 11:54:15 -0700 Subject: [PATCH] Statistics for multiple branches (#7558) * starter * prog * flush out statsdb interface fork, and noms implementation * more progress on tests * StatsIO tests passing * pretty close, thread canceller still broken * check context dropped inside refresh loop * tidy * nil panic * dolt harness setup imports setup statistics * lot of test fixes * bump * fmt * fix providers race * don't load in create database hook * bump * bump * fix integration tests * fix more bugs * fix drop * more test fixes * cleanup * missing updateActive * simplify bucket merging * nick comments * bump * fix bad merge * tidy and edits * [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh * bump test fixes * bump * fix bats * bump * [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh * redo create db to account for variety of edge cases * try path.join --------- Co-authored-by: max-hoffman --- go/Godeps/LICENSES | 33 + go/cmd/dolt/commands/engine/sqlengine.go | 7 +- go/go.mod | 3 +- go/go.sum | 5 +- go/libraries/doltcore/dbfactory/file.go | 4 + go/libraries/doltcore/doltdb/doltdb.go | 22 +- go/libraries/doltcore/migrate/environment.go | 12 +- go/libraries/doltcore/ref/ref.go | 2 +- go/libraries/doltcore/ref/stats_ref.go | 9 +- go/libraries/doltcore/schema/statistic.go | 3 - .../doltcore/sqle/cluster/initdbhook.go | 4 +- go/libraries/doltcore/sqle/database.go | 10 +- .../doltcore/sqle/database_provider.go | 14 +- .../doltcore/sqle/dprocedures/stats_funcs.go | 9 +- go/libraries/doltcore/sqle/dsess/variables.go | 1 + .../doltcore/sqle/dtables/statistics_table.go | 202 ++---- .../sqle/enginetest/dolt_engine_test.go | 35 +- .../doltcore/sqle/enginetest/dolt_harness.go | 30 +- .../sqle/enginetest/privilege_test.go | 3 +- .../doltcore/sqle/enginetest/stats_queries.go | 199 +++++- .../sqle/logictest/dolt/doltharness.go | 5 +- .../doltcore/sqle/stats/auto_refresh.go | 356 ----------- .../doltcore/sqle/stats/stats_provider.go | 583 ------------------ go/libraries/doltcore/sqle/stats/write.go | 211 ------- .../doltcore/sqle/statsnoms/database.go | 317 ++++++++++ go/libraries/doltcore/sqle/statsnoms/iter.go | 175 ++++++ .../sqle/{stats/read.go => statsnoms/load.go} | 46 +- go/libraries/doltcore/sqle/statsnoms/write.go | 144 +++++ .../doltcore/sqle/statspro/analyze.go | 241 ++++++++ .../doltcore/sqle/statspro/auto_refresh.go | 244 ++++++++ .../doltcore/sqle/statspro/configure.go | 148 +++++ .../doltcore/sqle/statspro/dolt_stats.go | 172 ++++++ .../sqle/{stats => statspro}/initdbhook.go | 48 +- .../doltcore/sqle/statspro/interface.go | 64 ++ .../doltcore/sqle/statspro/stats_provider.go | 326 ++++++++++ .../sqle/{stats => statspro}/update.go | 92 ++- .../sqle/{stats => statspro}/update_test.go | 2 +- .../doltcore/sqle/system_variables.go | 11 +- go/store/prolly/tuple_mutable_map.go | 7 + integration-tests/bats/stats.bats | 21 +- 40 files changed, 2322 insertions(+), 1498 deletions(-) delete mode 100644 go/libraries/doltcore/sqle/stats/auto_refresh.go delete mode 100644 go/libraries/doltcore/sqle/stats/stats_provider.go delete mode 100644 go/libraries/doltcore/sqle/stats/write.go create mode 100644 go/libraries/doltcore/sqle/statsnoms/database.go create mode 100644 go/libraries/doltcore/sqle/statsnoms/iter.go rename go/libraries/doltcore/sqle/{stats/read.go => statsnoms/load.go} (87%) create mode 100644 go/libraries/doltcore/sqle/statsnoms/write.go create mode 100644 go/libraries/doltcore/sqle/statspro/analyze.go create mode 100644 go/libraries/doltcore/sqle/statspro/auto_refresh.go create mode 100644 go/libraries/doltcore/sqle/statspro/configure.go create mode 100644 go/libraries/doltcore/sqle/statspro/dolt_stats.go rename go/libraries/doltcore/sqle/{stats => statspro}/initdbhook.go (52%) create mode 100644 go/libraries/doltcore/sqle/statspro/interface.go create mode 100644 go/libraries/doltcore/sqle/statspro/stats_provider.go rename go/libraries/doltcore/sqle/{stats => statspro}/update.go (83%) rename go/libraries/doltcore/sqle/{stats => statspro}/update_test.go (99%) diff --git a/go/Godeps/LICENSES b/go/Godeps/LICENSES index 1d553dfb53a..57ae6a25e5f 100644 --- a/go/Godeps/LICENSES +++ b/go/Godeps/LICENSES @@ -11957,6 +11957,39 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. = LICENSE 3565fbf999a10a748647f3a2f7ff9f5dfcf1af7502a30f860ef0bf98 = ================================================================================ +================================================================================ += gopkg.in/errgo.v2 licensed under: = + +Copyright © 2013, Roger Peppe +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of this project nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + += LICENSE fdb54eb3c3cf061a91aac42ab8e6578c3c69de803c2becb0d86810a5 = +================================================================================ + ================================================================================ = gopkg.in/square/go-jose.v2 licensed under: = diff --git a/go/cmd/dolt/commands/engine/sqlengine.go b/go/cmd/dolt/commands/engine/sqlengine.go index 5584efef4c5..b6c855dc3f2 100644 --- a/go/cmd/dolt/commands/engine/sqlengine.go +++ b/go/cmd/dolt/commands/engine/sqlengine.go @@ -41,7 +41,8 @@ import ( "github.com/dolthub/dolt/go/libraries/doltcore/sqle/cluster" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/mysql_file_handler" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/stats" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statsnoms" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" "github.com/dolthub/dolt/go/libraries/utils/config" "github.com/dolthub/dolt/go/store/types" ) @@ -180,7 +181,7 @@ func NewSqlEngine( "authentication_dolt_jwt": NewAuthenticateDoltJWTPlugin(config.JwksConfig), }) - statsPro := stats.NewProvider() + statsPro := statspro.NewProvider(pro, statsnoms.NewNomsStatsFactory(mrEnv.RemoteDialProvider())) engine.Analyzer.Catalog.StatsProvider = statsPro engine.Analyzer.ExecBuilder = rowexec.DefaultBuilder @@ -192,7 +193,7 @@ func NewSqlEngine( // configuring stats depends on sessionBuilder // sessionBuilder needs ref to statsProv - if err = statsPro.Configure(ctx, sqlEngine.NewDefaultContext, bThreads, pro, dbs); err != nil { + if err = statsPro.Configure(ctx, sqlEngine.NewDefaultContext, bThreads, dbs); err != nil { fmt.Fprintln(cli.CliErr, err) } diff --git a/go/go.mod b/go/go.mod index 472462f19d1..8a753f4ecae 100644 --- a/go/go.mod +++ b/go/go.mod @@ -57,7 +57,7 @@ require ( github.com/cespare/xxhash v1.1.0 github.com/creasty/defaults v1.6.0 github.com/dolthub/flatbuffers/v23 v23.3.3-dh.2 - github.com/dolthub/go-mysql-server v0.18.1-0.20240326173717-f57423646998 + github.com/dolthub/go-mysql-server v0.18.1-0.20240326223629-0f6489fffde3 github.com/dolthub/swiss v0.1.0 github.com/goccy/go-json v0.10.2 github.com/google/go-github/v57 v57.0.0 @@ -87,6 +87,7 @@ require ( golang.org/x/exp v0.0.0-20230522175609-2e198f4a06a1 golang.org/x/text v0.14.0 gonum.org/v1/plot v0.11.0 + gopkg.in/errgo.v2 v2.1.0 gopkg.in/yaml.v3 v3.0.1 ) diff --git a/go/go.sum b/go/go.sum index 729386f7e30..e39315c062c 100644 --- a/go/go.sum +++ b/go/go.sum @@ -183,8 +183,8 @@ github.com/dolthub/fslock v0.0.3 h1:iLMpUIvJKMKm92+N1fmHVdxJP5NdyDK5bK7z7Ba2s2U= github.com/dolthub/fslock v0.0.3/go.mod h1:QWql+P17oAAMLnL4HGB5tiovtDuAjdDTPbuqx7bYfa0= github.com/dolthub/go-icu-regex v0.0.0-20230524105445-af7e7991c97e h1:kPsT4a47cw1+y/N5SSCkma7FhAPw7KeGmD6c9PBZW9Y= github.com/dolthub/go-icu-regex v0.0.0-20230524105445-af7e7991c97e/go.mod h1:KPUcpx070QOfJK1gNe0zx4pA5sicIK1GMikIGLKC168= -github.com/dolthub/go-mysql-server v0.18.1-0.20240326173717-f57423646998 h1:mW5rmY0D0DSek6/UW0uu3B1h84Fttk+Y9Dee1abLAM8= -github.com/dolthub/go-mysql-server v0.18.1-0.20240326173717-f57423646998/go.mod h1:99WjL4v4Ozkona7YowHl20+1B5DPhDbVZnf7WP+TX1U= +github.com/dolthub/go-mysql-server v0.18.1-0.20240326223629-0f6489fffde3 h1:TePc9iACeSLxXkXuf/kYusW6I3SeSq49ebKPccJVNKg= +github.com/dolthub/go-mysql-server v0.18.1-0.20240326223629-0f6489fffde3/go.mod h1:99WjL4v4Ozkona7YowHl20+1B5DPhDbVZnf7WP+TX1U= github.com/dolthub/ishell v0.0.0-20221214210346-d7db0b066488 h1:0HHu0GWJH0N6a6keStrHhUAK5/o9LVfkh44pvsV4514= github.com/dolthub/ishell v0.0.0-20221214210346-d7db0b066488/go.mod h1:ehexgi1mPxRTk0Mok/pADALuHbvATulTh6gzr7NzZto= github.com/dolthub/jsonpath v0.0.2-0.20240227200619-19675ab05c71 h1:bMGS25NWAGTEtT5tOBsCuCrlYnLRKpbJVJkDbrTRhwQ= @@ -1155,6 +1155,7 @@ gopkg.in/check.v1 v1.0.0-20200902074654-038fdea0a05b/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/cheggaaa/pb.v1 v1.0.25/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw= +gopkg.in/errgo.v2 v2.1.0 h1:0vLT13EuvQ0hNvakwLuFZ/jYrLp5F3kcWHXdRggjCE8= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o= diff --git a/go/libraries/doltcore/dbfactory/file.go b/go/libraries/doltcore/dbfactory/file.go index d24bfa777d8..12e8ef23fe5 100644 --- a/go/libraries/doltcore/dbfactory/file.go +++ b/go/libraries/doltcore/dbfactory/file.go @@ -47,11 +47,15 @@ const ( // DataDir is the directory internal to the DoltDir which holds the noms files. DataDir = "noms" + // StatsDir is the directory in DoltDir that holds the database statistics + StatsDir = "stats" + ChunkJournalParam = "journal" ) // DoltDataDir is the directory where noms files will be stored var DoltDataDir = filepath.Join(DoltDir, DataDir) +var DoltStatsDir = filepath.Join(DoltDir, StatsDir) // FileFactory is a DBFactory implementation for creating local filesys backed databases type FileFactory struct { diff --git a/go/libraries/doltcore/doltdb/doltdb.go b/go/libraries/doltcore/doltdb/doltdb.go index dd9c652129a..4a9f505a34c 100644 --- a/go/libraries/doltcore/doltdb/doltdb.go +++ b/go/libraries/doltcore/doltdb/doltdb.go @@ -60,6 +60,7 @@ const ( // LocalDirDoltDB stores the db in the current directory var LocalDirDoltDB = "file://./" + dbfactory.DoltDataDir +var LocalDirStatsDB = "file://./" + dbfactory.DoltStatsDir // InMemDoltDB stores the DoltDB db in memory and is primarily used for testing var InMemDoltDB = "mem://" @@ -484,15 +485,6 @@ func (ddb *DoltDB) ResolveCommitRef(ctx context.Context, ref ref.DoltRef) (*Comm return NewCommit(ctx, ddb.vrw, ddb.ns, commitVal) } -// ResolveStatsRef takes a StatsRef and returns an address to a table. -func (ddb *DoltDB) ResolveStatsRef(ctx context.Context) (hash.Hash, bool) { - ds, err := ddb.db.GetDataset(ctx, ref.StatsRefName) - if err != nil { - return hash.Hash{}, false - } - return ds.MaybeHeadAddr() -} - // ResolveCommitRefAtRoot takes a DoltRef and returns a Commit, or an error if the commit cannot be found. The ref given must // point to a Commit. func (ddb *DoltDB) ResolveCommitRefAtRoot(ctx context.Context, ref ref.DoltRef, nomsRoot hash.Hash) (*Commit, error) { @@ -1804,8 +1796,8 @@ func (ddb *DoltDB) AddStash(ctx context.Context, head *Commit, stash *RootValue, return err } -func (ddb *DoltDB) SetStatisics(ctx context.Context, addr hash.Hash) error { - statsDs, err := ddb.db.GetDataset(ctx, ref.NewStatsRef().String()) +func (ddb *DoltDB) SetStatisics(ctx context.Context, branch string, addr hash.Hash) error { + statsDs, err := ddb.db.GetDataset(ctx, ref.NewStatsRef(branch).String()) if err != nil { return err } @@ -1813,8 +1805,8 @@ func (ddb *DoltDB) SetStatisics(ctx context.Context, addr hash.Hash) error { return err } -func (ddb *DoltDB) DropStatisics(ctx context.Context) error { - statsDs, err := ddb.db.GetDataset(ctx, ref.NewStatsRef().String()) +func (ddb *DoltDB) DropStatisics(ctx context.Context, branch string) error { + statsDs, err := ddb.db.GetDataset(ctx, ref.NewStatsRef(branch).String()) _, err = ddb.db.Delete(ctx, statsDs, "") if err != nil { @@ -1826,8 +1818,8 @@ func (ddb *DoltDB) DropStatisics(ctx context.Context) error { var ErrNoStatistics = errors.New("no statistics found") // GetStatistics returns the value of the singleton ref.StatsRef for this database -func (ddb *DoltDB) GetStatistics(ctx context.Context) (prolly.Map, error) { - ds, err := ddb.db.GetDataset(ctx, ref.NewStatsRef().String()) +func (ddb *DoltDB) GetStatistics(ctx context.Context, branch string) (prolly.Map, error) { + ds, err := ddb.db.GetDataset(ctx, ref.NewStatsRef(branch).String()) if err != nil { return prolly.Map{}, err } diff --git a/go/libraries/doltcore/migrate/environment.go b/go/libraries/doltcore/migrate/environment.go index 9b1a3cdc30d..a680eea107b 100644 --- a/go/libraries/doltcore/migrate/environment.go +++ b/go/libraries/doltcore/migrate/environment.go @@ -96,6 +96,12 @@ func initMigrationDB(ctx context.Context, existing *env.DoltEnv, src, dest files } ierr := src.Iter(doltDir, true, func(path string, size int64, isDir bool) (stop bool) { + path, err = filepath.Rel(base, path) + if err != nil { + stop = true + return + } + if isDir { err = dest.MkDirs(path) stop = err != nil @@ -105,12 +111,6 @@ func initMigrationDB(ctx context.Context, existing *env.DoltEnv, src, dest files return } - path, err = filepath.Rel(base, path) - if err != nil { - stop = true - return - } - if err = filesys.CopyFile(path, path, src, dest); err != nil { stop = true return diff --git a/go/libraries/doltcore/ref/ref.go b/go/libraries/doltcore/ref/ref.go index 3daf3fa7980..9bbb85ba857 100644 --- a/go/libraries/doltcore/ref/ref.go +++ b/go/libraries/doltcore/ref/ref.go @@ -202,7 +202,7 @@ func Parse(str string) (DoltRef, error) { } if prefix := PrefixForType(StatsRefType); strings.HasPrefix(str, prefix) { - return NewStashRef(), nil + return NewStatsRef(str[len(prefix):]), nil } return nil, ErrUnknownRefType diff --git a/go/libraries/doltcore/ref/stats_ref.go b/go/libraries/doltcore/ref/stats_ref.go index 86ed0c83802..7f957ae05bb 100644 --- a/go/libraries/doltcore/ref/stats_ref.go +++ b/go/libraries/doltcore/ref/stats_ref.go @@ -14,18 +14,15 @@ package ref -// StatsRefName is a dummy name, and there cannot be more than one stats ref. -const StatsRefName = "stats" - type StatsRef struct { stats string } var _ DoltRef = StatsRef{} -// NewStatsRef creates a reference to a statses list. There cannot be more than one statsRef. -func NewStatsRef() StatsRef { - return StatsRef{StatsRefName} +// NewStatsRef creates a reference to a statistic dataset head. +func NewStatsRef(branch string) StatsRef { + return StatsRef{branch} } // GetType will return StatsRefType diff --git a/go/libraries/doltcore/schema/statistic.go b/go/libraries/doltcore/schema/statistic.go index 4fdc0f24dac..1879951e10b 100644 --- a/go/libraries/doltcore/schema/statistic.go +++ b/go/libraries/doltcore/schema/statistic.go @@ -74,9 +74,6 @@ func StatsTableSqlSchema(dbName string) sql.PrimaryKeySchema { &sql.Column{Name: StatsDbColName, Type: types.Text, PrimaryKey: true, DatabaseSource: dbName}, &sql.Column{Name: StatsTableColName, Type: types.Text, PrimaryKey: true, DatabaseSource: dbName}, &sql.Column{Name: StatsIndexColName, Type: types.Text, PrimaryKey: true, DatabaseSource: dbName}, - &sql.Column{Name: StatsPositionColName, Type: types.Int64, PrimaryKey: true, DatabaseSource: dbName}, - &sql.Column{Name: StatsVersionColName, Type: types.Int64, DatabaseSource: dbName}, - &sql.Column{Name: StatsCommitHashColName, Type: types.Text, DatabaseSource: dbName}, &sql.Column{Name: StatsRowCountColName, Type: types.Int64, DatabaseSource: dbName}, &sql.Column{Name: StatsDistinctCountColName, Type: types.Int64, DatabaseSource: dbName}, &sql.Column{Name: StatsNullCountColName, Type: types.Int64, DatabaseSource: dbName}, diff --git a/go/libraries/doltcore/sqle/cluster/initdbhook.go b/go/libraries/doltcore/sqle/cluster/initdbhook.go index e995a4d92f0..0472dee22c1 100644 --- a/go/libraries/doltcore/sqle/cluster/initdbhook.go +++ b/go/libraries/doltcore/sqle/cluster/initdbhook.go @@ -32,9 +32,9 @@ func NewInitDatabaseHook(controller *Controller, bt *sql.BackgroundThreads, orig if controller == nil { return orig } - return func(ctx *sql.Context, pro *sqle.DoltDatabaseProvider, name string, denv *env.DoltEnv) error { + return func(ctx *sql.Context, pro *sqle.DoltDatabaseProvider, name string, denv *env.DoltEnv, db dsess.SqlDatabase) error { var err error - err = orig(ctx, pro, name, denv) + err = orig(ctx, pro, name, denv, db) if err != nil { return err } diff --git a/go/libraries/doltcore/sqle/database.go b/go/libraries/doltcore/sqle/database.go index 353fa5e1109..7a972c27b31 100644 --- a/go/libraries/doltcore/sqle/database.go +++ b/go/libraries/doltcore/sqle/database.go @@ -254,7 +254,7 @@ func (db Database) GetTableInsensitive(ctx *sql.Context, tblName string) (sql.Ta return nil, false, err } - return db.getTableInsensitive(ctx, nil, ds, root, tblName) + return db.getTableInsensitive(ctx, nil, ds, root, tblName, "") } // GetTableInsensitiveAsOf implements sql.VersionedDatabase @@ -271,7 +271,7 @@ func (db Database) GetTableInsensitiveAsOf(ctx *sql.Context, tableName string, a sess := dsess.DSessFromSess(ctx.Session) - table, ok, err := db.getTableInsensitive(ctx, head, sess, root, tableName) + table, ok, err := db.getTableInsensitive(ctx, head, sess, root, tableName, asOf) if err != nil { return nil, false, err } @@ -305,7 +305,7 @@ func (db Database) GetTableInsensitiveAsOf(ctx *sql.Context, tableName string, a } -func (db Database) getTableInsensitive(ctx *sql.Context, head *doltdb.Commit, ds *dsess.DoltSession, root *doltdb.RootValue, tblName string) (sql.Table, bool, error) { +func (db Database) getTableInsensitive(ctx *sql.Context, head *doltdb.Commit, ds *dsess.DoltSession, root *doltdb.RootValue, tblName string, asOf interface{}) (sql.Table, bool, error) { lwrName := strings.ToLower(tblName) // TODO: these tables that cache a root value at construction time should not, they need to get it from the session @@ -365,7 +365,7 @@ func (db Database) getTableInsensitive(ctx *sql.Context, head *doltdb.Commit, ds case strings.HasPrefix(lwrName, doltdb.DoltConfTablePrefix): suffix := tblName[len(doltdb.DoltConfTablePrefix):] - srcTable, ok, err := db.getTableInsensitive(ctx, head, ds, root, suffix) + srcTable, ok, err := db.getTableInsensitive(ctx, head, ds, root, suffix, asOf) if err != nil { return nil, false, err } else if !ok { @@ -488,7 +488,7 @@ func (db Database) getTableInsensitive(ctx *sql.Context, head *doltdb.Commit, ds dt, found = dtables.NewDocsTable(ctx, versionableTable), true } case doltdb.StatisticsTableName: - dt, found = dtables.NewStatisticsTable(ctx, db.Name(), db.ddb), true + dt, found = dtables.NewStatisticsTable(ctx, db.Name(), db.ddb, asOf), true } if found { diff --git a/go/libraries/doltcore/sqle/database_provider.go b/go/libraries/doltcore/sqle/database_provider.go index 726767643ed..e553882b8ce 100644 --- a/go/libraries/doltcore/sqle/database_provider.go +++ b/go/libraries/doltcore/sqle/database_provider.go @@ -66,6 +66,10 @@ type DoltDatabaseProvider struct { isStandby *bool } +func (p *DoltDatabaseProvider) DefaultBranch() string { + return p.defaultBranch +} + func (p *DoltDatabaseProvider) WithTableFunctions(fns ...sql.TableFunction) (sql.TableFunctionProvider, error) { funcs := make(map[string]sql.TableFunction) for _, fn := range fns { @@ -454,12 +458,12 @@ func (p *DoltDatabaseProvider) CreateCollatedDatabase(ctx *sql.Context, name str return p.registerNewDatabase(ctx, name, newEnv) } -type InitDatabaseHook func(ctx *sql.Context, pro *DoltDatabaseProvider, name string, env *env.DoltEnv) error +type InitDatabaseHook func(ctx *sql.Context, pro *DoltDatabaseProvider, name string, env *env.DoltEnv, db dsess.SqlDatabase) error type DropDatabaseHook func(name string) // ConfigureReplicationDatabaseHook sets up replication for a newly created database as necessary // TODO: consider the replication heads / all heads setting -func ConfigureReplicationDatabaseHook(ctx *sql.Context, p *DoltDatabaseProvider, name string, newEnv *env.DoltEnv) error { +func ConfigureReplicationDatabaseHook(ctx *sql.Context, p *DoltDatabaseProvider, name string, newEnv *env.DoltEnv, _ dsess.SqlDatabase) error { _, replicationRemoteName, _ := sql.SystemVariables.GetGlobal(dsess.ReplicateToRemote) if replicationRemoteName == "" { return nil @@ -616,6 +620,10 @@ func (p *DoltDatabaseProvider) DropDatabase(ctx *sql.Context, name string) error if err != nil { return err } + err = dbfactory.DeleteFromSingletonCache(filepath.ToSlash(dropDbLoc + "/.dolt/stats/.dolt/noms")) + if err != nil { + return err + } err = p.droppedDatabaseManager.DropDatabase(ctx, name, dropDbLoc) if err != nil { @@ -702,7 +710,7 @@ func (p *DoltDatabaseProvider) registerNewDatabase(ctx *sql.Context, name string // If we have an initialization hook, invoke it. By default, this will // be ConfigureReplicationDatabaseHook, which will setup replication // for the new database if a remote url template is set. - err = p.InitDatabaseHook(ctx, p, name, newEnv) + err = p.InitDatabaseHook(ctx, p, name, newEnv, db) if err != nil { return err } diff --git a/go/libraries/doltcore/sqle/dprocedures/stats_funcs.go b/go/libraries/doltcore/sqle/dprocedures/stats_funcs.go index 95e849c3cdc..6d72e28c054 100644 --- a/go/libraries/doltcore/sqle/dprocedures/stats_funcs.go +++ b/go/libraries/doltcore/sqle/dprocedures/stats_funcs.go @@ -49,7 +49,7 @@ func statsFunc(fn func(ctx *sql.Context) (interface{}, error)) func(ctx *sql.Con type AutoRefreshStatsProvider interface { sql.StatsProvider CancelRefreshThread(string) - StartRefreshThread(*sql.Context, dsess.DoltDatabaseProvider, string, *env.DoltEnv) error + StartRefreshThread(*sql.Context, dsess.DoltDatabaseProvider, string, *env.DoltEnv, dsess.SqlDatabase) error ThreadStatus(string) string } @@ -68,9 +68,14 @@ func statsRestart(ctx *sql.Context) (interface{}, error) { dEnv := env.Load(ctx, env.GetCurrentUserHomeDir, newFs, pro.DbFactoryUrl(), "TODO") + sqlDb, ok := pro.BaseDatabase(ctx, dbName) + if !ok { + return nil, fmt.Errorf("failed to restart stats collection: database not found: %s", dbName) + } + afp.CancelRefreshThread(dbName) - err = afp.StartRefreshThread(ctx, pro, dbName, dEnv) + err = afp.StartRefreshThread(ctx, pro, dbName, dEnv, sqlDb) if err != nil { return nil, fmt.Errorf("failed to restart collection: %w", err) } diff --git a/go/libraries/doltcore/sqle/dsess/variables.go b/go/libraries/doltcore/sqle/dsess/variables.go index c1d90240b83..74e6db00b1c 100644 --- a/go/libraries/doltcore/sqle/dsess/variables.go +++ b/go/libraries/doltcore/sqle/dsess/variables.go @@ -63,6 +63,7 @@ const ( DoltStatsAutoRefreshThreshold = "dolt_stats_auto_refresh_threshold" DoltStatsAutoRefreshInterval = "dolt_stats_auto_refresh_interval" DoltStatsMemoryOnly = "dolt_stats_memory_only" + DoltStatsBranches = "dolt_stats_branches" ) const URLTemplateDatabasePlaceholder = "{database}" diff --git a/go/libraries/doltcore/sqle/dtables/statistics_table.go b/go/libraries/doltcore/sqle/dtables/statistics_table.go index dbe399baa61..9b8524043d0 100644 --- a/go/libraries/doltcore/sqle/dtables/statistics_table.go +++ b/go/libraries/doltcore/sqle/dtables/statistics_table.go @@ -15,27 +15,21 @@ package dtables import ( - "errors" "fmt" - "strings" - "time" "github.com/dolthub/go-mysql-server/sql" - "github.com/dolthub/go-mysql-server/sql/planbuilder" - stats2 "github.com/dolthub/go-mysql-server/sql/stats" + "github.com/dolthub/go-mysql-server/sql/stats" "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" "github.com/dolthub/dolt/go/libraries/doltcore/schema" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/index" - "github.com/dolthub/dolt/go/store/hash" - "github.com/dolthub/dolt/go/store/prolly" - "github.com/dolthub/dolt/go/store/prolly/tree" - "github.com/dolthub/dolt/go/store/val" ) // StatisticsTable is a sql.Table implementation that implements a system table which shows the dolt commit log type StatisticsTable struct { dbName string + branch string ddb *doltdb.DoltDB } @@ -43,8 +37,12 @@ var _ sql.Table = (*StatisticsTable)(nil) var _ sql.StatisticsTable = (*StatisticsTable)(nil) // NewStatisticsTable creates a StatisticsTable -func NewStatisticsTable(_ *sql.Context, dbName string, ddb *doltdb.DoltDB) sql.Table { - return &StatisticsTable{dbName: dbName, ddb: ddb} +func NewStatisticsTable(_ *sql.Context, dbName string, ddb *doltdb.DoltDB, asOf interface{}) sql.Table { + ret := &StatisticsTable{dbName: dbName, ddb: ddb} + if branch, ok := asOf.(string); ok { + ret.branch = branch + } + return ret } // DataLength implements sql.StatisticsTable @@ -57,16 +55,37 @@ func (st *StatisticsTable) DataLength(ctx *sql.Context) (uint64, error) { return numBytesPerRow * numRows, nil } +type BranchStatsProvider interface { + GetTableDoltStats(ctx *sql.Context, branch, db, table string) ([]sql.Statistic, error) +} + // RowCount implements sql.StatisticsTable func (st *StatisticsTable) RowCount(ctx *sql.Context) (uint64, bool, error) { - statsMap, err := st.ddb.GetStatistics(ctx) + dSess := dsess.DSessFromSess(ctx.Session) + prov := dSess.Provider() + + sqlDb, err := prov.Database(ctx, st.dbName) if err != nil { return 0, false, err } - cnt, err := statsMap.Count() + + tables, err := sqlDb.GetTableNames(ctx) if err != nil { return 0, false, err } + + var cnt int + for _, table := range tables { + // only Dolt-specific provider has branch support + dbStats, err := dSess.StatsProvider().(BranchStatsProvider).GetTableDoltStats(ctx, st.branch, st.dbName, table) + if err != nil { + + } + for _, dbStat := range dbStats { + cnt += len(dbStat.Histogram()) + } + } + return uint64(cnt), true, nil } @@ -99,157 +118,38 @@ func (st *StatisticsTable) Partitions(*sql.Context) (sql.PartitionIter, error) { // PartitionRows is a sql.Table interface function that gets a row iterator for a partition func (st *StatisticsTable) PartitionRows(ctx *sql.Context, _ sql.Partition) (sql.RowIter, error) { - statsMap, err := st.ddb.GetStatistics(ctx) - if err != nil { - return nil, err + dSess := dsess.DSessFromSess(ctx.Session) + prov := dSess.Provider() + + var sqlDb sql.Database + var err error + if st.branch != "" { + sqlDb, err = prov.Database(ctx, fmt.Sprintf("%s/%s", st.dbName, st.branch)) + } else { + sqlDb, err = prov.Database(ctx, st.dbName) } - return NewStatsIter(ctx, statsMap) -} - -// PreciseMatch implements sql.IndexAddressable -func (st *StatisticsTable) PreciseMatch() bool { - return true -} - -var ErrIncompatibleVersion = errors.New("client stats version mismatch") - -func NewStatsIter(ctx *sql.Context, m prolly.Map) (*statsIter, error) { - iter, err := m.IterAll(ctx) if err != nil { return nil, err } - kd, vd := m.Descriptors() - keyBuilder := val.NewTupleBuilder(kd) - valueBuilder := val.NewTupleBuilder(vd) - ns := m.NodeStore() - - return &statsIter{ - iter: iter, - kb: keyBuilder, - vb: valueBuilder, - ns: ns, - planb: planbuilder.New(ctx, nil), - }, nil -} - -// statsIter reads histogram buckets into string-compatible types. -// Values that are SQL rows should be converted with statsIter.ParseRow. -// todo: make a JSON compatible container for sql.Row w/ types so that we -// can eagerly convert to sql.Row without sacrificing string printing. -type statsIter struct { - iter prolly.MapIter - kb, vb *val.TupleBuilder - ns tree.NodeStore - planb *planbuilder.Builder - currentQual string - currentTypes []sql.Type -} - -var _ sql.RowIter = (*statsIter)(nil) -func (s *statsIter) Next(ctx *sql.Context) (sql.Row, error) { - k, v, err := s.iter.Next(ctx) + tables, err := sqlDb.GetTableNames(ctx) if err != nil { return nil, err } - // deserialize K, V - version, err := tree.GetField(ctx, s.vb.Desc, 0, v, s.ns) - if err != nil { - return nil, err - } - if version != schema.StatsVersion { - return nil, fmt.Errorf("%w: write version %d does not match read version %d", ErrIncompatibleVersion, version, schema.StatsVersion) - } - - var row sql.Row - for i := 0; i < s.kb.Desc.Count(); i++ { - f, err := tree.GetField(ctx, s.kb.Desc, i, k, s.ns) - if err != nil { - return nil, err - } - row = append(row, f) - } - - for i := 0; i < s.vb.Desc.Count(); i++ { - f, err := tree.GetField(ctx, s.vb.Desc, i, v, s.ns) - if err != nil { - return nil, err - } - row = append(row, f) - } - - dbName := row[schema.StatsDbTag].(string) - tableName := row[schema.StatsTableTag].(string) - indexName := row[schema.StatsIndexTag].(string) - position := row[schema.StatsPositionTag].(int64) - _ = row[schema.StatsVersionTag] - commit := hash.Parse(row[schema.StatsCommitHashTag].(string)) - rowCount := row[schema.StatsRowCountTag].(int64) - distinctCount := row[schema.StatsDistinctCountTag].(int64) - nullCount := row[schema.StatsNullCountTag].(int64) - columnsStr := row[schema.StatsColumnsTag].(string) - typesStr := row[schema.StatsTypesTag].(string) - upperBoundStr := row[schema.StatsUpperBoundTag].(string) - upperBoundCnt := row[schema.StatsUpperBoundCntTag].(int64) - createdAt := row[schema.StatsCreatedAtTag].(time.Time) - - typs := strings.Split(typesStr, ",") - for i, t := range typs { - typs[i] = strings.TrimSpace(t) - } - - qual := sql.NewStatQualifier(dbName, tableName, indexName) - if curQual := qual.String(); !strings.EqualFold(curQual, s.currentQual) { - s.currentQual = curQual - s.currentTypes, err = stats2.ParseTypeStrings(typs) + statsPro := dSess.StatsProvider().(BranchStatsProvider) + var dStats []sql.Statistic + for _, table := range tables { + dbStats, err := statsPro.GetTableDoltStats(ctx, st.branch, st.dbName, table) if err != nil { return nil, err } + dStats = append(dStats, dbStats...) } - - mcvCountsStr := row[schema.StatsMcvCountsTag].(string) - - numMcvs := schema.StatsMcvCountsTag - schema.StatsMcv1Tag - mcvs := make([]string, numMcvs) - for i, v := range row[schema.StatsMcv1Tag:schema.StatsMcvCountsTag] { - if v != nil { - mcvs[i] = v.(string) - } - } - - return sql.Row{ - dbName, - tableName, - indexName, - int(position), - version, - commit.String(), - uint64(rowCount), - uint64(distinctCount), - uint64(nullCount), - columnsStr, - typesStr, - upperBoundStr, - uint64(upperBoundCnt), - createdAt, - mcvs[0], mcvs[1], mcvs[2], mcvs[3], - mcvCountsStr, - }, nil -} - -func (s *statsIter) ParseRow(rowStr string) (sql.Row, error) { - var row sql.Row - for i, v := range strings.Split(rowStr, ",") { - val, _, err := s.currentTypes[i].Convert(v) - if err != nil { - return nil, err - } - row = append(row, val) - } - return row, nil + return stats.NewStatsIter(ctx, dStats...) } -func (s *statsIter) Close(context *sql.Context) error { - return nil +// PreciseMatch implements sql.IndexAddressable +func (st *StatisticsTable) PreciseMatch() bool { + return true } diff --git a/go/libraries/doltcore/sqle/enginetest/dolt_engine_test.go b/go/libraries/doltcore/sqle/enginetest/dolt_engine_test.go index a768423a05a..2d07b4b36f8 100644 --- a/go/libraries/doltcore/sqle/enginetest/dolt_engine_test.go +++ b/go/libraries/doltcore/sqle/enginetest/dolt_engine_test.go @@ -41,7 +41,7 @@ import ( "github.com/dolthub/dolt/go/libraries/doltcore/schema" "github.com/dolthub/dolt/go/libraries/doltcore/sqle" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/stats" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" "github.com/dolthub/dolt/go/libraries/utils/config" "github.com/dolthub/dolt/go/store/datas" "github.com/dolthub/dolt/go/store/types" @@ -460,6 +460,7 @@ func TestQueryPlans(t *testing.T) { // Parallelism introduces Exchange nodes into the query plans, so disable. // TODO: exchange nodes should really only be part of the explain plan under certain debug settings harness := newDoltHarness(t).WithSkippedQueries(skipped) + harness.configureStats = true if !types.IsFormat_DOLT(types.Format_Default) { // only new format supports reverse IndexTableAccess reverseIndexSkip := []string{ @@ -485,7 +486,7 @@ func TestQueryPlans(t *testing.T) { func TestIntegrationQueryPlans(t *testing.T) { harness := newDoltHarness(t) - + harness.configureStats = true defer harness.Close() enginetest.TestIntegrationPlans(t, harness) } @@ -837,6 +838,7 @@ func TestJoinPlanning(t *testing.T) { t.Skip("DOLT_LD keyless indexes are not sorted") } h := newDoltHarness(t) + h.configureStats = true defer h.Close() enginetest.TestJoinPlanning(t, h) } @@ -884,6 +886,7 @@ func TestJSONTableScriptsPrepared(t *testing.T) { func TestUserPrivileges(t *testing.T) { h := newDoltHarness(t) h.setupTestProcedures = true + h.configureStats = true defer h.Close() enginetest.TestUserPrivileges(t, h) } @@ -2139,11 +2142,28 @@ func TestColumnDiffSystemTablePrepared(t *testing.T) { } } +func TestStatBranchTests(t *testing.T) { + harness := newDoltHarness(t) + defer harness.Close() + harness.Setup(setup.MydbData) + harness.configureStats = true + for _, test := range StatBranchTests { + t.Run(test.Name, func(t *testing.T) { + // reset engine so provider statistics are clean + harness.engine = nil + e := mustNewEngine(t, harness) + defer e.Close() + enginetest.TestScriptWithEngine(t, e, harness, test) + }) + } +} + func TestStatsFunctions(t *testing.T) { harness := newDoltHarness(t) defer harness.Close() harness.Setup(setup.MydbData) harness.configureStats = true + harness.skipSetupCommit = true for _, test := range StatProcTests { t.Run(test.Name, func(t *testing.T) { // reset engine so provider statistics are clean @@ -2607,6 +2627,7 @@ func TestQueriesPrepared(t *testing.T) { func TestStatsHistograms(t *testing.T) { h := newDoltHarness(t) defer h.Close() + h.configureStats = true for _, script := range DoltHistogramTests { h.engine = nil enginetest.TestScript(t, h, script) @@ -2617,6 +2638,7 @@ func TestStatsHistograms(t *testing.T) { // forces a round trip of the statistics table before inspecting values. func TestStatsIO(t *testing.T) { h := newDoltHarness(t) + h.configureStats = true defer h.Close() for _, script := range append(DoltStatsIOTests, DoltHistogramTests...) { h.engine = nil @@ -2637,6 +2659,7 @@ func TestJoinStats(t *testing.T) { // smallest table first vs smallest join first h := newDoltHarness(t) defer h.Close() + h.configureStats = true enginetest.TestJoinStats(t, h) } @@ -2657,6 +2680,7 @@ func TestSpatialQueriesPrepared(t *testing.T) { func TestPreparedStatistics(t *testing.T) { h := newDoltHarness(t) defer h.Close() + h.configureStats = true for _, script := range DoltHistogramTests { h.engine = nil enginetest.TestScriptPrepared(t, h, script) @@ -3122,7 +3146,7 @@ func TestCreateDatabaseErrorCleansUp(t *testing.T) { require.NoError(t, err) require.NotNil(t, e) - dh.provider.(*sqle.DoltDatabaseProvider).InitDatabaseHook = func(_ *sql.Context, _ *sqle.DoltDatabaseProvider, name string, _ *env.DoltEnv) error { + dh.provider.(*sqle.DoltDatabaseProvider).InitDatabaseHook = func(_ *sql.Context, _ *sqle.DoltDatabaseProvider, name string, _ *env.DoltEnv, _ dsess.SqlDatabase) error { if name == "cannot_create" { return fmt.Errorf("there was an error initializing this database. abort!") } @@ -3164,7 +3188,8 @@ func TestStatsAutoRefreshConcurrency(t *testing.T) { intervalSec := time.Duration(0) thresholdf64 := 0. bThreads := sql.NewBackgroundThreads() - statsProv := engine.EngineAnalyzer().Catalog.StatsProvider.(*stats.Provider) + branches := []string{"main"} + statsProv := engine.EngineAnalyzer().Catalog.StatsProvider.(*statspro.Provider) // it is important to use new sessions for this test, to avoid working root conflicts readCtx := enginetest.NewSession(harness) @@ -3173,7 +3198,7 @@ func TestStatsAutoRefreshConcurrency(t *testing.T) { return enginetest.NewSession(harness), nil } - err := statsProv.InitAutoRefresh(newCtx, sqlDb.Name(), bThreads, intervalSec, thresholdf64) + err := statsProv.InitAutoRefreshWithParams(newCtx, sqlDb.Name(), bThreads, intervalSec, thresholdf64, branches) require.NoError(t, err) execQ := func(ctx *sql.Context, q string, id int, tag string) { diff --git a/go/libraries/doltcore/sqle/enginetest/dolt_harness.go b/go/libraries/doltcore/sqle/enginetest/dolt_harness.go index d2ed8b22501..58976c60fcd 100644 --- a/go/libraries/doltcore/sqle/enginetest/dolt_harness.go +++ b/go/libraries/doltcore/sqle/enginetest/dolt_harness.go @@ -35,7 +35,8 @@ import ( "github.com/dolthub/dolt/go/libraries/doltcore/env" "github.com/dolthub/dolt/go/libraries/doltcore/sqle" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/stats" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statsnoms" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" "github.com/dolthub/dolt/go/libraries/utils/filesys" "github.com/dolthub/dolt/go/store/types" ) @@ -181,8 +182,8 @@ func (d *DoltHarness) NewEngine(t *testing.T) (enginetest.QueryEngine, error) { require.True(t, ok) d.provider = doltProvider - statsPro := stats.NewProvider() - d.statsPro = statsPro + statsProv := statspro.NewProvider(d.provider.(*sqle.DoltDatabaseProvider), statsnoms.NewNomsStatsFactory(d.multiRepoEnv.RemoteDialProvider())) + d.statsPro = statsProv var err error d.session, err = dsess.NewDoltSession(enginetest.NewBaseSession(), d.provider, d.multiRepoEnv.Config(), d.branchControl, d.statsPro) @@ -220,9 +221,14 @@ func (d *DoltHarness) NewEngine(t *testing.T) (enginetest.QueryEngine, error) { for i, dbName := range dbs { dsessDbs[i], _ = dbCache.GetCachedRevisionDb(fmt.Sprintf("%s/main", dbName), dbName) } - if err = statsPro.Configure(ctx, func(context.Context) (*sql.Context, error) { return d.NewSession(), nil }, bThreads, doltProvider, dsessDbs); err != nil { + + ctxFact := func(context.Context) (*sql.Context, error) { return d.NewContext(), nil } + if err = statsProv.Configure(ctx, ctxFact, bThreads, dsessDbs); err != nil { return nil, err } + + statsOnlyQueries := filterStatsOnlyQueries(d.setupData) + e, err = enginetest.RunSetupScripts(ctx, e, statsOnlyQueries, d.SupportsNativeIndexCreation()) } return e, nil @@ -232,7 +238,7 @@ func (d *DoltHarness) NewEngine(t *testing.T) (enginetest.QueryEngine, error) { d.engine.Analyzer.Catalog.MySQLDb = mysql_db.CreateEmptyMySQLDb() d.engine.Analyzer.Catalog.MySQLDb.AddRootAccount() - d.engine.Analyzer.Catalog.StatsProvider = stats.NewProvider() + d.engine.Analyzer.Catalog.StatsProvider = statspro.NewProvider(d.provider.(*sqle.DoltDatabaseProvider), statsnoms.NewNomsStatsFactory(d.multiRepoEnv.RemoteDialProvider())) // Get a fresh session if we are reusing the engine if !initializeEngine { @@ -247,6 +253,18 @@ func (d *DoltHarness) NewEngine(t *testing.T) (enginetest.QueryEngine, error) { return e, err } +func filterStatsOnlyQueries(scripts []setup.SetupScript) []setup.SetupScript { + var ret []string + for i := range scripts { + for _, s := range scripts[i] { + if strings.HasPrefix(s, "analyze table") { + ret = append(ret, s) + } + } + } + return []setup.SetupScript{ret} +} + // WithParallelism returns a copy of the harness with parallelism set to the given number of threads. A value of 0 or // less means to use the system parallelism settings. func (d *DoltHarness) WithParallelism(parallelism int) *DoltHarness { @@ -329,7 +347,6 @@ func (d *DoltHarness) NewDatabases(names ...string) []sql.Database { d.closeProvider() d.engine = nil d.provider = nil - d.statsPro = stats.NewProvider() d.branchControl = branch_control.CreateDefaultController(context.Background()) @@ -337,6 +354,7 @@ func (d *DoltHarness) NewDatabases(names ...string) []sql.Database { doltProvider, ok := pro.(*sqle.DoltDatabaseProvider) require.True(d.t, ok) d.provider = doltProvider + d.statsPro = statspro.NewProvider(doltProvider, statsnoms.NewNomsStatsFactory(d.multiRepoEnv.RemoteDialProvider())) var err error d.session, err = dsess.NewDoltSession(enginetest.NewBaseSession(), doltProvider, d.multiRepoEnv.Config(), d.branchControl, d.statsPro) diff --git a/go/libraries/doltcore/sqle/enginetest/privilege_test.go b/go/libraries/doltcore/sqle/enginetest/privilege_test.go index 716c79ecc90..8477288950b 100755 --- a/go/libraries/doltcore/sqle/enginetest/privilege_test.go +++ b/go/libraries/doltcore/sqle/enginetest/privilege_test.go @@ -47,8 +47,6 @@ var revisionDatabasePrivilegeScriptNames = []string{ "Anonymous User", "IPv4 Loopback == localhost", "information_schema.columns table 'privileges' column gets correct values", - "information_schema.column_statistics shows columns with privileges only", - "information_schema.statistics shows tables with privileges only", "basic tests on information_schema.SCHEMA_PRIVILEGES table", "basic tests on information_schema.TABLE_PRIVILEGES table", } @@ -74,6 +72,7 @@ func TestRevisionDatabasePrivileges(t *testing.T) { for _, script := range scripts { harness := newDoltHarness(t) + harness.configureStats = true harness.Setup(setup.MydbData, setup.MytableData) t.Run(script.Name, func(t *testing.T) { engine := mustNewEngine(t, harness) diff --git a/go/libraries/doltcore/sqle/enginetest/stats_queries.go b/go/libraries/doltcore/sqle/enginetest/stats_queries.go index f82ec6ea4c2..c1240d9fa02 100644 --- a/go/libraries/doltcore/sqle/enginetest/stats_queries.go +++ b/go/libraries/doltcore/sqle/enginetest/stats_queries.go @@ -26,10 +26,8 @@ import ( "github.com/dolthub/go-mysql-server/sql/types" "github.com/stretchr/testify/require" - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" "github.com/dolthub/dolt/go/libraries/doltcore/schema" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/stats" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" ) // fillerVarchar pushes the tree into level 3 @@ -311,10 +309,10 @@ var DoltStatsIOTests = []queries.ScriptTest{ }, Assertions: []queries.ScriptTestAssertion{ { - Query: "select database_name, table_name, index_name, commit_hash, columns, types from dolt_statistics", + Query: "select database_name, table_name, index_name, columns, types from dolt_statistics", Expected: []sql.Row{ - {"mydb", "xy", "primary", "f6la1u3ku5pucfctgrca2afq9vlr4nrs", "x", "bigint"}, - {"mydb", "xy", "yz", "9ec31007jaqtahij0tmlmd7j9t9hl1he", "y,z", "int,varchar(500)"}, + {"mydb", "xy", "primary", "x", "bigint"}, + {"mydb", "xy", "yz", "y,z", "int,varchar(500)"}, }, }, { @@ -349,10 +347,10 @@ var DoltStatsIOTests = []queries.ScriptTest{ }, Assertions: []queries.ScriptTestAssertion{ { - Query: "select database_name, table_name, index_name, commit_hash, columns, types from dolt_statistics where table_name = 'xy'", + Query: "select database_name, table_name, index_name, columns, types from dolt_statistics where table_name = 'xy'", Expected: []sql.Row{ - {"mydb", "xy", "primary", "f6la1u3ku5pucfctgrca2afq9vlr4nrs", "x", "bigint"}, - {"mydb", "xy", "yz", "9ec31007jaqtahij0tmlmd7j9t9hl1he", "y,z", "int,varchar(500)"}, + {"mydb", "xy", "primary", "x", "bigint"}, + {"mydb", "xy", "yz", "y,z", "int,varchar(500)"}, }, }, { @@ -369,10 +367,10 @@ var DoltStatsIOTests = []queries.ScriptTest{ }, }, { - Query: "select database_name, table_name, index_name, commit_hash, columns, types from dolt_statistics where table_name = 'ab'", + Query: "select database_name, table_name, index_name, columns, types from dolt_statistics where table_name = 'ab'", Expected: []sql.Row{ - {"mydb", "ab", "primary", "t6j206v6b9t8vnmhpcc2i57lom8kejk3", "a", "bigint"}, - {"mydb", "ab", "bc", "sibnr73868rb5dqa76opfn4pkelhhqna", "b,c", "int,int"}, + {"mydb", "ab", "primary", "a", "bigint"}, + {"mydb", "ab", "bc", "b,c", "int,int"}, }, }, { @@ -401,6 +399,158 @@ var DoltStatsIOTests = []queries.ScriptTest{ }, }, }, + { + Name: "incremental stats deletes manual analyze", + SetUpScript: []string{ + "CREATE table xy (x bigint primary key, y int, z varchar(500), key(y,z));", + "insert into xy select x, 1, 1 from (with recursive inputs(x) as (select 4 union select x+1 from inputs where x < 1000) select * from inputs) dt;", + "analyze table xy", + }, + Assertions: []queries.ScriptTestAssertion{ + { + Query: "select count(*) as cnt from dolt_statistics group by table_name, index_name order by cnt", + Expected: []sql.Row{{6}, {7}}, + }, + { + Query: "delete from xy where x > 500", + }, + { + Query: "analyze table xy", + }, + { + Query: "select count(*) from dolt_statistics group by table_name, index_name", + Expected: []sql.Row{{4}, {4}}, + }, + }, + }, + { + Name: "incremental stats deletes auto", + SetUpScript: []string{ + "set @@PERSIST.dolt_stats_auto_refresh_interval = 0;", + "set @@PERSIST.dolt_stats_auto_refresh_threshold = 0;", + "CREATE table xy (x bigint primary key, y int, z varchar(500), key(y,z));", + "insert into xy select x, 1, 1 from (with recursive inputs(x) as (select 4 union select x+1 from inputs where x < 1000) select * from inputs) dt;", + "analyze table xy", + }, + Assertions: []queries.ScriptTestAssertion{ + { + Query: "select count(*) as cnt from dolt_statistics group by table_name, index_name order by cnt", + Expected: []sql.Row{{6}, {7}}, + }, + { + Query: "delete from xy where x > 500", + }, + { + Query: "call dolt_stats_restart()", + }, + { + Query: "select sleep(.1)", + }, + { + Query: "select count(*) from dolt_statistics group by table_name, index_name", + Expected: []sql.Row{{4}, {4}}, + }, + }, + }, +} + +var StatBranchTests = []queries.ScriptTest{ + { + Name: "multi branch stats", + SetUpScript: []string{ + "set @@PERSIST.dolt_stats_auto_refresh_interval = 0;", + "set @@PERSIST.dolt_stats_auto_refresh_threshold = 0;", + "set @@PERSIST.dolt_stats_branches = 'main,feat';", + "CREATE table xy (x bigint primary key, y int, z varchar(500), key(y,z));", + "insert into xy values (0,0,'a'), (1,0,'a'), (2,0,'a'), (3,0,'a'), (4,1,'a'), (5,2,'a')", + "call dolt_commit('-Am', 'xy')", + "call dolt_checkout('-b','feat')", + "CREATE table ab (a bigint primary key, b int, c int, key(b,c));", + "insert into ab values (0,0,1), (1,0,1), (2,0,1), (3,0,1), (4,1,1), (5,2,1)", + "call dolt_commit('-Am', 'ab')", + "call dolt_checkout('main')", + }, + Assertions: []queries.ScriptTestAssertion{ + { + Query: "call dolt_stats_restart()", + }, + { + Query: "select sleep(.1)", + }, + { + Query: "select table_name, index_name, row_count from dolt_statistics", + Expected: []sql.Row{ + {"xy", "primary", uint64(6)}, + {"xy", "yz", uint64(6)}, + }, + }, + { + Query: "select table_name, index_name, row_count from dolt_statistics as of 'feat'", + Expected: []sql.Row{ + {"ab", "primary", uint64(6)}, + {"ab", "bc", uint64(6)}, + {"xy", "primary", uint64(6)}, + {"xy", "yz", uint64(6)}, + }, + }, + { + Query: "select table_name, index_name, row_count from dolt_statistics as of 'main'", + Expected: []sql.Row{ + {"xy", "primary", uint64(6)}, + {"xy", "yz", uint64(6)}, + }, + }, + { + Query: "call dolt_checkout('feat')", + }, + { + Query: "insert into xy values ('6',3,'a')", + }, + { + Query: "call dolt_commit('-am', 'cm')", + }, + { + Query: "select sleep(.1)", + }, + { + Query: "select table_name, index_name, row_count from dolt_statistics as of 'feat'", + Expected: []sql.Row{ + {"ab", "primary", uint64(6)}, + {"ab", "bc", uint64(6)}, + {"xy", "primary", uint64(7)}, + {"xy", "yz", uint64(7)}, + }, + }, + { + Query: "select table_name, index_name, row_count from dolt_statistics as of 'main'", + Expected: []sql.Row{ + {"xy", "primary", uint64(6)}, + {"xy", "yz", uint64(6)}, + }, + }, + { + Query: "call dolt_checkout('feat')", + }, + { + Query: "call dolt_stats_stop()", + }, + { + Query: "call dolt_stats_drop()", + }, + { + Query: "select table_name, index_name, row_count from dolt_statistics as of 'feat'", + Expected: []sql.Row{}, + }, + { + // we dropped 'feat', not 'main' + Query: "select table_name, index_name, row_count from dolt_statistics as of 'main'", + Expected: []sql.Row{ + {"xy", "primary", uint64(6)}, + {"xy", "yz", uint64(6)}, + }, + }, + }, + }, } var StatProcTests = []queries.ScriptTest{ @@ -475,8 +625,8 @@ var StatProcTests = []queries.ScriptTest{ }, Assertions: []queries.ScriptTestAssertion{ { - Query: "select count(*) from dolt_statistics", - ExpectedErrStr: doltdb.ErrNoStatistics.Error(), + Query: "select count(*) from dolt_statistics", + Expected: []sql.Row{{0}}, }, { Query: "call dolt_stats_status()", @@ -484,7 +634,7 @@ var StatProcTests = []queries.ScriptTest{ }, // set refresh interval arbitrarily high to avoid updating when we restart { - Query: "set @@PERSIST.dolt_stats_auto_refresh_interval = 1000;", + Query: "set @@PERSIST.dolt_stats_auto_refresh_interval = 100000;", Expected: []sql.Row{{}}, }, { @@ -511,7 +661,7 @@ var StatProcTests = []queries.ScriptTest{ }, { Query: "call dolt_stats_status()", - Expected: []sql.Row{{"updated to hash: vogi4fq0fe8n8rqa80pbsujlmmaljsoo"}}, + Expected: []sql.Row{{"refreshed mydb"}}, }, { Query: "select count(*) from dolt_statistics", @@ -543,7 +693,7 @@ var StatProcTests = []queries.ScriptTest{ }, { Query: "call dolt_stats_status()", - Expected: []sql.Row{{"updated to hash: fhnmdo8psvs10od36pqfi0g4cvvu732h"}}, + Expected: []sql.Row{{"refreshed mydb"}}, }, { Query: "select count(*) from dolt_statistics", @@ -558,8 +708,8 @@ var StatProcTests = []queries.ScriptTest{ Expected: []sql.Row{{"dropped"}}, }, { - Query: "select count(*) from dolt_statistics", - ExpectedErrStr: doltdb.ErrNoStatistics.Error(), + Query: "select count(*) from dolt_statistics", + Expected: []sql.Row{{0}}, }, }, }, @@ -601,16 +751,11 @@ func TestProviderReloadScriptWithEngine(t *testing.T, e enginetest.QueryEngine, t.Errorf("expected *gms.Engine but found: %T", e) } - dbProv, ok := eng.Analyzer.Catalog.DbProvider.(*sqle.DoltDatabaseProvider) - if !ok { - t.Errorf("expected *sqle.DoltDatabaseProvider but found: %T", eng.Analyzer.Catalog.DbProvider) - } - - newProv := stats.NewProvider() - err := newProv.Load(ctx, dbProv.DoltDatabases()) + err := eng.Analyzer.Catalog.StatsProvider.DropDbStats(ctx, "mydb", false) require.NoError(t, err) - eng.Analyzer.Catalog.StatsProvider = newProv + err = eng.Analyzer.Catalog.StatsProvider.(*statspro.Provider).LoadStats(ctx, "mydb", "main") + require.NoError(t, err) } for _, assertion := range assertions { diff --git a/go/libraries/doltcore/sqle/logictest/dolt/doltharness.go b/go/libraries/doltcore/sqle/logictest/dolt/doltharness.go index a01eaf06f8c..61af0bfc001 100644 --- a/go/libraries/doltcore/sqle/logictest/dolt/doltharness.go +++ b/go/libraries/doltcore/sqle/logictest/dolt/doltharness.go @@ -33,7 +33,8 @@ import ( "github.com/dolthub/dolt/go/libraries/doltcore/env" dsql "github.com/dolthub/dolt/go/libraries/doltcore/sqle" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/stats" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statsnoms" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" "github.com/dolthub/dolt/go/libraries/doltcore/table/editor" "github.com/dolthub/dolt/go/libraries/utils/filesys" "github.com/dolthub/dolt/go/store/types" @@ -142,7 +143,7 @@ func innerInit(h *DoltHarness, dEnv *env.DoltEnv) error { return err } - ctx := dsql.NewTestSQLCtxWithProvider(context.Background(), pro, stats.NewProvider()) + ctx := dsql.NewTestSQLCtxWithProvider(context.Background(), pro, statspro.NewProvider(pro.(*dsql.DoltDatabaseProvider), statsnoms.NewNomsStatsFactory(env.NewGRPCDialProviderFromDoltEnv(dEnv)))) h.sess = ctx.Session.(*dsess.DoltSession) dbs := h.engine.Analyzer.Catalog.AllDatabases(ctx) diff --git a/go/libraries/doltcore/sqle/stats/auto_refresh.go b/go/libraries/doltcore/sqle/stats/auto_refresh.go deleted file mode 100644 index 2dcccc7ce00..00000000000 --- a/go/libraries/doltcore/sqle/stats/auto_refresh.go +++ /dev/null @@ -1,356 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package stats - -import ( - "context" - "fmt" - "strings" - "time" - - "github.com/dolthub/go-mysql-server/sql" - types2 "github.com/dolthub/go-mysql-server/sql/types" - - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" - "github.com/dolthub/dolt/go/libraries/doltcore/schema" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/store/hash" - "github.com/dolthub/dolt/go/store/prolly" - "github.com/dolthub/dolt/go/store/prolly/tree" -) - -const asyncAutoRefreshStats = "async_auto_refresh_stats" - -func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Context) (*sql.Context, error), bThreads *sql.BackgroundThreads, pro *sqle.DoltDatabaseProvider, dbs []dsess.SqlDatabase) error { - p.SetStarter(NewInitDatabaseHook(p, ctxFactory, bThreads, nil)) - - if _, disabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsMemoryOnly); disabled == int8(1) { - return nil - } - - loadCtx, err := ctxFactory(ctx) - if err != nil { - return err - } - if err := p.Load(loadCtx, dbs); err != nil { - return err - } - if _, enabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshEnabled); enabled == int8(1) { - _, threshold, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshThreshold) - _, interval, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshInterval) - interval64, _, _ := types2.Int64.Convert(interval) - intervalSec := time.Second * time.Duration(interval64.(int64)) - thresholdf64 := threshold.(float64) - - for _, db := range dbs { - if err := p.InitAutoRefresh(ctxFactory, db.Name(), bThreads, intervalSec, thresholdf64); err != nil { - return err - } - } - pro.InitDatabaseHook = NewInitDatabaseHook(p, ctxFactory, bThreads, pro.InitDatabaseHook) - pro.DropDatabaseHook = NewDropDatabaseHook(p, ctxFactory, pro.DropDatabaseHook) - } - return nil -} - -func (p *Provider) InitAutoRefresh(ctxFactory func(ctx context.Context) (*sql.Context, error), dbName string, bThreads *sql.BackgroundThreads, checkInterval time.Duration, updateThresh float64) error { - // this is only called after initial statistics are finished loading - // launch a thread that periodically checks freshness - - // retain handle to cancel on drop database - // todo: add Cancel(name) to sql.BackgroundThreads interface - p.mu.Lock() - defer p.mu.Unlock() - - dropDbCtx, dbStatsCancel := context.WithCancel(context.Background()) - p.cancelers[dbName] = dbStatsCancel - - return bThreads.Add(fmt.Sprintf("%s_%s", asyncAutoRefreshStats, dbName), func(ctx context.Context) { - timer := time.NewTimer(checkInterval) - for { - // wake up checker on interval - select { - case <-ctx.Done(): - timer.Stop() - return - case <-dropDbCtx.Done(): - timer.Stop() - return - case <-timer.C: - sqlCtx, err := ctxFactory(ctx) - if err != nil { - return - } - - sqlCtx.GetLogger().Debugf("starting statistics refresh check for '%s': %s", dbName, time.Now().String()) - timer.Reset(checkInterval) - - // Iterate all dbs, tables, indexes. Each db will collect - // []indexMeta above refresh threshold. We read and process those - // chunks' statistics. We merge updated chunks with precomputed - // chunks. The full set of statistics for each database lands - // 1) in the provider's most recent set of database statistics, and - // 2) on disk in the database's statistics ref'd prolly.Map. - curStats := p.getStats(dbName) - if curStats == nil { - curStats = newDbStats(dbName) - } - - newStats := make(map[sql.StatQualifier]*DoltStats) - var deletedStats []sql.StatQualifier - qualExists := make(map[sql.StatQualifier]bool) - tableExistsAndSkipped := make(map[string]bool) - - // important: update session references every loop - dSess := dsess.DSessFromSess(sqlCtx.Session) - prov := dSess.Provider() - ddb, ok := dSess.GetDoltDB(sqlCtx, dbName) - if !ok { - sqlCtx.GetLogger().Debugf("statistics refresh error: database not found %s", dbName) - } - - sqlDb, err := prov.Database(sqlCtx, dbName) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - - tables, err := sqlDb.GetTableNames(sqlCtx) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - - for _, table := range tables { - sqlTable, ok, err := sqlDb.GetTableInsensitive(sqlCtx, table) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - if !ok { - sqlCtx.GetLogger().Debugf("statistics refresh error: table not found %s", table) - continue - } - - var dTab *doltdb.Table - switch t := sqlTable.(type) { - case *sqle.AlterableDoltTable: - dTab, err = t.DoltTable.DoltTable(sqlCtx) - case *sqle.WritableDoltTable: - dTab, err = t.DoltTable.DoltTable(sqlCtx) - case *sqle.DoltTable: - dTab, err = t.DoltTable(sqlCtx) - default: - err = fmt.Errorf("failed to unwrap dolt table from type: %T", sqlTable) - } - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - - tableHash, err := dTab.GetRowDataHash(ctx) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - - if curStats.getLatestHash(table) == tableHash { - // no data changes since last check - tableExistsAndSkipped[table] = true - sqlCtx.GetLogger().Debugf("statistics refresh: table hash unchanged since last check: %s", tableHash) - continue - } else { - sqlCtx.GetLogger().Debugf("statistics refresh: new table hash: %s", tableHash) - } - - iat, ok := sqlTable.(sql.IndexAddressableTable) - if !ok { - sqlCtx.GetLogger().Debugf("statistics refresh error: table does not support indexes %s", table) - continue - } - - indexes, err := iat.GetIndexes(sqlCtx) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - - // collect indexes and ranges to be updated - var idxMetas []indexMeta - for _, index := range indexes { - qual := sql.NewStatQualifier(dbName, table, strings.ToLower(index.ID())) - qualExists[qual] = true - curStat := curStats.getIndexStats(qual) - if curStat == nil { - curStat = NewDoltStats() - curStat.Qual = qual - - cols := make([]string, len(index.Expressions())) - tablePrefix := fmt.Sprintf("%s.", table) - for i, c := range index.Expressions() { - cols[i] = strings.TrimPrefix(strings.ToLower(c), tablePrefix) - } - curStat.Columns = cols - } - sqlCtx.GetLogger().Debugf("statistics refresh index: %s", qual.String()) - - updateMeta, err := newIdxMeta(sqlCtx, curStat, dTab, index, curStat.Columns) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - curCnt := float64(len(curStat.active)) - updateCnt := float64(len(updateMeta.updateChunks)) - deleteCnt := float64(len(curStat.active) - len(updateMeta.preexisting)) - sqlCtx.GetLogger().Debugf("statistics current: %d, new: %d, delete: %d", int(curCnt), int(updateCnt), int(deleteCnt)) - - if curCnt == 0 || (deleteCnt+updateCnt)/curCnt > updateThresh { - sqlCtx.GetLogger().Debugf("statistics updating: %s", updateMeta.qual) - // mark index for updating - idxMetas = append(idxMetas, updateMeta) - // update lastest hash if we haven't already - curStats.setLatestHash(table, tableHash) - } - } - // get new buckets for index chunks to update - newTableStats, err := updateStats(sqlCtx, sqlTable, dTab, indexes, idxMetas) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - - // merge new chunks with preexisting chunks - for _, updateMeta := range idxMetas { - stat := newTableStats[updateMeta.qual] - if stat != nil { - newStats[updateMeta.qual] = mergeStatUpdates(stat, updateMeta) - } - } - } - - func() { - curStats.mu.Lock() - defer curStats.mu.Unlock() - for _, s := range curStats.stats { - // table or index delete leaves hole in stats - // this is separate from threshold check - if !tableExistsAndSkipped[s.Qual.Table()] && !qualExists[s.Qual] { - // only delete stats we've verified are deleted - deletedStats = append(deletedStats, s.Qual) - } - } - }() - - prevMap := curStats.getCurrentMap() - if prevMap.KeyDesc().Count() == 0 { - kd, vd := schema.StatsTableDoltSchema.GetMapDescriptors() - prevMap, err = prolly.NewMapFromTuples(ctx, ddb.NodeStore(), kd, vd) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - } - - if len(deletedStats) == 0 && len(newStats) == 0 { - continue - } - - if len(deletedStats) > 0 { - sqlCtx.GetLogger().Debugf("statistics refresh: deleting stats %#v", deletedStats) - } - delMap, err := deleteStats(sqlCtx, prevMap, deletedStats...) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - - newMap, err := flushStats(sqlCtx, delMap, newStats) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - - curStats.setCurrentMap(newMap) - for q, s := range newStats { - curStats.setIndexStats(q, s) - } - p.setStats(dbName, curStats) - err = ddb.SetStatisics(ctx, newMap.HashOf()) - if err != nil { - sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) - continue - } - } - } - }) -} - -func newIdxMeta(ctx *sql.Context, curStats *DoltStats, doltTable *doltdb.Table, sqlIndex sql.Index, cols []string) (indexMeta, error) { - var idx durable.Index - var err error - if strings.EqualFold(sqlIndex.ID(), "PRIMARY") { - idx, err = doltTable.GetRowData(ctx) - } else { - idx, err = doltTable.GetIndexRowData(ctx, sqlIndex.ID()) - } - if err != nil { - return indexMeta{}, err - } - - prollyMap := durable.ProllyMapFromIndex(idx) - - // get newest histogram target level hashes - levelNodes, err := tree.GetHistogramLevel(ctx, prollyMap.Tuples(), bucketLowCnt) - if err != nil { - return indexMeta{}, err - } - - var addrs []hash.Hash - var preservedStats []DoltBucket - var missingAddrs float64 - var missingChunks []tree.Node - var missingOffsets [][]uint64 - var offset uint64 - for _, n := range levelNodes { - // Compare the previous histogram chunks to the newest tree chunks. - // Partition the newest chunks into 1) preserved or 2) missing. - // Missing chunks will need to be scanned on a stats update, so - // track the (start, end) ordinal offsets to simplify the read iter. - treeCnt, err := n.TreeCount() - if err != nil { - return indexMeta{}, err - } - - addrs = append(addrs, n.HashOf()) - if bucketIdx, ok := curStats.active[n.HashOf()]; !ok { - missingChunks = append(missingChunks, n) - missingOffsets = append(missingOffsets, []uint64{offset, offset + uint64(treeCnt)}) - missingAddrs++ - } else { - preservedStats = append(preservedStats, curStats.Histogram[bucketIdx]) - } - offset += uint64(treeCnt) - } - return indexMeta{ - qual: curStats.Qual, - cols: cols, - updateChunks: missingChunks, - updateOrdinals: missingOffsets, - preexisting: preservedStats, - allAddrs: addrs, - }, nil -} diff --git a/go/libraries/doltcore/sqle/stats/stats_provider.go b/go/libraries/doltcore/sqle/stats/stats_provider.go deleted file mode 100644 index 55d408b3ed4..00000000000 --- a/go/libraries/doltcore/sqle/stats/stats_provider.go +++ /dev/null @@ -1,583 +0,0 @@ -// Copyright 2023 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package stats - -import ( - "context" - "errors" - "fmt" - "strings" - "sync" - "time" - - "github.com/dolthub/go-mysql-server/sql" - "github.com/dolthub/go-mysql-server/sql/stats" - - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" - "github.com/dolthub/dolt/go/libraries/doltcore/env" - "github.com/dolthub/dolt/go/libraries/doltcore/schema" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dtables" - "github.com/dolthub/dolt/go/store/hash" - "github.com/dolthub/dolt/go/store/prolly" - "github.com/dolthub/dolt/go/store/prolly/tree" -) - -var ErrFailedToLoad = errors.New("failed to load statistics") - -type DoltStats struct { - mu *sync.Mutex - // chunks is a list of addresses for the histogram fanout level - chunks []hash.Hash - // active maps a chunk/bucket address to its position in - // the histogram. 1-indexed to differentiate from an empty - // field on disk - active map[hash.Hash]int - - RowCount uint64 - DistinctCount uint64 - NullCount uint64 - AvgSize uint64 - Qual sql.StatQualifier - CreatedAt time.Time - Histogram DoltHistogram - Columns []string - Types []sql.Type - IdxClass uint8 - LowerBound sql.Row - fds *sql.FuncDepSet - colSet sql.ColSet -} - -func NewDoltStats() *DoltStats { - return &DoltStats{mu: &sync.Mutex{}, active: make(map[hash.Hash]int)} -} - -func DoltStatsFromSql(stat sql.Statistic) (*DoltStats, error) { - hist, err := DoltHistFromSql(stat.Histogram(), stat.Types()) - if err != nil { - return nil, err - } - return &DoltStats{ - mu: &sync.Mutex{}, - Qual: stat.Qualifier(), - RowCount: stat.RowCount(), - DistinctCount: stat.DistinctCount(), - NullCount: stat.NullCount(), - AvgSize: stat.AvgSize(), - CreatedAt: stat.CreatedAt(), - Histogram: hist, - Columns: stat.Columns(), - Types: stat.Types(), - IdxClass: uint8(stat.IndexClass()), - LowerBound: stat.LowerBound(), - fds: stat.FuncDeps(), - colSet: stat.ColSet(), - }, nil -} - -func (s *DoltStats) updateActive() { - s.mu.Lock() - defer s.mu.Unlock() - newActive := make(map[hash.Hash]int) - for i, hash := range s.chunks { - newActive[hash] = i - } - s.active = newActive -} - -func (s *DoltStats) updateCounts() { - s.mu.Lock() - defer s.mu.Unlock() - var newDistinct uint64 - var newRows uint64 - var newNulls uint64 - for _, b := range s.Histogram { - newDistinct += b.DistinctCount - newRows += b.RowCount - newNulls += b.NullCount - } - s.RowCount = newRows - s.DistinctCount = newDistinct - s.NullCount = newNulls -} - -func (s *DoltStats) toSql() sql.Statistic { - s.mu.Lock() - defer s.mu.Unlock() - typStrs := make([]string, len(s.Types)) - for i, typ := range s.Types { - typStrs[i] = typ.String() - } - stat := stats.NewStatistic(s.RowCount, s.DistinctCount, s.NullCount, s.AvgSize, s.CreatedAt, s.Qual, s.Columns, s.Types, s.Histogram.toSql(), sql.IndexClass(s.IdxClass), s.LowerBound) - return stat.WithColSet(s.colSet).WithFuncDeps(s.fds) -} - -type DoltHistogram []DoltBucket - -type DoltBucket struct { - Chunk hash.Hash - RowCount uint64 - DistinctCount uint64 - NullCount uint64 - CreatedAt time.Time - Mcvs []sql.Row - McvCount []uint64 - BoundCount uint64 - UpperBound sql.Row -} - -func DoltHistFromSql(hist sql.Histogram, types []sql.Type) (DoltHistogram, error) { - ret := make([]DoltBucket, len(hist)) - var err error - for i, b := range hist { - upperBound := make(sql.Row, len(b.UpperBound())) - for i, v := range b.UpperBound() { - upperBound[i], _, err = types[i].Convert(v) - if err != nil { - return nil, fmt.Errorf("failed to convert %v to type %s", v, types[i].String()) - } - } - mcvs := make([]sql.Row, len(b.Mcvs())) - for i, mcv := range b.Mcvs() { - for _, v := range mcv { - conv, _, err := types[i].Convert(v) - if err != nil { - return nil, fmt.Errorf("failed to convert %v to type %s", v, types[i].String()) - } - mcvs[i] = append(mcvs[i], conv) - } - } - ret[i] = DoltBucket{ - RowCount: b.RowCount(), - DistinctCount: b.DistinctCount(), - NullCount: b.NullCount(), - Mcvs: mcvs, - McvCount: b.McvCounts(), - BoundCount: b.BoundCount(), - UpperBound: upperBound, - } - } - return ret, nil -} - -func (s DoltHistogram) toSql() []*stats.Bucket { - ret := make([]*stats.Bucket, len(s)) - for i, b := range s { - upperBound := make([]interface{}, len(b.UpperBound)) - copy(upperBound, b.UpperBound) - ret[i] = stats.NewHistogramBucket(b.RowCount, b.DistinctCount, b.NullCount, b.BoundCount, upperBound, b.McvCount, b.Mcvs) - } - return ret -} - -type indexMeta struct { - qual sql.StatQualifier - cols []string - updateChunks []tree.Node - // [start, stop] ordinals for each chunk for update - updateOrdinals [][]uint64 - preexisting []DoltBucket - allAddrs []hash.Hash -} - -func NewProvider() *Provider { - return &Provider{ - mu: &sync.Mutex{}, - dbStats: make(map[string]*dbStats), - cancelers: make(map[string]context.CancelFunc), - status: make(map[string]string), - } -} - -// Provider is the engine interface for reading and writing index statistics. -// Each database has its own statistics table that all tables/indexes in a db -// share. -type Provider struct { - mu *sync.Mutex - latestRootAddr hash.Hash - dbStats map[string]*dbStats - cancelers map[string]context.CancelFunc - starter sqle.InitDatabaseHook - status map[string]string -} - -// each database has one statistics table that is a collection of the -// table stats in the database -type dbStats struct { - mu *sync.Mutex - db string - stats map[sql.StatQualifier]*DoltStats - currentMap prolly.Map - latestRoot *doltdb.RootValue - latestTableHashes map[string]hash.Hash -} - -func newDbStats(dbName string) *dbStats { - return &dbStats{ - mu: &sync.Mutex{}, - db: dbName, - stats: make(map[sql.StatQualifier]*DoltStats), - latestTableHashes: make(map[string]hash.Hash), - } -} - -var _ sql.StatsProvider = (*Provider)(nil) - -func (p *Provider) StartRefreshThread(ctx *sql.Context, pro dsess.DoltDatabaseProvider, name string, env *env.DoltEnv) error { - err := p.starter(ctx, pro.(*sqle.DoltDatabaseProvider), name, env) - if err != nil { - p.UpdateStatus(name, fmt.Sprintf("error restarting thread %s: %s", name, err.Error())) - return err - } - p.UpdateStatus(name, fmt.Sprintf("restarted thread: %s", name)) - return nil -} - -func (p *Provider) SetStarter(hook sqle.InitDatabaseHook) { - p.starter = hook -} - -func (p *Provider) CancelRefreshThread(dbName string) { - p.mu.Lock() - defer p.mu.Unlock() - if cancel, ok := p.cancelers[dbName]; ok { - cancel() - p.status[dbName] = fmt.Sprintf("cancelled thread: %s", dbName) - } -} - -func (p *Provider) ThreadStatus(dbName string) string { - if msg, ok := p.status[dbName]; ok { - return msg - } - return "no active stats thread" -} - -func (p *Provider) setStats(dbName string, s *dbStats) { - p.mu.Lock() - defer p.mu.Unlock() - p.dbStats[dbName] = s - if s != nil && len(s.stats) > 0 { - p.status[dbName] = fmt.Sprintf("updated to hash: %s", s.currentMap.HashOf()) - } -} - -func (p *Provider) getStats(dbName string) *dbStats { - p.mu.Lock() - defer p.mu.Unlock() - s, _ := p.dbStats[dbName] - return s -} - -func (s *dbStats) getLatestHash(tableName string) hash.Hash { - s.mu.Lock() - defer s.mu.Unlock() - h, _ := s.latestTableHashes[tableName] - return h -} - -func (s *dbStats) setLatestHash(tableName string, h hash.Hash) { - s.mu.Lock() - defer s.mu.Unlock() - s.latestTableHashes[tableName] = h -} - -func (s *dbStats) getCurrentMap() prolly.Map { - s.mu.Lock() - defer s.mu.Unlock() - return s.currentMap -} - -func (s *dbStats) setCurrentMap(m prolly.Map) { - s.mu.Lock() - defer s.mu.Unlock() - s.currentMap = m -} - -func (s *dbStats) getIndexStats(qual sql.StatQualifier) *DoltStats { - s.mu.Lock() - defer s.mu.Unlock() - stat, _ := s.stats[qual] - return stat -} - -func (s *dbStats) setIndexStats(qual sql.StatQualifier, stat *DoltStats) { - s.mu.Lock() - defer s.mu.Unlock() - s.stats[qual] = stat -} - -func (s *dbStats) dropIndexStats(qual sql.StatQualifier) { - s.mu.Lock() - defer s.mu.Unlock() - delete(s.stats, qual) -} - -// Init scans the statistics tables, populating the |stats| attribute. -// Statistics are not available for reading until we've finished loading. -func (p *Provider) Load(ctx *sql.Context, dbs []dsess.SqlDatabase) error { - for _, db := range dbs { - // set map keys so concurrent orthogonal writes are OK - p.setStats(strings.ToLower(db.Name()), newDbStats(strings.ToLower(db.Name()))) - } - - eg, ctx := ctx.NewErrgroup() - for _, db := range dbs { - // copy closure variables - dbName := strings.ToLower(db.Name()) - db := db - eg.Go(func() (err error) { - defer func() { - if r := recover(); r != nil { - if str, ok := r.(fmt.Stringer); ok { - err = fmt.Errorf("%w: %s", ErrFailedToLoad, str.String()) - } else { - err = fmt.Errorf("%w: %v", ErrFailedToLoad, r) - } - - return - } - }() - - m, err := db.DbData().Ddb.GetStatistics(ctx) - if errors.Is(err, doltdb.ErrNoStatistics) { - return nil - } else if err != nil { - return err - } - if cnt, err := m.Count(); err != nil { - return err - } else if cnt == 0 { - return nil - } - stats, err := loadStats(ctx, db, m) - if errors.Is(err, dtables.ErrIncompatibleVersion) { - ctx.Warn(0, err.Error()) - return nil - } else if err != nil { - return err - } - p.setStats(dbName, stats) - return nil - }) - } - return eg.Wait() -} - -func (p *Provider) GetTableStats(ctx *sql.Context, db, table string) ([]sql.Statistic, error) { - var ret []sql.Statistic - if dbStat := p.getStats(strings.ToLower(db)); dbStat != nil { - dbStat.mu.Lock() - defer dbStat.mu.Unlock() - for qual, stat := range dbStat.stats { - if strings.EqualFold(db, qual.Database) && strings.EqualFold(table, qual.Tab) { - ret = append(ret, stat.toSql()) - } - } - } - return ret, nil -} - -func (p *Provider) SetStats(ctx *sql.Context, stats sql.Statistic) error { - doltStats, err := DoltStatsFromSql(stats) - if err != nil { - return err - } - dbName := strings.ToLower(stats.Qualifier().Database) - stat := p.getStats(dbName) - if stat == nil { - stat = newDbStats(dbName) - } - stat.setIndexStats(stats.Qualifier(), doltStats) - p.setStats(dbName, stat) - return nil -} - -func (p *Provider) GetStats(ctx *sql.Context, qual sql.StatQualifier, cols []string) (sql.Statistic, bool) { - if stat := p.getStats(strings.ToLower(qual.Database)); stat != nil { - idxStat := stat.getIndexStats(qual) - if idxStat != nil { - return idxStat.toSql(), true - } - } - return nil, false -} - -func (p *Provider) DropDbStats(ctx *sql.Context, db string, flush bool) error { - p.setStats(db, nil) - p.mu.Lock() - defer p.mu.Unlock() - p.status[db] = "dropped" - if flush { - dSess := dsess.DSessFromSess(ctx.Session) - ddb, ok := dSess.GetDoltDB(ctx, db) - if !ok { - return nil - } - return ddb.DropStatisics(ctx) - } - return nil -} - -func (p *Provider) DropStats(ctx *sql.Context, qual sql.StatQualifier, cols []string) error { - if stat := p.getStats(strings.ToLower(qual.Database)); stat != nil { - stat.dropIndexStats(qual) - p.UpdateStatus(qual.Db(), fmt.Sprintf("dropped statisic: %s", qual.String())) - } - return nil -} - -func (p *Provider) UpdateStatus(db string, msg string) { - p.mu.Lock() - defer p.mu.Unlock() - p.status[db] = msg -} - -func (p *Provider) RowCount(ctx *sql.Context, db, table string) (uint64, error) { - if dbStat := p.getStats(strings.ToLower(db)); dbStat != nil { - dbStat.mu.Lock() - defer dbStat.mu.Unlock() - for qual, s := range dbStat.stats { - if strings.EqualFold(db, qual.Database) && strings.EqualFold(table, qual.Table()) && strings.EqualFold(qual.Index(), "primary") { - return s.RowCount, nil - } - } - } - return 0, nil -} - -func (p *Provider) DataLength(_ *sql.Context, db, table string) (uint64, error) { - if dbStat := p.getStats(strings.ToLower(db)); dbStat != nil { - dbStat.mu.Lock() - defer dbStat.mu.Unlock() - for qual, s := range dbStat.stats { - if strings.EqualFold(db, qual.Database) && strings.EqualFold(table, qual.Table()) && strings.EqualFold(qual.Index(), "primary") { - return s.AvgSize, nil - } - } - } - return 0, nil -} - -func (p *Provider) RefreshTableStats(ctx *sql.Context, table sql.Table, db string) error { - tableName := strings.ToLower(table.Name()) - dbName := strings.ToLower(db) - - iat, ok := table.(sql.IndexAddressableTable) - if !ok { - return nil - } - indexes, err := iat.GetIndexes(ctx) - if err != nil { - return err - } - - // it's important to update session references every call - dSess := dsess.DSessFromSess(ctx.Session) - prov := dSess.Provider() - sqlDb, err := prov.Database(ctx, dbName) - if err != nil { - return err - } - sqlTable, ok, err := sqlDb.GetTableInsensitive(ctx, tableName) - if err != nil { - return err - } - if !ok { - return fmt.Errorf("error creating statistics for table: %s; table not found", tableName) - } - - var dTab *doltdb.Table - switch t := sqlTable.(type) { - case *sqle.AlterableDoltTable: - dTab, err = t.DoltTable.DoltTable(ctx) - case *sqle.WritableDoltTable: - dTab, err = t.DoltTable.DoltTable(ctx) - case *sqle.DoltTable: - dTab, err = t.DoltTable(ctx) - default: - return fmt.Errorf("failed to unwrap dolt table from type: %T", sqlTable) - } - if err != nil { - return err - } - - curStats := p.getStats(dbName) - if curStats == nil { - curStats = newDbStats(dbName) - } - - tablePrefix := fmt.Sprintf("%s.", tableName) - var idxMetas []indexMeta - for _, idx := range indexes { - cols := make([]string, len(idx.Expressions())) - for i, c := range idx.Expressions() { - cols[i] = strings.TrimPrefix(strings.ToLower(c), tablePrefix) - } - - qual := sql.NewStatQualifier(db, table.Name(), strings.ToLower(idx.ID())) - curStat := curStats.getIndexStats(qual) - if curStat == nil { - curStat = NewDoltStats() - curStat.Qual = qual - } - idxMeta, err := newIdxMeta(ctx, curStat, dTab, idx, cols) - if err != nil { - return err - } - idxMetas = append(idxMetas, idxMeta) - } - - newTableStats, err := updateStats(ctx, sqlTable, dTab, indexes, idxMetas) - if err != nil { - return err - } - - // merge new chunks with preexisting chunks - newStats := make(map[sql.StatQualifier]*DoltStats) - for _, idxMeta := range idxMetas { - stat := newTableStats[idxMeta.qual] - newStats[idxMeta.qual] = mergeStatUpdates(stat, idxMeta) - } - - ddb, ok := dSess.GetDoltDB(ctx, dbName) - if !ok { - return fmt.Errorf("database not found in session for stats update: %s", db) - } - - prevMap := curStats.currentMap - if prevMap.KeyDesc().Count() == 0 { - kd, vd := schema.StatsTableDoltSchema.GetMapDescriptors() - prevMap, err = prolly.NewMapFromTuples(ctx, ddb.NodeStore(), kd, vd) - if err != nil { - return err - } - } - newMap, err := flushStats(ctx, prevMap, newStats) - if err != nil { - return err - } - - curStats.setCurrentMap(newMap) - for k, v := range newStats { - curStats.setIndexStats(k, v) - } - - p.setStats(dbName, curStats) - - return ddb.SetStatisics(ctx, newMap.HashOf()) -} diff --git a/go/libraries/doltcore/sqle/stats/write.go b/go/libraries/doltcore/sqle/stats/write.go deleted file mode 100644 index c2e27445a31..00000000000 --- a/go/libraries/doltcore/sqle/stats/write.go +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2024 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package stats - -import ( - "errors" - "fmt" - "io" - "strings" - - "github.com/dolthub/go-mysql-server/sql" - - "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" - "github.com/dolthub/dolt/go/libraries/doltcore/schema" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/store/prolly" - "github.com/dolthub/dolt/go/store/prolly/tree" - stypes "github.com/dolthub/dolt/go/store/types" - "github.com/dolthub/dolt/go/store/val" -) - -// About ~200 20 byte address fit in a ~4k chunk. Chunk sizes -// are approximate, but certainly shouldn't reach the square -// of the expected size. -const maxBucketFanout = 200 * 200 - -func newStatsTable(ctx *sql.Context, ns tree.NodeStore, vrw stypes.ValueReadWriter) (*doltdb.Table, error) { - return doltdb.CreateEmptyTable(ctx, ns, vrw, schema.StatsTableDoltSchema) -} - -// flushStats writes a set of table statistics to the given node store, and returns a new prolly.Map -func flushStats(ctx *sql.Context, prev prolly.Map, tableStats map[sql.StatQualifier]*DoltStats) (prolly.Map, error) { - if _, disabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsMemoryOnly); disabled == int8(1) { - // do not write to disk - return prolly.Map{}, nil - } - - sch := schema.StatsTableDoltSchema - kd, vd := sch.GetMapDescriptors() - var m *prolly.MutableMap - m = prev.Mutate() - pool := prev.NodeStore().Pool() - - keyBuilder := val.NewTupleBuilder(kd) - valueBuilder := val.NewTupleBuilder(vd) - - stringifyKey := func(r sql.Row, types []sql.Type) string { - b := strings.Builder{} - sep := "" - for i, v := range r { - if v == nil { - v = types[i].Zero() - } - fmt.Fprintf(&b, "%s%v", sep, v) - sep = "," - } - return b.String() - } - for qual, stats := range tableStats { - var pos int64 - - // delete previous entries for this index -> (db, table, index, pos) - keyBuilder.PutString(0, qual.Database) - keyBuilder.PutString(1, qual.Table()) - keyBuilder.PutString(2, qual.Index()) - keyBuilder.PutInt64(3, 0) - firstKey := keyBuilder.Build(pool) - keyBuilder.PutString(0, qual.Database) - keyBuilder.PutString(1, qual.Table()) - keyBuilder.PutString(2, qual.Index()) - keyBuilder.PutInt64(3, maxBucketFanout+1) - maxKey := keyBuilder.Build(pool) - - // there is a limit on the number of buckets for a given index, iter - // will terminate before maxBucketFanout - iter, err := prev.IterKeyRange(ctx, firstKey, maxKey) - if err != nil { - return prolly.Map{}, err - } - - for { - k, _, err := iter.Next(ctx) - if errors.Is(err, io.EOF) { - break - } else if err != nil { - return prolly.Map{}, err - } - err = m.Put(ctx, k, nil) - if err != nil { - return prolly.Map{}, err - } - } - - // now add new buckets - typesB := strings.Builder{} - sep := "" - for _, t := range stats.Types { - typesB.WriteString(sep + t.String()) - sep = "," - } - typesStr := typesB.String() - - if len(stats.Types) != len(stats.Columns) { - ctx.GetLogger().Println(stats.Qual.String()) - ctx.GetLogger().Println(typesStr) - ctx.GetLogger().Println(strings.Join(stats.Columns, ",")) - panic("invalid statistic") - } - - for _, h := range stats.Histogram { - var upperBoundElems []string - for _, v := range h.UpperBound { - upperBoundElems = append(upperBoundElems, fmt.Sprintf("%v", v)) - } - - keyBuilder.PutString(0, qual.Database) - keyBuilder.PutString(1, qual.Tab) - keyBuilder.PutString(2, qual.Idx) - keyBuilder.PutInt64(3, pos) - - valueBuilder.PutInt64(0, schema.StatsVersion) - valueBuilder.PutString(1, h.Chunk.String()) - valueBuilder.PutInt64(2, int64(h.RowCount)) - valueBuilder.PutInt64(3, int64(h.DistinctCount)) - valueBuilder.PutInt64(4, int64(h.NullCount)) - valueBuilder.PutString(5, strings.Join(stats.Columns, ",")) - valueBuilder.PutString(6, typesStr) - valueBuilder.PutString(7, stringifyKey(h.UpperBound, stats.Types)) - valueBuilder.PutInt64(8, int64(h.BoundCount)) - valueBuilder.PutDatetime(9, h.CreatedAt) - for i, r := range h.Mcvs { - valueBuilder.PutString(10+i, stringifyKey(r, stats.Types)) - } - var mcvCntsRow sql.Row - for _, v := range h.McvCount { - mcvCntsRow = append(mcvCntsRow, int(v)) - } - valueBuilder.PutString(14, stringifyKey(mcvCntsRow, stats.Types)) - - key := keyBuilder.Build(pool) - value := valueBuilder.Build(pool) - m.Put(ctx, key, value) - pos++ - } - } - - return m.Map(ctx) -} - -func deleteStats(ctx *sql.Context, prev prolly.Map, quals ...sql.StatQualifier) (prolly.Map, error) { - if cnt, err := prev.Count(); err != nil { - return prolly.Map{}, err - } else if cnt == 0 { - return prev, nil - } - - sch := schema.StatsTableDoltSchema - kd, _ := sch.GetMapDescriptors() - var m *prolly.MutableMap - m = prev.Mutate() - pool := prev.NodeStore().Pool() - - keyBuilder := val.NewTupleBuilder(kd) - - for _, qual := range quals { - // delete previous entries for this index -> (db, table, index, pos) - keyBuilder.PutString(0, qual.Database) - keyBuilder.PutString(1, qual.Table()) - keyBuilder.PutString(2, qual.Index()) - keyBuilder.PutInt64(3, 0) - firstKey := keyBuilder.Build(pool) - keyBuilder.PutString(0, qual.Database) - keyBuilder.PutString(1, qual.Table()) - keyBuilder.PutString(2, qual.Index()) - keyBuilder.PutInt64(3, maxBucketFanout+1) - maxKey := keyBuilder.Build(pool) - - // there is a limit on the number of buckets for a given index, iter - // will terminate before maxBucketFanout - iter, err := prev.IterKeyRange(ctx, firstKey, maxKey) - if err != nil { - return prolly.Map{}, err - } - - for { - k, _, err := iter.Next(ctx) - if errors.Is(err, io.EOF) { - break - } else if err != nil { - return prolly.Map{}, err - } - err = m.Put(ctx, k, nil) - if err != nil { - return prolly.Map{}, err - } - } - } - return m.Map(ctx) -} diff --git a/go/libraries/doltcore/sqle/statsnoms/database.go b/go/libraries/doltcore/sqle/statsnoms/database.go new file mode 100644 index 00000000000..bb34e703a55 --- /dev/null +++ b/go/libraries/doltcore/sqle/statsnoms/database.go @@ -0,0 +1,317 @@ +// Copyright 2024 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statsnoms + +import ( + "context" + "errors" + "fmt" + "path" + "strings" + "sync" + + "github.com/dolthub/go-mysql-server/sql" + + "github.com/dolthub/dolt/go/libraries/doltcore/dbfactory" + "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" + "github.com/dolthub/dolt/go/libraries/doltcore/env" + "github.com/dolthub/dolt/go/libraries/doltcore/schema" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" + "github.com/dolthub/dolt/go/libraries/doltcore/table/editor" + "github.com/dolthub/dolt/go/libraries/utils/earl" + "github.com/dolthub/dolt/go/libraries/utils/filesys" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/prolly" + "github.com/dolthub/dolt/go/store/types" +) + +func NewNomsStatsFactory(dialPro dbfactory.GRPCDialProvider) *NomsStatsFactory { + return &NomsStatsFactory{dialPro: dialPro} +} + +type NomsStatsFactory struct { + dialPro dbfactory.GRPCDialProvider +} + +var _ statspro.StatsFactory = NomsStatsFactory{} + +func (sf NomsStatsFactory) Init(ctx *sql.Context, sourceDb dsess.SqlDatabase, prov *sqle.DoltDatabaseProvider, fs filesys.Filesys, hdp env.HomeDirProvider) (statspro.Database, error) { + params := make(map[string]interface{}) + params[dbfactory.GRPCDialProviderParam] = sf.dialPro + + var urlPath string + u, err := earl.Parse(prov.DbFactoryUrl()) + if u.Scheme == dbfactory.MemScheme { + urlPath = path.Join(prov.DbFactoryUrl(), dbfactory.DoltDataDir) + } else if u.Scheme == dbfactory.FileScheme { + urlPath = doltdb.LocalDirDoltDB + } + + statsFs, err := fs.WithWorkingDir(dbfactory.DoltStatsDir) + if err != nil { + return nil, err + } + + var dEnv *env.DoltEnv + exists, isDir := statsFs.Exists("") + if !exists { + err := statsFs.MkDirs("") + if err != nil { + return nil, fmt.Errorf("unable to make directory '%s', cause: %s", dbfactory.DoltStatsDir, err.Error()) + } + + dEnv = env.Load(context.Background(), hdp, statsFs, urlPath, "test") + sess := dsess.DSessFromSess(ctx.Session) + err = dEnv.InitRepo(ctx, types.Format_Default, sess.Username(), sess.Email(), prov.DefaultBranch()) + if err != nil { + return nil, err + } + } else if !isDir { + return nil, fmt.Errorf("file exists where the dolt stats directory should be") + } else { + dEnv = env.LoadWithoutDB(ctx, hdp, statsFs, "") + } + + ddb, err := doltdb.LoadDoltDBWithParams(ctx, types.Format_Default, urlPath, statsFs, params) + if err != nil { + return nil, err + } + + dEnv.DoltDB = ddb + + deaf := dEnv.DbEaFactory() + + tmpDir, err := dEnv.TempTableFilesDir() + if err != nil { + return nil, err + } + opts := editor.Options{ + Deaf: deaf, + Tempdir: tmpDir, + } + statsDb, err := sqle.NewDatabase(ctx, "stats", dEnv.DbData(), opts) + if err != nil { + return nil, err + } + return NewNomsStats(sourceDb, statsDb), nil +} + +func NewNomsStats(sourceDb, statsDb dsess.SqlDatabase) *NomsStatsDatabase { + return &NomsStatsDatabase{mu: &sync.Mutex{}, destDb: statsDb, sourceDb: sourceDb} +} + +type dbStats map[sql.StatQualifier]*statspro.DoltStats + +type NomsStatsDatabase struct { + mu *sync.Mutex + destDb dsess.SqlDatabase + sourceDb dsess.SqlDatabase + stats []dbStats + branches []string + latestTableRoots []map[string]hash.Hash + dirty []*prolly.MutableMap +} + +var _ statspro.Database = (*NomsStatsDatabase)(nil) + +func (n *NomsStatsDatabase) Close() error { + return n.destDb.DbData().Ddb.Close() +} + +func (n *NomsStatsDatabase) LoadBranchStats(ctx *sql.Context, branch string) error { + statsMap, err := n.destDb.DbData().Ddb.GetStatistics(ctx, branch) + if errors.Is(err, doltdb.ErrNoStatistics) { + return nil + } else if err != nil { + return err + } + doltStats, err := loadStats(ctx, n.sourceDb, statsMap) + if err != nil { + return err + } + n.branches = append(n.branches, branch) + n.stats = append(n.stats, doltStats) + n.dirty = append(n.dirty, nil) + n.latestTableRoots = append(n.latestTableRoots, make(map[string]hash.Hash)) + return nil +} + +func (n *NomsStatsDatabase) getBranchStats(branch string) dbStats { + for i, b := range n.branches { + if strings.EqualFold(b, branch) { + return n.stats[i] + } + } + return nil +} + +func (n *NomsStatsDatabase) GetStat(branch string, qual sql.StatQualifier) (*statspro.DoltStats, bool) { + stats := n.getBranchStats(branch) + ret, ok := stats[qual] + return ret, ok +} + +func (n *NomsStatsDatabase) ListStatQuals(branch string) []sql.StatQualifier { + stats := n.getBranchStats(branch) + var ret []sql.StatQualifier + for qual, _ := range stats { + ret = append(ret, qual) + } + return ret +} + +func (n *NomsStatsDatabase) SetStat(ctx context.Context, branch string, qual sql.StatQualifier, stats *statspro.DoltStats) error { + var statsMap *prolly.MutableMap + for i, b := range n.branches { + if strings.EqualFold(branch, b) { + n.stats[i][qual] = stats + if n.dirty[i] == nil { + n.initMutable(ctx, i) + } + statsMap = n.dirty[i] + } + } + if statsMap == nil { + if err := n.trackBranch(ctx, branch); err != nil { + return err + } + statsMap = n.dirty[len(n.branches)-1] + n.stats[len(n.branches)-1][qual] = stats + } + + return n.replaceStats(ctx, statsMap, stats) +} + +func (n *NomsStatsDatabase) trackBranch(ctx context.Context, branch string) error { + n.branches = append(n.branches, branch) + n.stats = append(n.stats, make(dbStats)) + n.latestTableRoots = append(n.latestTableRoots, make(map[string]hash.Hash)) + + kd, vd := schema.StatsTableDoltSchema.GetMapDescriptors() + newMap, err := prolly.NewMapFromTuples(ctx, n.destDb.DbData().Ddb.NodeStore(), kd, vd) + if err != nil { + return err + } + n.dirty = append(n.dirty, newMap.Mutate()) + return n.destDb.DbData().Ddb.SetStatisics(ctx, branch, newMap.HashOf()) +} + +func (n *NomsStatsDatabase) initMutable(ctx context.Context, i int) error { + statsMap, err := n.destDb.DbData().Ddb.GetStatistics(ctx, n.branches[i]) + if err != nil { + return err + } + n.dirty[i] = statsMap.Mutate() + return nil +} + +func (n *NomsStatsDatabase) DeleteStats(branch string, quals ...sql.StatQualifier) { + for i, b := range n.branches { + if strings.EqualFold(b, branch) { + for _, qual := range quals { + delete(n.stats[i], qual) + } + } + } +} + +func (n *NomsStatsDatabase) DeleteBranchStats(ctx context.Context, branch string, flush bool) error { + for i, b := range n.branches { + if strings.EqualFold(b, branch) { + n.branches = append(n.branches[:i], n.branches[i+1:]...) + n.dirty = append(n.dirty[:i], n.dirty[i+1:]...) + n.stats = append(n.stats[:i], n.stats[i+1:]...) + n.latestTableRoots = append(n.latestTableRoots[:i], n.latestTableRoots[i+1:]...) + } + } + if flush { + return n.destDb.DbData().Ddb.DropStatisics(ctx, branch) + } + return nil +} + +func (n *NomsStatsDatabase) ReplaceChunks(ctx context.Context, branch string, qual sql.StatQualifier, targetHashes []hash.Hash, _, newChunks []statspro.DoltBucket) error { + var dbStat dbStats + for i, b := range n.branches { + if strings.EqualFold(b, branch) { + // naive merge the new with old + dbStat = n.stats[i] + } + } + + if dbStat == nil { + if err := n.trackBranch(ctx, branch); err != nil { + return err + } + dbStat = n.stats[len(n.branches)-1] + } + + if _, ok := dbStat[qual]; ok { + oldChunks := dbStat[qual].Histogram + targetBuckets, err := statspro.MergeNewChunks(targetHashes, oldChunks, newChunks) + if err != nil { + return err + } + dbStat[qual].Histogram = targetBuckets + } else { + dbStat[qual] = statspro.NewDoltStats() + } + dbStat[qual].Chunks = targetHashes + dbStat[qual].UpdateActive() + + // let |n.SetStats| update memory and disk + return n.SetStat(ctx, branch, qual, dbStat[qual]) +} + +func (n *NomsStatsDatabase) Flush(ctx context.Context, branch string) error { + for i, b := range n.branches { + if strings.EqualFold(b, branch) { + if n.dirty[i] != nil { + flushedMap, err := n.dirty[i].Map(ctx) + if err != nil { + return err + } + n.dirty[i] = nil + n.destDb.DbData().Ddb.SetStatisics(ctx, branch, flushedMap.HashOf()) + return nil + } + } + } + return nil +} + +func (n *NomsStatsDatabase) GetLatestHash(branch, tableName string) hash.Hash { + n.mu.Lock() + defer n.mu.Unlock() + for i, b := range n.branches { + if strings.EqualFold(branch, b) { + return n.latestTableRoots[i][tableName] + } + } + return hash.Hash{} +} + +func (n *NomsStatsDatabase) SetLatestHash(branch, tableName string, h hash.Hash) { + n.mu.Lock() + defer n.mu.Unlock() + for i, b := range n.branches { + if strings.EqualFold(branch, b) { + n.latestTableRoots[i][tableName] = h + break + } + } +} diff --git a/go/libraries/doltcore/sqle/statsnoms/iter.go b/go/libraries/doltcore/sqle/statsnoms/iter.go new file mode 100644 index 00000000000..a0ea725bc51 --- /dev/null +++ b/go/libraries/doltcore/sqle/statsnoms/iter.go @@ -0,0 +1,175 @@ +// Copyright 2024 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statsnoms + +import ( + "fmt" + "strings" + "time" + + "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/planbuilder" + "github.com/dolthub/go-mysql-server/sql/stats" + "gopkg.in/errgo.v2/errors" + + "github.com/dolthub/dolt/go/libraries/doltcore/schema" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/prolly" + "github.com/dolthub/dolt/go/store/prolly/tree" + "github.com/dolthub/dolt/go/store/val" +) + +var ErrIncompatibleVersion = errors.New("client stats version mismatch") + +func NewStatsIter(ctx *sql.Context, m prolly.Map) (*statsIter, error) { + iter, err := m.IterAll(ctx) + if err != nil { + return nil, err + } + kd, vd := m.Descriptors() + keyBuilder := val.NewTupleBuilder(kd) + valueBuilder := val.NewTupleBuilder(vd) + ns := m.NodeStore() + + return &statsIter{ + iter: iter, + kb: keyBuilder, + vb: valueBuilder, + ns: ns, + planb: planbuilder.New(ctx, nil), + }, nil +} + +// statsIter reads histogram buckets into string-compatible types. +// Values that are SQL rows should be converted with statsIter.ParseRow. +// todo: make a JSON compatible container for sql.Row w/ types so that we +// can eagerly convert to sql.Row without sacrificing string printing. +type statsIter struct { + iter prolly.MapIter + kb, vb *val.TupleBuilder + ns tree.NodeStore + planb *planbuilder.Builder + currentQual string + currentTypes []sql.Type +} + +var _ sql.RowIter = (*statsIter)(nil) + +func (s *statsIter) Next(ctx *sql.Context) (sql.Row, error) { + k, v, err := s.iter.Next(ctx) + if err != nil { + return nil, err + } + + // deserialize K, V + version, err := tree.GetField(ctx, s.vb.Desc, 0, v, s.ns) + if err != nil { + return nil, err + } + if version != schema.StatsVersion { + return nil, fmt.Errorf("%w: write version %d does not match read version %d", ErrIncompatibleVersion, version, schema.StatsVersion) + } + + var row sql.Row + for i := 0; i < s.kb.Desc.Count(); i++ { + f, err := tree.GetField(ctx, s.kb.Desc, i, k, s.ns) + if err != nil { + return nil, err + } + row = append(row, f) + } + + for i := 0; i < s.vb.Desc.Count(); i++ { + f, err := tree.GetField(ctx, s.vb.Desc, i, v, s.ns) + if err != nil { + return nil, err + } + row = append(row, f) + } + + dbName := row[schema.StatsDbTag].(string) + tableName := row[schema.StatsTableTag].(string) + indexName := row[schema.StatsIndexTag].(string) + position := row[schema.StatsPositionTag].(int64) + _ = row[schema.StatsVersionTag] + commit := hash.Parse(row[schema.StatsCommitHashTag].(string)) + rowCount := row[schema.StatsRowCountTag].(int64) + distinctCount := row[schema.StatsDistinctCountTag].(int64) + nullCount := row[schema.StatsNullCountTag].(int64) + columnsStr := row[schema.StatsColumnsTag].(string) + typesStr := row[schema.StatsTypesTag].(string) + upperBoundStr := row[schema.StatsUpperBoundTag].(string) + upperBoundCnt := row[schema.StatsUpperBoundCntTag].(int64) + createdAt := row[schema.StatsCreatedAtTag].(time.Time) + + typs := strings.Split(typesStr, ",") + for i, t := range typs { + typs[i] = strings.TrimSpace(t) + } + + qual := sql.NewStatQualifier(dbName, tableName, indexName) + if curQual := qual.String(); !strings.EqualFold(curQual, s.currentQual) { + s.currentQual = curQual + s.currentTypes, err = stats.ParseTypeStrings(typs) + if err != nil { + return nil, err + } + } + + mcvCountsStr := row[schema.StatsMcvCountsTag].(string) + + numMcvs := schema.StatsMcvCountsTag - schema.StatsMcv1Tag + mcvs := make([]string, numMcvs) + for i, v := range row[schema.StatsMcv1Tag:schema.StatsMcvCountsTag] { + if v != nil { + mcvs[i] = v.(string) + } + } + + return sql.Row{ + dbName, + tableName, + indexName, + int(position), + version, + commit.String(), + uint64(rowCount), + uint64(distinctCount), + uint64(nullCount), + columnsStr, + typesStr, + upperBoundStr, + uint64(upperBoundCnt), + createdAt, + mcvs[0], mcvs[1], mcvs[2], mcvs[3], + mcvCountsStr, + }, nil +} + +func (s *statsIter) ParseRow(rowStr string) (sql.Row, error) { + var row sql.Row + for i, v := range strings.Split(rowStr, ",") { + val, _, err := s.currentTypes[i].Convert(v) + if err != nil { + return nil, err + } + row = append(row, val) + } + return row, nil +} + +func (s *statsIter) Close(context *sql.Context) error { + return nil +} diff --git a/go/libraries/doltcore/sqle/stats/read.go b/go/libraries/doltcore/sqle/statsnoms/load.go similarity index 87% rename from go/libraries/doltcore/sqle/stats/read.go rename to go/libraries/doltcore/sqle/statsnoms/load.go index 3e6083f5a30..d73b23e3c8d 100644 --- a/go/libraries/doltcore/sqle/stats/read.go +++ b/go/libraries/doltcore/sqle/statsnoms/load.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package stats +package statsnoms import ( "errors" @@ -28,21 +28,21 @@ import ( "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" "github.com/dolthub/dolt/go/libraries/doltcore/schema" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" - "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dtables" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" "github.com/dolthub/dolt/go/store/hash" "github.com/dolthub/dolt/go/store/prolly" "github.com/dolthub/dolt/go/store/prolly/tree" "github.com/dolthub/dolt/go/store/val" ) -func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (*dbStats, error) { - dbStat := newDbStats(db.Name()) +func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.StatQualifier]*statspro.DoltStats, error) { + qualToStats := make(map[sql.StatQualifier]*statspro.DoltStats) - iter, err := dtables.NewStatsIter(ctx, m) + iter, err := NewStatsIter(ctx, m) if err != nil { return nil, err } - currentStat := NewDoltStats() + currentStat := statspro.NewDoltStats() var lowerBound sql.Row for { row, err := iter.Next(ctx) @@ -52,8 +52,6 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (*dbStats, return nil, err } - position := row[schema.StatsPositionTag].(int) - // deserialize K, V dbName := row[schema.StatsDbTag].(string) tableName := row[schema.StatsTableTag].(string) @@ -121,13 +119,13 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (*dbStats, if err != nil { return nil, err } - currentStat.fds = fds - currentStat.colSet = colSet - currentStat.updateActive() - dbStat.stats[currentStat.Qual] = currentStat + currentStat.Fds = fds + currentStat.ColSet = colSet + currentStat.UpdateActive() + qualToStats[currentStat.Qual] = currentStat } - currentStat = NewDoltStats() + currentStat = statspro.NewDoltStats() currentStat.Qual = qual currentStat.Columns = columns currentStat.LowerBound = lowerBound @@ -141,7 +139,7 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (*dbStats, currentStat.Qual = qual } - bucket := DoltBucket{ + bucket := statspro.DoltBucket{ Chunk: commit, RowCount: uint64(rowCount), DistinctCount: uint64(distinctCount), @@ -153,7 +151,6 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (*dbStats, UpperBound: boundRow, } - currentStat.active[commit] = position currentStat.Histogram = append(currentStat.Histogram, bucket) currentStat.RowCount += uint64(rowCount) currentStat.DistinctCount += uint64(distinctCount) @@ -170,17 +167,16 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (*dbStats, if err != nil { return nil, err } - currentStat.fds = fds - currentStat.colSet = colSet - currentStat.updateActive() - dbStat.setIndexStats(currentStat.Qual, currentStat) - dbStat.stats[currentStat.Qual] = currentStat - return dbStat, nil + currentStat.Fds = fds + currentStat.ColSet = colSet + currentStat.UpdateActive() + qualToStats[currentStat.Qual] = currentStat + return qualToStats, nil } func loadLowerBound(ctx *sql.Context, qual sql.StatQualifier) (sql.Row, error) { dSess := dsess.DSessFromSess(ctx.Session) - roots, ok := dSess.GetRoots(ctx, qual.Database) + roots, ok := dSess.GetRoots(ctx, qual.Db()) if !ok { return nil, nil } @@ -229,12 +225,12 @@ func loadFuncDeps(ctx *sql.Context, db dsess.SqlDatabase, qual sql.StatQualifier if err != nil { return nil, sql.ColSet{}, err } else if !ok { - return nil, sql.ColSet{}, fmt.Errorf("%w: table not found: '%s'", ErrFailedToLoad, qual.Table()) + return nil, sql.ColSet{}, fmt.Errorf("%w: table not found: '%s'", statspro.ErrFailedToLoad, qual.Table()) } iat, ok := tab.(sql.IndexAddressable) if !ok { - return nil, sql.ColSet{}, fmt.Errorf("%w: table does not have indexes: '%s'", ErrFailedToLoad, qual.Table()) + return nil, sql.ColSet{}, fmt.Errorf("%w: table does not have indexes: '%s'", statspro.ErrFailedToLoad, qual.Table()) } indexes, err := iat.GetIndexes(ctx) @@ -251,7 +247,7 @@ func loadFuncDeps(ctx *sql.Context, db dsess.SqlDatabase, qual sql.StatQualifier } if idx == nil { - return nil, sql.ColSet{}, fmt.Errorf("%w: index not found: '%s'", ErrFailedToLoad, qual.Index()) + return nil, sql.ColSet{}, fmt.Errorf("%w: index not found: '%s'", statspro.ErrFailedToLoad, qual.Index()) } return stats.IndexFds(qual.Table(), tab.Schema(), idx) diff --git a/go/libraries/doltcore/sqle/statsnoms/write.go b/go/libraries/doltcore/sqle/statsnoms/write.go new file mode 100644 index 00000000000..aa8fdc9f31a --- /dev/null +++ b/go/libraries/doltcore/sqle/statsnoms/write.go @@ -0,0 +1,144 @@ +// Copyright 2024 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statsnoms + +import ( + "context" + "errors" + "fmt" + "io" + "strings" + + "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/stats" + + "github.com/dolthub/dolt/go/libraries/doltcore/schema" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/statspro" + "github.com/dolthub/dolt/go/store/prolly" + "github.com/dolthub/dolt/go/store/val" +) + +// About ~200 20 byte address fit in a ~4k chunk. Chunk sizes +// are approximate, but certainly shouldn't reach the square +// of the expected size. +const maxBucketFanout = 200 * 200 + +func (n *NomsStatsDatabase) replaceStats(ctx context.Context, statsMap *prolly.MutableMap, dStats *statspro.DoltStats) error { + if err := deleteIndexRows(ctx, statsMap, dStats); err != nil { + return err + } + return putIndexRows(ctx, statsMap, dStats) +} + +func deleteIndexRows(ctx context.Context, statsMap *prolly.MutableMap, dStats *statspro.DoltStats) error { + sch := schema.StatsTableDoltSchema + kd, _ := sch.GetMapDescriptors() + + keyBuilder := val.NewTupleBuilder(kd) + + qual := dStats.Qual + pool := statsMap.NodeStore().Pool() + + // delete previous entries for this index -> (db, table, index, pos) + keyBuilder.PutString(0, qual.Database) + keyBuilder.PutString(1, qual.Table()) + keyBuilder.PutString(2, qual.Index()) + keyBuilder.PutInt64(3, 0) + firstKey := keyBuilder.Build(pool) + keyBuilder.PutString(0, qual.Database) + keyBuilder.PutString(1, qual.Table()) + keyBuilder.PutString(2, qual.Index()) + keyBuilder.PutInt64(3, maxBucketFanout+1) + maxKey := keyBuilder.Build(pool) + + // there is a limit on the number of buckets for a given index, iter + // will terminate before maxBucketFanout + iter, err := statsMap.IterKeyRange(ctx, firstKey, maxKey) + if err != nil { + return err + } + + for { + k, _, err := iter.Next(ctx) + if errors.Is(err, io.EOF) { + break + } else if err != nil { + return err + } + err = statsMap.Put(ctx, k, nil) + if err != nil { + return err + } + } + return nil +} + +func putIndexRows(ctx context.Context, statsMap *prolly.MutableMap, dStats *statspro.DoltStats) error { + sch := schema.StatsTableDoltSchema + kd, vd := sch.GetMapDescriptors() + + keyBuilder := val.NewTupleBuilder(kd) + valueBuilder := val.NewTupleBuilder(vd) + + qual := dStats.Qual + pool := statsMap.NodeStore().Pool() + + // now add new buckets + typesB := strings.Builder{} + sep := "" + for _, t := range dStats.Types { + typesB.WriteString(sep + t.String()) + sep = "," + } + typesStr := typesB.String() + + var pos int64 + for _, h := range dStats.Histogram { + var upperBoundElems []string + for _, v := range h.UpperBound { + upperBoundElems = append(upperBoundElems, fmt.Sprintf("%v", v)) + } + + keyBuilder.PutString(0, qual.Database) + keyBuilder.PutString(1, qual.Tab) + keyBuilder.PutString(2, qual.Idx) + keyBuilder.PutInt64(3, pos) + + valueBuilder.PutInt64(0, schema.StatsVersion) + valueBuilder.PutString(1, h.Chunk.String()) + valueBuilder.PutInt64(2, int64(h.RowCount)) + valueBuilder.PutInt64(3, int64(h.DistinctCount)) + valueBuilder.PutInt64(4, int64(h.NullCount)) + valueBuilder.PutString(5, strings.Join(dStats.Columns, ",")) + valueBuilder.PutString(6, typesStr) + valueBuilder.PutString(7, stats.StringifyKey(h.UpperBound, dStats.Types)) + valueBuilder.PutInt64(8, int64(h.BoundCount)) + valueBuilder.PutDatetime(9, h.CreatedAt) + for i, r := range h.Mcvs { + valueBuilder.PutString(10+i, stats.StringifyKey(r, dStats.Types)) + } + var mcvCntsRow sql.Row + for _, v := range h.McvCount { + mcvCntsRow = append(mcvCntsRow, int(v)) + } + valueBuilder.PutString(14, stats.StringifyKey(mcvCntsRow, dStats.Types)) + + key := keyBuilder.Build(pool) + value := valueBuilder.Build(pool) + statsMap.Put(ctx, key, value) + pos++ + } + return nil +} diff --git a/go/libraries/doltcore/sqle/statspro/analyze.go b/go/libraries/doltcore/sqle/statspro/analyze.go new file mode 100644 index 00000000000..0b436ea2b53 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/analyze.go @@ -0,0 +1,241 @@ +// Copyright 2024 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "fmt" + "strings" + + "github.com/dolthub/go-mysql-server/sql" + + "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" + "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" + "github.com/dolthub/dolt/go/libraries/doltcore/env" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/prolly/tree" +) + +func (p *Provider) RefreshTableStats(ctx *sql.Context, table sql.Table, db string) error { + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return err + } + + sqlDb, err := dSess.Provider().Database(ctx, fmt.Sprintf("%s/%s", db, branch)) + if err != nil { + return err + } + + // lock only after accessing DatabaseProvider + p.mu.Lock() + defer p.mu.Unlock() + + tableName := strings.ToLower(table.Name()) + dbName := strings.ToLower(db) + + iat, ok := table.(sql.IndexAddressableTable) + if !ok { + return nil + } + indexes, err := iat.GetIndexes(ctx) + if err != nil { + return err + } + + // it's important to update WORKING session references every call + sqlTable, dTab, err := GetLatestTable(ctx, tableName, sqlDb) + if err != nil { + return err + } + + statDb, ok := p.getStatDb(dbName) + if !ok { + // if the stats database does not exist, initialize one + fs, err := p.pro.FileSystemForDatabase(dbName) + if err != nil { + return err + } + sourceDb, ok := p.pro.BaseDatabase(ctx, dbName) + if !ok { + return sql.ErrDatabaseNotFound.New(dbName) + } + statDb, err = p.sf.Init(ctx, sourceDb, p.pro, fs, env.GetCurrentUserHomeDir) + if err != nil { + ctx.Warn(0, err.Error()) + return nil + } + p.setStatDb(dbName, statDb) + } + + tablePrefix := fmt.Sprintf("%s.", tableName) + var idxMetas []indexMeta + for _, idx := range indexes { + cols := make([]string, len(idx.Expressions())) + for i, c := range idx.Expressions() { + cols[i] = strings.TrimPrefix(strings.ToLower(c), tablePrefix) + } + + qual := sql.NewStatQualifier(db, table.Name(), strings.ToLower(idx.ID())) + curStat, ok := statDb.GetStat(branch, qual) + if !ok { + curStat = NewDoltStats() + curStat.Qual = qual + } + idxMeta, err := newIdxMeta(ctx, curStat, dTab, idx, cols) + if err != nil { + return err + } + idxMetas = append(idxMetas, idxMeta) + } + + newTableStats, err := createNewStatsBuckets(ctx, sqlTable, dTab, indexes, idxMetas) + if err != nil { + return err + } + + // merge new chunks with preexisting chunks + for _, idxMeta := range idxMetas { + stat := newTableStats[idxMeta.qual] + targetChunks, err := MergeNewChunks(idxMeta.allAddrs, idxMeta.keepChunks, stat.Histogram) + if err != nil { + return err + } + if targetChunks == nil { + // empty table + continue + } + stat.Chunks = idxMeta.allAddrs + stat.Histogram = targetChunks + stat.UpdateActive() + if err := statDb.SetStat(ctx, branch, idxMeta.qual, stat); err != nil { + return err + } + } + + p.UpdateStatus(dbName, fmt.Sprintf("refreshed %s", dbName)) + return statDb.Flush(ctx, branch) +} + +// GetLatestTable will get the WORKING root table for the current database/branch +func GetLatestTable(ctx *sql.Context, tableName string, sqlDb sql.Database) (sql.Table, *doltdb.Table, error) { + sqlTable, ok, err := sqlDb.(sqle.Database).GetTableInsensitive(ctx, tableName) + if err != nil { + return nil, nil, err + } + if !ok { + return nil, nil, fmt.Errorf("statistics refresh error: table not found %s", tableName) + } + + var dTab *doltdb.Table + switch t := sqlTable.(type) { + case *sqle.AlterableDoltTable: + dTab, err = t.DoltTable.DoltTable(ctx) + case *sqle.WritableDoltTable: + dTab, err = t.DoltTable.DoltTable(ctx) + case *sqle.DoltTable: + dTab, err = t.DoltTable(ctx) + default: + err = fmt.Errorf("failed to unwrap dolt table from type: %T", sqlTable) + } + if err != nil { + return nil, nil, err + } + return sqlTable, dTab, nil +} + +func newIdxMeta(ctx *sql.Context, curStats *DoltStats, doltTable *doltdb.Table, sqlIndex sql.Index, cols []string) (indexMeta, error) { + var idx durable.Index + var err error + if strings.EqualFold(sqlIndex.ID(), "PRIMARY") { + idx, err = doltTable.GetRowData(ctx) + } else { + idx, err = doltTable.GetIndexRowData(ctx, sqlIndex.ID()) + } + if err != nil { + return indexMeta{}, err + } + + prollyMap := durable.ProllyMapFromIndex(idx) + + if cnt, err := prollyMap.Count(); err != nil { + return indexMeta{}, err + } else if cnt == 0 { + return indexMeta{ + qual: curStats.Qual, + cols: cols, + }, nil + } + + // get newest histogram target level hashes + levelNodes, err := tree.GetHistogramLevel(ctx, prollyMap.Tuples(), bucketLowCnt) + if err != nil { + return indexMeta{}, err + } + + var addrs []hash.Hash + var keepChunks []DoltBucket + var missingAddrs float64 + var missingChunks []tree.Node + var missingOffsets []updateOrdinal + var offset uint64 + + for _, n := range levelNodes { + // Compare the previous histogram chunks to the newest tree chunks. + // Partition the newest chunks into 1) preserved or 2) missing. + // Missing chunks will need to be scanned on a stats update, so + // track the (start, end) ordinal offsets to simplify the read iter. + treeCnt, err := n.TreeCount() + if err != nil { + return indexMeta{}, err + } + + addrs = append(addrs, n.HashOf()) + if bucketIdx, ok := curStats.Active[n.HashOf()]; !ok { + missingChunks = append(missingChunks, n) + missingOffsets = append(missingOffsets, updateOrdinal{offset, offset + uint64(treeCnt)}) + missingAddrs++ + } else { + keepChunks = append(keepChunks, curStats.Histogram[bucketIdx]) + } + offset += uint64(treeCnt) + } + + var dropChunks []DoltBucket + for _, h := range curStats.Chunks { + var match bool + for _, b := range keepChunks { + if b.Chunk == h { + match = true + break + } + } + if !match { + dropChunks = append(dropChunks, curStats.Histogram[curStats.Active[h]]) + } + } + + return indexMeta{ + qual: curStats.Qual, + cols: cols, + newNodes: missingChunks, + updateOrdinals: missingOffsets, + keepChunks: keepChunks, + dropChunks: dropChunks, + allAddrs: addrs, + }, nil +} diff --git a/go/libraries/doltcore/sqle/statspro/auto_refresh.go b/go/libraries/doltcore/sqle/statspro/auto_refresh.go new file mode 100644 index 00000000000..89f7b6ad6c8 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/auto_refresh.go @@ -0,0 +1,244 @@ +// Copyright 2024 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/dolthub/go-mysql-server/sql" + types2 "github.com/dolthub/go-mysql-server/sql/types" + + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" +) + +const asyncAutoRefreshStats = "async_auto_refresh_stats" + +func (p *Provider) InitAutoRefresh(ctxFactory func(ctx context.Context) (*sql.Context, error), dbName string, bThreads *sql.BackgroundThreads) error { + _, threshold, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshThreshold) + _, interval, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshInterval) + interval64, _, _ := types2.Int64.Convert(interval) + intervalSec := time.Second * time.Duration(interval64.(int64)) + thresholdf64 := threshold.(float64) + + ctx, err := ctxFactory(context.Background()) + if err != nil { + return err + } + + branches := p.getStatsBranches(ctx) + + return p.InitAutoRefreshWithParams(ctxFactory, dbName, bThreads, intervalSec, thresholdf64, branches) +} + +func (p *Provider) InitAutoRefreshWithParams(ctxFactory func(ctx context.Context) (*sql.Context, error), dbName string, bThreads *sql.BackgroundThreads, checkInterval time.Duration, updateThresh float64, branches []string) error { + // this is only called after initial statistics are finished loading + // launch a thread that periodically checks freshness + + p.mu.Lock() + defer p.mu.Unlock() + + dropDbCtx, dbStatsCancel := context.WithCancel(context.Background()) + p.cancelers[dbName] = dbStatsCancel + + return bThreads.Add(fmt.Sprintf("%s_%s", asyncAutoRefreshStats, dbName), func(ctx context.Context) { + ticker := time.NewTicker(checkInterval + time.Nanosecond) + for { + select { + case <-ctx.Done(): + ticker.Stop() + return + case <-ticker.C: + select { + case <-dropDbCtx.Done(): + ticker.Stop() + return + default: + } + + sqlCtx, err := ctxFactory(ctx) + if err != nil { + return + } + + dSess := dsess.DSessFromSess(sqlCtx.Session) + ddb, ok := dSess.GetDoltDB(sqlCtx, dbName) + if !ok { + sqlCtx.GetLogger().Debugf("statistics refresh error: database not found %s", dbName) + return + } + for _, branch := range branches { + if br, ok, err := ddb.HasBranch(ctx, branch); ok { + sqlCtx.GetLogger().Debugf("starting statistics refresh check for '%s': %s", dbName, time.Now().String()) + // update WORKING session references + sqlDb, err := dSess.Provider().Database(sqlCtx, fmt.Sprintf("%s/%s", dbName, branch)) + if err != nil { + sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) + return + } + + if err := p.checkRefresh(sqlCtx, sqlDb, dbName, br, updateThresh); err != nil { + sqlCtx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) + return + } + } else if err != nil { + sqlCtx.GetLogger().Debugf("statistics refresh error: branch check error %s", err.Error()) + } else { + sqlCtx.GetLogger().Debugf("statistics refresh error: branch not found %s", br) + } + } + } + } + }) +} + +func (p *Provider) checkRefresh(ctx *sql.Context, sqlDb sql.Database, dbName, branch string, updateThresh float64) error { + p.mu.Lock() + defer p.mu.Unlock() + + // Iterate all dbs, tables, indexes. Each db will collect + // []indexMeta above refresh threshold. We read and process those + // chunks' statistics. We merge updated chunks with precomputed + // chunks. The full set of statistics for each database lands + // 1) in the provider's most recent set of database statistics, and + // 2) on disk in the database's statistics ref'd prolly.Map. + statDb, ok := p.getStatDb(dbName) + if !ok { + return sql.ErrDatabaseNotFound.New(dbName) + } + + var deletedStats []sql.StatQualifier + qualExists := make(map[sql.StatQualifier]bool) + tableExistsAndSkipped := make(map[string]bool) + + tables, err := sqlDb.GetTableNames(ctx) + if err != nil { + return err + } + + for _, table := range tables { + sqlTable, dTab, err := GetLatestTable(ctx, table, sqlDb) + if err != nil { + return err + } + + tableHash, err := dTab.GetRowDataHash(ctx) + if err != nil { + return err + } + + if statDb.GetLatestHash(branch, table) == tableHash { + // no data changes since last check + tableExistsAndSkipped[table] = true + ctx.GetLogger().Debugf("statistics refresh: table hash unchanged since last check: %s", tableHash) + continue + } else { + ctx.GetLogger().Debugf("statistics refresh: new table hash: %s", tableHash) + } + + iat, ok := sqlTable.(sql.IndexAddressableTable) + if !ok { + return fmt.Errorf("table does not support indexes %s", table) + } + + indexes, err := iat.GetIndexes(ctx) + if err != nil { + return err + } + + // collect indexes and ranges to be updated + var idxMetas []indexMeta + for _, index := range indexes { + qual := sql.NewStatQualifier(dbName, table, strings.ToLower(index.ID())) + qualExists[qual] = true + curStat, ok := statDb.GetStat(branch, qual) + if !ok { + curStat = NewDoltStats() + curStat.Qual = qual + + cols := make([]string, len(index.Expressions())) + tablePrefix := fmt.Sprintf("%s.", table) + for i, c := range index.Expressions() { + cols[i] = strings.TrimPrefix(strings.ToLower(c), tablePrefix) + } + curStat.Columns = cols + } + ctx.GetLogger().Debugf("statistics refresh index: %s", qual.String()) + + updateMeta, err := newIdxMeta(ctx, curStat, dTab, index, curStat.Columns) + if err != nil { + ctx.GetLogger().Debugf("statistics refresh error: %s", err.Error()) + continue + } + curCnt := float64(len(curStat.Active)) + updateCnt := float64(len(updateMeta.newNodes)) + deleteCnt := float64(len(curStat.Active) - len(updateMeta.keepChunks)) + ctx.GetLogger().Debugf("statistics current: %d, new: %d, delete: %d", int(curCnt), int(updateCnt), int(deleteCnt)) + + if curCnt == 0 || (deleteCnt+updateCnt)/curCnt > updateThresh { + if curCnt == 0 && updateCnt == 0 { + continue + } + ctx.GetLogger().Debugf("statistics updating: %s", updateMeta.qual) + // mark index for updating + idxMetas = append(idxMetas, updateMeta) + // update lastest hash if we haven't already + statDb.SetLatestHash(branch, table, tableHash) + } + } + + // get new buckets for index chunks to update + newTableStats, err := createNewStatsBuckets(ctx, sqlTable, dTab, indexes, idxMetas) + if err != nil { + return err + } + + // merge new chunks with preexisting chunks + for _, updateMeta := range idxMetas { + stat := newTableStats[updateMeta.qual] + if stat != nil { + var err error + if _, ok := statDb.GetStat(branch, updateMeta.qual); !ok { + err = statDb.SetStat(ctx, branch, updateMeta.qual, stat) + } else { + err = statDb.ReplaceChunks(ctx, branch, updateMeta.qual, updateMeta.allAddrs, updateMeta.dropChunks, stat.Histogram) + } + if err != nil { + return err + } + p.UpdateStatus(dbName, fmt.Sprintf("refreshed %s", dbName)) + } + } + } + + for _, q := range statDb.ListStatQuals(branch) { + // table or index delete leaves hole in stats + // this is separate from threshold check + if !tableExistsAndSkipped[q.Table()] && !qualExists[q] { + // only delete stats we've verified are deleted + deletedStats = append(deletedStats, q) + } + } + + statDb.DeleteStats(branch, deletedStats...) + + if err := statDb.Flush(ctx, branch); err != nil { + return err + } + + return nil +} diff --git a/go/libraries/doltcore/sqle/statspro/configure.go b/go/libraries/doltcore/sqle/statspro/configure.go new file mode 100644 index 00000000000..f8a9cb90f66 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/configure.go @@ -0,0 +1,148 @@ +// Copyright 2024 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/dolthub/go-mysql-server/sql" + types2 "github.com/dolthub/go-mysql-server/sql/types" + + "github.com/dolthub/dolt/go/libraries/doltcore/env" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/libraries/utils/filesys" +) + +func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Context) (*sql.Context, error), bThreads *sql.BackgroundThreads, dbs []dsess.SqlDatabase) error { + p.SetStarter(NewInitDatabaseHook(p, ctxFactory, bThreads, nil)) + + if _, disabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsMemoryOnly); disabled == int8(1) { + return nil + } + + loadCtx, err := ctxFactory(ctx) + if err != nil { + return err + } + + branches := p.getStatsBranches(loadCtx) + + var autoEnabled bool + var intervalSec time.Duration + var thresholdf64 float64 + if _, enabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshEnabled); enabled == int8(1) { + autoEnabled = true + _, threshold, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshThreshold) + _, interval, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshInterval) + interval64, _, _ := types2.Int64.Convert(interval) + intervalSec = time.Second * time.Duration(interval64.(int64)) + thresholdf64 = threshold.(float64) + + p.pro.InitDatabaseHook = NewInitDatabaseHook(p, ctxFactory, bThreads, p.pro.InitDatabaseHook) + p.pro.DropDatabaseHook = NewDropDatabaseHook(p, ctxFactory, p.pro.DropDatabaseHook) + } + + eg, ctx := loadCtx.NewErrgroup() + for _, db := range dbs { + // copy closure variables + db := db + eg.Go(func() (err error) { + defer func() { + if r := recover(); r != nil { + if str, ok := r.(fmt.Stringer); ok { + err = fmt.Errorf("%w: %s", ErrFailedToLoad, str.String()) + } else { + err = fmt.Errorf("%w: %v", ErrFailedToLoad, r) + } + + return + } + }() + + fs, err := p.pro.FileSystemForDatabase(db.Name()) + if err != nil { + return err + } + + if p.Load(loadCtx, fs, db, branches); err != nil { + return err + } + if autoEnabled { + return p.InitAutoRefreshWithParams(ctxFactory, db.Name(), bThreads, intervalSec, thresholdf64, branches) + } + return nil + }) + } + return eg.Wait() +} + +// getStatsBranches returns the set of branches whose statistics are tracked. +// The order of precedence is (1) global variable, (2) session current branch, +// (3) engine default branch. +func (p *Provider) getStatsBranches(ctx *sql.Context) []string { + dSess := dsess.DSessFromSess(ctx.Session) + var branches []string + if _, bs, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsBranches); bs == "" { + defaultBranch, _ := dSess.GetBranch() + if defaultBranch != "" { + branches = append(branches, defaultBranch) + } + } else { + for _, branch := range strings.Split(bs.(string), ",") { + branches = append(branches, strings.TrimSpace(branch)) + } + } + + if branches == nil { + branches = []string{p.pro.DefaultBranch()} + } + return branches +} + +func (p *Provider) LoadStats(ctx *sql.Context, db, branch string) error { + if statDb, ok := p.getStatDb(db); ok { + return statDb.LoadBranchStats(ctx, branch) + } + return nil +} + +// Load scans the statistics tables, populating the |stats| attribute. +// Statistics are not available for reading until we've finished loading. +func (p *Provider) Load(ctx *sql.Context, fs filesys.Filesys, db dsess.SqlDatabase, branches []string) { + // |statPath| is either file://./stat or mem://stat + statsDb, err := p.sf.Init(ctx, db, p.pro, fs, env.GetCurrentUserHomeDir) + if err != nil { + ctx.Warn(0, err.Error()) + return + } + + for _, branch := range branches { + err = statsDb.LoadBranchStats(ctx, branch) + if err != nil { + // if branch name is invalid, continue loading rest + // TODO: differentiate bad branch name from other errors + ctx.Warn(0, err.Error()) + continue + } + } + + p.mu.Lock() + defer p.mu.Unlock() + p.setStatDb(strings.ToLower(db.Name()), statsDb) + return +} diff --git a/go/libraries/doltcore/sqle/statspro/dolt_stats.go b/go/libraries/doltcore/sqle/statspro/dolt_stats.go new file mode 100644 index 00000000000..c20689cc33f --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/dolt_stats.go @@ -0,0 +1,172 @@ +// Copyright 2024 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "fmt" + "sync" + "time" + + "github.com/dolthub/go-mysql-server/sql" + "github.com/dolthub/go-mysql-server/sql/stats" + + "github.com/dolthub/dolt/go/store/hash" +) + +type DoltStats struct { + mu *sync.Mutex + // Chunks is a list of addresses for the histogram fanout level + Chunks []hash.Hash + // Active maps a chunk/bucket address to its position in + // the histogram. 1-indexed to differentiate from an empty + // field on disk + Active map[hash.Hash]int + + RowCount uint64 + DistinctCount uint64 + NullCount uint64 + AvgSize uint64 + Qual sql.StatQualifier + CreatedAt time.Time + Histogram DoltHistogram + Columns []string + Types []sql.Type + IdxClass uint8 + LowerBound sql.Row + Fds *sql.FuncDepSet + ColSet sql.ColSet +} + +func NewDoltStats() *DoltStats { + return &DoltStats{mu: &sync.Mutex{}, Active: make(map[hash.Hash]int)} +} + +func DoltStatsFromSql(stat sql.Statistic) (*DoltStats, error) { + hist, err := DoltHistFromSql(stat.Histogram(), stat.Types()) + if err != nil { + return nil, err + } + return &DoltStats{ + mu: &sync.Mutex{}, + Qual: stat.Qualifier(), + RowCount: stat.RowCount(), + DistinctCount: stat.DistinctCount(), + NullCount: stat.NullCount(), + AvgSize: stat.AvgSize(), + CreatedAt: stat.CreatedAt(), + Histogram: hist, + Columns: stat.Columns(), + Types: stat.Types(), + IdxClass: uint8(stat.IndexClass()), + LowerBound: stat.LowerBound(), + Fds: stat.FuncDeps(), + ColSet: stat.ColSet(), + }, nil +} + +func (s *DoltStats) UpdateActive() { + s.mu.Lock() + defer s.mu.Unlock() + newActive := make(map[hash.Hash]int) + for i, hash := range s.Chunks { + newActive[hash] = i + } + s.Active = newActive +} + +func (s *DoltStats) updateCounts() { + s.mu.Lock() + defer s.mu.Unlock() + var newDistinct uint64 + var newRows uint64 + var newNulls uint64 + for _, b := range s.Histogram { + newDistinct += b.DistinctCount + newRows += b.RowCount + newNulls += b.NullCount + } + s.RowCount = newRows + s.DistinctCount = newDistinct + s.NullCount = newNulls +} + +func (s *DoltStats) toSql() sql.Statistic { + s.mu.Lock() + defer s.mu.Unlock() + typStrs := make([]string, len(s.Types)) + for i, typ := range s.Types { + typStrs[i] = typ.String() + } + stat := stats.NewStatistic(s.RowCount, s.DistinctCount, s.NullCount, s.AvgSize, s.CreatedAt, s.Qual, s.Columns, s.Types, s.Histogram.toSql(), sql.IndexClass(s.IdxClass), s.LowerBound) + return stat.WithColSet(s.ColSet).WithFuncDeps(s.Fds) +} + +type DoltHistogram []DoltBucket + +type DoltBucket struct { + Chunk hash.Hash + RowCount uint64 + DistinctCount uint64 + NullCount uint64 + CreatedAt time.Time + Mcvs []sql.Row + McvCount []uint64 + BoundCount uint64 + UpperBound sql.Row +} + +func DoltHistFromSql(hist sql.Histogram, types []sql.Type) (DoltHistogram, error) { + ret := make([]DoltBucket, len(hist)) + var err error + for i, b := range hist { + upperBound := make(sql.Row, len(b.UpperBound())) + for i, v := range b.UpperBound() { + upperBound[i], _, err = types[i].Convert(v) + if err != nil { + return nil, fmt.Errorf("failed to convert %v to type %s", v, types[i].String()) + } + } + mcvs := make([]sql.Row, len(b.Mcvs())) + for i, mcv := range b.Mcvs() { + for _, v := range mcv { + conv, _, err := types[i].Convert(v) + if err != nil { + return nil, fmt.Errorf("failed to convert %v to type %s", v, types[i].String()) + } + mcvs[i] = append(mcvs[i], conv) + } + } + ret[i] = DoltBucket{ + RowCount: b.RowCount(), + DistinctCount: b.DistinctCount(), + NullCount: b.NullCount(), + Mcvs: mcvs, + McvCount: b.McvCounts(), + BoundCount: b.BoundCount(), + UpperBound: upperBound, + } + } + return ret, nil +} + +func (s DoltHistogram) toSql() []*stats.Bucket { + ret := make([]*stats.Bucket, len(s)) + for i, b := range s { + upperBound := make([]interface{}, len(b.UpperBound)) + copy(upperBound, b.UpperBound) + ret[i] = stats.NewHistogramBucket(b.RowCount, b.DistinctCount, b.NullCount, b.BoundCount, upperBound, b.McvCount, b.Mcvs) + } + return ret +} diff --git a/go/libraries/doltcore/sqle/stats/initdbhook.go b/go/libraries/doltcore/sqle/statspro/initdbhook.go similarity index 52% rename from go/libraries/doltcore/sqle/stats/initdbhook.go rename to go/libraries/doltcore/sqle/statspro/initdbhook.go index a44f4d05151..5c2705d1108 100644 --- a/go/libraries/doltcore/sqle/stats/initdbhook.go +++ b/go/libraries/doltcore/sqle/statspro/initdbhook.go @@ -12,34 +12,52 @@ // See the License for the specific language governing permissions and // limitations under the License. -package stats +package statspro import ( "context" - "time" + "strings" "github.com/dolthub/go-mysql-server/sql" - types2 "github.com/dolthub/go-mysql-server/sql/types" "github.com/dolthub/dolt/go/libraries/doltcore/env" "github.com/dolthub/dolt/go/libraries/doltcore/sqle" "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" ) -func NewInitDatabaseHook(statsProv *Provider, ctxFactory func(ctx context.Context) (*sql.Context, error), bThreads *sql.BackgroundThreads, orig sqle.InitDatabaseHook) sqle.InitDatabaseHook { - return func(ctx *sql.Context, pro *sqle.DoltDatabaseProvider, name string, denv *env.DoltEnv) error { +func NewInitDatabaseHook( + statsProv *Provider, + ctxFactory func(ctx context.Context) (*sql.Context, error), + bThreads *sql.BackgroundThreads, + orig sqle.InitDatabaseHook, +) sqle.InitDatabaseHook { + return func( + ctx *sql.Context, + pro *sqle.DoltDatabaseProvider, + name string, + denv *env.DoltEnv, + db dsess.SqlDatabase, + ) error { + // We assume there is nothing on disk to read. Probably safe and also + // would deadlock with dbProvider if we tried from reading root/session. if orig != nil { - err := orig(ctx, pro, name, denv) + err := orig(ctx, pro, name, denv, db) if err != nil { return err } } - _, threshold, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshThreshold) - _, interval, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshInterval) - interval64, _, _ := types2.Int64.Convert(interval) - intervalSec := time.Second * time.Duration(interval64.(int64)) - thresholdf64 := threshold.(float64) - return statsProv.InitAutoRefresh(ctxFactory, name, bThreads, intervalSec, thresholdf64) + + statsDb, err := statsProv.sf.Init(ctx, db, statsProv.pro, denv.FS, env.GetCurrentUserHomeDir) + if err != nil { + ctx.GetLogger().Debugf("statistics load error: %s", err.Error()) + return nil + } + statsProv.mu.Lock() + statsProv.setStatDb(strings.ToLower(db.Name()), statsDb) + statsProv.mu.Unlock() + + ctx.GetLogger().Debugf("statistics refresh: initialize %s", name) + return statsProv.InitAutoRefresh(ctxFactory, name, bThreads) } } @@ -54,5 +72,11 @@ func NewDropDatabaseHook(statsProv *Provider, ctxFactory func(ctx context.Contex } statsProv.CancelRefreshThread(name) statsProv.DropDbStats(ctx, name, false) + + if db, ok := statsProv.getStatDb(name); ok { + if err := db.Close(); err != nil { + ctx.GetLogger().Debugf("failed to close stats database: %s", err) + } + } } } diff --git a/go/libraries/doltcore/sqle/statspro/interface.go b/go/libraries/doltcore/sqle/statspro/interface.go new file mode 100644 index 00000000000..c525c625ca5 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/interface.go @@ -0,0 +1,64 @@ +// Copyright 2024 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + + "github.com/dolthub/go-mysql-server/sql" + + "github.com/dolthub/dolt/go/libraries/doltcore/env" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/libraries/utils/filesys" + "github.com/dolthub/dolt/go/store/hash" +) + +// Database is a backing store for a collection of DoltStats. +// Each stats database tracks a user database, with multiple +// branches potentially each having their own statistics. +type Database interface { + // ListStatQuals returns the list of index statistics for a branch. + ListStatQuals(branch string) []sql.StatQualifier + // LoadBranchStats starts tracking a specific branch's statistics. + LoadBranchStats(ctx *sql.Context, branch string) error + // DeleteBranchStats removes references to in memory index statistics. + // If |flush| is true delete the data from storage. + DeleteBranchStats(ctx context.Context, branch string, flush bool) error + // GetStat returns a branch's index statistics. + GetStat(branch string, qual sql.StatQualifier) (*DoltStats, bool) + //SetStat bulk replaces the statistic, deleting any previous version + SetStat(ctx context.Context, branch string, qual sql.StatQualifier, stats *DoltStats) error + //DeleteStats deletes a list of index statistics. + DeleteStats(branch string, quals ...sql.StatQualifier) + // ReplaceChunks is an update interface that lets a stats implementation + // decide how to edit stats for a stats refresh. + ReplaceChunks(ctx context.Context, branch string, qual sql.StatQualifier, targetHashes []hash.Hash, dropChunks, newChunks []DoltBucket) error + // Flush instructs the database to sync any partial state to disk + Flush(ctx context.Context, branch string) error + // Close finalizes any file references. + Close() error + + SetLatestHash(branch, tableName string, h hash.Hash) + GetLatestHash(branch, tableName string) hash.Hash +} + +// StatsFactory instances construct statistic databases. +type StatsFactory interface { + // Init gets a reference to the stats database for a dolt database + // rooted at the given filesystem. It will create the database if + // it does not exist. + Init(ctx *sql.Context, sourceDb dsess.SqlDatabase, prov *sqle.DoltDatabaseProvider, fs filesys.Filesys, hdp env.HomeDirProvider) (Database, error) +} diff --git a/go/libraries/doltcore/sqle/statspro/stats_provider.go b/go/libraries/doltcore/sqle/statspro/stats_provider.go new file mode 100644 index 00000000000..1008a5f4aa6 --- /dev/null +++ b/go/libraries/doltcore/sqle/statspro/stats_provider.go @@ -0,0 +1,326 @@ +// Copyright 2023 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statspro + +import ( + "context" + "errors" + "fmt" + "strings" + "sync" + + "github.com/dolthub/go-mysql-server/sql" + + "github.com/dolthub/dolt/go/libraries/doltcore/env" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle" + "github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess" + "github.com/dolthub/dolt/go/store/hash" + "github.com/dolthub/dolt/go/store/prolly/tree" +) + +var ErrFailedToLoad = errors.New("failed to load statistics") + +type indexMeta struct { + qual sql.StatQualifier + cols []string + newNodes []tree.Node + // updateOrdinals are [start, stop] tuples for each update chunk + updateOrdinals []updateOrdinal + keepChunks []DoltBucket + dropChunks []DoltBucket + allAddrs []hash.Hash +} + +type updateOrdinal struct { + start, stop uint64 +} + +func NewProvider(pro *sqle.DoltDatabaseProvider, sf StatsFactory) *Provider { + return &Provider{ + pro: pro, + sf: sf, + mu: &sync.Mutex{}, + statDbs: make(map[string]Database), + cancelers: make(map[string]context.CancelFunc), + status: make(map[string]string), + } +} + +// Provider is the engine interface for reading and writing index statistics. +// Each database has its own statistics table that all tables/indexes in a db +// share. +type Provider struct { + mu *sync.Mutex + pro *sqle.DoltDatabaseProvider + sf StatsFactory + statDbs map[string]Database + cancelers map[string]context.CancelFunc + starter sqle.InitDatabaseHook + status map[string]string +} + +// each database has one statistics table that is a collection of the +// table stats in the database +type dbToStats struct { + mu *sync.Mutex + dbName string + stats map[sql.StatQualifier]*DoltStats + statsDatabase Database + latestTableHashes map[string]hash.Hash +} + +func newDbStats(dbName string) *dbToStats { + return &dbToStats{ + mu: &sync.Mutex{}, + dbName: dbName, + stats: make(map[sql.StatQualifier]*DoltStats), + latestTableHashes: make(map[string]hash.Hash), + } +} + +var _ sql.StatsProvider = (*Provider)(nil) + +func (p *Provider) StartRefreshThread(ctx *sql.Context, pro dsess.DoltDatabaseProvider, name string, env *env.DoltEnv, db dsess.SqlDatabase) error { + err := p.starter(ctx, pro.(*sqle.DoltDatabaseProvider), name, env, db) + p.mu.Lock() + defer p.mu.Unlock() + + if err != nil { + p.UpdateStatus(name, fmt.Sprintf("error restarting thread %s: %s", name, err.Error())) + return err + } + p.UpdateStatus(name, fmt.Sprintf("restarted thread: %s", name)) + return nil +} + +func (p *Provider) SetStarter(hook sqle.InitDatabaseHook) { + p.starter = hook +} + +func (p *Provider) CancelRefreshThread(dbName string) { + p.mu.Lock() + defer p.mu.Unlock() + if cancel, ok := p.cancelers[dbName]; ok { + cancel() + p.UpdateStatus(dbName, fmt.Sprintf("cancelled thread: %s", dbName)) + } +} + +func (p *Provider) ThreadStatus(dbName string) string { + p.mu.Lock() + defer p.mu.Unlock() + + if msg, ok := p.status[dbName]; ok { + return msg + } + return "no active stats thread" +} + +func (p *Provider) GetTableStats(ctx *sql.Context, db, table string) ([]sql.Statistic, error) { + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return nil, nil + } + + return p.GetTableDoltStats(ctx, branch, db, table) +} + +func (p *Provider) GetTableDoltStats(ctx *sql.Context, branch, db, table string) ([]sql.Statistic, error) { + p.mu.Lock() + defer p.mu.Unlock() + + statDb, ok := p.getStatDb(db) + if !ok || statDb == nil { + return nil, nil + } + + if branch == "" { + dSess := dsess.DSessFromSess(ctx.Session) + var err error + branch, err = dSess.GetBranch() + if err != nil { + return nil, nil + } + } + + var ret []sql.Statistic + for _, qual := range statDb.ListStatQuals(branch) { + if strings.EqualFold(db, qual.Database) && strings.EqualFold(table, qual.Tab) { + stat, _ := statDb.GetStat(branch, qual) + ret = append(ret, stat.toSql()) + } + } + + return ret, nil +} + +func (p *Provider) setStatDb(name string, db Database) { + p.statDbs[name] = db +} + +func (p *Provider) getStatDb(name string) (Database, bool) { + statDb, ok := p.statDbs[strings.ToLower(name)] + return statDb, ok +} + +func (p *Provider) SetStats(ctx *sql.Context, s sql.Statistic) error { + p.mu.Lock() + defer p.mu.Unlock() + + statDb, ok := p.getStatDb(s.Qualifier().Db()) + if !ok { + return nil + } + + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return nil + } + + doltStat, err := DoltStatsFromSql(s) + if err != nil { + return err + } + + p.UpdateStatus(s.Qualifier().Db(), fmt.Sprintf("refreshed %s", s.Qualifier().Db())) + + return statDb.SetStat(ctx, branch, s.Qualifier(), doltStat) +} + +func (p *Provider) getQualStats(ctx *sql.Context, qual sql.StatQualifier) (*DoltStats, bool) { + statDb, ok := p.getStatDb(qual.Db()) + if !ok { + return nil, false + } + + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return nil, false + } + + return statDb.GetStat(branch, qual) +} + +func (p *Provider) GetStats(ctx *sql.Context, qual sql.StatQualifier, _ []string) (sql.Statistic, bool) { + p.mu.Lock() + defer p.mu.Unlock() + + stat, ok := p.getQualStats(ctx, qual) + if !ok { + return nil, false + } + return stat.toSql(), true +} + +func (p *Provider) DropDbStats(ctx *sql.Context, db string, flush bool) error { + p.mu.Lock() + defer p.mu.Unlock() + + statDb, ok := p.getStatDb(db) + if !ok { + return nil + } + + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return err + } + + // remove provider access + if err := statDb.DeleteBranchStats(ctx, branch, flush); err != nil { + return nil + } + + p.status[db] = "dropped" + + return nil +} + +func (p *Provider) DropStats(ctx *sql.Context, qual sql.StatQualifier, _ []string) error { + p.mu.Lock() + defer p.mu.Unlock() + + statDb, ok := p.getStatDb(qual.Db()) + if !ok { + return nil + } + + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return nil + } + + if _, ok := statDb.GetStat(branch, qual); ok { + statDb.DeleteStats(branch, qual) + p.UpdateStatus(qual.Db(), fmt.Sprintf("dropped statisic: %s", qual.String())) + } + + return nil +} + +func (p *Provider) UpdateStatus(db string, msg string) { + p.status[db] = msg +} + +func (p *Provider) RowCount(ctx *sql.Context, db, table string) (uint64, error) { + p.mu.Lock() + defer p.mu.Unlock() + + statDb, ok := p.getStatDb(db) + if !ok { + return 0, sql.ErrDatabaseNotFound.New(db) + } + + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return 0, err + } + + priStats, ok := statDb.GetStat(branch, sql.NewStatQualifier(db, table, "primary")) + if !ok { + return 0, nil + } + + return priStats.RowCount, nil +} + +func (p *Provider) DataLength(ctx *sql.Context, db, table string) (uint64, error) { + p.mu.Lock() + defer p.mu.Unlock() + + statDb, ok := p.getStatDb(db) + if !ok { + return 0, sql.ErrDatabaseNotFound.New(db) + } + + dSess := dsess.DSessFromSess(ctx.Session) + branch, err := dSess.GetBranch() + if err != nil { + return 0, err + } + + priStats, ok := statDb.GetStat(branch, sql.NewStatQualifier(db, table, "primary")) + if !ok { + return 0, nil + } + + return priStats.AvgSize, nil +} diff --git a/go/libraries/doltcore/sqle/stats/update.go b/go/libraries/doltcore/sqle/statspro/update.go similarity index 83% rename from go/libraries/doltcore/sqle/stats/update.go rename to go/libraries/doltcore/sqle/statspro/update.go index 2c97e799903..227f25885ee 100644 --- a/go/libraries/doltcore/sqle/stats/update.go +++ b/go/libraries/doltcore/sqle/statspro/update.go @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -package stats +package statspro import ( "container/heap" "context" "errors" + "fmt" "io" "strings" "time" @@ -27,6 +28,7 @@ import ( "github.com/dolthub/dolt/go/libraries/doltcore/doltdb" "github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable" + "github.com/dolthub/dolt/go/store/hash" "github.com/dolthub/dolt/go/store/prolly" "github.com/dolthub/dolt/go/store/prolly/tree" "github.com/dolthub/dolt/go/store/val" @@ -37,11 +39,11 @@ const ( mcvCnt = 3 ) -// updateStats builds histograms for a list of index statistic metadata. +// createNewStatsBuckets builds histograms for a list of index statistic metadata. // We only read chunk ranges indicated by |indexMeta.updateOrdinals|. If // the returned buckets are a subset of the index the caller is responsible // for reconciling the difference. -func updateStats(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Table, indexes []sql.Index, idxMetas []indexMeta) (map[sql.StatQualifier]*DoltStats, error) { +func createNewStatsBuckets(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Table, indexes []sql.Index, idxMetas []indexMeta) (map[sql.StatQualifier]*DoltStats, error) { nameToIdx := make(map[string]sql.Index) for _, idx := range indexes { nameToIdx[strings.ToLower(idx.ID())] = idx @@ -85,8 +87,8 @@ func updateStats(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Table, index ret[meta.qual].Types = types ret[meta.qual].Qual = meta.qual - ret[meta.qual].fds = fds - ret[meta.qual].colSet = colSet + ret[meta.qual].Fds = fds + ret[meta.qual].ColSet = colSet continue } @@ -95,31 +97,9 @@ func updateStats(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Table, index return nil, err } - // find level if not exists - if len(meta.updateChunks) == 0 { - levelNodes, err := tree.GetHistogramLevel(ctx, prollyMap.Tuples(), bucketLowCnt) - if err != nil { - return nil, err - } - var chunks []tree.Node - var offsets [][]uint64 - var offset uint64 - for _, n := range levelNodes { - chunks = append(chunks, n) - treeCnt, err := n.TreeCount() - if err != nil { - return nil, err - } - offsets = append(offsets, []uint64{offset, offset + uint64(treeCnt)}) - offset += uint64(treeCnt) - } - meta.updateChunks = chunks - meta.updateOrdinals = offsets - } - updater := newBucketBuilder(meta.qual, len(meta.cols), prollyMap.KeyDesc()) ret[meta.qual] = NewDoltStats() - ret[meta.qual].chunks = meta.allAddrs + ret[meta.qual].Chunks = meta.allAddrs ret[meta.qual].CreatedAt = time.Now() ret[meta.qual].Columns = meta.cols ret[meta.qual].Types = types @@ -127,12 +107,12 @@ func updateStats(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Table, index var start, stop uint64 // read leaf rows for each bucket - for i, chunk := range meta.updateChunks { + for i, chunk := range meta.newNodes { // each node is a bucket updater.newBucket() // we read exclusive range [node first key, next node first key) - start, stop = meta.updateOrdinals[i][0], meta.updateOrdinals[i][1] + start, stop = meta.updateOrdinals[i].start, meta.updateOrdinals[i].stop iter, err := prollyMap.IterOrdinalRange(ctx, start, stop) if err != nil { return nil, err @@ -166,40 +146,40 @@ func updateStats(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Table, index ret[updater.qual].DistinctCount = uint64(updater.globalDistinct) ret[updater.qual].RowCount = uint64(updater.globalCount) ret[updater.qual].LowerBound = firstRow - ret[updater.qual].fds = fds - ret[updater.qual].colSet = colSet + ret[updater.qual].Fds = fds + ret[updater.qual].ColSet = colSet + ret[updater.qual].UpdateActive() } return ret, nil } -func mergeStatUpdates(newStats *DoltStats, idxMeta indexMeta) *DoltStats { - if len(newStats.Histogram) == len(idxMeta.allAddrs) { - newStats.updateActive() - return newStats +// MergeNewChunks combines a set of old and new chunks to create +// the desired target histogram. Undefined behavior if a |targetHash| +// does not exist in either |oldChunks| or |newChunks|. +func MergeNewChunks(inputHashes []hash.Hash, oldChunks, newChunks []DoltBucket) ([]DoltBucket, error) { + hashToPos := make(map[hash.Hash]int, len(inputHashes)) + for i, h := range inputHashes { + hashToPos[h] = i } - oldHist := idxMeta.preexisting - var mergeHist DoltHistogram - newHist := newStats.Histogram - var i, j int - for _, chunkAddr := range idxMeta.allAddrs { - if i < len(oldHist) && oldHist[i].Chunk == chunkAddr { - mergeHist = append(mergeHist, oldHist[i]) - i++ - } else if j < len(newHist) && newHist[j].Chunk == chunkAddr { - mergeHist = append(mergeHist, newHist[j]) - j++ + + var cnt int + targetBuckets := make([]DoltBucket, len(inputHashes)) + for _, c := range oldChunks { + if idx, ok := hashToPos[c.Chunk]; ok { + cnt++ + targetBuckets[idx] = c } } - - if len(mergeHist) == 0 { - return newStats + for _, c := range newChunks { + if idx, ok := hashToPos[c.Chunk]; ok && targetBuckets[idx].Chunk.IsEmpty() { + cnt++ + targetBuckets[idx] = c + } } - - newStats.Histogram = mergeHist - newStats.chunks = idxMeta.allAddrs - newStats.updateActive() - newStats.updateCounts() - return newStats + if cnt != len(inputHashes) { + return nil, fmt.Errorf("encountered invalid statistic chunks") + } + return targetBuckets, nil } func firstRowForIndex(ctx *sql.Context, prollyMap prolly.Map, keyBuilder *val.TupleBuilder, prefixLen int) (sql.Row, error) { diff --git a/go/libraries/doltcore/sqle/stats/update_test.go b/go/libraries/doltcore/sqle/statspro/update_test.go similarity index 99% rename from go/libraries/doltcore/sqle/stats/update_test.go rename to go/libraries/doltcore/sqle/statspro/update_test.go index 0a20aec176f..e4b3935473f 100644 --- a/go/libraries/doltcore/sqle/stats/update_test.go +++ b/go/libraries/doltcore/sqle/statspro/update_test.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package stats +package statspro import ( "container/heap" diff --git a/go/libraries/doltcore/sqle/system_variables.go b/go/libraries/doltcore/sqle/system_variables.go index 4cc46eb86b1..6180fc70c77 100644 --- a/go/libraries/doltcore/sqle/system_variables.go +++ b/go/libraries/doltcore/sqle/system_variables.go @@ -15,6 +15,8 @@ package sqle import ( + "math" + "github.com/dolthub/go-mysql-server/sql" "github.com/dolthub/go-mysql-server/sql/types" _ "github.com/dolthub/go-mysql-server/sql/variables" @@ -227,9 +229,16 @@ func AddDoltSystemVariables() { Name: dsess.DoltStatsAutoRefreshInterval, Dynamic: true, Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), - Type: types.NewSystemIntType(dsess.DoltStatsAutoRefreshInterval, 0, 1<<10, false), + Type: types.NewSystemIntType(dsess.DoltStatsAutoRefreshInterval, 0, math.MaxInt, false), Default: 120, }, + &sql.MysqlSystemVariable{ + Name: dsess.DoltStatsBranches, + Dynamic: true, + Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global), + Type: types.NewSystemStringType(dsess.DoltStatsBranches), + Default: "", + }, }) } diff --git a/go/store/prolly/tuple_mutable_map.go b/go/store/prolly/tuple_mutable_map.go index d020c96f0f6..a90f9439bfb 100644 --- a/go/store/prolly/tuple_mutable_map.go +++ b/go/store/prolly/tuple_mutable_map.go @@ -190,6 +190,13 @@ func (mut *MutableMap) IterAll(ctx context.Context) (MapIter, error) { return mut.IterRange(ctx, rng) } +// IterKeyRange iterates over a physical key range defined by |start| and +// |stop|. If |start| and/or |stop| is nil, the range will be open +// towards that end. +func (mut *MutableMap) IterKeyRange(ctx context.Context, start, stop val.Tuple) (MapIter, error) { + return mut.tuples.Static.IterKeyRange(ctx, start, stop) +} + // IterRange returns a MapIter that iterates over a Range. func (mut *MutableMap) IterRange(ctx context.Context, rng Range) (MapIter, error) { treeIter, err := treeIterFromRange(ctx, mut.tuples.Static.Root, mut.tuples.Static.NodeStore, rng) diff --git a/integration-tests/bats/stats.bats b/integration-tests/bats/stats.bats index e8c56b67b12..95fd18ee787 100644 --- a/integration-tests/bats/stats.bats +++ b/integration-tests/bats/stats.bats @@ -47,24 +47,23 @@ teardown() { sleep 1 stop_sql_server - # no statistics error if ref does not exist - run dolt sql -r csv -q "select database_name, table_name, index_name from dolt_statistics" - [ "$status" -eq 1 ] - [[ "$output" =~ "no statistics found" ]] || false + run dolt sql -r csv -q "select count(*) from dolt_statistics" + [ "$status" -eq 0 ] + [ "${lines[1]}" = "0" ] # setting variables doesn't hang or error dolt sql -q "set @@PERSIST.dolt_stats_auto_refresh_enabled = 1;" dolt sql -q "set @@PERSIST.dolt_stats_auto_refresh_threshold = .5" dolt sql -q "set @@PERSIST.dolt_stats_auto_refresh_interval = 1;" - # auto refresh can only initialize at server startup + # auto refresh initialize at server startup start_sql_server # need to trigger at least one refresh cycle sleep 1 # only statistics for non-empty tables are collected - run dolt sql -r csv -q "select database_name, table_name, index_name from dolt_statistics" + run dolt sql -r csv -q "select database_name, table_name, index_name from dolt_statistics order by index_name" [ "$status" -eq 0 ] [ "${lines[0]}" = "database_name,table_name,index_name" ] [ "${lines[1]}" = "repo2,xy,primary" ] @@ -84,7 +83,7 @@ teardown() { sleep 1 - run dolt sql -r csv -q "select count(*) from dolt_statistics" + dolt sql -r csv -q "select count(*) from dolt_statistics" [ "$status" -eq 0 ] [ "${lines[1]}" = "8" ] } @@ -108,7 +107,7 @@ teardown() { [ "${lines[1]}" = "8" ] # delete >50% of rows - dolt sql -q "delete from xy where x > 500" + dolt sql -q "delete from xy where x > 600" sleep 1 @@ -251,9 +250,9 @@ teardown() { sleep 1 stop_sql_server - run dolt sql -r csv -q "select database_name, table_name, index_name from dolt_statistics" - [ "$status" -eq 1 ] - [[ "$output" =~ "no statistics found" ]] || false + run dolt sql -r csv -q "select count(*) from dolt_statistics" + [ "$status" -eq 0 ] + [ "${lines[1]}" = "0" ] dolt sql -q "SET @@persist.dolt_stats_auto_refresh_enabled = 1;" dolt sql -q "SET @@persist.dolt_stats_auto_refresh_threshold = 0.5"