From 62a74c4774afc824caf60cfae69f92b6f221256d Mon Sep 17 00:00:00 2001 From: Thorsten Klein Date: Mon, 13 Jan 2025 16:39:56 +0000 Subject: [PATCH] enhance: (knowledge) only preload files on datasets if asked for (#336) --- knowledge/pkg/client/client.go | 2 +- knowledge/pkg/client/common.go | 2 +- knowledge/pkg/client/standalone.go | 4 ++-- knowledge/pkg/cmd/edit_dataset.go | 2 +- knowledge/pkg/cmd/export.go | 2 +- knowledge/pkg/cmd/get_dataset.go | 3 ++- knowledge/pkg/datastore/dataset.go | 6 +++--- knowledge/pkg/datastore/ingest.go | 2 +- knowledge/pkg/datastore/retrieve.go | 2 +- knowledge/pkg/datastore/retrievers/bm25.go | 2 +- knowledge/pkg/datastore/retrievers/retrievers.go | 2 +- knowledge/pkg/datastore/retrievers/routing.go | 2 +- knowledge/pkg/datastore/retrievers/subquery.go | 2 +- knowledge/pkg/datastore/store/store.go | 2 +- knowledge/pkg/index/db.go | 15 ++++++++++++++- knowledge/pkg/index/index.go | 2 +- knowledge/pkg/index/postgres/postgres.go | 4 ++-- knowledge/pkg/index/sqlite/sqlite.go | 4 ++-- knowledge/pkg/index/types/models.go | 4 ++++ knowledge/pkg/index/types/types.go | 16 ++++++++++++++-- 20 files changed, 55 insertions(+), 25 deletions(-) diff --git a/knowledge/pkg/client/client.go b/knowledge/pkg/client/client.go index ecd8be3d..be0523f0 100644 --- a/knowledge/pkg/client/client.go +++ b/knowledge/pkg/client/client.go @@ -35,7 +35,7 @@ type IngestPathsOpts struct { type Client interface { CreateDataset(ctx context.Context, datasetID string, opts *types2.DatasetCreateOpts) (*types2.Dataset, error) DeleteDataset(ctx context.Context, datasetID string) error - GetDataset(ctx context.Context, datasetID string) (*types2.Dataset, error) + GetDataset(ctx context.Context, datasetID string, opts *types2.DatasetGetOpts) (*types2.Dataset, error) FindFile(ctx context.Context, searchFile types2.File) (*types2.File, error) DeleteFile(ctx context.Context, datasetID, fileID string) error ListDatasets(ctx context.Context) ([]types2.Dataset, error) diff --git a/knowledge/pkg/client/common.go b/knowledge/pkg/client/common.go index 830d1715..67747533 100644 --- a/knowledge/pkg/client/common.go +++ b/knowledge/pkg/client/common.go @@ -277,7 +277,7 @@ func AskDir(ctx context.Context, c Client, path string, query string, opts *Inge func getOrCreateDataset(ctx context.Context, c Client, datasetID string, create bool) (*types.Dataset, error) { var ds *types.Dataset var err error - ds, err = c.GetDataset(ctx, datasetID) + ds, err = c.GetDataset(ctx, datasetID, nil) if err != nil { return nil, err } diff --git a/knowledge/pkg/client/standalone.go b/knowledge/pkg/client/standalone.go index adbbc80f..6cf5a58d 100644 --- a/knowledge/pkg/client/standalone.go +++ b/knowledge/pkg/client/standalone.go @@ -53,8 +53,8 @@ func (c *StandaloneClient) DeleteDataset(ctx context.Context, datasetID string) return c.Datastore.DeleteDataset(ctx, datasetID) } -func (c *StandaloneClient) GetDataset(ctx context.Context, datasetID string) (*types2.Dataset, error) { - return c.Datastore.GetDataset(ctx, datasetID) +func (c *StandaloneClient) GetDataset(ctx context.Context, datasetID string, opts *types2.DatasetGetOpts) (*types2.Dataset, error) { + return c.Datastore.GetDataset(ctx, datasetID, opts) } func (c *StandaloneClient) ListDatasets(ctx context.Context) ([]types2.Dataset, error) { diff --git a/knowledge/pkg/cmd/edit_dataset.go b/knowledge/pkg/cmd/edit_dataset.go index 97a7fedd..13dd1f7b 100644 --- a/knowledge/pkg/cmd/edit_dataset.go +++ b/knowledge/pkg/cmd/edit_dataset.go @@ -33,7 +33,7 @@ func (s *ClientEditDataset) Run(cmd *cobra.Command, args []string) error { datasetID := args[0] // Get current dataset - dataset, err := c.GetDataset(cmd.Context(), datasetID) + dataset, err := c.GetDataset(cmd.Context(), datasetID, nil) if err != nil { return fmt.Errorf("failed to get dataset: %w", err) } diff --git a/knowledge/pkg/cmd/export.go b/knowledge/pkg/cmd/export.go index 5ea8ce3b..b0c6191b 100644 --- a/knowledge/pkg/cmd/export.go +++ b/knowledge/pkg/cmd/export.go @@ -42,7 +42,7 @@ func (s *ClientExportDatasets) Run(cmd *cobra.Command, args []string) error { } } else { for _, datasetID := range dsnames { - ds, err := c.GetDataset(cmd.Context(), datasetID) + ds, err := c.GetDataset(cmd.Context(), datasetID, nil) if err != nil { return err } diff --git a/knowledge/pkg/cmd/get_dataset.go b/knowledge/pkg/cmd/get_dataset.go index b770e5dc..bd52b50d 100644 --- a/knowledge/pkg/cmd/get_dataset.go +++ b/knowledge/pkg/cmd/get_dataset.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" + "github.com/gptscript-ai/knowledge/pkg/index/types" "github.com/spf13/cobra" ) @@ -28,7 +29,7 @@ func (s *ClientGetDataset) Run(cmd *cobra.Command, args []string) error { datasetID := args[0] - ds, err := c.GetDataset(cmd.Context(), datasetID) + ds, err := c.GetDataset(cmd.Context(), datasetID, &types.DatasetGetOpts{IncludeFiles: true}) if err != nil { return fmt.Errorf("failed to get dataset: %w", err) } diff --git a/knowledge/pkg/datastore/dataset.go b/knowledge/pkg/datastore/dataset.go index c82ba101..45cfdef7 100644 --- a/knowledge/pkg/datastore/dataset.go +++ b/knowledge/pkg/datastore/dataset.go @@ -41,8 +41,8 @@ func (s *Datastore) DeleteDataset(ctx context.Context, datasetID string) error { return nil } -func (s *Datastore) GetDataset(ctx context.Context, datasetID string) (*types.Dataset, error) { - return s.Index.GetDataset(ctx, datasetID) +func (s *Datastore) GetDataset(ctx context.Context, datasetID string, opts *types.DatasetGetOpts) (*types.Dataset, error) { + return s.Index.GetDataset(ctx, datasetID, opts) } func (s *Datastore) ListDatasets(ctx context.Context) ([]types.Dataset, error) { @@ -61,7 +61,7 @@ func (s *Datastore) UpdateDataset(ctx context.Context, updatedDataset types.Data return origDS, fmt.Errorf("dataset ID is required") } - origDS, err = s.GetDataset(ctx, updatedDataset.ID) + origDS, err = s.GetDataset(ctx, updatedDataset.ID, nil) if err != nil { return origDS, err } diff --git a/knowledge/pkg/datastore/ingest.go b/knowledge/pkg/datastore/ingest.go index b25867bb..277e6c99 100644 --- a/knowledge/pkg/datastore/ingest.go +++ b/knowledge/pkg/datastore/ingest.go @@ -39,7 +39,7 @@ func (s *Datastore) Ingest(ctx context.Context, datasetID string, filename strin statusLog := log.FromCtx(ctx).With("phase", "store") // Get dataset - ds, err := s.GetDataset(ctx, datasetID) + ds, err := s.GetDataset(ctx, datasetID, nil) if err != nil { return nil, err } diff --git a/knowledge/pkg/datastore/retrieve.go b/knowledge/pkg/datastore/retrieve.go index 5a520e59..8f77f1ee 100644 --- a/knowledge/pkg/datastore/retrieve.go +++ b/knowledge/pkg/datastore/retrieve.go @@ -73,7 +73,7 @@ func (s *Datastore) Retrieve(ctx context.Context, datasetIDs []string, query str } func (s *Datastore) SimilaritySearch(ctx context.Context, query string, numDocuments int, datasetID string, where map[string]string, whereDocument []chromem.WhereDocument) ([]types2.Document, error) { - ds, err := s.GetDataset(ctx, datasetID) + ds, err := s.GetDataset(ctx, datasetID, nil) if err != nil { return nil, err } diff --git a/knowledge/pkg/datastore/retrievers/bm25.go b/knowledge/pkg/datastore/retrievers/bm25.go index c8fa1f38..4d83ac51 100644 --- a/knowledge/pkg/datastore/retrievers/bm25.go +++ b/knowledge/pkg/datastore/retrievers/bm25.go @@ -44,7 +44,7 @@ func (r *BM25Retriever) Retrieve(ctx context.Context, store store.Store, query s for _, datasetID := range datasetIDs { // TODO: make configurable via RetrieveOpts // silently ignore non-existent datasets - ds, err := store.GetDataset(ctx, datasetID) + ds, err := store.GetDataset(ctx, datasetID, nil) if err != nil { if strings.HasPrefix(err.Error(), "dataset not found") { slog.Info("Dataset not found", "dataset", datasetID) diff --git a/knowledge/pkg/datastore/retrievers/retrievers.go b/knowledge/pkg/datastore/retrievers/retrievers.go index 693770c4..6ccf4e1b 100644 --- a/knowledge/pkg/datastore/retrievers/retrievers.go +++ b/knowledge/pkg/datastore/retrievers/retrievers.go @@ -86,7 +86,7 @@ func (r *BasicRetriever) Retrieve(ctx context.Context, store store.Store, query for _, dataset := range datasetIDs { // TODO: make configurable via RetrieveOpts // silently ignore non-existent datasets - ds, err := store.GetDataset(ctx, dataset) + ds, err := store.GetDataset(ctx, dataset, nil) if err != nil { if strings.HasPrefix(err.Error(), "dataset not found") { continue diff --git a/knowledge/pkg/datastore/retrievers/routing.go b/knowledge/pkg/datastore/retrievers/routing.go index d58a2f95..cc6d9250 100644 --- a/knowledge/pkg/datastore/retrievers/routing.go +++ b/knowledge/pkg/datastore/retrievers/routing.go @@ -69,7 +69,7 @@ func (r *RoutingRetriever) Retrieve(ctx context.Context, store store.Store, quer datasets := map[string]map[string]any{} for _, dsID := range r.AvailableDatasets { - dataset, err := store.GetDataset(ctx, dsID) + dataset, err := store.GetDataset(ctx, dsID, nil) if err != nil { return nil, err } diff --git a/knowledge/pkg/datastore/retrievers/subquery.go b/knowledge/pkg/datastore/retrievers/subquery.go index 12d1d8c5..5c6bf773 100644 --- a/knowledge/pkg/datastore/retrievers/subquery.go +++ b/knowledge/pkg/datastore/retrievers/subquery.go @@ -91,7 +91,7 @@ func (s *SubqueryRetriever) Retrieve(ctx context.Context, store store.Store, que for _, dataset := range datasetIDs { // TODO: make configurable via RetrieveOpts // silently ignore non-existent datasets - ds, err := store.GetDataset(ctx, dataset) + ds, err := store.GetDataset(ctx, dataset, nil) if err != nil { if strings.HasPrefix(err.Error(), "dataset not found") { continue diff --git a/knowledge/pkg/datastore/store/store.go b/knowledge/pkg/datastore/store/store.go index 3ca9c675..685c684e 100644 --- a/knowledge/pkg/datastore/store/store.go +++ b/knowledge/pkg/datastore/store/store.go @@ -10,7 +10,7 @@ import ( type Store interface { ListDatasets(ctx context.Context) ([]types.Dataset, error) - GetDataset(ctx context.Context, datasetID string) (*types.Dataset, error) + GetDataset(ctx context.Context, datasetID string, opts *types.DatasetGetOpts) (*types.Dataset, error) SimilaritySearch(ctx context.Context, query string, numDocuments int, collection string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vs.Document, error) GetDocuments(ctx context.Context, datasetID string, where map[string]string, whereDocument []chromem.WhereDocument) ([]vs.Document, error) } diff --git a/knowledge/pkg/index/db.go b/knowledge/pkg/index/db.go index c6d020d8..96bf81b9 100644 --- a/knowledge/pkg/index/db.go +++ b/knowledge/pkg/index/db.go @@ -5,6 +5,7 @@ import ( "fmt" "log" "log/slog" + "os" "strings" "time" @@ -15,6 +16,18 @@ import ( ) func New(ctx context.Context, dsn string, autoMigrate bool) (Index, error) { + gormLogLevel := logger.Silent + switch os.Getenv("GORM_LOG_LEVEL") { + case "silent": + gormLogLevel = logger.Silent + case "error": + gormLogLevel = logger.Error + case "warn": + gormLogLevel = logger.Warn + case "info": + gormLogLevel = logger.Info + } + var ( indexDB Index err error @@ -22,7 +35,7 @@ func New(ctx context.Context, dsn string, autoMigrate bool) (Index, error) { Logger: logger.New(log.Default(), logger.Config{ SlowThreshold: 200 * time.Millisecond, Colorful: true, - LogLevel: logger.Silent, + LogLevel: gormLogLevel, }), TranslateError: true, } diff --git a/knowledge/pkg/index/index.go b/knowledge/pkg/index/index.go index f8074a6a..fcb27ad3 100644 --- a/knowledge/pkg/index/index.go +++ b/knowledge/pkg/index/index.go @@ -12,7 +12,7 @@ type Index interface { // Fundamental Dataset Operations CreateDataset(ctx context.Context, dataset types.Dataset, opts *types.DatasetCreateOpts) error - GetDataset(ctx context.Context, datasetID string) (*types.Dataset, error) + GetDataset(ctx context.Context, datasetID string, opts *types.DatasetGetOpts) (*types.Dataset, error) ListDatasets(ctx context.Context) ([]types.Dataset, error) DeleteDataset(ctx context.Context, datasetID string) error diff --git a/knowledge/pkg/index/postgres/postgres.go b/knowledge/pkg/index/postgres/postgres.go index 79782071..ae4346c1 100644 --- a/knowledge/pkg/index/postgres/postgres.go +++ b/knowledge/pkg/index/postgres/postgres.go @@ -59,8 +59,8 @@ func (i *Index) CreateDataset(ctx context.Context, dataset types.Dataset, opts * return i.DB.CreateDataset(ctx, dataset, opts) } -func (i *Index) GetDataset(ctx context.Context, datasetID string) (*types.Dataset, error) { - return i.DB.GetDataset(ctx, datasetID) +func (i *Index) GetDataset(ctx context.Context, datasetID string, opts *types.DatasetGetOpts) (*types.Dataset, error) { + return i.DB.GetDataset(ctx, datasetID, opts) } func (i *Index) ListDatasets(ctx context.Context) ([]types.Dataset, error) { diff --git a/knowledge/pkg/index/sqlite/sqlite.go b/knowledge/pkg/index/sqlite/sqlite.go index f02fd2b8..b6ade402 100644 --- a/knowledge/pkg/index/sqlite/sqlite.go +++ b/knowledge/pkg/index/sqlite/sqlite.go @@ -143,8 +143,8 @@ func (i *Index) CreateDataset(ctx context.Context, dataset types.Dataset, opts * return i.DB.CreateDataset(ctx, dataset, opts) } -func (i *Index) GetDataset(ctx context.Context, datasetID string) (*types.Dataset, error) { - return i.DB.GetDataset(ctx, datasetID) +func (i *Index) GetDataset(ctx context.Context, datasetID string, opts *types.DatasetGetOpts) (*types.Dataset, error) { + return i.DB.GetDataset(ctx, datasetID, opts) } func (i *Index) ListDatasets(ctx context.Context) ([]types.Dataset, error) { diff --git a/knowledge/pkg/index/types/models.go b/knowledge/pkg/index/types/models.go index 12e4e7f8..7d7fad61 100644 --- a/knowledge/pkg/index/types/models.go +++ b/knowledge/pkg/index/types/models.go @@ -10,6 +10,10 @@ type DatasetCreateOpts struct { ErrOnExists bool } +type DatasetGetOpts struct { + IncludeFiles bool +} + // Dataset refers to a VectorDB data space. // @Description Dataset refers to a VectorDB data space. type Dataset struct { diff --git a/knowledge/pkg/index/types/types.go b/knowledge/pkg/index/types/types.go index cee59d1e..5732afd8 100644 --- a/knowledge/pkg/index/types/types.go +++ b/knowledge/pkg/index/types/types.go @@ -81,9 +81,21 @@ func (db *DB) DeleteDataset(ctx context.Context, datasetID string) error { return nil } -func (db *DB) GetDataset(ctx context.Context, datasetID string) (*Dataset, error) { +func (db *DB) GetDataset(ctx context.Context, datasetID string, opts *DatasetGetOpts) (*Dataset, error) { dataset := &Dataset{} - tx := db.WithContext(ctx).Preload("Files.Documents").First(dataset, "id = ?", datasetID) + tx := db.WithContext(ctx) + + if opts == nil { + opts = &DatasetGetOpts{ + IncludeFiles: false, + } + } + + if opts.IncludeFiles { + tx = tx.Preload("Files.Documents") + } + + tx = tx.First(dataset, "id = ?", datasetID) if tx.Error != nil { if errors.Is(tx.Error, gorm.ErrRecordNotFound) { return nil, nil