Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow for paging through more results than the limit set by index.max_result_window #67

Open
alexashley opened this issue Apr 8, 2021 · 1 comment

Comments

@alexashley
Copy link
Contributor

Came out of this discussion.

The documentation that Elasticsearch provides on pagination makes it sound like there is a hard cap on the number of results than can be paged through using from and size:

By default, you cannot use from and size to page through more than 10,000 hits. This limit is a safeguard set by the index.max_result_window index setting. If you need to page through more than 10,000 hits, use the search_after parameter instead.

We need to determine if that's the case or not by loading a number of notes or occurrences greater than index.max_result_window and attempting to page through them.

If it is, we'll need to make some changes to grab the sort value from the last hit in the results, encode that in the page token, and send it along in future requests as the search_after parameter.

@alexashley
Copy link
Contributor Author

Used this script to bulk load 15,000 occurrences and try to page through them:

bulk.go
package main

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"log"
	"net/http"
	"time"

	"github.com/brianvoe/gofakeit/v6"
	"github.com/grafeas/grafeas/proto/v1beta1/build_go_proto"
	"github.com/grafeas/grafeas/proto/v1beta1/common_go_proto"
	"github.com/grafeas/grafeas/proto/v1beta1/grafeas_go_proto"
	"github.com/grafeas/grafeas/proto/v1beta1/provenance_go_proto"
	"github.com/grafeas/grafeas/proto/v1beta1/source_go_proto"
	"google.golang.org/grpc"
)

const (
	chunkSize = 1000
	numberOfOccurrences = 15000
	project = "rode"
	grafeasUrl = "localhost:8080"
)

var (
	fake = gofakeit.New(0)
)

func main() {
	conn, client := createGrafeasClient()
	defer conn.Close()

	createProject()
	log.Println("created project")
	loadOccurrences(client)
	log.Println("loaded occurrences")

	pageThroughOccurrences(client)
}

func createProject() {
	client := http.Client{
		Timeout: time.Minute,
	}

	projectPayload := map[string]string{
		"name": "projects/" + project,
	}

	response, err := client.Post(fmt.Sprintf("%s/v1beta1/projects", "http://" + grafeasUrl), "application/json", jsonBody(&projectPayload))
	if err != nil {
		log.Fatal("error creating project", err)
	}

	if response.StatusCode != http.StatusOK {
		log.Fatal("unexpected response creating project", response.StatusCode)
	}
}

func createGrafeasClient() (*grpc.ClientConn, grafeas_go_proto.GrafeasV1Beta1Client) {
	connection, err := grpc.DialContext(context.Background(), grafeasUrl, grpc.WithInsecure(), grpc.WithBlock())
	if err != nil {
		log.Fatal("error creating grafeas client", err)
	}
	grafeasClient := grafeas_go_proto.NewGrafeasV1Beta1Client(connection)

	return connection, grafeasClient
}

func loadOccurrences(client grafeas_go_proto.GrafeasV1Beta1Client) {
	occurrences := make([]*grafeas_go_proto.Occurrence, numberOfOccurrences)

	for i := 0; i < len(occurrences); i++ {
		occurrences[i] = createRandomBuildOccurrence()
	}

	var	occurrenceChunks [][]*grafeas_go_proto.Occurrence
	for i := 0; i < len(occurrences); i+= chunkSize {
		end := i + chunkSize
		if end > len(occurrences) {
			end = len(occurrences)
		}

		occurrenceChunks = append(occurrenceChunks, occurrences[i:end])
	}

	for i := range occurrenceChunks {
		o := occurrenceChunks[i]

		_, err := client.BatchCreateOccurrences(context.Background(), &grafeas_go_proto.BatchCreateOccurrencesRequest{
			Parent:               "projects/"+project,
			Occurrences:          o,
		})

		if err != nil {
			log.Fatal("error batch creating occurrences", err)
		}
	}
}

func pageThroughOccurrences(client grafeas_go_proto.GrafeasV1Beta1Client) {
	currentPage := 1
	pageToken := ""
	for {
		log.Println("requesting page", currentPage)
		request := &grafeas_go_proto.ListOccurrencesRequest{
			Parent:               "projects/"+project,
			Filter:               "",
			PageSize:             1000,
			PageToken:            pageToken,
		}
		response, err := client.ListOccurrences(context.Background(), request)

		if err != nil {
			log.Fatal("failed to list occurrences", err)
		}
		currentPage++
		pageToken = response.NextPageToken
		log.Printf("got %d occurrences\n", len(response.Occurrences))

		if len(response.Occurrences) == 0 {
			log.Println("reached the end of the result set")
			break
		}
	}
}

func createRandomBuildOccurrence() *grafeas_go_proto.Occurrence {
	return  &grafeas_go_proto.Occurrence{
		Name: fake.Name(),
		Resource: &grafeas_go_proto.Resource{
			Uri: fake.URL(),
		},
		NoteName:    fmt.Sprintf("projects/%s/notes/%s", project, fake.UUID()),
		Kind:        common_go_proto.NoteKind_BUILD,
		Remediation: "",
		CreateTime:  nil,
		UpdateTime:  nil,
		Details: &grafeas_go_proto.Occurrence_Build{
			Build: &build_go_proto.Details{
				Provenance: &provenance_go_proto.BuildProvenance{
					Id:        fake.UUID(),
					ProjectId: "projects/rode",
					Commands:  nil,
					BuiltArtifacts: []*provenance_go_proto.Artifact{
						{
							Checksum: fake.LetterN(5),
							Id:       fake.UUID(),
							Names: []string{
								fake.URL(),
								fake.URL(),
							},
						},
					},
					SourceProvenance: &provenance_go_proto.Source{
						ArtifactStorageSourceUri: fake.URL(),
						Context: &source_go_proto.SourceContext{
							Context: &source_go_proto.SourceContext_Git{
								Git: &source_go_proto.GitSourceContext{
									Url:        fake.URL(),
									RevisionId: fake.LetterN(7),
								},
							},
							Labels: nil,
						},
					},
				},
			},
		},
	}
}

func jsonBody(val interface{}) io.Reader {
	jsonBytes, err := json.Marshal(val)
	if err != nil {
		log.Fatal("serialization error", err)
	}

	return bytes.NewReader(jsonBytes)
}
output
$ go run bulk.go 
WARNING: Package "github.com/golang/protobuf/protoc-gen-go/generator" is deprecated.
	A future release of golang/protobuf will delete this package,
	which has long been excluded from the compatibility promise.

2021/04/09 16:52:11 requesting page 1
2021/04/09 16:52:11 got 1000 occurrences
2021/04/09 16:52:11 requesting page 2
2021/04/09 16:52:12 got 1000 occurrences
2021/04/09 16:52:12 requesting page 3
2021/04/09 16:52:12 got 1000 occurrences
2021/04/09 16:52:12 requesting page 4
2021/04/09 16:52:12 got 1000 occurrences
2021/04/09 16:52:12 requesting page 5
2021/04/09 16:52:12 got 1000 occurrences
2021/04/09 16:52:12 requesting page 6
2021/04/09 16:52:12 got 1000 occurrences
2021/04/09 16:52:12 requesting page 7
2021/04/09 16:52:12 got 1000 occurrences
2021/04/09 16:52:12 requesting page 8
2021/04/09 16:52:13 got 1000 occurrences
2021/04/09 16:52:13 requesting page 9
2021/04/09 16:52:13 got 1000 occurrences
2021/04/09 16:52:13 requesting page 10
2021/04/09 16:52:13 got 1000 occurrences
2021/04/09 16:52:13 requesting page 11
2021/04/09 16:52:13 failed to list occurrencesrpc error: code = Internal desc = unexpected response from elasticsearch
exit status 1

On the 11th page, this error is returned from Elasticsearch:

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "Result window is too large, from + size must be less than or equal to: [10000] but was [11000]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting."
      }
    ],
    "type": "search_phase_execution_exception",
    "reason": "all shards failed",
    "phase": "query",
    "grouped": true,
    "failed_shards": [
      {
        "shard": 0,
        "index": "grafeas-v1beta2-rode-occurrences",
        "node": "y40fPpNDRm648olC-Ut-tA",
        "reason": {
          "type": "illegal_argument_exception",
          "reason": "Result window is too large, from + size must be less than or equal to: [10000] but was [11000]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting."
        }
      }
    ],
    "caused_by": {
      "type": "illegal_argument_exception",
      "reason": "Result window is too large, from + size must be less than or equal to: [10000] but was [11000]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting.",
      "caused_by": {
        "type": "illegal_argument_exception",
        "reason": "Result window is too large, from + size must be less than or equal to: [10000] but was [11000]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting."
      }
    }
  },
  "status": 400
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant