Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use check statuses 1h/1d summary to get the best window for graph #402

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 107 additions & 2 deletions query/check_details.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,32 @@ import (
"github.com/samber/lo"
)

var (
const (
// Maximum number of past checks in the in-memory cache
DefaultCacheCount = 5

// Default search window
DefaultCheckQueryWindow = "1h"

// The number of data points that should be strived for
// when aggregating check statuses.
desiredNumOfCheckStatuses = 100
)

var (
// allowed list of window durations that are used when aggregating check statuses.
allowedWindows = []time.Duration{
time.Minute, // 1m
time.Minute * 5, // 5m
time.Minute * 15, // 15m
time.Minute * 30, // 30m
time.Hour, // 1h
time.Hour * 3, // 3h
time.Hour * 6, // 6h
time.Hour * 12, // 12h
time.Hour * 24, // 24h
time.Hour * 24 * 7, // 1w
}
)

type Timeseries struct {
Expand Down Expand Up @@ -138,10 +158,84 @@ func (q CheckQueryParams) GetWhereClause() (string, map[string]interface{}, erro
return strings.TrimSpace(clause), args, nil
}

func (q CheckQueryParams) ExecuteDetails(ctx context.Context) ([]Timeseries, types.Uptime, types.Latency, error) {
func getBestPartitioner(totalChecks int, rangeDuration time.Duration) time.Duration {
if totalChecks <= desiredNumOfCheckStatuses {
return 0 // No need to perform window aggregation
}

bestDelta := 100000000 // sufficiently large delta to begin with
bestWindow := allowedWindows[0]

for _, wp := range allowedWindows {
numWindows := int(rangeDuration / wp)
delta := abs(desiredNumOfCheckStatuses - numWindows)

if delta < bestDelta {
bestDelta = delta
bestWindow = wp
} else {
// as soon as we notice that the delta gets worse, we break the loop
break
}
}

numWindows := int(rangeDuration / bestWindow)
if abs(desiredNumOfCheckStatuses-totalChecks) <= abs(desiredNumOfCheckStatuses-numWindows) {
// If this best partition creates windows such that the resulting number of data points deviate more
// from the desired data points than the actual data points, then we do not aggregate.
// Example: if there are 144 checks for the duration of 6 days,
// then the best partition, 1 hour, would generate 144 data points.
// But the original data points (120) are closer to 100, so we do not aggregate.
return 0
}

return bestWindow
}

func optimalWindow(ctx context.Context, from, to time.Time) (time.Duration, error) {
var view string
timeRange := to.Sub(from)
if timeRange > time.Hour*24*21 {
view = "check_statuses_1d"
} else if timeRange > time.Hour*48 {
view = "check_statuses_1h"
} else {
return -1, nil //
}

q := fmt.Sprintf(`
SELECT
SUM(total) AS total,
MAX(created_at) AS latest,
MIN(created_at) AS earliest
FROM
%s
WHERE
created_at >= ? AND created_at <= ?;`, view)
var total *int
var latest, earliest *time.Time
if err := ctx.DB().Raw(q, from, to).Row().Scan(&total, &latest, &earliest); err != nil {
return 0, err
}
if total == nil {
return -1, nil //
}

return getBestPartitioner(*total, earliest.Sub(*latest)), nil
}

func CheckStatuses(ctx context.Context, q CheckQueryParams) ([]Timeseries, types.Uptime, types.Latency, error) {
start := q.GetStartTime().Format(time.RFC3339)
end := q.GetEndTime().Format(time.RFC3339)

// For the given ranges try to find the best window using check statuses summary
window, err := optimalWindow(ctx, *q.GetStartTime(), *q.GetEndTime())
if err != nil {
return nil, types.Uptime{}, types.Latency{}, err
} else if window >= 0 {
q.WindowDuration = window
}

query := `
With grouped_by_window AS (
SELECT
Expand Down Expand Up @@ -309,3 +403,14 @@ func parseDuration(d string, name string) (clause string, arg interface{}, err e
}
return "", nil, fmt.Errorf("start time must be a duration or RFC3339 timestamp")
}

// abs returns the absolute value of i.
// math.Abs only supports float64 and this avoids the needless type conversions
// and ugly expression.
func abs(n int) int {
if n > 0 {
return n
}

return -n
}
14 changes: 7 additions & 7 deletions tests/fixtures/dummy/check_statuses.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"github.com/flanksource/duty/models"
)

func generateStatus(check models.Check, t time.Time, count int, passingMod int) []models.CheckStatus {
func generateStatus(check models.Check, t time.Time, schedule time.Duration, count int, passingMod int) []models.CheckStatus {
var statuses = []models.CheckStatus{}

for i := 0; i < count; i++ {
Expand All @@ -27,9 +27,9 @@ func generateStatus(check models.Check, t time.Time, count int, passingMod int)

func AllDummyCheckStatuses() []models.CheckStatus {
statuses := append(
generateStatus(LogisticsAPIHealthHTTPCheck, CurrentTime, 70, 5),
generateStatus(DeletedCheck, CurrentTime, 1, 1)[0],
generateStatus(DeletedCheckOld, *DeletedCheckOld.CreatedAt, 1, 1)[0],
generateStatus(LogisticsAPIHealthHTTPCheck, CurrentTime, time.Minute, 70, 5),
generateStatus(DeletedCheck, CurrentTime, time.Minute, 1, 1)[0],
generateStatus(DeletedCheckOld, *DeletedCheckOld.CreatedAt, time.Minute, 1, 1)[0],
models.CheckStatus{
CheckID: LogisticsAPIHomeHTTPCheck.ID,
Duration: 100,
Expand All @@ -46,12 +46,12 @@ func AllDummyCheckStatuses() []models.CheckStatus {
},
)

statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-15*time.Minute), 1, 1)[0])
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-2*time.Hour), 10, 2)...)
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-15*time.Minute), time.Minute, 1, 1)[0])
statuses = append(statuses, generateStatus(DeletedCheck1h, CurrentTime.Add(-2*time.Hour), time.Minute, 10, 2)...)

// Check statuses from 2022-01-01
// not dervied from current time for consistency
statuses = append(statuses, generateStatus(CartAPIHeathCheckAgent, time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC), 70, 5)...)
statuses = append(statuses, generateStatus(CartAPIHeathCheckAgent, time.Date(2022, 1, 1, 0, 0, 0, 0, time.UTC), time.Minute*5, 1440, 5)...) // 1440 check statuses spanning 5 days

return statuses
}
4 changes: 2 additions & 2 deletions tests/query_check_details_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
. "github.com/onsi/gomega"
)

var _ = ginkgo.Describe("CheckDetails", ginkgo.Ordered, func() {
var _ = ginkgo.Describe("CheckDetails", ginkgo.Ordered, ginkgo.Focus, func() {
type testRecord struct {
since string
statuses int
Expand Down Expand Up @@ -48,7 +48,7 @@ var _ = ginkgo.Describe("CheckDetails", ginkgo.Ordered, func() {
err = q.Init(urlParam)
Expect(err).To(BeNil())

ts, uptime, latency, err := q.ExecuteDetails(DefaultContext)
ts, uptime, latency, err := query.CheckStatuses(DefaultContext, q)
Expect(err).To(BeNil())

Expect(len(ts)).To(Equal(td.statuses), "unexpected number of results")
Expand Down
8 changes: 8 additions & 0 deletions tests/setup/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,14 @@ func BeforeSuiteFn(args ...interface{}) context.Context {
logger.Infof("Created dummy data %v", len(dummyData.Checks))
}

if err, _ := job.AggregateCheckStatus1d(DefaultContext); err != nil {
panic(err.Error())
}

if err, _ := job.AggregateCheckStatus1h(DefaultContext); err != nil {
panic(err.Error())
}

DefaultContext := DefaultContext.WithKubernetes(fake.NewSimpleClientset(&v1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: "test-cm",
Expand Down