Skip to content

Commit

Permalink
fix(gardener): avoid issues caused by stale databases (#1516)
Browse files Browse the repository at this point in the history
We use a database to keep measurements results to allow to interrupt and
resume the work, given that we have thousands or URLs to vet. However,
after a week, the database is most likely stale, so let's just avoid
trusting and start afresh in such a case. It's reasonable to assume that
the database is stale after such an interval because the URL's status
may have changed (e.g., some URLs could have stopped working).

Additionally, every time we run `gardener sync` we most likely have new
URLs, so we need to do one of the following:

1. merge new URLs into the database;

2. just zap the database and start afresh.

Because doing 1. would be a bit time consuming, for now I am opting for
doing 2. and we'll implement 1. if needed.

Closes ooni/probe#2684.
  • Loading branch information
bassosimone authored Feb 14, 2024
1 parent b36dd8f commit 813ba9e
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 18 deletions.
24 changes: 18 additions & 6 deletions internal/cmd/gardener/internal/dnsreport/dnsreport.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"encoding/csv"
"encoding/json"
"fmt"
"io/fs"
"net"
"net/url"
"os"
Expand Down Expand Up @@ -51,20 +52,31 @@ type Subcommand struct {
// loadedFromRepository counts the number of times we loaded from the repository
var loadedFromRepository = &atomic.Int64{}

// databaseIsGood returns whether we should use the existing database content. You should
// pass to this function the results of os.Stat invoked on the database file path.
func databaseIsGood(statbuf fs.FileInfo, err error) bool {
const recreateInterval = 7 * 24 * time.Hour
return err == nil && fsx.IsRegular(statbuf) && time.Since(statbuf.ModTime()) < recreateInterval
}

// Main is the main function of the dnsreport subcommand. This function calls
// [runtimex.PanicOnError] in case of failure.
func (s *Subcommand) Main(ctx context.Context) {
// check whether the database exists
dbExists := fsx.RegularFileExists(s.Database)
// check whether the database exists and check its statistics
isGood := databaseIsGood(os.Stat(s.Database))

// if the database is not good, truncate it and restart
if !isGood {
log.Infof("rm -f %s", s.Database)
_ = os.Remove(s.Database)
}

// create or open the underlying sqlite3 database
db := s.createOrOpenDatabase()
defer db.Close()

// if the database has just been created, then import
// the URLs from the locally cloned git repository, otherwise
// keep using the existing database file
if !dbExists {
// fill again the database if what we had before was not good
if !isGood {
log.Infof("creating new %s database", s.Database)
s.loadFromRepository(db)
loadedFromRepository.Add(1)
Expand Down
11 changes: 11 additions & 0 deletions internal/cmd/gardener/internal/sync/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ const testListsRepo = "https://github.com/citizenlab/test-lists"
// Subcommand is the sync subcommand. The zero value is invalid; please, make
// sure you initialize all the fields marked as MANDATORY.
type Subcommand struct {
// DNSReportDatabase is the MANDATORY file containing the `dnsreport` database
// that we're currently using to avoid repeating measurements.
DNSReportDatabase string

// RepositoryDir is the MANDATORY directory where to clone the test lists repository.
RepositoryDir string

Expand All @@ -35,6 +39,13 @@ func (s *Subcommand) Main() {
// possibly remove a previous working copy
runtimex.Try0(shellx.Run(log.Log, "rm", "-rf", s.RepositoryDir))

// possibly remove an existing dnsreport.sqlite3 database
//
// TODO(bassosimone): an alternative would be to somehow take note of the fact
// that the database needs merging from an updated repository, but doing that
// would require us to write a more complex diff.
runtimex.Try0(shellx.Run(log.Log, "rm", "-f", s.DNSReportDatabase))

// clone a new working copy
runtimex.Try0(shellx.Run(log.Log, "git", "clone", testListsRepo, s.RepositoryDir))

Expand Down
11 changes: 7 additions & 4 deletions internal/cmd/gardener/internal/sync/sync_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,13 @@ func TestWorkingAsIntended(t *testing.T) {

// create the subcommand instance
repodir := filepath.Join("testdata", "repo")
dnsreportfile := filepath.Join("testdata", "dnsreport.sqlite3")
sc := &sync.Subcommand{
RepositoryDir: repodir,
OsChdir: cc.Chdir,
OsGetwd: cc.Getwd,
TimeNow: cc.TimeNow,
DNSReportDatabase: dnsreportfile,
RepositoryDir: repodir,
OsChdir: cc.Chdir,
OsGetwd: cc.Getwd,
TimeNow: cc.TimeNow,
}

// run the subcommand with custom shellx dependencies
Expand All @@ -83,6 +85,7 @@ func TestWorkingAsIntended(t *testing.T) {
// expectations for commands
expect := []string{
fmt.Sprintf("rm -rf %s", repodir),
fmt.Sprintf("rm -f %s", dnsreportfile),
fmt.Sprintf("git clone https://github.com/citizenlab/test-lists %s", repodir),
fmt.Sprintf("cd %s", repodir),
"git checkout -b gardener_20230315T114300Z",
Expand Down
14 changes: 9 additions & 5 deletions internal/cmd/gardener/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ import (
// repositoryDir is the path of the citizenlab/test-lists working copy.
const repositoryDir = "citizenlab-test-lists"

// dnsReportDatabase is the path of the database maintained by the dnsreport subcommand.
const dnsReportDatabase = "dnsreport.sqlite3"

func main() {
// select a colourful apex/log handler
log.SetHandler(cli.New(os.Stderr))
Expand All @@ -39,10 +42,11 @@ func main() {
Args: cobra.NoArgs,
Run: func(cmd *cobra.Command, args []string) {
sc := &sync.Subcommand{
RepositoryDir: repositoryDir,
OsChdir: os.Chdir,
OsGetwd: os.Getwd,
TimeNow: time.Now,
DNSReportDatabase: dnsReportDatabase,
RepositoryDir: repositoryDir,
OsChdir: os.Chdir,
OsGetwd: os.Getwd,
TimeNow: time.Now,
}
sc.Main()
},
Expand All @@ -58,7 +62,7 @@ func main() {
sc := &dnsreport.Subcommand{
APIURL: "https://api.ooni.io",
DNSOverHTTPSServerURL: "https://dns.google/dns-query",
Database: "dnsreport.sqlite3",
Database: dnsReportDatabase,
ReportFile: "dnsreport.csv",
RepositoryDir: repositoryDir,
}
Expand Down
7 changes: 4 additions & 3 deletions internal/fsx/fsx.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func openWithFS(fs fs.FS, pathname string) (fs.File, error) {
file.Close()
return nil, err
}
if !isRegular(info) {
if !IsRegular(info) {
file.Close()
return nil, fmt.Errorf("%w: %s", ErrNotRegularFile, pathname)
}
Expand All @@ -47,7 +47,8 @@ func (filesystem) Open(pathname string) (fs.File, error) {
return os.Open(pathname)
}

func isRegular(info fs.FileInfo) bool {
// IsRegular returns whether a file is a regular file.
func IsRegular(info fs.FileInfo) bool {
return info.Mode().IsRegular()
}

Expand All @@ -58,7 +59,7 @@ func RegularFileExists(filename string) bool {
if err != nil {
return false
}
return isRegular(finfo)
return IsRegular(finfo)
}

// DirectoryExists returns whether the given filename
Expand Down

0 comments on commit 813ba9e

Please sign in to comment.