From 6a974bb7cdda7283a08c524e546075c83f259e2b Mon Sep 17 00:00:00 2001 From: Andrew Gouin Date: Thu, 30 Nov 2023 10:34:56 -0700 Subject: [PATCH] Fix ClearNonces index out of range (#227) * Fix clearnonces index out of range * Auto remove lockfile after unclean shutdown * fix cmd --- cmd/horcrux/cmd/start.go | 9 +++--- cmd/horcrux/cmd/state.go | 21 ++++++++----- signer/cosigner_nonce_cache.go | 9 ++++-- signer/cosigner_nonce_cache_test.go | 46 +++++++++++++++++++++++++++++ signer/services.go | 17 +++++++---- signer/services_test.go | 45 +++++++++++++--------------- 6 files changed, 102 insertions(+), 45 deletions(-) diff --git a/cmd/horcrux/cmd/start.go b/cmd/horcrux/cmd/start.go index f8820533..ef4fde1d 100644 --- a/cmd/horcrux/cmd/start.go +++ b/cmd/horcrux/cmd/start.go @@ -17,19 +17,18 @@ func startCmd() *cobra.Command { Args: cobra.NoArgs, SilenceUsage: true, RunE: func(cmd *cobra.Command, args []string) error { - err := signer.RequireNotRunning(config.PidFile) + out := cmd.OutOrStdout() + logger := cometlog.NewTMLogger(cometlog.NewSyncWriter(out)) + + err := signer.RequireNotRunning(logger, config.PidFile) if err != nil { return err } - out := cmd.OutOrStdout() - if _, err := legacyConfig(); err == nil { return fmt.Errorf("this is a legacy config. run `horcrux config migrate` to migrate to the latest format") } - logger := cometlog.NewTMLogger(cometlog.NewSyncWriter(out)) - // create all directories up to the state directory if err = os.MkdirAll(config.StateDir, 0700); err != nil { return err diff --git a/cmd/horcrux/cmd/state.go b/cmd/horcrux/cmd/state.go index 5380da16..146f7e49 100644 --- a/cmd/horcrux/cmd/state.go +++ b/cmd/horcrux/cmd/state.go @@ -14,6 +14,7 @@ import ( "github.com/strangelove-ventures/horcrux/signer" cometjson "github.com/cometbft/cometbft/libs/json" + cometlog "github.com/cometbft/cometbft/libs/log" ) // Snippet Taken from https://raw.githubusercontent.com/cometbft/cometbft/main/privval/file.go @@ -82,6 +83,9 @@ func setStateCmd() *cobra.Command { RunE: func(cmd *cobra.Command, args []string) error { chainID := args[0] + out := cmd.OutOrStdout() + logger := cometlog.NewTMLogger(cometlog.NewSyncWriter(out)) + if _, err := os.Stat(config.HomeDir); os.IsNotExist(err) { cmd.SilenceUsage = false return fmt.Errorf("%s does not exist, initialize config with horcrux config init and try again", config.HomeDir) @@ -89,7 +93,7 @@ func setStateCmd() *cobra.Command { // Resetting the priv_validator_state.json should only be allowed if the // signer is not running. - if err := signer.RequireNotRunning(config.PidFile); err != nil { + if err := signer.RequireNotRunning(logger, config.PidFile); err != nil { return err } @@ -109,7 +113,7 @@ func setStateCmd() *cobra.Command { return err } - fmt.Fprintf(cmd.OutOrStdout(), "Setting height %d\n", height) + fmt.Fprintf(out, "Setting height %d\n", height) pv.NoncePublic, cs.NoncePublic = nil, nil signState := signer.SignStateConsensus{ @@ -150,9 +154,12 @@ func importStateCmd() *cobra.Command { return fmt.Errorf("%s does not exist, initialize config with horcrux config init and try again", config.HomeDir) } + out := cmd.OutOrStdout() + logger := cometlog.NewTMLogger(cometlog.NewSyncWriter(out)) + // Resetting the priv_validator_state.json should only be allowed if the // signer is not running. - if err := signer.RequireNotRunning(config.PidFile); err != nil { + if err := signer.RequireNotRunning(logger, config.PidFile); err != nil { return err } @@ -170,11 +177,11 @@ func importStateCmd() *cobra.Command { // Allow user to paste in priv_validator_state.json - fmt.Println("IMPORTANT: Your validator should already be STOPPED. You must copy the latest state..") + fmt.Fprintln(out, "IMPORTANT: Your validator should already be STOPPED. You must copy the latest state..") <-time.After(2 * time.Second) - fmt.Println("") - fmt.Println("Paste your old priv_validator_state.json. Input a blank line after the pasted JSON to continue.") - fmt.Println("") + fmt.Fprintln(out, "") + fmt.Fprintln(out, "Paste your old priv_validator_state.json. Input a blank line after the pasted JSON to continue.") + fmt.Fprintln(out, "") var textBuffer strings.Builder diff --git a/signer/cosigner_nonce_cache.go b/signer/cosigner_nonce_cache.go index 1b153db6..81da856d 100644 --- a/signer/cosigner_nonce_cache.go +++ b/signer/cosigner_nonce_cache.go @@ -353,12 +353,14 @@ func (cnc *CosignerNonceCache) PruneNonces() int { func (cnc *CosignerNonceCache) ClearNonces(cosigner Cosigner) { cnc.cache.mu.Lock() defer cnc.cache.mu.Unlock() - for i, cn := range cnc.cache.cache { + for i := 0; i < len(cnc.cache.cache); i++ { + cn := cnc.cache.cache[i] + deleteID := -1 - for i, n := range cn.Nonces { + for j, n := range cn.Nonces { if n.Cosigner.GetID() == cosigner.GetID() { // remove cosigner from this nonce. - deleteID = i + deleteID = j break } } @@ -366,6 +368,7 @@ func (cnc *CosignerNonceCache) ClearNonces(cosigner Cosigner) { if len(cn.Nonces)-1 < int(cnc.threshold) { // If cosigners on this nonce drops below threshold, delete it as it's no longer usable cnc.cache.Delete(i) + i-- } else { cn.Nonces = append(cn.Nonces[:deleteID], cn.Nonces[deleteID+1:]...) } diff --git a/signer/cosigner_nonce_cache_test.go b/signer/cosigner_nonce_cache_test.go index f7d6cc55..e06f2445 100644 --- a/signer/cosigner_nonce_cache_test.go +++ b/signer/cosigner_nonce_cache_test.go @@ -22,6 +22,52 @@ func TestNonceCache(_ *testing.T) { nc.Delete(0) } +func TestClearNonces(t *testing.T) { + lcs, _ := getTestLocalCosigners(t, 2, 3) + cosigners := make([]Cosigner, len(lcs)) + for i, lc := range lcs { + cosigners[i] = lc + } + + cnc := CosignerNonceCache{ + threshold: 2, + } + + for i := 0; i < 10; i++ { + // When deleting nonce for cosigner 1 ([0]), + // these nonce will drop below threshold and be deleted. + cnc.cache.Add(&CachedNonce{ + UUID: uuid.New(), + Expiration: time.Now().Add(1 * time.Second), + Nonces: []CosignerNoncesRel{ + {Cosigner: cosigners[0]}, + {Cosigner: cosigners[1]}, + }, + }) + // When deleting nonce for cosigner 1 ([0]), these nonces will still be above threshold, + // so they will remain without cosigner 1. + cnc.cache.Add(&CachedNonce{ + UUID: uuid.New(), + Expiration: time.Now().Add(1 * time.Second), + Nonces: []CosignerNoncesRel{ + {Cosigner: cosigners[0]}, + {Cosigner: cosigners[1]}, + {Cosigner: cosigners[2]}, + }, + }) + } + + require.Equal(t, 20, cnc.cache.Size()) + + cnc.ClearNonces(cosigners[0]) + + require.Equal(t, 10, cnc.cache.Size()) + + for _, n := range cnc.cache.cache { + require.Len(t, n.Nonces, 2) + } +} + type mockPruner struct { cnc *CosignerNonceCache count int diff --git a/signer/services.go b/signer/services.go index 62ff93ba..e6263f06 100644 --- a/signer/services.go +++ b/signer/services.go @@ -13,7 +13,7 @@ import ( cometservice "github.com/cometbft/cometbft/libs/service" ) -func RequireNotRunning(pidFilePath string) error { +func RequireNotRunning(log cometlog.Logger, pidFilePath string) error { if _, err := os.Stat(pidFilePath); err != nil { if os.IsNotExist(err) { // lock file does not exist, can continue starting daemon @@ -41,8 +41,7 @@ func RequireNotRunning(pidFilePath string) error { process, err := os.FindProcess(int(pid)) if err != nil { - return fmt.Errorf(`unclean shutdown detected. PID file exists at %s but PID %d can not be found. -manual deletion of PID file required. %w`, pidFilePath, pid, err) + return fmt.Errorf("error checking pid %d: %w", pid, err) } err = process.Signal(syscall.Signal(0)) @@ -50,8 +49,16 @@ manual deletion of PID file required. %w`, pidFilePath, pid, err) return fmt.Errorf("horcrux is already running on PID: %d", pid) } if errors.Is(err, os.ErrProcessDone) { - return fmt.Errorf(`unclean shutdown detected. PID file exists at %s but PID %d is not running. -manual deletion of PID file required`, pidFilePath, pid) + log.Error( + "Unclean shutdown detected. PID file exists at but process with that ID cannot be found. Removing lock file", + "pid", pid, + "pid_file", pidFilePath, + "error", err, + ) + if err := os.Remove(pidFilePath); err != nil { + return fmt.Errorf("failed to delete pid file %s: %w", pidFilePath, err) + } + return nil } errno, ok := err.(syscall.Errno) diff --git a/signer/services_test.go b/signer/services_test.go index fe8acb8b..66d1e732 100644 --- a/signer/services_test.go +++ b/signer/services_test.go @@ -5,7 +5,6 @@ import ( "fmt" "os" "path/filepath" - "runtime" "strconv" "strings" "sync" @@ -72,7 +71,7 @@ func TestIsRunning(t *testing.T) { pidBz, err := os.ReadFile(pidFilePath) require.NoError(t, err) - err = signer.RequireNotRunning(pidFilePath) + err = signer.RequireNotRunning(cometlog.NewNopLogger(), pidFilePath) expectedErrorMsg := fmt.Sprintf("horcrux is already running on PID: %s", strings.TrimSpace(string(pidBz))) require.EqualError(t, err, expectedErrorMsg) } @@ -81,20 +80,26 @@ func TestIsNotRunning(t *testing.T) { homeDir := t.TempDir() pidFilePath := filepath.Join(homeDir, "horcrux.pid") - err := signer.RequireNotRunning(pidFilePath) + err := signer.RequireNotRunning(cometlog.NewNopLogger(), pidFilePath) require.NoError(t, err) } -func getNonExistentPid() (int, error) { +func maxPid() int { + const defaultMaxPid = 100000 maxPidBytes, err := os.ReadFile("/proc/sys/kernel/pid_max") if err != nil { - return -1, err + return defaultMaxPid } - maxPid, err := strconv.ParseUint(strings.TrimSpace(string(maxPidBytes)), 10, 64) + maxPid, err := strconv.ParseInt(strings.TrimSpace(string(maxPidBytes)), 10, 32) if err != nil { - return -1, err + return defaultMaxPid } - for pid := 1; pid <= int(maxPid); pid++ { + return int(maxPid) +} + +func getUnusedPid() (int, error) { + max := maxPid() + for pid := 1; pid <= max; pid++ { process, err := os.FindProcess(pid) if err != nil { continue @@ -106,26 +111,15 @@ func getNonExistentPid() (int, error) { if errors.Is(err, os.ErrProcessDone) { return pid, nil } - errno, ok := err.(syscall.Errno) - if !ok { - continue - } - if errno == syscall.ESRCH { - return pid, nil - } } return -1, errors.New("could not find unused PID") } func TestIsRunningNonExistentPid(t *testing.T) { - if runtime.GOOS != "linux" { - t.Skip("test only valid on Linux") - } - homeDir := t.TempDir() pidFilePath := filepath.Join(homeDir, "horcrux.pid") - pid, err := getNonExistentPid() + pid, err := getUnusedPid() require.NoError(t, err) err = os.WriteFile( @@ -135,10 +129,11 @@ func TestIsRunningNonExistentPid(t *testing.T) { ) require.NoError(t, err, "error writing pid file") - err = signer.RequireNotRunning(pidFilePath) - expectedErrorMsg := fmt.Sprintf(`unclean shutdown detected. PID file exists at %s but PID %d is not running. -manual deletion of PID file required`, pidFilePath, pid) - require.EqualError(t, err, expectedErrorMsg) + err = signer.RequireNotRunning(cometlog.NewNopLogger(), pidFilePath) + require.Nil(t, err) + + _, err = os.Stat(pidFilePath) + require.ErrorIs(t, err, os.ErrNotExist) } func TestConcurrentStart(t *testing.T) { @@ -216,7 +211,7 @@ func TestIsRunningAndWaitForService(t *testing.T) { } panicFunction := func() { defer recoverFromPanic() - err = signer.RequireNotRunning(pidFilePath) + err = signer.RequireNotRunning(cometlog.NewNopLogger(), pidFilePath) } go panicFunction() wg.Wait()