Skip to content

Commit

Permalink
fix pipeline stuck in queue status because etcd server abnormal (#6427)
Browse files Browse the repository at this point in the history
  • Loading branch information
chengjoey authored Aug 23, 2024
1 parent c315ced commit 48579d8
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func (p *provider) handleLogDir() error {
return err
}
// dir is not exist
return os.Mkdir(p.Cfg.LogDir, 0755)
return os.MkdirAll(p.Cfg.LogDir, 0755)
}

func (p *provider) Init(ctx servicehub.Context) error {
Expand Down
5 changes: 5 additions & 0 deletions internal/tools/pipeline/providers/leaderworker/heartbeat.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"strconv"
"time"

"github.com/pkg/errors"

clientv3 "go.etcd.io/etcd/client/v3"

"github.com/erda-project/erda/internal/tools/pipeline/providers/leaderworker/worker"
Expand Down Expand Up @@ -53,6 +55,9 @@ func (p *provider) workerOnceReportHeartbeat(ctx context.Context, w worker.Worke
// update lastProbeAt
nowSec := time.Now().Round(0).Unix()
if _, err := p.EtcdClient.Put(hctx, p.makeEtcdWorkerHeartbeatKey(w.GetID()), strutil.String(nowSec)); err != nil {
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
panic(fmt.Errorf("failed to update last heartbeat time into etcd, workerID: %s, err: %v", w.GetID(), err))
}
return fmt.Errorf("failed to update last heartbeat time into etcd, workerID: %s, err: %v", w.GetID(), err)
}
p.Log.Debugf("worker heartbeat reported, workerID: %s", w.GetID())
Expand Down
6 changes: 6 additions & 0 deletions internal/tools/pipeline/providers/leaderworker/impl_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ package leaderworker

import (
"context"
"fmt"
"time"

"github.com/pkg/errors"
clientv3 "go.etcd.io/etcd/client/v3"

"github.com/erda-project/erda/internal/tools/pipeline/providers/leaderworker/lwctx"
Expand Down Expand Up @@ -151,6 +153,10 @@ func (p *provider) workerIntervalCleanupOnDelete(ctx context.Context, ev Event)
if err == nil {
return
}
// actively panic exit
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
panic(fmt.Errorf("actively panic exit, maybe the etcd server has been shut down, err: %v", err))
}
p.Log.Errorf("failed to do worker interval cleanup on delete(auto retry), step: delete heartbeat key, workerID: %s, err: %v", ev.WorkerID, err)
time.Sleep(p.Cfg.Worker.RetryInterval)
}
Expand Down

0 comments on commit 48579d8

Please sign in to comment.