feat: Implement Phase 1 of kat-agent with leader election and init
This commit is contained in:
@ -0,0 +1,85 @@
|
||||
package leader
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"git.dws.rip/dubey/kat/internal/store"
|
||||
)
|
||||
|
||||
const (
|
||||
// DefaultLeaseTTLSeconds is the default time-to-live for a leader's lease.
|
||||
DefaultLeaseTTLSeconds = 15
|
||||
// DefaultRetryPeriod is the time to wait before retrying to campaign for leadership.
|
||||
DefaultRetryPeriod = 5 * time.Second
|
||||
)
|
||||
|
||||
// LeadershipManager handles the lifecycle of campaigning for and maintaining leadership.
|
||||
type LeadershipManager struct {
|
||||
Store store.StateStore
|
||||
LeaderID string // Identifier for this candidate (e.g., node name)
|
||||
LeaseTTLSeconds int64
|
||||
|
||||
OnElected func(leadershipCtx context.Context) // Called when leadership is acquired
|
||||
OnResigned func() // Called when leadership is lost or resigned
|
||||
}
|
||||
|
||||
// NewLeadershipManager creates a new leadership manager.
|
||||
func NewLeadershipManager(st store.StateStore, leaderID string, onElected func(leadershipCtx context.Context), onResigned func()) *LeadershipManager {
|
||||
return &LeadershipManager{
|
||||
Store: st,
|
||||
LeaderID: leaderID,
|
||||
LeaseTTLSeconds: DefaultLeaseTTLSeconds,
|
||||
OnElected: onElected,
|
||||
OnResigned: onResigned,
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts the leadership campaign loop.
|
||||
// It blocks until the provided context is cancelled.
|
||||
func (lm *LeadershipManager) Run(ctx context.Context) {
|
||||
log.Printf("Starting leadership manager for %s", lm.LeaderID)
|
||||
defer log.Printf("Leadership manager for %s stopped", lm.LeaderID)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
log.Printf("Parent context cancelled, stopping leadership campaign for %s.", lm.LeaderID)
|
||||
// Attempt to resign if currently leading, though store.Close() might handle this too.
|
||||
// This resign is best-effort as the app is shutting down.
|
||||
resignCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
lm.Store.Resign(resignCtx)
|
||||
cancel()
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
log.Printf("%s is campaigning for leadership...", lm.LeaderID)
|
||||
leadershipCtx, err := lm.Store.Campaign(ctx, lm.LeaderID, lm.LeaseTTLSeconds)
|
||||
if err != nil {
|
||||
log.Printf("Error campaigning for leadership for %s: %v. Retrying in %v.", lm.LeaderID, err, DefaultRetryPeriod)
|
||||
select {
|
||||
case <-time.After(DefaultRetryPeriod):
|
||||
continue
|
||||
case <-ctx.Done():
|
||||
return // Exit if parent context cancelled during retry wait
|
||||
}
|
||||
}
|
||||
|
||||
// Successfully became leader
|
||||
log.Printf("%s is now the leader.", lm.LeaderID)
|
||||
if lm.OnElected != nil {
|
||||
lm.OnElected(leadershipCtx) // Pass the context that's cancelled on leadership loss
|
||||
}
|
||||
|
||||
// Block until leadership is lost (leadershipCtx is cancelled)
|
||||
<-leadershipCtx.Done()
|
||||
log.Printf("%s has lost leadership.", lm.LeaderID)
|
||||
if lm.OnResigned != nil {
|
||||
lm.OnResigned()
|
||||
}
|
||||
// Loop will restart campaign unless parent ctx is done.
|
||||
// Store.Resign() is implicitly called by the store when leadershipCtx is done or session expires.
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user