feat: proper push monitor lifecycle — PENDING, LATE, DOWN states
Push monitors no longer lie about status: - PENDING stays until first heartbeat (no auto-promote to UP) - LATE state (amber) when overdue but within grace period - DOWN only after grace period expires - Grace period = interval/2, minimum 60s RecordHeartbeat now handles all transitions: - PENDING → UP (first heartbeat, logged) - LATE → UP (late arrival, logged) - DOWN → UP (recovery, alert + state change persisted) TUI updates: - LATE rendered in amber/warning color - Status bar shows LATE count separately - Tab badge shows ⚠ for late monitors - Sort order: DOWN > LATE > UP > PENDING > PAUSED - Detail panel shows error for LATE monitors Inspired by Healthchecks.io state machine (new/up/grace/down).
This commit is contained in:
+43
-12
@@ -19,8 +19,8 @@ import (
|
|||||||
const (
|
const (
|
||||||
maxLogEntries = 100
|
maxLogEntries = 100
|
||||||
pollInterval = 5 * time.Second
|
pollInterval = 5 * time.Second
|
||||||
pushGracePeriod = 5 * time.Second
|
|
||||||
minCheckInterval = 5
|
minCheckInterval = 5
|
||||||
|
minPushGrace = 60 * time.Second
|
||||||
)
|
)
|
||||||
|
|
||||||
type Engine struct {
|
type Engine struct {
|
||||||
@@ -186,17 +186,34 @@ func (e *Engine) RecordHeartbeat(token string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
prevStatus := site.Status
|
||||||
site.LastCheck = time.Now()
|
site.LastCheck = time.Now()
|
||||||
wasDown := site.Status == "DOWN"
|
|
||||||
site.Status = "UP"
|
site.Status = "UP"
|
||||||
site.FailureCount = 0
|
site.FailureCount = 0
|
||||||
site.Latency = 0
|
site.Latency = 0
|
||||||
|
site.LastError = ""
|
||||||
|
site.LastSuccessAt = time.Now()
|
||||||
|
|
||||||
|
if prevStatus != "UP" {
|
||||||
|
site.StatusChangedAt = time.Now()
|
||||||
|
}
|
||||||
|
|
||||||
e.liveState[targetID] = site
|
e.liveState[targetID] = site
|
||||||
|
|
||||||
if wasDown {
|
switch prevStatus {
|
||||||
|
case "PENDING":
|
||||||
|
e.AddLog(fmt.Sprintf("Push Monitor '%s' received first heartbeat", site.Name))
|
||||||
|
case "LATE":
|
||||||
|
e.AddLog(fmt.Sprintf("Push Monitor '%s' heartbeat arrived (was late)", site.Name))
|
||||||
|
case "DOWN":
|
||||||
e.AddLog(fmt.Sprintf("Push Monitor '%s' recovered", site.Name))
|
e.AddLog(fmt.Sprintf("Push Monitor '%s' recovered", site.Name))
|
||||||
e.triggerAlert(site.AlertID, "✅ RECOVERY", fmt.Sprintf("Push Monitor '%s' is receiving heartbeats.", site.Name))
|
go e.triggerAlert(site.AlertID, "✅ RECOVERY", fmt.Sprintf("Push Monitor '%s' is receiving heartbeats.", site.Name))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if prevStatus != "UP" && prevStatus != "PENDING" {
|
||||||
|
go func() { _ = e.db.SaveStateChange(targetID, prevStatus, "UP", "") }()
|
||||||
|
}
|
||||||
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -241,9 +258,6 @@ func (e *Engine) Start(ctx context.Context) {
|
|||||||
if !exists {
|
if !exists {
|
||||||
e.mu.Lock()
|
e.mu.Lock()
|
||||||
s.Status = "PENDING"
|
s.Status = "PENDING"
|
||||||
if s.Type == "push" {
|
|
||||||
s.LastCheck = time.Now()
|
|
||||||
}
|
|
||||||
if h, ok := e.GetHistory(s.ID); ok && len(h.Statuses) > 0 {
|
if h, ok := e.GetHistory(s.ID); ok && len(h.Statuses) > 0 {
|
||||||
if h.Statuses[len(h.Statuses)-1] {
|
if h.Statuses[len(h.Statuses)-1] {
|
||||||
s.Status = "UP"
|
s.Status = "UP"
|
||||||
@@ -401,11 +415,28 @@ func (e *Engine) checkByID(id int) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (e *Engine) checkPush(site models.Site) {
|
func (e *Engine) checkPush(site models.Site) {
|
||||||
deadline := site.LastCheck.Add(time.Duration(site.Interval) * time.Second).Add(pushGracePeriod)
|
if site.Status == "PENDING" {
|
||||||
if time.Now().After(deadline) {
|
return
|
||||||
e.handleStatusChange(site, "DOWN", 0, 0, "heartbeat missed")
|
}
|
||||||
} else if site.Status != "UP" {
|
|
||||||
e.handleStatusChange(site, "UP", 200, 0, "")
|
interval := time.Duration(site.Interval) * time.Second
|
||||||
|
grace := interval / 2
|
||||||
|
if grace < minPushGrace {
|
||||||
|
grace = minPushGrace
|
||||||
|
}
|
||||||
|
|
||||||
|
overdue := site.LastCheck.Add(interval)
|
||||||
|
graceEnd := overdue.Add(grace)
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
if now.After(graceEnd) {
|
||||||
|
if site.Status != "DOWN" {
|
||||||
|
e.handleStatusChange(site, "DOWN", 0, 0, "heartbeat missed")
|
||||||
|
}
|
||||||
|
} else if now.After(overdue) {
|
||||||
|
if site.Status != "LATE" {
|
||||||
|
e.handleStatusChange(site, "LATE", 0, 0, "heartbeat overdue")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -537,7 +537,7 @@ func TestCheckPush_DeadlineMissed(t *testing.T) {
|
|||||||
site := models.Site{
|
site := models.Site{
|
||||||
ID: 1, Name: "push", Type: "push", Status: "UP",
|
ID: 1, Name: "push", Type: "push", Status: "UP",
|
||||||
Interval: 10, MaxRetries: 0,
|
Interval: 10, MaxRetries: 0,
|
||||||
LastCheck: time.Now().Add(-20 * time.Second),
|
LastCheck: time.Now().Add(-120 * time.Second),
|
||||||
}
|
}
|
||||||
injectSite(e, site)
|
injectSite(e, site)
|
||||||
|
|
||||||
@@ -549,6 +549,24 @@ func TestCheckPush_DeadlineMissed(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCheckPush_OverdueBecomesLate(t *testing.T) {
|
||||||
|
ms := newMockStore()
|
||||||
|
e := newTestEngine(ms)
|
||||||
|
site := models.Site{
|
||||||
|
ID: 1, Name: "push", Type: "push", Status: "UP",
|
||||||
|
Interval: 300,
|
||||||
|
LastCheck: time.Now().Add(-310 * time.Second),
|
||||||
|
}
|
||||||
|
injectSite(e, site)
|
||||||
|
|
||||||
|
e.checkPush(site)
|
||||||
|
|
||||||
|
s, _ := getSite(e, 1)
|
||||||
|
if s.Status != "LATE" {
|
||||||
|
t.Errorf("expected LATE when overdue but within grace, got %s", s.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestCheckPush_WithinDeadline(t *testing.T) {
|
func TestCheckPush_WithinDeadline(t *testing.T) {
|
||||||
ms := newMockStore()
|
ms := newMockStore()
|
||||||
e := newTestEngine(ms)
|
e := newTestEngine(ms)
|
||||||
@@ -566,20 +584,20 @@ func TestCheckPush_WithinDeadline(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCheckPush_PendingToUp(t *testing.T) {
|
func TestCheckPush_PendingStaysPending(t *testing.T) {
|
||||||
ms := newMockStore()
|
ms := newMockStore()
|
||||||
e := newTestEngine(ms)
|
e := newTestEngine(ms)
|
||||||
site := models.Site{
|
site := models.Site{
|
||||||
ID: 1, Name: "push", Type: "push", Status: "PENDING",
|
ID: 1, Name: "push", Type: "push", Status: "PENDING",
|
||||||
Interval: 60, LastCheck: time.Now(),
|
Interval: 60,
|
||||||
}
|
}
|
||||||
injectSite(e, site)
|
injectSite(e, site)
|
||||||
|
|
||||||
e.checkPush(site)
|
e.checkPush(site)
|
||||||
|
|
||||||
s, _ := getSite(e, 1)
|
s, _ := getSite(e, 1)
|
||||||
if s.Status != "UP" {
|
if s.Status != "PENDING" {
|
||||||
t.Errorf("expected UP, got %s", s.Status)
|
t.Errorf("expected PENDING to stay until first heartbeat, got %s", s.Status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ var statusTpl = template.Must(template.New("status").Parse(`
|
|||||||
.UP { background: #9ece6a; color: #1a1b26; }
|
.UP { background: #9ece6a; color: #1a1b26; }
|
||||||
.DOWN { background: #f7768e; color: #1a1b26; }
|
.DOWN { background: #f7768e; color: #1a1b26; }
|
||||||
.PENDING { background: #e0af68; color: #1a1b26; }
|
.PENDING { background: #e0af68; color: #1a1b26; }
|
||||||
|
.LATE { background: #e0af68; color: #1a1b26; }
|
||||||
.SSL-EXP { background: #e0af68; color: #1a1b26; }
|
.SSL-EXP { background: #e0af68; color: #1a1b26; }
|
||||||
.PAUSED { background: #565f89; color: #c0caf5; }
|
.PAUSED { background: #565f89; color: #c0caf5; }
|
||||||
.MAINT { background: #bb9af7; color: #1a1b26; }
|
.MAINT { background: #bb9af7; color: #1a1b26; }
|
||||||
|
|||||||
@@ -302,6 +302,8 @@ func fmtStatus(status string, paused bool, inMaint bool) string {
|
|||||||
switch status {
|
switch status {
|
||||||
case "DOWN", "SSL EXP":
|
case "DOWN", "SSL EXP":
|
||||||
return dangerStyle.Render(status)
|
return dangerStyle.Render(status)
|
||||||
|
case "LATE":
|
||||||
|
return warnStyle.Render(status)
|
||||||
case "PENDING":
|
case "PENDING":
|
||||||
return subtleStyle.Render(status)
|
return subtleStyle.Render(status)
|
||||||
default:
|
default:
|
||||||
@@ -412,7 +414,7 @@ func (m Model) viewSitesTab() string {
|
|||||||
name = limitStr(name, nameW)
|
name = limitStr(name, nameW)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
|
if (site.Status == "DOWN" || site.Status == "SSL EXP" || site.Status == "LATE") && site.LastError != "" {
|
||||||
nameLen := len([]rune(name))
|
nameLen := len([]rune(name))
|
||||||
errSpace := nameW - nameLen - 1
|
errSpace := nameW - nameLen - 1
|
||||||
if errSpace > 10 {
|
if errSpace > 10 {
|
||||||
@@ -764,7 +766,7 @@ func (m Model) viewDetailPanel() string {
|
|||||||
|
|
||||||
row("Status", fmtStatus(site.Status, site.Paused, m.isMonitorInMaintenance(site.ID)))
|
row("Status", fmtStatus(site.Status, site.Paused, m.isMonitorInMaintenance(site.ID)))
|
||||||
|
|
||||||
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
|
if (site.Status == "DOWN" || site.Status == "SSL EXP" || site.Status == "LATE") && site.LastError != "" {
|
||||||
row("Error", dangerStyle.Render(limitStr(site.LastError, 60)))
|
row("Error", dangerStyle.Render(limitStr(site.LastError, 60)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+21
-5
@@ -811,13 +811,20 @@ func (m Model) viewDashboard() string {
|
|||||||
allSites := m.engine.GetAllSites()
|
allSites := m.engine.GetAllSites()
|
||||||
totalMonitors := 0
|
totalMonitors := 0
|
||||||
downCount := 0
|
downCount := 0
|
||||||
|
lateCount := 0
|
||||||
for _, s := range allSites {
|
for _, s := range allSites {
|
||||||
if s.Type == "group" {
|
if s.Type == "group" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
totalMonitors++
|
totalMonitors++
|
||||||
if !s.Paused && !m.isMonitorInMaintenance(s.ID) && (s.Status == "DOWN" || s.Status == "SSL EXP") {
|
if s.Paused || m.isMonitorInMaintenance(s.ID) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch s.Status {
|
||||||
|
case "DOWN", "SSL EXP":
|
||||||
downCount++
|
downCount++
|
||||||
|
case "LATE":
|
||||||
|
lateCount++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
offlineNodes := 0
|
offlineNodes := 0
|
||||||
@@ -830,6 +837,8 @@ func (m Model) viewDashboard() string {
|
|||||||
var sitesLabel string
|
var sitesLabel string
|
||||||
if downCount > 0 {
|
if downCount > 0 {
|
||||||
sitesLabel = fmt.Sprintf("Sites (%d↓)", downCount)
|
sitesLabel = fmt.Sprintf("Sites (%d↓)", downCount)
|
||||||
|
} else if lateCount > 0 {
|
||||||
|
sitesLabel = fmt.Sprintf("Sites (%d⚠)", lateCount)
|
||||||
} else if totalMonitors > 0 {
|
} else if totalMonitors > 0 {
|
||||||
sitesLabel = fmt.Sprintf("Sites (%d)", totalMonitors)
|
sitesLabel = fmt.Sprintf("Sites (%d)", totalMonitors)
|
||||||
} else {
|
} else {
|
||||||
@@ -895,14 +904,19 @@ func (m Model) viewDashboard() string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
upCount := totalMonitors - downCount
|
upCount := totalMonitors - downCount - lateCount
|
||||||
var upStr string
|
var upStr string
|
||||||
if downCount > 0 {
|
if downCount > 0 {
|
||||||
upStr = dangerStyle.Render(fmt.Sprintf("%d/%d UP", upCount, totalMonitors))
|
upStr = dangerStyle.Render(fmt.Sprintf("%d/%d UP", upCount, totalMonitors))
|
||||||
|
} else if lateCount > 0 {
|
||||||
|
upStr = warnStyle.Render(fmt.Sprintf("%d/%d UP", upCount, totalMonitors))
|
||||||
} else {
|
} else {
|
||||||
upStr = specialStyle.Render(fmt.Sprintf("%d/%d UP", upCount, totalMonitors))
|
upStr = specialStyle.Render(fmt.Sprintf("%d/%d UP", upCount, totalMonitors))
|
||||||
}
|
}
|
||||||
statusParts := []string{upStr}
|
statusParts := []string{upStr}
|
||||||
|
if lateCount > 0 {
|
||||||
|
statusParts = append(statusParts, warnStyle.Render(fmt.Sprintf("%d LATE", lateCount)))
|
||||||
|
}
|
||||||
if len(m.nodes) > 0 {
|
if len(m.nodes) > 0 {
|
||||||
online := 0
|
online := 0
|
||||||
for _, n := range m.nodes {
|
for _, n := range m.nodes {
|
||||||
@@ -949,10 +963,12 @@ func siteOrder(s models.Site) int {
|
|||||||
switch s.Status {
|
switch s.Status {
|
||||||
case "DOWN", "SSL EXP":
|
case "DOWN", "SSL EXP":
|
||||||
return 0
|
return 0
|
||||||
case "PENDING":
|
case "LATE":
|
||||||
return 2
|
|
||||||
default:
|
|
||||||
return 1
|
return 1
|
||||||
|
case "PENDING":
|
||||||
|
return 3
|
||||||
|
default:
|
||||||
|
return 2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user