feat: show error reason when monitors go DOWN
CI / test (pull_request) Successful in 2m42s
CI / lint (pull_request) Successful in 1m11s
CI / vulncheck (pull_request) Successful in 51s

Propagate check failure reasons through the entire stack:
- Checker captures specific errors (DNS, timeout, HTTP status, SSL, etc.)
- Engine tracks LastError, StatusChangedAt, LastSuccessAt per monitor
- State transitions persisted to new state_changes table
- Detail panel shows error reason, HTTP code, state duration, last
  success time, and last 5 state change events
- Monitor table shows inline error preview for DOWN services
- Alert messages include error reason
- Probe nodes forward error reasons to leader

15 files changed across models, checker, engine, store, TUI, and probes.
This commit is contained in:
2026-05-27 19:32:30 -04:00
parent d8a2cab90f
commit bc3a44beac
15 changed files with 299 additions and 96 deletions
+2
View File
@@ -67,6 +67,8 @@ func (m *mockStore) DeleteMaintenanceWindow(int) error { retur
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil } func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil } func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil } func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil } func (m *mockStore) Close() error { return nil }
// --- Cluster Start Tests --- // --- Cluster Start Tests ---
+2
View File
@@ -130,6 +130,7 @@ type probeResultItem struct {
SiteID int `json:"site_id"` SiteID int `json:"site_id"`
LatencyNs int64 `json:"latency_ns"` LatencyNs int64 `json:"latency_ns"`
IsUp bool `json:"is_up"` IsUp bool `json:"is_up"`
ErrorReason string `json:"error_reason,omitempty"`
} }
func probeExecuteChecks(ctx context.Context, sites []models.Site, strict, insecure *http.Client, allowPrivate bool) []probeResultItem { func probeExecuteChecks(ctx context.Context, sites []models.Site, strict, insecure *http.Client, allowPrivate bool) []probeResultItem {
@@ -157,6 +158,7 @@ loop:
SiteID: s.ID, SiteID: s.ID,
LatencyNs: cr.LatencyNs, LatencyNs: cr.LatencyNs,
IsUp: cr.Status == "UP", IsUp: cr.Status == "UP",
ErrorReason: cr.ErrorReason,
}) })
mu.Unlock() mu.Unlock()
}(site) }(site)
+5 -2
View File
@@ -2,13 +2,14 @@ package metrics
import ( import (
"context" "context"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"strings" "strings"
"testing" "testing"
"time" "time"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
) )
type mockStore struct { type mockStore struct {
@@ -64,6 +65,8 @@ func (m *mockStore) DeleteMaintenanceWindow(int) error { retur
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil } func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil } func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil } func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil } func (m *mockStore) Close() error { return nil }
func TestMetricsHandler(t *testing.T) { func TestMetricsHandler(t *testing.T) {
+12
View File
@@ -35,6 +35,18 @@ type Site struct {
HasSSL bool HasSSL bool
LastCheck time.Time LastCheck time.Time
SentSSLWarning bool SentSSLWarning bool
LastError string
StatusChangedAt time.Time
LastSuccessAt time.Time
}
type StateChange struct {
ID int
SiteID int
FromStatus string
ToStatus string
ErrorReason string
ChangedAt time.Time
} }
type AlertConfig struct { type AlertConfig struct {
+1
View File
@@ -15,6 +15,7 @@ type NodeResult struct {
IsUp bool IsUp bool
LatencyNs int64 LatencyNs int64
CheckedAt time.Time CheckedAt time.Time
ErrorReason string
} }
func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) { func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) {
+28 -9
View File
@@ -2,6 +2,7 @@ package monitor
import ( import (
"context" "context"
"fmt"
"net" "net"
"net/http" "net/http"
"strconv" "strconv"
@@ -21,6 +22,7 @@ type CheckResult struct {
LatencyNs int64 LatencyNs int64
HasSSL bool HasSSL bool
CertExpiry time.Time CertExpiry time.Time
ErrorReason string
} }
func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bool, allowPrivate ...bool) CheckResult { func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bool, allowPrivate ...bool) CheckResult {
@@ -35,7 +37,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo
if ips, err := net.LookupIP(host); err == nil { if ips, err := net.LookupIP(host); err == nil {
for _, ip := range ips { for _, ip := range ips {
if isPrivateIP(ip) { if isPrivateIP(ip) {
return CheckResult{SiteID: site.ID, Status: "DOWN"} return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "target resolves to private IP"}
} }
} }
} }
@@ -52,7 +54,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo
case "dns": case "dns":
return runDNSCheck(site) return runDNSCheck(site)
default: default:
return CheckResult{SiteID: site.ID, Status: "DOWN"} return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "unsupported monitor type: " + site.Type}
} }
} }
@@ -68,7 +70,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
req, err := http.NewRequestWithContext(ctx, method, site.URL, nil) req, err := http.NewRequestWithContext(ctx, method, site.URL, nil)
if err != nil { if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN"} return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "invalid request: " + err.Error()}
} }
client := strict client := strict
@@ -88,6 +90,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
if err != nil { if err != nil {
result.Status = "DOWN" result.Status = "DOWN"
result.ErrorReason = truncateError(err.Error(), 256)
return result return result
} }
defer resp.Body.Close() defer resp.Body.Close()
@@ -95,6 +98,11 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
result.StatusCode = resp.StatusCode result.StatusCode = resp.StatusCode
if !isCodeAccepted(resp.StatusCode, site.AcceptedCodes) { if !isCodeAccepted(resp.StatusCode, site.AcceptedCodes) {
result.Status = "DOWN" result.Status = "DOWN"
expected := site.AcceptedCodes
if expected == "" {
expected = "200-299"
}
result.ErrorReason = fmt.Sprintf("HTTP %d (expected %s)", resp.StatusCode, expected)
} }
if site.CheckSSL && resp.TLS != nil && len(resp.TLS.PeerCertificates) > 0 { if site.CheckSSL && resp.TLS != nil && len(resp.TLS.PeerCertificates) > 0 {
@@ -103,6 +111,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
result.CertExpiry = cert.NotAfter result.CertExpiry = cert.NotAfter
if time.Now().After(cert.NotAfter) { if time.Now().After(cert.NotAfter) {
result.Status = "SSL EXP" result.Status = "SSL EXP"
result.ErrorReason = "SSL certificate expired"
} }
} }
@@ -117,7 +126,7 @@ func runPingCheck(site models.Site) CheckResult {
pinger, err := probing.NewPinger(host) pinger, err := probing.NewPinger(host)
if err != nil { if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN"} return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "ping setup: " + err.Error()}
} }
pinger.Count = 1 pinger.Count = 1
pinger.Timeout = siteTimeout(site) pinger.Timeout = siteTimeout(site)
@@ -127,8 +136,11 @@ func runPingCheck(site models.Site) CheckResult {
err = pinger.Run() err = pinger.Run()
latency := time.Since(start) latency := time.Since(start)
if err != nil || pinger.Statistics().PacketsRecv == 0 { if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()} return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "ping failed: " + err.Error()}
}
if pinger.Statistics().PacketsRecv == 0 {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "no ICMP response"}
} }
stats := pinger.Statistics() stats := pinger.Statistics()
@@ -148,7 +160,7 @@ func runPortCheck(site models.Site) CheckResult {
latency := time.Since(start) latency := time.Since(start)
if err != nil { if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()} return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: truncateError(err.Error(), 256)}
} }
_ = conn.Close() _ = conn.Close()
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()} return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
@@ -199,10 +211,10 @@ func runDNSCheck(site models.Site) CheckResult {
latency := time.Since(start) latency := time.Since(start)
if err != nil { if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()} return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS query failed: " + err.Error()}
} }
if r.Rcode != dns.RcodeSuccess { if r.Rcode != dns.RcodeSuccess {
return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds()} return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS RCODE: " + dns.RcodeToString[r.Rcode]}
} }
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()} return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
} }
@@ -235,3 +247,10 @@ func isCodeAccepted(code int, accepted string) bool {
} }
return false return false
} }
func truncateError(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max-3] + "..."
}
+45 -6
View File
@@ -283,6 +283,9 @@ func (e *Engine) UpdateSiteConfig(site models.Site) {
site.LastCheck = existing.LastCheck site.LastCheck = existing.LastCheck
site.SentSSLWarning = existing.SentSSLWarning site.SentSSLWarning = existing.SentSSLWarning
site.FailureCount = existing.FailureCount site.FailureCount = existing.FailureCount
site.LastError = existing.LastError
site.StatusChangedAt = existing.StatusChangedAt
site.LastSuccessAt = existing.LastSuccessAt
e.liveState[site.ID] = site e.liveState[site.ID] = site
e.addToTokenIndex(site) e.addToTokenIndex(site)
} }
@@ -393,33 +396,45 @@ func (e *Engine) checkByID(id int) {
updatedSite.CertExpiry = result.CertExpiry updatedSite.CertExpiry = result.CertExpiry
updatedSite.Latency = time.Duration(result.LatencyNs) updatedSite.Latency = time.Duration(result.LatencyNs)
updatedSite.LastCheck = time.Now() updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs)) e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs), result.ErrorReason)
} }
} }
func (e *Engine) checkPush(site models.Site) { func (e *Engine) checkPush(site models.Site) {
deadline := site.LastCheck.Add(time.Duration(site.Interval) * time.Second).Add(pushGracePeriod) deadline := site.LastCheck.Add(time.Duration(site.Interval) * time.Second).Add(pushGracePeriod)
if time.Now().After(deadline) { if time.Now().After(deadline) {
e.handleStatusChange(site, "DOWN", 0, 0) e.handleStatusChange(site, "DOWN", 0, 0, "heartbeat missed")
} else if site.Status != "UP" { } else if site.Status != "UP" {
e.handleStatusChange(site, "UP", 200, 0) e.handleStatusChange(site, "UP", 200, 0, "")
} }
} }
func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration) { func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration, errorReason string) {
if !e.IsActive() { if !e.IsActive() {
return return
} }
newState := site newState := site
newState.StatusCode = code newState.StatusCode = code
newState.LastError = errorReason
if rawStatus == "UP" {
newState.LastSuccessAt = time.Now()
newState.LastError = ""
} else {
newState.LastSuccessAt = site.LastSuccessAt
}
if site.Status == "UP" && rawStatus != "UP" { if site.Status == "UP" && rawStatus != "UP" {
newState.FailureCount++ newState.FailureCount++
if newState.FailureCount > site.MaxRetries { if newState.FailureCount > site.MaxRetries {
newState.Status = rawStatus newState.Status = rawStatus
newState.FailureCount = site.MaxRetries + 1 newState.FailureCount = site.MaxRetries + 1
if errorReason != "" {
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN: %s", site.Name, errorReason))
} else {
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN", site.Name)) e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN", site.Name))
}
} else { } else {
e.AddLog(fmt.Sprintf("Monitor '%s' failed check %d/%d", site.Name, newState.FailureCount, site.MaxRetries)) e.AddLog(fmt.Sprintf("Monitor '%s' failed check %d/%d", site.Name, newState.FailureCount, site.MaxRetries))
} }
@@ -431,6 +446,14 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int
newState.FailureCount = site.MaxRetries + 1 newState.FailureCount = site.MaxRetries + 1
} }
if newState.Status != site.Status && site.Status != "PENDING" {
newState.StatusChangedAt = time.Now()
} else if site.StatusChangedAt.IsZero() && newState.Status != "PENDING" {
newState.StatusChangedAt = time.Now()
} else {
newState.StatusChangedAt = site.StatusChangedAt
}
inMaint := e.isInMaintenance(site.ID) inMaint := e.isInMaintenance(site.ID)
if site.Type == "http" && site.CheckSSL && site.HasSSL { if site.Type == "http" && site.CheckSSL && site.HasSSL {
@@ -455,12 +478,19 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int
e.recordCheck(site.ID, latency, rawStatus == "UP") e.recordCheck(site.ID, latency, rawStatus == "UP")
if newState.Status != site.Status && site.Status != "PENDING" {
go func() { _ = e.db.SaveStateChange(site.ID, site.Status, newState.Status, errorReason) }()
}
isBroken := func(s string) bool { return s == "DOWN" || s == "SSL EXP" } isBroken := func(s string) bool { return s == "DOWN" || s == "SSL EXP" }
if !isBroken(site.Status) && isBroken(newState.Status) && newState.Status != "PENDING" { if !isBroken(site.Status) && isBroken(newState.Status) && newState.Status != "PENDING" {
if inMaint { if inMaint {
e.AddLog(fmt.Sprintf("Monitor '%s' is DOWN (alerts suppressed — maintenance)", site.Name)) e.AddLog(fmt.Sprintf("Monitor '%s' is DOWN (alerts suppressed — maintenance)", site.Name))
} else { } else {
msg := fmt.Sprintf("Monitor '%s' is DOWN (%s)", site.Name, rawStatus) msg := fmt.Sprintf("Monitor '%s' is DOWN (%s)", site.Name, rawStatus)
if errorReason != "" {
msg = fmt.Sprintf("Monitor '%s' is DOWN: %s", site.Name, errorReason)
}
if site.Type == "push" { if site.Type == "push" {
msg = fmt.Sprintf("Push Monitor '%s' missed heartbeat.", site.Name) msg = fmt.Sprintf("Push Monitor '%s' missed heartbeat.", site.Name)
} }
@@ -554,7 +584,7 @@ func (e *Engine) SetAggStrategy(strategy AggregationStrategy) {
e.aggStrategy = strategy e.aggStrategy = strategy
} }
func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool) { func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool, errorReason string) {
e.probeResultsMu.Lock() e.probeResultsMu.Lock()
if e.probeResults[siteID] == nil { if e.probeResults[siteID] == nil {
e.probeResults[siteID] = make(map[string]NodeResult) e.probeResults[siteID] = make(map[string]NodeResult)
@@ -564,6 +594,7 @@ func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, i
IsUp: isUp, IsUp: isUp,
LatencyNs: latencyNs, LatencyNs: latencyNs,
CheckedAt: time.Now(), CheckedAt: time.Now(),
ErrorReason: errorReason,
} }
results := make([]NodeResult, 0, len(e.probeResults[siteID])) results := make([]NodeResult, 0, len(e.probeResults[siteID]))
for _, r := range e.probeResults[siteID] { for _, r := range e.probeResults[siteID] {
@@ -588,7 +619,7 @@ func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, i
updatedSite := site updatedSite := site
updatedSite.Latency = time.Duration(avgLatency) updatedSite.Latency = time.Duration(avgLatency)
updatedSite.LastCheck = time.Now() updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency)) e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency), errorReason)
} }
func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult { func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult {
@@ -601,3 +632,11 @@ func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult {
} }
return cp return cp
} }
func (e *Engine) GetStateChanges(siteID int, limit int) []models.StateChange {
changes, err := e.db.GetStateChanges(siteID, limit)
if err != nil {
return nil
}
return changes
}
+19 -16
View File
@@ -2,10 +2,11 @@ package monitor
import ( import (
"fmt" "fmt"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"sync" "sync"
"testing" "testing"
"time" "time"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
) )
// --- Mock Store --- // --- Mock Store ---
@@ -73,6 +74,8 @@ func (m *mockStore) EndMaintenanceWindow(int) error { retur
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil } func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil } func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil } func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil } func (m *mockStore) Close() error { return nil }
func (m *mockStore) GetAllAlerts() ([]models.AlertConfig, error) { func (m *mockStore) GetAllAlerts() ([]models.AlertConfig, error) {
@@ -174,7 +177,7 @@ func TestHandleStatusChange_PendingToUp(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "PENDING", MaxRetries: 3, AlertID: 1} site := models.Site{ID: 1, Name: "test", Status: "PENDING", MaxRetries: 3, AlertID: 1}
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 10*time.Millisecond) e.handleStatusChange(site, "UP", 200, 10*time.Millisecond, "")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "UP" { if s.Status != "UP" {
@@ -195,7 +198,7 @@ func TestHandleStatusChange_UpIncrementFailure(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 3, FailureCount: 0} site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 3, FailureCount: 0}
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "DOWN", 500, 0) e.handleStatusChange(site, "DOWN", 500, 0, "test error")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "UP" { if s.Status != "UP" {
@@ -213,7 +216,7 @@ func TestHandleStatusChange_UpToDown_ExceedsRetries(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 2, FailureCount: 2, AlertID: 1} site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 2, FailureCount: 2, AlertID: 1}
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "DOWN", 500, 0) e.handleStatusChange(site, "DOWN", 500, 0, "test error")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "DOWN" { if s.Status != "DOWN" {
@@ -236,7 +239,7 @@ func TestHandleStatusChange_UpToDown_ZeroRetries(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, FailureCount: 0, AlertID: 1} site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, FailureCount: 0, AlertID: 1}
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "DOWN", 0, 0) e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "DOWN" { if s.Status != "DOWN" {
@@ -255,7 +258,7 @@ func TestHandleStatusChange_DownToUp_Recovery(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "DOWN", FailureCount: 4, AlertID: 1} site := models.Site{ID: 1, Name: "test", Status: "DOWN", FailureCount: 4, AlertID: 1}
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 5*time.Millisecond) e.handleStatusChange(site, "UP", 200, 5*time.Millisecond, "")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "UP" { if s.Status != "UP" {
@@ -276,7 +279,7 @@ func TestHandleStatusChange_DownStaysDown(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "DOWN", MaxRetries: 2, FailureCount: 3} site := models.Site{ID: 1, Name: "test", Status: "DOWN", MaxRetries: 2, FailureCount: 3}
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "DOWN", 0, 0) e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "DOWN" { if s.Status != "DOWN" {
@@ -295,7 +298,7 @@ func TestHandleStatusChange_SSLExpired(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1} site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1}
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "SSL EXP", 0, 0) e.handleStatusChange(site, "SSL EXP", 0, 0, "SSL certificate expired")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "SSL EXP" { if s.Status != "SSL EXP" {
@@ -315,7 +318,7 @@ func TestHandleStatusChange_AlertSuppressedMaintenance(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1} site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1}
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "DOWN", 0, 0) e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "DOWN" { if s.Status != "DOWN" {
@@ -346,7 +349,7 @@ func TestHandleStatusChange_RecoverySuppressedMaintenance(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "DOWN", AlertID: 1} site := models.Site{ID: 1, Name: "test", Status: "DOWN", AlertID: 1}
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0) e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "UP" { if s.Status != "UP" {
@@ -370,7 +373,7 @@ func TestHandleStatusChange_SSLWarning(t *testing.T) {
} }
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0) e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if !s.SentSSLWarning { if !s.SentSSLWarning {
@@ -393,7 +396,7 @@ func TestHandleStatusChange_SSLWarningNotRepeated(t *testing.T) {
} }
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0) e.handleStatusChange(site, "UP", 200, 0, "")
waitAsync() waitAsync()
if len(ms.getAlertCallsSnapshot()) != 0 { if len(ms.getAlertCallsSnapshot()) != 0 {
@@ -412,7 +415,7 @@ func TestHandleStatusChange_SSLWarningReset(t *testing.T) {
} }
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0) e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.SentSSLWarning { if s.SentSSLWarning {
@@ -433,7 +436,7 @@ func TestHandleStatusChange_SSLWarningSuppressedMaint(t *testing.T) {
} }
injectSite(e, site) injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0) e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if !s.SentSSLWarning { if !s.SentSSLWarning {
@@ -452,7 +455,7 @@ func TestHandleStatusChange_InactiveEngine(t *testing.T) {
injectSite(e, site) injectSite(e, site)
e.SetActive(false) e.SetActive(false)
e.handleStatusChange(site, "DOWN", 0, 0) e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1) s, _ := getSite(e, 1)
if s.Status != "UP" { if s.Status != "UP" {
@@ -991,7 +994,7 @@ func TestConcurrent_HandleStatusChangeAndGetState(t *testing.T) {
wg.Add(2) wg.Add(2)
go func() { go func() {
defer wg.Done() defer wg.Done()
e.handleStatusChange(site, "DOWN", 500, 0) e.handleStatusChange(site, "DOWN", 500, 0, "test error")
}() }()
go func() { go func() {
defer wg.Done() defer wg.Done()
+2 -1
View File
@@ -406,6 +406,7 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server {
SiteID int `json:"site_id"` SiteID int `json:"site_id"`
LatencyNs int64 `json:"latency_ns"` LatencyNs int64 `json:"latency_ns"`
IsUp bool `json:"is_up"` IsUp bool `json:"is_up"`
ErrorReason string `json:"error_reason"`
} `json:"results"` } `json:"results"`
} }
if err := json.NewDecoder(r.Body).Decode(&req); err != nil { if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
@@ -420,7 +421,7 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server {
if err := s.SaveCheckFromNode(result.SiteID, req.NodeID, result.LatencyNs, result.IsUp); err != nil { if err := s.SaveCheckFromNode(result.SiteID, req.NodeID, result.LatencyNs, result.IsUp); err != nil {
log.Printf("Failed to save probe result: %v", err) log.Printf("Failed to save probe result: %v", err)
} }
eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp) eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp, result.ErrorReason)
} }
if err := s.UpdateNodeLastSeen(req.NodeID); err != nil { if err := s.UpdateNodeLastSeen(req.NodeID); err != nil {
log.Printf("Failed to update node last seen: %v", err) log.Printf("Failed to update node last seen: %v", err)
+5 -2
View File
@@ -4,13 +4,14 @@ import (
"bytes" "bytes"
"encoding/json" "encoding/json"
"fmt" "fmt"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
"net" "net"
"net/http" "net/http"
"sync" "sync"
"testing" "testing"
"time" "time"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
) )
// --- Mock Store --- // --- Mock Store ---
@@ -75,6 +76,8 @@ func (m *mockStore) DeleteMaintenanceWindow(int) error { retur
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil } func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil } func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil } func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil } func (m *mockStore) Close() error { return nil }
func (m *mockStore) ExportData() (models.Backup, error) { func (m *mockStore) ExportData() (models.Backup, error) {
+9
View File
@@ -72,6 +72,15 @@ func (d *PostgresDialect) CreateTablesSQL() []string {
key TEXT PRIMARY KEY, key TEXT PRIMARY KEY,
value TEXT NOT NULL value TEXT NOT NULL
)`, )`,
`CREATE TABLE IF NOT EXISTS state_changes (
id SERIAL PRIMARY KEY,
site_id INTEGER NOT NULL,
from_status TEXT NOT NULL,
to_status TEXT NOT NULL,
error_reason TEXT DEFAULT '',
changed_at TIMESTAMP DEFAULT NOW()
)`,
`CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`,
} }
} }
+9
View File
@@ -79,6 +79,15 @@ func (d *SQLiteDialect) CreateTablesSQL() []string {
key TEXT PRIMARY KEY, key TEXT PRIMARY KEY,
value TEXT NOT NULL value TEXT NOT NULL
)`, )`,
`CREATE TABLE IF NOT EXISTS state_changes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
site_id INTEGER NOT NULL,
from_status TEXT NOT NULL,
to_status TEXT NOT NULL,
error_reason TEXT DEFAULT '',
changed_at DATETIME DEFAULT CURRENT_TIMESTAMP
)`,
`CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`,
} }
} }
+23
View File
@@ -347,6 +347,29 @@ func (s *SQLStore) DeleteUser(id int) error {
return err return err
} }
func (s *SQLStore) SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error {
_, err := s.db.Exec(s.q("INSERT INTO state_changes (site_id, from_status, to_status, error_reason) VALUES (?, ?, ?, ?)"),
siteID, fromStatus, toStatus, errorReason)
return err
}
func (s *SQLStore) GetStateChanges(siteID int, limit int) ([]models.StateChange, error) {
rows, err := s.db.Query(s.q("SELECT id, site_id, from_status, to_status, error_reason, changed_at FROM state_changes WHERE site_id = ? ORDER BY changed_at DESC LIMIT ?"), siteID, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var changes []models.StateChange
for rows.Next() {
var sc models.StateChange
if err := rows.Scan(&sc.ID, &sc.SiteID, &sc.FromStatus, &sc.ToStatus, &sc.ErrorReason, &sc.ChangedAt); err != nil {
return changes, err
}
changes = append(changes, sc)
}
return changes, rows.Err()
}
func (s *SQLStore) SaveCheck(siteID int, latencyNs int64, isUp bool) error { func (s *SQLStore) SaveCheck(siteID int, latencyNs int64, isUp bool) error {
return s.SaveCheckFromNode(siteID, "", latencyNs, isUp) return s.SaveCheckFromNode(siteID, "", latencyNs, isUp)
} }
+4
View File
@@ -38,6 +38,10 @@ type Store interface {
SaveCheckFromNode(siteID int, nodeID string, latencyNs int64, isUp bool) error SaveCheckFromNode(siteID int, nodeID string, latencyNs int64, isUp bool) error
LoadAllHistory(limit int) (map[int][]models.CheckRecord, error) LoadAllHistory(limit int) (map[int][]models.CheckRecord, error)
// State Changes
SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error
GetStateChanges(siteID int, limit int) ([]models.StateChange, error)
// Nodes // Nodes
RegisterNode(node models.ProbeNode) error RegisterNode(node models.ProbeNode) error
GetNode(id string) (models.ProbeNode, error) GetNode(id string) (models.ProbeNode, error)
+74 -1
View File
@@ -309,6 +309,29 @@ func fmtStatus(status string, paused bool, inMaint bool) string {
} }
} }
func fmtDuration(d time.Duration) string {
if d < time.Minute {
return fmt.Sprintf("%ds", int(d.Seconds()))
}
if d < time.Hour {
return fmt.Sprintf("%dm", int(d.Minutes()))
}
if d < 24*time.Hour {
h := int(d.Hours())
m := int(d.Minutes()) % 60
if m > 0 {
return fmt.Sprintf("%dh %dm", h, m)
}
return fmt.Sprintf("%dh", h)
}
days := int(d.Hours()) / 24
hours := int(d.Hours()) % 24
if hours > 0 {
return fmt.Sprintf("%dd %dh", days, hours)
}
return fmt.Sprintf("%dd", days)
}
func (m Model) dynamicWidths() (nameW, sparkW int) { func (m Model) dynamicWidths() (nameW, sparkW int) {
fixed := 6 + 10 + 10 + 8 + 8 + 7 + 9 // #, TYPE, STATUS, LATENCY, UPTIME, SSL, RETRY fixed := 6 + 10 + 10 + 8 + 8 + 7 + 9 // #, TYPE, STATUS, LATENCY, UPTIME, SSL, RETRY
overhead := 30 // cell padding + borders overhead := 30 // cell padding + borders
@@ -389,6 +412,14 @@ func (m Model) viewSitesTab() string {
name = limitStr(name, nameW) name = limitStr(name, nameW)
} }
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
nameLen := len([]rune(name))
errSpace := nameW - nameLen - 1
if errSpace > 10 {
name = name + " " + subtleStyle.Render(limitStr(site.LastError, errSpace))
}
}
hist, _ := m.engine.GetHistory(site.ID) hist, _ := m.engine.GetHistory(site.ID)
var spark string var spark string
if site.Type == "push" { if site.Type == "push" {
@@ -732,6 +763,25 @@ func (m Model) viewDetailPanel() string {
} }
row("Status", fmtStatus(site.Status, site.Paused, m.isMonitorInMaintenance(site.ID))) row("Status", fmtStatus(site.Status, site.Paused, m.isMonitorInMaintenance(site.ID)))
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
row("Error", dangerStyle.Render(limitStr(site.LastError, 60)))
}
if site.Type == "http" && site.StatusCode > 0 {
row("HTTP Code", strconv.Itoa(site.StatusCode))
}
if !site.StatusChangedAt.IsZero() {
dur := time.Since(site.StatusChangedAt)
row("State Since", site.StatusChangedAt.Format("2006-01-02 15:04:05")+" ("+fmtDuration(dur)+")")
}
if !site.LastSuccessAt.IsZero() {
ago := time.Since(site.LastSuccessAt)
row("Last Success", site.LastSuccessAt.Format("15:04:05")+" ("+fmtDuration(ago)+" ago)")
}
if m.isMonitorInMaintenance(site.ID) { if m.isMonitorInMaintenance(site.ID) {
for _, mw := range m.maintenanceWindows { for _, mw := range m.maintenanceWindows {
if mw.Type == "maintenance" && (mw.MonitorID == 0 || mw.MonitorID == site.ID || mw.MonitorID == site.ParentID) { if mw.Type == "maintenance" && (mw.MonitorID == 0 || mw.MonitorID == site.ID || mw.MonitorID == site.ParentID) {
@@ -787,7 +837,30 @@ func (m Model) viewDetailPanel() string {
} }
latency := time.Duration(result.LatencyNs).Milliseconds() latency := time.Duration(result.LatencyNs).Milliseconds()
ago := time.Since(result.CheckedAt).Truncate(time.Second) ago := time.Since(result.CheckedAt).Truncate(time.Second)
fmt.Fprintf(&b, " %-14s %s %dms %s ago\n", nodeID, status, latency, ago) line := fmt.Sprintf(" %-14s %s %dms %s ago", nodeID, status, latency, ago)
if !result.IsUp && result.ErrorReason != "" {
line += " " + dangerStyle.Render(limitStr(result.ErrorReason, 30))
}
b.WriteString(line + "\n")
}
}
stateChanges := m.engine.GetStateChanges(site.ID, 5)
if len(stateChanges) > 0 {
b.WriteString("\n" + subtleStyle.Render(" STATE CHANGES") + "\n")
for _, sc := range stateChanges {
ago := fmtDuration(time.Since(sc.ChangedAt))
arrow := subtleStyle.Render(sc.FromStatus) + " → "
if sc.ToStatus == "UP" {
arrow += specialStyle.Render(sc.ToStatus)
} else {
arrow += dangerStyle.Render(sc.ToStatus)
}
line := fmt.Sprintf(" %s %s", arrow, subtleStyle.Render(ago+" ago"))
if sc.ErrorReason != "" && sc.ToStatus != "UP" {
line += " " + dangerStyle.Render(limitStr(sc.ErrorReason, 40))
}
b.WriteString(line + "\n")
} }
} }