From bc3a44beac493594b4e508594a0e1efa8a12973e Mon Sep 17 00:00:00 2001 From: Tyler Koenig Date: Wed, 27 May 2026 19:32:30 -0400 Subject: [PATCH] feat: show error reason when monitors go DOWN Propagate check failure reasons through the entire stack: - Checker captures specific errors (DNS, timeout, HTTP status, SSL, etc.) - Engine tracks LastError, StatusChangedAt, LastSuccessAt per monitor - State transitions persisted to new state_changes table - Detail panel shows error reason, HTTP code, state duration, last success time, and last 5 state change events - Monitor table shows inline error preview for DOWN services - Alert messages include error reason - Probe nodes forward error reasons to leader 15 files changed across models, checker, engine, store, TUI, and probes. --- internal/cluster/cluster_test.go | 16 +++--- internal/cluster/probe.go | 14 +++--- internal/metrics/prometheus_test.go | 21 ++++---- internal/models/models.go | 28 ++++++++--- internal/monitor/aggregator.go | 9 ++-- internal/monitor/checker.go | 49 +++++++++++++------ internal/monitor/monitor.go | 61 ++++++++++++++++++----- internal/monitor/monitor_test.go | 47 +++++++++--------- internal/server/server.go | 9 ++-- internal/server/server_test.go | 21 ++++---- internal/store/postgres.go | 9 ++++ internal/store/sqlite.go | 9 ++++ internal/store/sqlstore.go | 23 +++++++++ internal/store/store.go | 4 ++ internal/tui/tab_sites.go | 75 ++++++++++++++++++++++++++++- 15 files changed, 299 insertions(+), 96 deletions(-) diff --git a/internal/cluster/cluster_test.go b/internal/cluster/cluster_test.go index 083b49f..d665bbb 100644 --- a/internal/cluster/cluster_test.go +++ b/internal/cluster/cluster_test.go @@ -61,13 +61,15 @@ func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, e func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) { return nil, nil } -func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil } -func (m *mockStore) EndMaintenanceWindow(int) error { return nil } -func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil } -func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil } -func (m *mockStore) GetPreference(string) (string, error) { return "", nil } -func (m *mockStore) SetPreference(string, string) error { return nil } -func (m *mockStore) Close() error { return nil } +func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil } +func (m *mockStore) EndMaintenanceWindow(int) error { return nil } +func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil } +func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil } +func (m *mockStore) GetPreference(string) (string, error) { return "", nil } +func (m *mockStore) SetPreference(string, string) error { return nil } +func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil } +func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil } +func (m *mockStore) Close() error { return nil } // --- Cluster Start Tests --- diff --git a/internal/cluster/probe.go b/internal/cluster/probe.go index 5037675..e5c0988 100644 --- a/internal/cluster/probe.go +++ b/internal/cluster/probe.go @@ -127,9 +127,10 @@ func probeFetchAssignments(ctx context.Context, client *http.Client, cfg ProbeCo } type probeResultItem struct { - SiteID int `json:"site_id"` - LatencyNs int64 `json:"latency_ns"` - IsUp bool `json:"is_up"` + SiteID int `json:"site_id"` + LatencyNs int64 `json:"latency_ns"` + IsUp bool `json:"is_up"` + ErrorReason string `json:"error_reason,omitempty"` } func probeExecuteChecks(ctx context.Context, sites []models.Site, strict, insecure *http.Client, allowPrivate bool) []probeResultItem { @@ -154,9 +155,10 @@ loop: cr := monitor.RunCheck(s, strict, insecure, false, allowPrivate) mu.Lock() results = append(results, probeResultItem{ - SiteID: s.ID, - LatencyNs: cr.LatencyNs, - IsUp: cr.Status == "UP", + SiteID: s.ID, + LatencyNs: cr.LatencyNs, + IsUp: cr.Status == "UP", + ErrorReason: cr.ErrorReason, }) mu.Unlock() }(site) diff --git a/internal/metrics/prometheus_test.go b/internal/metrics/prometheus_test.go index 2fbccf7..24f6567 100644 --- a/internal/metrics/prometheus_test.go +++ b/internal/metrics/prometheus_test.go @@ -2,13 +2,14 @@ package metrics import ( "context" - "gitea.lerkolabs.com/lerko/uptop/internal/models" - "gitea.lerkolabs.com/lerko/uptop/internal/monitor" "net/http" "net/http/httptest" "strings" "testing" "time" + + "gitea.lerkolabs.com/lerko/uptop/internal/models" + "gitea.lerkolabs.com/lerko/uptop/internal/monitor" ) type mockStore struct { @@ -58,13 +59,15 @@ func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, e func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) { return nil, nil } -func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil } -func (m *mockStore) EndMaintenanceWindow(int) error { return nil } -func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil } -func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil } -func (m *mockStore) GetPreference(string) (string, error) { return "", nil } -func (m *mockStore) SetPreference(string, string) error { return nil } -func (m *mockStore) Close() error { return nil } +func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil } +func (m *mockStore) EndMaintenanceWindow(int) error { return nil } +func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil } +func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil } +func (m *mockStore) GetPreference(string) (string, error) { return "", nil } +func (m *mockStore) SetPreference(string, string) error { return nil } +func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil } +func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil } +func (m *mockStore) Close() error { return nil } func TestMetricsHandler(t *testing.T) { ms := &mockStore{ diff --git a/internal/models/models.go b/internal/models/models.go index 19179c6..571d555 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -27,14 +27,26 @@ type Site struct { Paused bool Regions string - FailureCount int - Status string - StatusCode int - Latency time.Duration - CertExpiry time.Time - HasSSL bool - LastCheck time.Time - SentSSLWarning bool + FailureCount int + Status string + StatusCode int + Latency time.Duration + CertExpiry time.Time + HasSSL bool + LastCheck time.Time + SentSSLWarning bool + LastError string + StatusChangedAt time.Time + LastSuccessAt time.Time +} + +type StateChange struct { + ID int + SiteID int + FromStatus string + ToStatus string + ErrorReason string + ChangedAt time.Time } type AlertConfig struct { diff --git a/internal/monitor/aggregator.go b/internal/monitor/aggregator.go index 88054c8..d610331 100644 --- a/internal/monitor/aggregator.go +++ b/internal/monitor/aggregator.go @@ -11,10 +11,11 @@ const ( ) type NodeResult struct { - NodeID string - IsUp bool - LatencyNs int64 - CheckedAt time.Time + NodeID string + IsUp bool + LatencyNs int64 + CheckedAt time.Time + ErrorReason string } func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) { diff --git a/internal/monitor/checker.go b/internal/monitor/checker.go index 30795f4..7b879a2 100644 --- a/internal/monitor/checker.go +++ b/internal/monitor/checker.go @@ -2,6 +2,7 @@ package monitor import ( "context" + "fmt" "net" "net/http" "strconv" @@ -15,12 +16,13 @@ import ( ) type CheckResult struct { - SiteID int - Status string // "UP", "DOWN", "SSL EXP" - StatusCode int - LatencyNs int64 - HasSSL bool - CertExpiry time.Time + SiteID int + Status string // "UP", "DOWN", "SSL EXP" + StatusCode int + LatencyNs int64 + HasSSL bool + CertExpiry time.Time + ErrorReason string } func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bool, allowPrivate ...bool) CheckResult { @@ -35,7 +37,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo if ips, err := net.LookupIP(host); err == nil { for _, ip := range ips { if isPrivateIP(ip) { - return CheckResult{SiteID: site.ID, Status: "DOWN"} + return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "target resolves to private IP"} } } } @@ -52,7 +54,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo case "dns": return runDNSCheck(site) default: - return CheckResult{SiteID: site.ID, Status: "DOWN"} + return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "unsupported monitor type: " + site.Type} } } @@ -68,7 +70,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur req, err := http.NewRequestWithContext(ctx, method, site.URL, nil) if err != nil { - return CheckResult{SiteID: site.ID, Status: "DOWN"} + return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "invalid request: " + err.Error()} } client := strict @@ -88,6 +90,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur if err != nil { result.Status = "DOWN" + result.ErrorReason = truncateError(err.Error(), 256) return result } defer resp.Body.Close() @@ -95,6 +98,11 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur result.StatusCode = resp.StatusCode if !isCodeAccepted(resp.StatusCode, site.AcceptedCodes) { result.Status = "DOWN" + expected := site.AcceptedCodes + if expected == "" { + expected = "200-299" + } + result.ErrorReason = fmt.Sprintf("HTTP %d (expected %s)", resp.StatusCode, expected) } if site.CheckSSL && resp.TLS != nil && len(resp.TLS.PeerCertificates) > 0 { @@ -103,6 +111,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur result.CertExpiry = cert.NotAfter if time.Now().After(cert.NotAfter) { result.Status = "SSL EXP" + result.ErrorReason = "SSL certificate expired" } } @@ -117,7 +126,7 @@ func runPingCheck(site models.Site) CheckResult { pinger, err := probing.NewPinger(host) if err != nil { - return CheckResult{SiteID: site.ID, Status: "DOWN"} + return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "ping setup: " + err.Error()} } pinger.Count = 1 pinger.Timeout = siteTimeout(site) @@ -127,8 +136,11 @@ func runPingCheck(site models.Site) CheckResult { err = pinger.Run() latency := time.Since(start) - if err != nil || pinger.Statistics().PacketsRecv == 0 { - return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()} + if err != nil { + return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "ping failed: " + err.Error()} + } + if pinger.Statistics().PacketsRecv == 0 { + return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "no ICMP response"} } stats := pinger.Statistics() @@ -148,7 +160,7 @@ func runPortCheck(site models.Site) CheckResult { latency := time.Since(start) if err != nil { - return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()} + return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: truncateError(err.Error(), 256)} } _ = conn.Close() return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()} @@ -199,10 +211,10 @@ func runDNSCheck(site models.Site) CheckResult { latency := time.Since(start) if err != nil { - return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()} + return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS query failed: " + err.Error()} } if r.Rcode != dns.RcodeSuccess { - return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds()} + return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS RCODE: " + dns.RcodeToString[r.Rcode]} } return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()} } @@ -235,3 +247,10 @@ func isCodeAccepted(code int, accepted string) bool { } return false } + +func truncateError(s string, max int) string { + if len(s) <= max { + return s + } + return s[:max-3] + "..." +} diff --git a/internal/monitor/monitor.go b/internal/monitor/monitor.go index b31e0c5..997fe77 100644 --- a/internal/monitor/monitor.go +++ b/internal/monitor/monitor.go @@ -283,6 +283,9 @@ func (e *Engine) UpdateSiteConfig(site models.Site) { site.LastCheck = existing.LastCheck site.SentSSLWarning = existing.SentSSLWarning site.FailureCount = existing.FailureCount + site.LastError = existing.LastError + site.StatusChangedAt = existing.StatusChangedAt + site.LastSuccessAt = existing.LastSuccessAt e.liveState[site.ID] = site e.addToTokenIndex(site) } @@ -393,33 +396,45 @@ func (e *Engine) checkByID(id int) { updatedSite.CertExpiry = result.CertExpiry updatedSite.Latency = time.Duration(result.LatencyNs) updatedSite.LastCheck = time.Now() - e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs)) + e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs), result.ErrorReason) } } func (e *Engine) checkPush(site models.Site) { deadline := site.LastCheck.Add(time.Duration(site.Interval) * time.Second).Add(pushGracePeriod) if time.Now().After(deadline) { - e.handleStatusChange(site, "DOWN", 0, 0) + e.handleStatusChange(site, "DOWN", 0, 0, "heartbeat missed") } else if site.Status != "UP" { - e.handleStatusChange(site, "UP", 200, 0) + e.handleStatusChange(site, "UP", 200, 0, "") } } -func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration) { +func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration, errorReason string) { if !e.IsActive() { return } newState := site newState.StatusCode = code + newState.LastError = errorReason + + if rawStatus == "UP" { + newState.LastSuccessAt = time.Now() + newState.LastError = "" + } else { + newState.LastSuccessAt = site.LastSuccessAt + } if site.Status == "UP" && rawStatus != "UP" { newState.FailureCount++ if newState.FailureCount > site.MaxRetries { newState.Status = rawStatus newState.FailureCount = site.MaxRetries + 1 - e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN", site.Name)) + if errorReason != "" { + e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN: %s", site.Name, errorReason)) + } else { + e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN", site.Name)) + } } else { e.AddLog(fmt.Sprintf("Monitor '%s' failed check %d/%d", site.Name, newState.FailureCount, site.MaxRetries)) } @@ -431,6 +446,14 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int newState.FailureCount = site.MaxRetries + 1 } + if newState.Status != site.Status && site.Status != "PENDING" { + newState.StatusChangedAt = time.Now() + } else if site.StatusChangedAt.IsZero() && newState.Status != "PENDING" { + newState.StatusChangedAt = time.Now() + } else { + newState.StatusChangedAt = site.StatusChangedAt + } + inMaint := e.isInMaintenance(site.ID) if site.Type == "http" && site.CheckSSL && site.HasSSL { @@ -455,12 +478,19 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int e.recordCheck(site.ID, latency, rawStatus == "UP") + if newState.Status != site.Status && site.Status != "PENDING" { + go func() { _ = e.db.SaveStateChange(site.ID, site.Status, newState.Status, errorReason) }() + } + isBroken := func(s string) bool { return s == "DOWN" || s == "SSL EXP" } if !isBroken(site.Status) && isBroken(newState.Status) && newState.Status != "PENDING" { if inMaint { e.AddLog(fmt.Sprintf("Monitor '%s' is DOWN (alerts suppressed — maintenance)", site.Name)) } else { msg := fmt.Sprintf("Monitor '%s' is DOWN (%s)", site.Name, rawStatus) + if errorReason != "" { + msg = fmt.Sprintf("Monitor '%s' is DOWN: %s", site.Name, errorReason) + } if site.Type == "push" { msg = fmt.Sprintf("Push Monitor '%s' missed heartbeat.", site.Name) } @@ -554,16 +584,17 @@ func (e *Engine) SetAggStrategy(strategy AggregationStrategy) { e.aggStrategy = strategy } -func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool) { +func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool, errorReason string) { e.probeResultsMu.Lock() if e.probeResults[siteID] == nil { e.probeResults[siteID] = make(map[string]NodeResult) } e.probeResults[siteID][nodeID] = NodeResult{ - NodeID: nodeID, - IsUp: isUp, - LatencyNs: latencyNs, - CheckedAt: time.Now(), + NodeID: nodeID, + IsUp: isUp, + LatencyNs: latencyNs, + CheckedAt: time.Now(), + ErrorReason: errorReason, } results := make([]NodeResult, 0, len(e.probeResults[siteID])) for _, r := range e.probeResults[siteID] { @@ -588,7 +619,7 @@ func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, i updatedSite := site updatedSite.Latency = time.Duration(avgLatency) updatedSite.LastCheck = time.Now() - e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency)) + e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency), errorReason) } func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult { @@ -601,3 +632,11 @@ func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult { } return cp } + +func (e *Engine) GetStateChanges(siteID int, limit int) []models.StateChange { + changes, err := e.db.GetStateChanges(siteID, limit) + if err != nil { + return nil + } + return changes +} diff --git a/internal/monitor/monitor_test.go b/internal/monitor/monitor_test.go index f4c778b..148cf6d 100644 --- a/internal/monitor/monitor_test.go +++ b/internal/monitor/monitor_test.go @@ -2,10 +2,11 @@ package monitor import ( "fmt" - "gitea.lerkolabs.com/lerko/uptop/internal/models" "sync" "testing" "time" + + "gitea.lerkolabs.com/lerko/uptop/internal/models" ) // --- Mock Store --- @@ -68,12 +69,14 @@ func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, e func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) { return nil, nil } -func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil } -func (m *mockStore) EndMaintenanceWindow(int) error { return nil } -func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil } -func (m *mockStore) GetPreference(string) (string, error) { return "", nil } -func (m *mockStore) SetPreference(string, string) error { return nil } -func (m *mockStore) Close() error { return nil } +func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil } +func (m *mockStore) EndMaintenanceWindow(int) error { return nil } +func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil } +func (m *mockStore) GetPreference(string) (string, error) { return "", nil } +func (m *mockStore) SetPreference(string, string) error { return nil } +func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil } +func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil } +func (m *mockStore) Close() error { return nil } func (m *mockStore) GetAllAlerts() ([]models.AlertConfig, error) { m.mu.Lock() @@ -174,7 +177,7 @@ func TestHandleStatusChange_PendingToUp(t *testing.T) { site := models.Site{ID: 1, Name: "test", Status: "PENDING", MaxRetries: 3, AlertID: 1} injectSite(e, site) - e.handleStatusChange(site, "UP", 200, 10*time.Millisecond) + e.handleStatusChange(site, "UP", 200, 10*time.Millisecond, "") s, _ := getSite(e, 1) if s.Status != "UP" { @@ -195,7 +198,7 @@ func TestHandleStatusChange_UpIncrementFailure(t *testing.T) { site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 3, FailureCount: 0} injectSite(e, site) - e.handleStatusChange(site, "DOWN", 500, 0) + e.handleStatusChange(site, "DOWN", 500, 0, "test error") s, _ := getSite(e, 1) if s.Status != "UP" { @@ -213,7 +216,7 @@ func TestHandleStatusChange_UpToDown_ExceedsRetries(t *testing.T) { site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 2, FailureCount: 2, AlertID: 1} injectSite(e, site) - e.handleStatusChange(site, "DOWN", 500, 0) + e.handleStatusChange(site, "DOWN", 500, 0, "test error") s, _ := getSite(e, 1) if s.Status != "DOWN" { @@ -236,7 +239,7 @@ func TestHandleStatusChange_UpToDown_ZeroRetries(t *testing.T) { site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, FailureCount: 0, AlertID: 1} injectSite(e, site) - e.handleStatusChange(site, "DOWN", 0, 0) + e.handleStatusChange(site, "DOWN", 0, 0, "test error") s, _ := getSite(e, 1) if s.Status != "DOWN" { @@ -255,7 +258,7 @@ func TestHandleStatusChange_DownToUp_Recovery(t *testing.T) { site := models.Site{ID: 1, Name: "test", Status: "DOWN", FailureCount: 4, AlertID: 1} injectSite(e, site) - e.handleStatusChange(site, "UP", 200, 5*time.Millisecond) + e.handleStatusChange(site, "UP", 200, 5*time.Millisecond, "") s, _ := getSite(e, 1) if s.Status != "UP" { @@ -276,7 +279,7 @@ func TestHandleStatusChange_DownStaysDown(t *testing.T) { site := models.Site{ID: 1, Name: "test", Status: "DOWN", MaxRetries: 2, FailureCount: 3} injectSite(e, site) - e.handleStatusChange(site, "DOWN", 0, 0) + e.handleStatusChange(site, "DOWN", 0, 0, "test error") s, _ := getSite(e, 1) if s.Status != "DOWN" { @@ -295,7 +298,7 @@ func TestHandleStatusChange_SSLExpired(t *testing.T) { site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1} injectSite(e, site) - e.handleStatusChange(site, "SSL EXP", 0, 0) + e.handleStatusChange(site, "SSL EXP", 0, 0, "SSL certificate expired") s, _ := getSite(e, 1) if s.Status != "SSL EXP" { @@ -315,7 +318,7 @@ func TestHandleStatusChange_AlertSuppressedMaintenance(t *testing.T) { site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1} injectSite(e, site) - e.handleStatusChange(site, "DOWN", 0, 0) + e.handleStatusChange(site, "DOWN", 0, 0, "test error") s, _ := getSite(e, 1) if s.Status != "DOWN" { @@ -346,7 +349,7 @@ func TestHandleStatusChange_RecoverySuppressedMaintenance(t *testing.T) { site := models.Site{ID: 1, Name: "test", Status: "DOWN", AlertID: 1} injectSite(e, site) - e.handleStatusChange(site, "UP", 200, 0) + e.handleStatusChange(site, "UP", 200, 0, "") s, _ := getSite(e, 1) if s.Status != "UP" { @@ -370,7 +373,7 @@ func TestHandleStatusChange_SSLWarning(t *testing.T) { } injectSite(e, site) - e.handleStatusChange(site, "UP", 200, 0) + e.handleStatusChange(site, "UP", 200, 0, "") s, _ := getSite(e, 1) if !s.SentSSLWarning { @@ -393,7 +396,7 @@ func TestHandleStatusChange_SSLWarningNotRepeated(t *testing.T) { } injectSite(e, site) - e.handleStatusChange(site, "UP", 200, 0) + e.handleStatusChange(site, "UP", 200, 0, "") waitAsync() if len(ms.getAlertCallsSnapshot()) != 0 { @@ -412,7 +415,7 @@ func TestHandleStatusChange_SSLWarningReset(t *testing.T) { } injectSite(e, site) - e.handleStatusChange(site, "UP", 200, 0) + e.handleStatusChange(site, "UP", 200, 0, "") s, _ := getSite(e, 1) if s.SentSSLWarning { @@ -433,7 +436,7 @@ func TestHandleStatusChange_SSLWarningSuppressedMaint(t *testing.T) { } injectSite(e, site) - e.handleStatusChange(site, "UP", 200, 0) + e.handleStatusChange(site, "UP", 200, 0, "") s, _ := getSite(e, 1) if !s.SentSSLWarning { @@ -452,7 +455,7 @@ func TestHandleStatusChange_InactiveEngine(t *testing.T) { injectSite(e, site) e.SetActive(false) - e.handleStatusChange(site, "DOWN", 0, 0) + e.handleStatusChange(site, "DOWN", 0, 0, "test error") s, _ := getSite(e, 1) if s.Status != "UP" { @@ -991,7 +994,7 @@ func TestConcurrent_HandleStatusChangeAndGetState(t *testing.T) { wg.Add(2) go func() { defer wg.Done() - e.handleStatusChange(site, "DOWN", 500, 0) + e.handleStatusChange(site, "DOWN", 500, 0, "test error") }() go func() { defer wg.Done() diff --git a/internal/server/server.go b/internal/server/server.go index 85b791d..463aba9 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -403,9 +403,10 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server { var req struct { NodeID string `json:"node_id"` Results []struct { - SiteID int `json:"site_id"` - LatencyNs int64 `json:"latency_ns"` - IsUp bool `json:"is_up"` + SiteID int `json:"site_id"` + LatencyNs int64 `json:"latency_ns"` + IsUp bool `json:"is_up"` + ErrorReason string `json:"error_reason"` } `json:"results"` } if err := json.NewDecoder(r.Body).Decode(&req); err != nil { @@ -420,7 +421,7 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server { if err := s.SaveCheckFromNode(result.SiteID, req.NodeID, result.LatencyNs, result.IsUp); err != nil { log.Printf("Failed to save probe result: %v", err) } - eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp) + eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp, result.ErrorReason) } if err := s.UpdateNodeLastSeen(req.NodeID); err != nil { log.Printf("Failed to update node last seen: %v", err) diff --git a/internal/server/server_test.go b/internal/server/server_test.go index f0a9965..2e9de56 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -4,13 +4,14 @@ import ( "bytes" "encoding/json" "fmt" - "gitea.lerkolabs.com/lerko/uptop/internal/models" - "gitea.lerkolabs.com/lerko/uptop/internal/monitor" "net" "net/http" "sync" "testing" "time" + + "gitea.lerkolabs.com/lerko/uptop/internal/models" + "gitea.lerkolabs.com/lerko/uptop/internal/monitor" ) // --- Mock Store --- @@ -69,13 +70,15 @@ func (m *mockStore) LoadLogs(int) ([]string, error) { return nil, nil func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) { return nil, nil } -func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil } -func (m *mockStore) EndMaintenanceWindow(int) error { return nil } -func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil } -func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil } -func (m *mockStore) GetPreference(string) (string, error) { return "", nil } -func (m *mockStore) SetPreference(string, string) error { return nil } -func (m *mockStore) Close() error { return nil } +func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil } +func (m *mockStore) EndMaintenanceWindow(int) error { return nil } +func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil } +func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil } +func (m *mockStore) GetPreference(string) (string, error) { return "", nil } +func (m *mockStore) SetPreference(string, string) error { return nil } +func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil } +func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil } +func (m *mockStore) Close() error { return nil } func (m *mockStore) ExportData() (models.Backup, error) { return models.Backup{ diff --git a/internal/store/postgres.go b/internal/store/postgres.go index aa57316..320fb51 100644 --- a/internal/store/postgres.go +++ b/internal/store/postgres.go @@ -72,6 +72,15 @@ func (d *PostgresDialect) CreateTablesSQL() []string { key TEXT PRIMARY KEY, value TEXT NOT NULL )`, + `CREATE TABLE IF NOT EXISTS state_changes ( + id SERIAL PRIMARY KEY, + site_id INTEGER NOT NULL, + from_status TEXT NOT NULL, + to_status TEXT NOT NULL, + error_reason TEXT DEFAULT '', + changed_at TIMESTAMP DEFAULT NOW() + )`, + `CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`, } } diff --git a/internal/store/sqlite.go b/internal/store/sqlite.go index 31b7880..beadc40 100644 --- a/internal/store/sqlite.go +++ b/internal/store/sqlite.go @@ -79,6 +79,15 @@ func (d *SQLiteDialect) CreateTablesSQL() []string { key TEXT PRIMARY KEY, value TEXT NOT NULL )`, + `CREATE TABLE IF NOT EXISTS state_changes ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + site_id INTEGER NOT NULL, + from_status TEXT NOT NULL, + to_status TEXT NOT NULL, + error_reason TEXT DEFAULT '', + changed_at DATETIME DEFAULT CURRENT_TIMESTAMP + )`, + `CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`, } } diff --git a/internal/store/sqlstore.go b/internal/store/sqlstore.go index f8a1e20..e24c9f1 100644 --- a/internal/store/sqlstore.go +++ b/internal/store/sqlstore.go @@ -347,6 +347,29 @@ func (s *SQLStore) DeleteUser(id int) error { return err } +func (s *SQLStore) SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error { + _, err := s.db.Exec(s.q("INSERT INTO state_changes (site_id, from_status, to_status, error_reason) VALUES (?, ?, ?, ?)"), + siteID, fromStatus, toStatus, errorReason) + return err +} + +func (s *SQLStore) GetStateChanges(siteID int, limit int) ([]models.StateChange, error) { + rows, err := s.db.Query(s.q("SELECT id, site_id, from_status, to_status, error_reason, changed_at FROM state_changes WHERE site_id = ? ORDER BY changed_at DESC LIMIT ?"), siteID, limit) + if err != nil { + return nil, err + } + defer rows.Close() + var changes []models.StateChange + for rows.Next() { + var sc models.StateChange + if err := rows.Scan(&sc.ID, &sc.SiteID, &sc.FromStatus, &sc.ToStatus, &sc.ErrorReason, &sc.ChangedAt); err != nil { + return changes, err + } + changes = append(changes, sc) + } + return changes, rows.Err() +} + func (s *SQLStore) SaveCheck(siteID int, latencyNs int64, isUp bool) error { return s.SaveCheckFromNode(siteID, "", latencyNs, isUp) } diff --git a/internal/store/store.go b/internal/store/store.go index 09dad80..8321486 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -38,6 +38,10 @@ type Store interface { SaveCheckFromNode(siteID int, nodeID string, latencyNs int64, isUp bool) error LoadAllHistory(limit int) (map[int][]models.CheckRecord, error) + // State Changes + SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error + GetStateChanges(siteID int, limit int) ([]models.StateChange, error) + // Nodes RegisterNode(node models.ProbeNode) error GetNode(id string) (models.ProbeNode, error) diff --git a/internal/tui/tab_sites.go b/internal/tui/tab_sites.go index d1b4ff7..e07eff1 100644 --- a/internal/tui/tab_sites.go +++ b/internal/tui/tab_sites.go @@ -309,6 +309,29 @@ func fmtStatus(status string, paused bool, inMaint bool) string { } } +func fmtDuration(d time.Duration) string { + if d < time.Minute { + return fmt.Sprintf("%ds", int(d.Seconds())) + } + if d < time.Hour { + return fmt.Sprintf("%dm", int(d.Minutes())) + } + if d < 24*time.Hour { + h := int(d.Hours()) + m := int(d.Minutes()) % 60 + if m > 0 { + return fmt.Sprintf("%dh %dm", h, m) + } + return fmt.Sprintf("%dh", h) + } + days := int(d.Hours()) / 24 + hours := int(d.Hours()) % 24 + if hours > 0 { + return fmt.Sprintf("%dd %dh", days, hours) + } + return fmt.Sprintf("%dd", days) +} + func (m Model) dynamicWidths() (nameW, sparkW int) { fixed := 6 + 10 + 10 + 8 + 8 + 7 + 9 // #, TYPE, STATUS, LATENCY, UPTIME, SSL, RETRY overhead := 30 // cell padding + borders @@ -389,6 +412,14 @@ func (m Model) viewSitesTab() string { name = limitStr(name, nameW) } + if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" { + nameLen := len([]rune(name)) + errSpace := nameW - nameLen - 1 + if errSpace > 10 { + name = name + " " + subtleStyle.Render(limitStr(site.LastError, errSpace)) + } + } + hist, _ := m.engine.GetHistory(site.ID) var spark string if site.Type == "push" { @@ -732,6 +763,25 @@ func (m Model) viewDetailPanel() string { } row("Status", fmtStatus(site.Status, site.Paused, m.isMonitorInMaintenance(site.ID))) + + if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" { + row("Error", dangerStyle.Render(limitStr(site.LastError, 60))) + } + + if site.Type == "http" && site.StatusCode > 0 { + row("HTTP Code", strconv.Itoa(site.StatusCode)) + } + + if !site.StatusChangedAt.IsZero() { + dur := time.Since(site.StatusChangedAt) + row("State Since", site.StatusChangedAt.Format("2006-01-02 15:04:05")+" ("+fmtDuration(dur)+")") + } + + if !site.LastSuccessAt.IsZero() { + ago := time.Since(site.LastSuccessAt) + row("Last Success", site.LastSuccessAt.Format("15:04:05")+" ("+fmtDuration(ago)+" ago)") + } + if m.isMonitorInMaintenance(site.ID) { for _, mw := range m.maintenanceWindows { if mw.Type == "maintenance" && (mw.MonitorID == 0 || mw.MonitorID == site.ID || mw.MonitorID == site.ParentID) { @@ -787,7 +837,30 @@ func (m Model) viewDetailPanel() string { } latency := time.Duration(result.LatencyNs).Milliseconds() ago := time.Since(result.CheckedAt).Truncate(time.Second) - fmt.Fprintf(&b, " %-14s %s %dms %s ago\n", nodeID, status, latency, ago) + line := fmt.Sprintf(" %-14s %s %dms %s ago", nodeID, status, latency, ago) + if !result.IsUp && result.ErrorReason != "" { + line += " " + dangerStyle.Render(limitStr(result.ErrorReason, 30)) + } + b.WriteString(line + "\n") + } + } + + stateChanges := m.engine.GetStateChanges(site.ID, 5) + if len(stateChanges) > 0 { + b.WriteString("\n" + subtleStyle.Render(" STATE CHANGES") + "\n") + for _, sc := range stateChanges { + ago := fmtDuration(time.Since(sc.ChangedAt)) + arrow := subtleStyle.Render(sc.FromStatus) + " → " + if sc.ToStatus == "UP" { + arrow += specialStyle.Render(sc.ToStatus) + } else { + arrow += dangerStyle.Render(sc.ToStatus) + } + line := fmt.Sprintf(" %s %s", arrow, subtleStyle.Render(ago+" ago")) + if sc.ErrorReason != "" && sc.ToStatus != "UP" { + line += " " + dangerStyle.Render(limitStr(sc.ErrorReason, 40)) + } + b.WriteString(line + "\n") } }