feat: show error reason when monitors go DOWN
CI / test (pull_request) Successful in 2m42s
CI / lint (pull_request) Successful in 1m11s
CI / vulncheck (pull_request) Successful in 51s

Propagate check failure reasons through the entire stack:
- Checker captures specific errors (DNS, timeout, HTTP status, SSL, etc.)
- Engine tracks LastError, StatusChangedAt, LastSuccessAt per monitor
- State transitions persisted to new state_changes table
- Detail panel shows error reason, HTTP code, state duration, last
  success time, and last 5 state change events
- Monitor table shows inline error preview for DOWN services
- Alert messages include error reason
- Probe nodes forward error reasons to leader

15 files changed across models, checker, engine, store, TUI, and probes.
This commit is contained in:
2026-05-27 19:32:30 -04:00
parent d8a2cab90f
commit bc3a44beac
15 changed files with 299 additions and 96 deletions
+9 -7
View File
@@ -61,13 +61,15 @@ func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, e
func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) {
return nil, nil
}
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) Close() error { return nil }
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil }
// --- Cluster Start Tests ---
+8 -6
View File
@@ -127,9 +127,10 @@ func probeFetchAssignments(ctx context.Context, client *http.Client, cfg ProbeCo
}
type probeResultItem struct {
SiteID int `json:"site_id"`
LatencyNs int64 `json:"latency_ns"`
IsUp bool `json:"is_up"`
SiteID int `json:"site_id"`
LatencyNs int64 `json:"latency_ns"`
IsUp bool `json:"is_up"`
ErrorReason string `json:"error_reason,omitempty"`
}
func probeExecuteChecks(ctx context.Context, sites []models.Site, strict, insecure *http.Client, allowPrivate bool) []probeResultItem {
@@ -154,9 +155,10 @@ loop:
cr := monitor.RunCheck(s, strict, insecure, false, allowPrivate)
mu.Lock()
results = append(results, probeResultItem{
SiteID: s.ID,
LatencyNs: cr.LatencyNs,
IsUp: cr.Status == "UP",
SiteID: s.ID,
LatencyNs: cr.LatencyNs,
IsUp: cr.Status == "UP",
ErrorReason: cr.ErrorReason,
})
mu.Unlock()
}(site)
+12 -9
View File
@@ -2,13 +2,14 @@ package metrics
import (
"context"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
)
type mockStore struct {
@@ -58,13 +59,15 @@ func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, e
func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) {
return nil, nil
}
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) Close() error { return nil }
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil }
func TestMetricsHandler(t *testing.T) {
ms := &mockStore{
+20 -8
View File
@@ -27,14 +27,26 @@ type Site struct {
Paused bool
Regions string
FailureCount int
Status string
StatusCode int
Latency time.Duration
CertExpiry time.Time
HasSSL bool
LastCheck time.Time
SentSSLWarning bool
FailureCount int
Status string
StatusCode int
Latency time.Duration
CertExpiry time.Time
HasSSL bool
LastCheck time.Time
SentSSLWarning bool
LastError string
StatusChangedAt time.Time
LastSuccessAt time.Time
}
type StateChange struct {
ID int
SiteID int
FromStatus string
ToStatus string
ErrorReason string
ChangedAt time.Time
}
type AlertConfig struct {
+5 -4
View File
@@ -11,10 +11,11 @@ const (
)
type NodeResult struct {
NodeID string
IsUp bool
LatencyNs int64
CheckedAt time.Time
NodeID string
IsUp bool
LatencyNs int64
CheckedAt time.Time
ErrorReason string
}
func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) {
+34 -15
View File
@@ -2,6 +2,7 @@ package monitor
import (
"context"
"fmt"
"net"
"net/http"
"strconv"
@@ -15,12 +16,13 @@ import (
)
type CheckResult struct {
SiteID int
Status string // "UP", "DOWN", "SSL EXP"
StatusCode int
LatencyNs int64
HasSSL bool
CertExpiry time.Time
SiteID int
Status string // "UP", "DOWN", "SSL EXP"
StatusCode int
LatencyNs int64
HasSSL bool
CertExpiry time.Time
ErrorReason string
}
func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bool, allowPrivate ...bool) CheckResult {
@@ -35,7 +37,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo
if ips, err := net.LookupIP(host); err == nil {
for _, ip := range ips {
if isPrivateIP(ip) {
return CheckResult{SiteID: site.ID, Status: "DOWN"}
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "target resolves to private IP"}
}
}
}
@@ -52,7 +54,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo
case "dns":
return runDNSCheck(site)
default:
return CheckResult{SiteID: site.ID, Status: "DOWN"}
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "unsupported monitor type: " + site.Type}
}
}
@@ -68,7 +70,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
req, err := http.NewRequestWithContext(ctx, method, site.URL, nil)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN"}
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "invalid request: " + err.Error()}
}
client := strict
@@ -88,6 +90,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
if err != nil {
result.Status = "DOWN"
result.ErrorReason = truncateError(err.Error(), 256)
return result
}
defer resp.Body.Close()
@@ -95,6 +98,11 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
result.StatusCode = resp.StatusCode
if !isCodeAccepted(resp.StatusCode, site.AcceptedCodes) {
result.Status = "DOWN"
expected := site.AcceptedCodes
if expected == "" {
expected = "200-299"
}
result.ErrorReason = fmt.Sprintf("HTTP %d (expected %s)", resp.StatusCode, expected)
}
if site.CheckSSL && resp.TLS != nil && len(resp.TLS.PeerCertificates) > 0 {
@@ -103,6 +111,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
result.CertExpiry = cert.NotAfter
if time.Now().After(cert.NotAfter) {
result.Status = "SSL EXP"
result.ErrorReason = "SSL certificate expired"
}
}
@@ -117,7 +126,7 @@ func runPingCheck(site models.Site) CheckResult {
pinger, err := probing.NewPinger(host)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN"}
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "ping setup: " + err.Error()}
}
pinger.Count = 1
pinger.Timeout = siteTimeout(site)
@@ -127,8 +136,11 @@ func runPingCheck(site models.Site) CheckResult {
err = pinger.Run()
latency := time.Since(start)
if err != nil || pinger.Statistics().PacketsRecv == 0 {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "ping failed: " + err.Error()}
}
if pinger.Statistics().PacketsRecv == 0 {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "no ICMP response"}
}
stats := pinger.Statistics()
@@ -148,7 +160,7 @@ func runPortCheck(site models.Site) CheckResult {
latency := time.Since(start)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: truncateError(err.Error(), 256)}
}
_ = conn.Close()
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
@@ -199,10 +211,10 @@ func runDNSCheck(site models.Site) CheckResult {
latency := time.Since(start)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS query failed: " + err.Error()}
}
if r.Rcode != dns.RcodeSuccess {
return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds()}
return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS RCODE: " + dns.RcodeToString[r.Rcode]}
}
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
}
@@ -235,3 +247,10 @@ func isCodeAccepted(code int, accepted string) bool {
}
return false
}
func truncateError(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max-3] + "..."
}
+50 -11
View File
@@ -283,6 +283,9 @@ func (e *Engine) UpdateSiteConfig(site models.Site) {
site.LastCheck = existing.LastCheck
site.SentSSLWarning = existing.SentSSLWarning
site.FailureCount = existing.FailureCount
site.LastError = existing.LastError
site.StatusChangedAt = existing.StatusChangedAt
site.LastSuccessAt = existing.LastSuccessAt
e.liveState[site.ID] = site
e.addToTokenIndex(site)
}
@@ -393,33 +396,45 @@ func (e *Engine) checkByID(id int) {
updatedSite.CertExpiry = result.CertExpiry
updatedSite.Latency = time.Duration(result.LatencyNs)
updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs))
e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs), result.ErrorReason)
}
}
func (e *Engine) checkPush(site models.Site) {
deadline := site.LastCheck.Add(time.Duration(site.Interval) * time.Second).Add(pushGracePeriod)
if time.Now().After(deadline) {
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "heartbeat missed")
} else if site.Status != "UP" {
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
}
}
func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration) {
func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration, errorReason string) {
if !e.IsActive() {
return
}
newState := site
newState.StatusCode = code
newState.LastError = errorReason
if rawStatus == "UP" {
newState.LastSuccessAt = time.Now()
newState.LastError = ""
} else {
newState.LastSuccessAt = site.LastSuccessAt
}
if site.Status == "UP" && rawStatus != "UP" {
newState.FailureCount++
if newState.FailureCount > site.MaxRetries {
newState.Status = rawStatus
newState.FailureCount = site.MaxRetries + 1
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN", site.Name))
if errorReason != "" {
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN: %s", site.Name, errorReason))
} else {
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN", site.Name))
}
} else {
e.AddLog(fmt.Sprintf("Monitor '%s' failed check %d/%d", site.Name, newState.FailureCount, site.MaxRetries))
}
@@ -431,6 +446,14 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int
newState.FailureCount = site.MaxRetries + 1
}
if newState.Status != site.Status && site.Status != "PENDING" {
newState.StatusChangedAt = time.Now()
} else if site.StatusChangedAt.IsZero() && newState.Status != "PENDING" {
newState.StatusChangedAt = time.Now()
} else {
newState.StatusChangedAt = site.StatusChangedAt
}
inMaint := e.isInMaintenance(site.ID)
if site.Type == "http" && site.CheckSSL && site.HasSSL {
@@ -455,12 +478,19 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int
e.recordCheck(site.ID, latency, rawStatus == "UP")
if newState.Status != site.Status && site.Status != "PENDING" {
go func() { _ = e.db.SaveStateChange(site.ID, site.Status, newState.Status, errorReason) }()
}
isBroken := func(s string) bool { return s == "DOWN" || s == "SSL EXP" }
if !isBroken(site.Status) && isBroken(newState.Status) && newState.Status != "PENDING" {
if inMaint {
e.AddLog(fmt.Sprintf("Monitor '%s' is DOWN (alerts suppressed — maintenance)", site.Name))
} else {
msg := fmt.Sprintf("Monitor '%s' is DOWN (%s)", site.Name, rawStatus)
if errorReason != "" {
msg = fmt.Sprintf("Monitor '%s' is DOWN: %s", site.Name, errorReason)
}
if site.Type == "push" {
msg = fmt.Sprintf("Push Monitor '%s' missed heartbeat.", site.Name)
}
@@ -554,16 +584,17 @@ func (e *Engine) SetAggStrategy(strategy AggregationStrategy) {
e.aggStrategy = strategy
}
func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool) {
func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool, errorReason string) {
e.probeResultsMu.Lock()
if e.probeResults[siteID] == nil {
e.probeResults[siteID] = make(map[string]NodeResult)
}
e.probeResults[siteID][nodeID] = NodeResult{
NodeID: nodeID,
IsUp: isUp,
LatencyNs: latencyNs,
CheckedAt: time.Now(),
NodeID: nodeID,
IsUp: isUp,
LatencyNs: latencyNs,
CheckedAt: time.Now(),
ErrorReason: errorReason,
}
results := make([]NodeResult, 0, len(e.probeResults[siteID]))
for _, r := range e.probeResults[siteID] {
@@ -588,7 +619,7 @@ func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, i
updatedSite := site
updatedSite.Latency = time.Duration(avgLatency)
updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency))
e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency), errorReason)
}
func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult {
@@ -601,3 +632,11 @@ func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult {
}
return cp
}
func (e *Engine) GetStateChanges(siteID int, limit int) []models.StateChange {
changes, err := e.db.GetStateChanges(siteID, limit)
if err != nil {
return nil
}
return changes
}
+25 -22
View File
@@ -2,10 +2,11 @@ package monitor
import (
"fmt"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"sync"
"testing"
"time"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
)
// --- Mock Store ---
@@ -68,12 +69,14 @@ func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, e
func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) {
return nil, nil
}
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) Close() error { return nil }
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil }
func (m *mockStore) GetAllAlerts() ([]models.AlertConfig, error) {
m.mu.Lock()
@@ -174,7 +177,7 @@ func TestHandleStatusChange_PendingToUp(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "PENDING", MaxRetries: 3, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 10*time.Millisecond)
e.handleStatusChange(site, "UP", 200, 10*time.Millisecond, "")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -195,7 +198,7 @@ func TestHandleStatusChange_UpIncrementFailure(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 3, FailureCount: 0}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 500, 0)
e.handleStatusChange(site, "DOWN", 500, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -213,7 +216,7 @@ func TestHandleStatusChange_UpToDown_ExceedsRetries(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 2, FailureCount: 2, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 500, 0)
e.handleStatusChange(site, "DOWN", 500, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "DOWN" {
@@ -236,7 +239,7 @@ func TestHandleStatusChange_UpToDown_ZeroRetries(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, FailureCount: 0, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "DOWN" {
@@ -255,7 +258,7 @@ func TestHandleStatusChange_DownToUp_Recovery(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "DOWN", FailureCount: 4, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 5*time.Millisecond)
e.handleStatusChange(site, "UP", 200, 5*time.Millisecond, "")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -276,7 +279,7 @@ func TestHandleStatusChange_DownStaysDown(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "DOWN", MaxRetries: 2, FailureCount: 3}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "DOWN" {
@@ -295,7 +298,7 @@ func TestHandleStatusChange_SSLExpired(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "SSL EXP", 0, 0)
e.handleStatusChange(site, "SSL EXP", 0, 0, "SSL certificate expired")
s, _ := getSite(e, 1)
if s.Status != "SSL EXP" {
@@ -315,7 +318,7 @@ func TestHandleStatusChange_AlertSuppressedMaintenance(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "DOWN" {
@@ -346,7 +349,7 @@ func TestHandleStatusChange_RecoverySuppressedMaintenance(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "DOWN", AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -370,7 +373,7 @@ func TestHandleStatusChange_SSLWarning(t *testing.T) {
}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1)
if !s.SentSSLWarning {
@@ -393,7 +396,7 @@ func TestHandleStatusChange_SSLWarningNotRepeated(t *testing.T) {
}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
waitAsync()
if len(ms.getAlertCallsSnapshot()) != 0 {
@@ -412,7 +415,7 @@ func TestHandleStatusChange_SSLWarningReset(t *testing.T) {
}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1)
if s.SentSSLWarning {
@@ -433,7 +436,7 @@ func TestHandleStatusChange_SSLWarningSuppressedMaint(t *testing.T) {
}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1)
if !s.SentSSLWarning {
@@ -452,7 +455,7 @@ func TestHandleStatusChange_InactiveEngine(t *testing.T) {
injectSite(e, site)
e.SetActive(false)
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -991,7 +994,7 @@ func TestConcurrent_HandleStatusChangeAndGetState(t *testing.T) {
wg.Add(2)
go func() {
defer wg.Done()
e.handleStatusChange(site, "DOWN", 500, 0)
e.handleStatusChange(site, "DOWN", 500, 0, "test error")
}()
go func() {
defer wg.Done()
+5 -4
View File
@@ -403,9 +403,10 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server {
var req struct {
NodeID string `json:"node_id"`
Results []struct {
SiteID int `json:"site_id"`
LatencyNs int64 `json:"latency_ns"`
IsUp bool `json:"is_up"`
SiteID int `json:"site_id"`
LatencyNs int64 `json:"latency_ns"`
IsUp bool `json:"is_up"`
ErrorReason string `json:"error_reason"`
} `json:"results"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
@@ -420,7 +421,7 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server {
if err := s.SaveCheckFromNode(result.SiteID, req.NodeID, result.LatencyNs, result.IsUp); err != nil {
log.Printf("Failed to save probe result: %v", err)
}
eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp)
eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp, result.ErrorReason)
}
if err := s.UpdateNodeLastSeen(req.NodeID); err != nil {
log.Printf("Failed to update node last seen: %v", err)
+12 -9
View File
@@ -4,13 +4,14 @@ import (
"bytes"
"encoding/json"
"fmt"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
"net"
"net/http"
"sync"
"testing"
"time"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
)
// --- Mock Store ---
@@ -69,13 +70,15 @@ func (m *mockStore) LoadLogs(int) ([]string, error) { return nil, nil
func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) {
return nil, nil
}
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) Close() error { return nil }
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil }
func (m *mockStore) ExportData() (models.Backup, error) {
return models.Backup{
+9
View File
@@ -72,6 +72,15 @@ func (d *PostgresDialect) CreateTablesSQL() []string {
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)`,
`CREATE TABLE IF NOT EXISTS state_changes (
id SERIAL PRIMARY KEY,
site_id INTEGER NOT NULL,
from_status TEXT NOT NULL,
to_status TEXT NOT NULL,
error_reason TEXT DEFAULT '',
changed_at TIMESTAMP DEFAULT NOW()
)`,
`CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`,
}
}
+9
View File
@@ -79,6 +79,15 @@ func (d *SQLiteDialect) CreateTablesSQL() []string {
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)`,
`CREATE TABLE IF NOT EXISTS state_changes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
site_id INTEGER NOT NULL,
from_status TEXT NOT NULL,
to_status TEXT NOT NULL,
error_reason TEXT DEFAULT '',
changed_at DATETIME DEFAULT CURRENT_TIMESTAMP
)`,
`CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`,
}
}
+23
View File
@@ -347,6 +347,29 @@ func (s *SQLStore) DeleteUser(id int) error {
return err
}
func (s *SQLStore) SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error {
_, err := s.db.Exec(s.q("INSERT INTO state_changes (site_id, from_status, to_status, error_reason) VALUES (?, ?, ?, ?)"),
siteID, fromStatus, toStatus, errorReason)
return err
}
func (s *SQLStore) GetStateChanges(siteID int, limit int) ([]models.StateChange, error) {
rows, err := s.db.Query(s.q("SELECT id, site_id, from_status, to_status, error_reason, changed_at FROM state_changes WHERE site_id = ? ORDER BY changed_at DESC LIMIT ?"), siteID, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var changes []models.StateChange
for rows.Next() {
var sc models.StateChange
if err := rows.Scan(&sc.ID, &sc.SiteID, &sc.FromStatus, &sc.ToStatus, &sc.ErrorReason, &sc.ChangedAt); err != nil {
return changes, err
}
changes = append(changes, sc)
}
return changes, rows.Err()
}
func (s *SQLStore) SaveCheck(siteID int, latencyNs int64, isUp bool) error {
return s.SaveCheckFromNode(siteID, "", latencyNs, isUp)
}
+4
View File
@@ -38,6 +38,10 @@ type Store interface {
SaveCheckFromNode(siteID int, nodeID string, latencyNs int64, isUp bool) error
LoadAllHistory(limit int) (map[int][]models.CheckRecord, error)
// State Changes
SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error
GetStateChanges(siteID int, limit int) ([]models.StateChange, error)
// Nodes
RegisterNode(node models.ProbeNode) error
GetNode(id string) (models.ProbeNode, error)
+74 -1
View File
@@ -309,6 +309,29 @@ func fmtStatus(status string, paused bool, inMaint bool) string {
}
}
func fmtDuration(d time.Duration) string {
if d < time.Minute {
return fmt.Sprintf("%ds", int(d.Seconds()))
}
if d < time.Hour {
return fmt.Sprintf("%dm", int(d.Minutes()))
}
if d < 24*time.Hour {
h := int(d.Hours())
m := int(d.Minutes()) % 60
if m > 0 {
return fmt.Sprintf("%dh %dm", h, m)
}
return fmt.Sprintf("%dh", h)
}
days := int(d.Hours()) / 24
hours := int(d.Hours()) % 24
if hours > 0 {
return fmt.Sprintf("%dd %dh", days, hours)
}
return fmt.Sprintf("%dd", days)
}
func (m Model) dynamicWidths() (nameW, sparkW int) {
fixed := 6 + 10 + 10 + 8 + 8 + 7 + 9 // #, TYPE, STATUS, LATENCY, UPTIME, SSL, RETRY
overhead := 30 // cell padding + borders
@@ -389,6 +412,14 @@ func (m Model) viewSitesTab() string {
name = limitStr(name, nameW)
}
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
nameLen := len([]rune(name))
errSpace := nameW - nameLen - 1
if errSpace > 10 {
name = name + " " + subtleStyle.Render(limitStr(site.LastError, errSpace))
}
}
hist, _ := m.engine.GetHistory(site.ID)
var spark string
if site.Type == "push" {
@@ -732,6 +763,25 @@ func (m Model) viewDetailPanel() string {
}
row("Status", fmtStatus(site.Status, site.Paused, m.isMonitorInMaintenance(site.ID)))
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
row("Error", dangerStyle.Render(limitStr(site.LastError, 60)))
}
if site.Type == "http" && site.StatusCode > 0 {
row("HTTP Code", strconv.Itoa(site.StatusCode))
}
if !site.StatusChangedAt.IsZero() {
dur := time.Since(site.StatusChangedAt)
row("State Since", site.StatusChangedAt.Format("2006-01-02 15:04:05")+" ("+fmtDuration(dur)+")")
}
if !site.LastSuccessAt.IsZero() {
ago := time.Since(site.LastSuccessAt)
row("Last Success", site.LastSuccessAt.Format("15:04:05")+" ("+fmtDuration(ago)+" ago)")
}
if m.isMonitorInMaintenance(site.ID) {
for _, mw := range m.maintenanceWindows {
if mw.Type == "maintenance" && (mw.MonitorID == 0 || mw.MonitorID == site.ID || mw.MonitorID == site.ParentID) {
@@ -787,7 +837,30 @@ func (m Model) viewDetailPanel() string {
}
latency := time.Duration(result.LatencyNs).Milliseconds()
ago := time.Since(result.CheckedAt).Truncate(time.Second)
fmt.Fprintf(&b, " %-14s %s %dms %s ago\n", nodeID, status, latency, ago)
line := fmt.Sprintf(" %-14s %s %dms %s ago", nodeID, status, latency, ago)
if !result.IsUp && result.ErrorReason != "" {
line += " " + dangerStyle.Render(limitStr(result.ErrorReason, 30))
}
b.WriteString(line + "\n")
}
}
stateChanges := m.engine.GetStateChanges(site.ID, 5)
if len(stateChanges) > 0 {
b.WriteString("\n" + subtleStyle.Render(" STATE CHANGES") + "\n")
for _, sc := range stateChanges {
ago := fmtDuration(time.Since(sc.ChangedAt))
arrow := subtleStyle.Render(sc.FromStatus) + " → "
if sc.ToStatus == "UP" {
arrow += specialStyle.Render(sc.ToStatus)
} else {
arrow += dangerStyle.Render(sc.ToStatus)
}
line := fmt.Sprintf(" %s %s", arrow, subtleStyle.Render(ago+" ago"))
if sc.ErrorReason != "" && sc.ToStatus != "UP" {
line += " " + dangerStyle.Render(limitStr(sc.ErrorReason, 40))
}
b.WriteString(line + "\n")
}
}