feat: show error reason when monitors go DOWN #33

Merged
lerko merged 1 commits from feat/error-reason into main 2026-05-27 23:38:26 +00:00
15 changed files with 299 additions and 96 deletions
+2
View File
@@ -67,6 +67,8 @@ func (m *mockStore) DeleteMaintenanceWindow(int) error { retur
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil }
// --- Cluster Start Tests ---
+2
View File
@@ -130,6 +130,7 @@ type probeResultItem struct {
SiteID int `json:"site_id"`
LatencyNs int64 `json:"latency_ns"`
IsUp bool `json:"is_up"`
ErrorReason string `json:"error_reason,omitempty"`
}
func probeExecuteChecks(ctx context.Context, sites []models.Site, strict, insecure *http.Client, allowPrivate bool) []probeResultItem {
@@ -157,6 +158,7 @@ loop:
SiteID: s.ID,
LatencyNs: cr.LatencyNs,
IsUp: cr.Status == "UP",
ErrorReason: cr.ErrorReason,
})
mu.Unlock()
}(site)
+5 -2
View File
@@ -2,13 +2,14 @@ package metrics
import (
"context"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
)
type mockStore struct {
@@ -64,6 +65,8 @@ func (m *mockStore) DeleteMaintenanceWindow(int) error { retur
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil }
func TestMetricsHandler(t *testing.T) {
+12
View File
@@ -35,6 +35,18 @@ type Site struct {
HasSSL bool
LastCheck time.Time
SentSSLWarning bool
LastError string
StatusChangedAt time.Time
LastSuccessAt time.Time
}
type StateChange struct {
ID int
SiteID int
FromStatus string
ToStatus string
ErrorReason string
ChangedAt time.Time
}
type AlertConfig struct {
+1
View File
@@ -15,6 +15,7 @@ type NodeResult struct {
IsUp bool
LatencyNs int64
CheckedAt time.Time
ErrorReason string
}
func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) {
+28 -9
View File
@@ -2,6 +2,7 @@ package monitor
import (
"context"
"fmt"
"net"
"net/http"
"strconv"
@@ -21,6 +22,7 @@ type CheckResult struct {
LatencyNs int64
HasSSL bool
CertExpiry time.Time
ErrorReason string
}
func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bool, allowPrivate ...bool) CheckResult {
@@ -35,7 +37,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo
if ips, err := net.LookupIP(host); err == nil {
for _, ip := range ips {
if isPrivateIP(ip) {
return CheckResult{SiteID: site.ID, Status: "DOWN"}
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "target resolves to private IP"}
}
}
}
@@ -52,7 +54,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo
case "dns":
return runDNSCheck(site)
default:
return CheckResult{SiteID: site.ID, Status: "DOWN"}
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "unsupported monitor type: " + site.Type}
}
}
@@ -68,7 +70,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
req, err := http.NewRequestWithContext(ctx, method, site.URL, nil)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN"}
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "invalid request: " + err.Error()}
}
client := strict
@@ -88,6 +90,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
if err != nil {
result.Status = "DOWN"
result.ErrorReason = truncateError(err.Error(), 256)
return result
}
defer resp.Body.Close()
@@ -95,6 +98,11 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
result.StatusCode = resp.StatusCode
if !isCodeAccepted(resp.StatusCode, site.AcceptedCodes) {
result.Status = "DOWN"
expected := site.AcceptedCodes
if expected == "" {
expected = "200-299"
}
result.ErrorReason = fmt.Sprintf("HTTP %d (expected %s)", resp.StatusCode, expected)
}
if site.CheckSSL && resp.TLS != nil && len(resp.TLS.PeerCertificates) > 0 {
@@ -103,6 +111,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
result.CertExpiry = cert.NotAfter
if time.Now().After(cert.NotAfter) {
result.Status = "SSL EXP"
result.ErrorReason = "SSL certificate expired"
}
}
@@ -117,7 +126,7 @@ func runPingCheck(site models.Site) CheckResult {
pinger, err := probing.NewPinger(host)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN"}
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "ping setup: " + err.Error()}
}
pinger.Count = 1
pinger.Timeout = siteTimeout(site)
@@ -127,8 +136,11 @@ func runPingCheck(site models.Site) CheckResult {
err = pinger.Run()
latency := time.Since(start)
if err != nil || pinger.Statistics().PacketsRecv == 0 {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "ping failed: " + err.Error()}
}
if pinger.Statistics().PacketsRecv == 0 {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "no ICMP response"}
}
stats := pinger.Statistics()
@@ -148,7 +160,7 @@ func runPortCheck(site models.Site) CheckResult {
latency := time.Since(start)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: truncateError(err.Error(), 256)}
}
_ = conn.Close()
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
@@ -199,10 +211,10 @@ func runDNSCheck(site models.Site) CheckResult {
latency := time.Since(start)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS query failed: " + err.Error()}
}
if r.Rcode != dns.RcodeSuccess {
return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds()}
return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS RCODE: " + dns.RcodeToString[r.Rcode]}
}
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
}
@@ -235,3 +247,10 @@ func isCodeAccepted(code int, accepted string) bool {
}
return false
}
func truncateError(s string, max int) string {
if len(s) <= max {
return s
}
return s[:max-3] + "..."
}
+45 -6
View File
@@ -283,6 +283,9 @@ func (e *Engine) UpdateSiteConfig(site models.Site) {
site.LastCheck = existing.LastCheck
site.SentSSLWarning = existing.SentSSLWarning
site.FailureCount = existing.FailureCount
site.LastError = existing.LastError
site.StatusChangedAt = existing.StatusChangedAt
site.LastSuccessAt = existing.LastSuccessAt
e.liveState[site.ID] = site
e.addToTokenIndex(site)
}
@@ -393,33 +396,45 @@ func (e *Engine) checkByID(id int) {
updatedSite.CertExpiry = result.CertExpiry
updatedSite.Latency = time.Duration(result.LatencyNs)
updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs))
e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs), result.ErrorReason)
}
}
func (e *Engine) checkPush(site models.Site) {
deadline := site.LastCheck.Add(time.Duration(site.Interval) * time.Second).Add(pushGracePeriod)
if time.Now().After(deadline) {
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "heartbeat missed")
} else if site.Status != "UP" {
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
}
}
func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration) {
func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration, errorReason string) {
if !e.IsActive() {
return
}
newState := site
newState.StatusCode = code
newState.LastError = errorReason
if rawStatus == "UP" {
newState.LastSuccessAt = time.Now()
newState.LastError = ""
} else {
newState.LastSuccessAt = site.LastSuccessAt
}
if site.Status == "UP" && rawStatus != "UP" {
newState.FailureCount++
if newState.FailureCount > site.MaxRetries {
newState.Status = rawStatus
newState.FailureCount = site.MaxRetries + 1
if errorReason != "" {
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN: %s", site.Name, errorReason))
} else {
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN", site.Name))
}
} else {
e.AddLog(fmt.Sprintf("Monitor '%s' failed check %d/%d", site.Name, newState.FailureCount, site.MaxRetries))
}
@@ -431,6 +446,14 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int
newState.FailureCount = site.MaxRetries + 1
}
if newState.Status != site.Status && site.Status != "PENDING" {
newState.StatusChangedAt = time.Now()
} else if site.StatusChangedAt.IsZero() && newState.Status != "PENDING" {
newState.StatusChangedAt = time.Now()
} else {
newState.StatusChangedAt = site.StatusChangedAt
}
inMaint := e.isInMaintenance(site.ID)
if site.Type == "http" && site.CheckSSL && site.HasSSL {
@@ -455,12 +478,19 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int
e.recordCheck(site.ID, latency, rawStatus == "UP")
if newState.Status != site.Status && site.Status != "PENDING" {
go func() { _ = e.db.SaveStateChange(site.ID, site.Status, newState.Status, errorReason) }()
}
isBroken := func(s string) bool { return s == "DOWN" || s == "SSL EXP" }
if !isBroken(site.Status) && isBroken(newState.Status) && newState.Status != "PENDING" {
if inMaint {
e.AddLog(fmt.Sprintf("Monitor '%s' is DOWN (alerts suppressed — maintenance)", site.Name))
} else {
msg := fmt.Sprintf("Monitor '%s' is DOWN (%s)", site.Name, rawStatus)
if errorReason != "" {
msg = fmt.Sprintf("Monitor '%s' is DOWN: %s", site.Name, errorReason)
}
if site.Type == "push" {
msg = fmt.Sprintf("Push Monitor '%s' missed heartbeat.", site.Name)
}
@@ -554,7 +584,7 @@ func (e *Engine) SetAggStrategy(strategy AggregationStrategy) {
e.aggStrategy = strategy
}
func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool) {
func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool, errorReason string) {
e.probeResultsMu.Lock()
if e.probeResults[siteID] == nil {
e.probeResults[siteID] = make(map[string]NodeResult)
@@ -564,6 +594,7 @@ func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, i
IsUp: isUp,
LatencyNs: latencyNs,
CheckedAt: time.Now(),
ErrorReason: errorReason,
}
results := make([]NodeResult, 0, len(e.probeResults[siteID]))
for _, r := range e.probeResults[siteID] {
@@ -588,7 +619,7 @@ func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, i
updatedSite := site
updatedSite.Latency = time.Duration(avgLatency)
updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency))
e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency), errorReason)
}
func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult {
@@ -601,3 +632,11 @@ func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult {
}
return cp
}
func (e *Engine) GetStateChanges(siteID int, limit int) []models.StateChange {
changes, err := e.db.GetStateChanges(siteID, limit)
if err != nil {
return nil
}
return changes
}
+19 -16
View File
@@ -2,10 +2,11 @@ package monitor
import (
"fmt"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"sync"
"testing"
"time"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
)
// --- Mock Store ---
@@ -73,6 +74,8 @@ func (m *mockStore) EndMaintenanceWindow(int) error { retur
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil }
func (m *mockStore) GetAllAlerts() ([]models.AlertConfig, error) {
@@ -174,7 +177,7 @@ func TestHandleStatusChange_PendingToUp(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "PENDING", MaxRetries: 3, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 10*time.Millisecond)
e.handleStatusChange(site, "UP", 200, 10*time.Millisecond, "")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -195,7 +198,7 @@ func TestHandleStatusChange_UpIncrementFailure(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 3, FailureCount: 0}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 500, 0)
e.handleStatusChange(site, "DOWN", 500, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -213,7 +216,7 @@ func TestHandleStatusChange_UpToDown_ExceedsRetries(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 2, FailureCount: 2, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 500, 0)
e.handleStatusChange(site, "DOWN", 500, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "DOWN" {
@@ -236,7 +239,7 @@ func TestHandleStatusChange_UpToDown_ZeroRetries(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, FailureCount: 0, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "DOWN" {
@@ -255,7 +258,7 @@ func TestHandleStatusChange_DownToUp_Recovery(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "DOWN", FailureCount: 4, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 5*time.Millisecond)
e.handleStatusChange(site, "UP", 200, 5*time.Millisecond, "")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -276,7 +279,7 @@ func TestHandleStatusChange_DownStaysDown(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "DOWN", MaxRetries: 2, FailureCount: 3}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "DOWN" {
@@ -295,7 +298,7 @@ func TestHandleStatusChange_SSLExpired(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "SSL EXP", 0, 0)
e.handleStatusChange(site, "SSL EXP", 0, 0, "SSL certificate expired")
s, _ := getSite(e, 1)
if s.Status != "SSL EXP" {
@@ -315,7 +318,7 @@ func TestHandleStatusChange_AlertSuppressedMaintenance(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "DOWN" {
@@ -346,7 +349,7 @@ func TestHandleStatusChange_RecoverySuppressedMaintenance(t *testing.T) {
site := models.Site{ID: 1, Name: "test", Status: "DOWN", AlertID: 1}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -370,7 +373,7 @@ func TestHandleStatusChange_SSLWarning(t *testing.T) {
}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1)
if !s.SentSSLWarning {
@@ -393,7 +396,7 @@ func TestHandleStatusChange_SSLWarningNotRepeated(t *testing.T) {
}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
waitAsync()
if len(ms.getAlertCallsSnapshot()) != 0 {
@@ -412,7 +415,7 @@ func TestHandleStatusChange_SSLWarningReset(t *testing.T) {
}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1)
if s.SentSSLWarning {
@@ -433,7 +436,7 @@ func TestHandleStatusChange_SSLWarningSuppressedMaint(t *testing.T) {
}
injectSite(e, site)
e.handleStatusChange(site, "UP", 200, 0)
e.handleStatusChange(site, "UP", 200, 0, "")
s, _ := getSite(e, 1)
if !s.SentSSLWarning {
@@ -452,7 +455,7 @@ func TestHandleStatusChange_InactiveEngine(t *testing.T) {
injectSite(e, site)
e.SetActive(false)
e.handleStatusChange(site, "DOWN", 0, 0)
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
s, _ := getSite(e, 1)
if s.Status != "UP" {
@@ -991,7 +994,7 @@ func TestConcurrent_HandleStatusChangeAndGetState(t *testing.T) {
wg.Add(2)
go func() {
defer wg.Done()
e.handleStatusChange(site, "DOWN", 500, 0)
e.handleStatusChange(site, "DOWN", 500, 0, "test error")
}()
go func() {
defer wg.Done()
+2 -1
View File
@@ -406,6 +406,7 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server {
SiteID int `json:"site_id"`
LatencyNs int64 `json:"latency_ns"`
IsUp bool `json:"is_up"`
ErrorReason string `json:"error_reason"`
} `json:"results"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
@@ -420,7 +421,7 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server {
if err := s.SaveCheckFromNode(result.SiteID, req.NodeID, result.LatencyNs, result.IsUp); err != nil {
log.Printf("Failed to save probe result: %v", err)
}
eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp)
eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp, result.ErrorReason)
}
if err := s.UpdateNodeLastSeen(req.NodeID); err != nil {
log.Printf("Failed to update node last seen: %v", err)
+5 -2
View File
@@ -4,13 +4,14 @@ import (
"bytes"
"encoding/json"
"fmt"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
"net"
"net/http"
"sync"
"testing"
"time"
"gitea.lerkolabs.com/lerko/uptop/internal/models"
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
)
// --- Mock Store ---
@@ -75,6 +76,8 @@ func (m *mockStore) DeleteMaintenanceWindow(int) error { retur
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
func (m *mockStore) SetPreference(string, string) error { return nil }
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
func (m *mockStore) Close() error { return nil }
func (m *mockStore) ExportData() (models.Backup, error) {
+9
View File
@@ -72,6 +72,15 @@ func (d *PostgresDialect) CreateTablesSQL() []string {
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)`,
`CREATE TABLE IF NOT EXISTS state_changes (
id SERIAL PRIMARY KEY,
site_id INTEGER NOT NULL,
from_status TEXT NOT NULL,
to_status TEXT NOT NULL,
error_reason TEXT DEFAULT '',
changed_at TIMESTAMP DEFAULT NOW()
)`,
`CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`,
}
}
+9
View File
@@ -79,6 +79,15 @@ func (d *SQLiteDialect) CreateTablesSQL() []string {
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)`,
`CREATE TABLE IF NOT EXISTS state_changes (
id INTEGER PRIMARY KEY AUTOINCREMENT,
site_id INTEGER NOT NULL,
from_status TEXT NOT NULL,
to_status TEXT NOT NULL,
error_reason TEXT DEFAULT '',
changed_at DATETIME DEFAULT CURRENT_TIMESTAMP
)`,
`CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`,
}
}
+23
View File
@@ -347,6 +347,29 @@ func (s *SQLStore) DeleteUser(id int) error {
return err
}
func (s *SQLStore) SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error {
_, err := s.db.Exec(s.q("INSERT INTO state_changes (site_id, from_status, to_status, error_reason) VALUES (?, ?, ?, ?)"),
siteID, fromStatus, toStatus, errorReason)
return err
}
func (s *SQLStore) GetStateChanges(siteID int, limit int) ([]models.StateChange, error) {
rows, err := s.db.Query(s.q("SELECT id, site_id, from_status, to_status, error_reason, changed_at FROM state_changes WHERE site_id = ? ORDER BY changed_at DESC LIMIT ?"), siteID, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var changes []models.StateChange
for rows.Next() {
var sc models.StateChange
if err := rows.Scan(&sc.ID, &sc.SiteID, &sc.FromStatus, &sc.ToStatus, &sc.ErrorReason, &sc.ChangedAt); err != nil {
return changes, err
}
changes = append(changes, sc)
}
return changes, rows.Err()
}
func (s *SQLStore) SaveCheck(siteID int, latencyNs int64, isUp bool) error {
return s.SaveCheckFromNode(siteID, "", latencyNs, isUp)
}
+4
View File
@@ -38,6 +38,10 @@ type Store interface {
SaveCheckFromNode(siteID int, nodeID string, latencyNs int64, isUp bool) error
LoadAllHistory(limit int) (map[int][]models.CheckRecord, error)
// State Changes
SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error
GetStateChanges(siteID int, limit int) ([]models.StateChange, error)
// Nodes
RegisterNode(node models.ProbeNode) error
GetNode(id string) (models.ProbeNode, error)
+74 -1
View File
@@ -309,6 +309,29 @@ func fmtStatus(status string, paused bool, inMaint bool) string {
}
}
func fmtDuration(d time.Duration) string {
if d < time.Minute {
return fmt.Sprintf("%ds", int(d.Seconds()))
}
if d < time.Hour {
return fmt.Sprintf("%dm", int(d.Minutes()))
}
if d < 24*time.Hour {
h := int(d.Hours())
m := int(d.Minutes()) % 60
if m > 0 {
return fmt.Sprintf("%dh %dm", h, m)
}
return fmt.Sprintf("%dh", h)
}
days := int(d.Hours()) / 24
hours := int(d.Hours()) % 24
if hours > 0 {
return fmt.Sprintf("%dd %dh", days, hours)
}
return fmt.Sprintf("%dd", days)
}
func (m Model) dynamicWidths() (nameW, sparkW int) {
fixed := 6 + 10 + 10 + 8 + 8 + 7 + 9 // #, TYPE, STATUS, LATENCY, UPTIME, SSL, RETRY
overhead := 30 // cell padding + borders
@@ -389,6 +412,14 @@ func (m Model) viewSitesTab() string {
name = limitStr(name, nameW)
}
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
nameLen := len([]rune(name))
errSpace := nameW - nameLen - 1
if errSpace > 10 {
name = name + " " + subtleStyle.Render(limitStr(site.LastError, errSpace))
}
}
hist, _ := m.engine.GetHistory(site.ID)
var spark string
if site.Type == "push" {
@@ -732,6 +763,25 @@ func (m Model) viewDetailPanel() string {
}
row("Status", fmtStatus(site.Status, site.Paused, m.isMonitorInMaintenance(site.ID)))
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
row("Error", dangerStyle.Render(limitStr(site.LastError, 60)))
}
if site.Type == "http" && site.StatusCode > 0 {
row("HTTP Code", strconv.Itoa(site.StatusCode))
}
if !site.StatusChangedAt.IsZero() {
dur := time.Since(site.StatusChangedAt)
row("State Since", site.StatusChangedAt.Format("2006-01-02 15:04:05")+" ("+fmtDuration(dur)+")")
}
if !site.LastSuccessAt.IsZero() {
ago := time.Since(site.LastSuccessAt)
row("Last Success", site.LastSuccessAt.Format("15:04:05")+" ("+fmtDuration(ago)+" ago)")
}
if m.isMonitorInMaintenance(site.ID) {
for _, mw := range m.maintenanceWindows {
if mw.Type == "maintenance" && (mw.MonitorID == 0 || mw.MonitorID == site.ID || mw.MonitorID == site.ParentID) {
@@ -787,7 +837,30 @@ func (m Model) viewDetailPanel() string {
}
latency := time.Duration(result.LatencyNs).Milliseconds()
ago := time.Since(result.CheckedAt).Truncate(time.Second)
fmt.Fprintf(&b, " %-14s %s %dms %s ago\n", nodeID, status, latency, ago)
line := fmt.Sprintf(" %-14s %s %dms %s ago", nodeID, status, latency, ago)
if !result.IsUp && result.ErrorReason != "" {
line += " " + dangerStyle.Render(limitStr(result.ErrorReason, 30))
}
b.WriteString(line + "\n")
}
}
stateChanges := m.engine.GetStateChanges(site.ID, 5)
if len(stateChanges) > 0 {
b.WriteString("\n" + subtleStyle.Render(" STATE CHANGES") + "\n")
for _, sc := range stateChanges {
ago := fmtDuration(time.Since(sc.ChangedAt))
arrow := subtleStyle.Render(sc.FromStatus) + " → "
if sc.ToStatus == "UP" {
arrow += specialStyle.Render(sc.ToStatus)
} else {
arrow += dangerStyle.Render(sc.ToStatus)
}
line := fmt.Sprintf(" %s %s", arrow, subtleStyle.Render(ago+" ago"))
if sc.ErrorReason != "" && sc.ToStatus != "UP" {
line += " " + dangerStyle.Render(limitStr(sc.ErrorReason, 40))
}
b.WriteString(line + "\n")
}
}