feat: show error reason when monitors go DOWN #33
@@ -61,13 +61,15 @@ func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, e
|
||||
func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
|
||||
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
|
||||
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
|
||||
func (m *mockStore) SetPreference(string, string) error { return nil }
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
|
||||
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
|
||||
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
|
||||
func (m *mockStore) SetPreference(string, string) error { return nil }
|
||||
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
|
||||
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
|
||||
// --- Cluster Start Tests ---
|
||||
|
||||
|
||||
@@ -127,9 +127,10 @@ func probeFetchAssignments(ctx context.Context, client *http.Client, cfg ProbeCo
|
||||
}
|
||||
|
||||
type probeResultItem struct {
|
||||
SiteID int `json:"site_id"`
|
||||
LatencyNs int64 `json:"latency_ns"`
|
||||
IsUp bool `json:"is_up"`
|
||||
SiteID int `json:"site_id"`
|
||||
LatencyNs int64 `json:"latency_ns"`
|
||||
IsUp bool `json:"is_up"`
|
||||
ErrorReason string `json:"error_reason,omitempty"`
|
||||
}
|
||||
|
||||
func probeExecuteChecks(ctx context.Context, sites []models.Site, strict, insecure *http.Client, allowPrivate bool) []probeResultItem {
|
||||
@@ -154,9 +155,10 @@ loop:
|
||||
cr := monitor.RunCheck(s, strict, insecure, false, allowPrivate)
|
||||
mu.Lock()
|
||||
results = append(results, probeResultItem{
|
||||
SiteID: s.ID,
|
||||
LatencyNs: cr.LatencyNs,
|
||||
IsUp: cr.Status == "UP",
|
||||
SiteID: s.ID,
|
||||
LatencyNs: cr.LatencyNs,
|
||||
IsUp: cr.Status == "UP",
|
||||
ErrorReason: cr.ErrorReason,
|
||||
})
|
||||
mu.Unlock()
|
||||
}(site)
|
||||
|
||||
@@ -2,13 +2,14 @@ package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/models"
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/models"
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
|
||||
)
|
||||
|
||||
type mockStore struct {
|
||||
@@ -58,13 +59,15 @@ func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, e
|
||||
func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
|
||||
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
|
||||
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
|
||||
func (m *mockStore) SetPreference(string, string) error { return nil }
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
|
||||
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
|
||||
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
|
||||
func (m *mockStore) SetPreference(string, string) error { return nil }
|
||||
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
|
||||
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
|
||||
func TestMetricsHandler(t *testing.T) {
|
||||
ms := &mockStore{
|
||||
|
||||
@@ -27,14 +27,26 @@ type Site struct {
|
||||
Paused bool
|
||||
Regions string
|
||||
|
||||
FailureCount int
|
||||
Status string
|
||||
StatusCode int
|
||||
Latency time.Duration
|
||||
CertExpiry time.Time
|
||||
HasSSL bool
|
||||
LastCheck time.Time
|
||||
SentSSLWarning bool
|
||||
FailureCount int
|
||||
Status string
|
||||
StatusCode int
|
||||
Latency time.Duration
|
||||
CertExpiry time.Time
|
||||
HasSSL bool
|
||||
LastCheck time.Time
|
||||
SentSSLWarning bool
|
||||
LastError string
|
||||
StatusChangedAt time.Time
|
||||
LastSuccessAt time.Time
|
||||
}
|
||||
|
||||
type StateChange struct {
|
||||
ID int
|
||||
SiteID int
|
||||
FromStatus string
|
||||
ToStatus string
|
||||
ErrorReason string
|
||||
ChangedAt time.Time
|
||||
}
|
||||
|
||||
type AlertConfig struct {
|
||||
|
||||
@@ -11,10 +11,11 @@ const (
|
||||
)
|
||||
|
||||
type NodeResult struct {
|
||||
NodeID string
|
||||
IsUp bool
|
||||
LatencyNs int64
|
||||
CheckedAt time.Time
|
||||
NodeID string
|
||||
IsUp bool
|
||||
LatencyNs int64
|
||||
CheckedAt time.Time
|
||||
ErrorReason string
|
||||
}
|
||||
|
||||
func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) {
|
||||
|
||||
+34
-15
@@ -2,6 +2,7 @@ package monitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"strconv"
|
||||
@@ -15,12 +16,13 @@ import (
|
||||
)
|
||||
|
||||
type CheckResult struct {
|
||||
SiteID int
|
||||
Status string // "UP", "DOWN", "SSL EXP"
|
||||
StatusCode int
|
||||
LatencyNs int64
|
||||
HasSSL bool
|
||||
CertExpiry time.Time
|
||||
SiteID int
|
||||
Status string // "UP", "DOWN", "SSL EXP"
|
||||
StatusCode int
|
||||
LatencyNs int64
|
||||
HasSSL bool
|
||||
CertExpiry time.Time
|
||||
ErrorReason string
|
||||
}
|
||||
|
||||
func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bool, allowPrivate ...bool) CheckResult {
|
||||
@@ -35,7 +37,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo
|
||||
if ips, err := net.LookupIP(host); err == nil {
|
||||
for _, ip := range ips {
|
||||
if isPrivateIP(ip) {
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN"}
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "target resolves to private IP"}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -52,7 +54,7 @@ func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bo
|
||||
case "dns":
|
||||
return runDNSCheck(site)
|
||||
default:
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN"}
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "unsupported monitor type: " + site.Type}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,7 +70,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, method, site.URL, nil)
|
||||
if err != nil {
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN"}
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "invalid request: " + err.Error()}
|
||||
}
|
||||
|
||||
client := strict
|
||||
@@ -88,6 +90,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
|
||||
|
||||
if err != nil {
|
||||
result.Status = "DOWN"
|
||||
result.ErrorReason = truncateError(err.Error(), 256)
|
||||
return result
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -95,6 +98,11 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
|
||||
result.StatusCode = resp.StatusCode
|
||||
if !isCodeAccepted(resp.StatusCode, site.AcceptedCodes) {
|
||||
result.Status = "DOWN"
|
||||
expected := site.AcceptedCodes
|
||||
if expected == "" {
|
||||
expected = "200-299"
|
||||
}
|
||||
result.ErrorReason = fmt.Sprintf("HTTP %d (expected %s)", resp.StatusCode, expected)
|
||||
}
|
||||
|
||||
if site.CheckSSL && resp.TLS != nil && len(resp.TLS.PeerCertificates) > 0 {
|
||||
@@ -103,6 +111,7 @@ func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecur
|
||||
result.CertExpiry = cert.NotAfter
|
||||
if time.Now().After(cert.NotAfter) {
|
||||
result.Status = "SSL EXP"
|
||||
result.ErrorReason = "SSL certificate expired"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,7 +126,7 @@ func runPingCheck(site models.Site) CheckResult {
|
||||
|
||||
pinger, err := probing.NewPinger(host)
|
||||
if err != nil {
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN"}
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", ErrorReason: "ping setup: " + err.Error()}
|
||||
}
|
||||
pinger.Count = 1
|
||||
pinger.Timeout = siteTimeout(site)
|
||||
@@ -127,8 +136,11 @@ func runPingCheck(site models.Site) CheckResult {
|
||||
err = pinger.Run()
|
||||
latency := time.Since(start)
|
||||
|
||||
if err != nil || pinger.Statistics().PacketsRecv == 0 {
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
|
||||
if err != nil {
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "ping failed: " + err.Error()}
|
||||
}
|
||||
if pinger.Statistics().PacketsRecv == 0 {
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "no ICMP response"}
|
||||
}
|
||||
|
||||
stats := pinger.Statistics()
|
||||
@@ -148,7 +160,7 @@ func runPortCheck(site models.Site) CheckResult {
|
||||
latency := time.Since(start)
|
||||
|
||||
if err != nil {
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: truncateError(err.Error(), 256)}
|
||||
}
|
||||
_ = conn.Close()
|
||||
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
|
||||
@@ -199,10 +211,10 @@ func runDNSCheck(site models.Site) CheckResult {
|
||||
latency := time.Since(start)
|
||||
|
||||
if err != nil {
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS query failed: " + err.Error()}
|
||||
}
|
||||
if r.Rcode != dns.RcodeSuccess {
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds()}
|
||||
return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds(), ErrorReason: "DNS RCODE: " + dns.RcodeToString[r.Rcode]}
|
||||
}
|
||||
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
|
||||
}
|
||||
@@ -235,3 +247,10 @@ func isCodeAccepted(code int, accepted string) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func truncateError(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max-3] + "..."
|
||||
}
|
||||
|
||||
+50
-11
@@ -283,6 +283,9 @@ func (e *Engine) UpdateSiteConfig(site models.Site) {
|
||||
site.LastCheck = existing.LastCheck
|
||||
site.SentSSLWarning = existing.SentSSLWarning
|
||||
site.FailureCount = existing.FailureCount
|
||||
site.LastError = existing.LastError
|
||||
site.StatusChangedAt = existing.StatusChangedAt
|
||||
site.LastSuccessAt = existing.LastSuccessAt
|
||||
e.liveState[site.ID] = site
|
||||
e.addToTokenIndex(site)
|
||||
}
|
||||
@@ -393,33 +396,45 @@ func (e *Engine) checkByID(id int) {
|
||||
updatedSite.CertExpiry = result.CertExpiry
|
||||
updatedSite.Latency = time.Duration(result.LatencyNs)
|
||||
updatedSite.LastCheck = time.Now()
|
||||
e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs))
|
||||
e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs), result.ErrorReason)
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Engine) checkPush(site models.Site) {
|
||||
deadline := site.LastCheck.Add(time.Duration(site.Interval) * time.Second).Add(pushGracePeriod)
|
||||
if time.Now().After(deadline) {
|
||||
e.handleStatusChange(site, "DOWN", 0, 0)
|
||||
e.handleStatusChange(site, "DOWN", 0, 0, "heartbeat missed")
|
||||
} else if site.Status != "UP" {
|
||||
e.handleStatusChange(site, "UP", 200, 0)
|
||||
e.handleStatusChange(site, "UP", 200, 0, "")
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration) {
|
||||
func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration, errorReason string) {
|
||||
if !e.IsActive() {
|
||||
return
|
||||
}
|
||||
|
||||
newState := site
|
||||
newState.StatusCode = code
|
||||
newState.LastError = errorReason
|
||||
|
||||
if rawStatus == "UP" {
|
||||
newState.LastSuccessAt = time.Now()
|
||||
newState.LastError = ""
|
||||
} else {
|
||||
newState.LastSuccessAt = site.LastSuccessAt
|
||||
}
|
||||
|
||||
if site.Status == "UP" && rawStatus != "UP" {
|
||||
newState.FailureCount++
|
||||
if newState.FailureCount > site.MaxRetries {
|
||||
newState.Status = rawStatus
|
||||
newState.FailureCount = site.MaxRetries + 1
|
||||
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN", site.Name))
|
||||
if errorReason != "" {
|
||||
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN: %s", site.Name, errorReason))
|
||||
} else {
|
||||
e.AddLog(fmt.Sprintf("Monitor '%s' confirmed DOWN", site.Name))
|
||||
}
|
||||
} else {
|
||||
e.AddLog(fmt.Sprintf("Monitor '%s' failed check %d/%d", site.Name, newState.FailureCount, site.MaxRetries))
|
||||
}
|
||||
@@ -431,6 +446,14 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int
|
||||
newState.FailureCount = site.MaxRetries + 1
|
||||
}
|
||||
|
||||
if newState.Status != site.Status && site.Status != "PENDING" {
|
||||
newState.StatusChangedAt = time.Now()
|
||||
} else if site.StatusChangedAt.IsZero() && newState.Status != "PENDING" {
|
||||
newState.StatusChangedAt = time.Now()
|
||||
} else {
|
||||
newState.StatusChangedAt = site.StatusChangedAt
|
||||
}
|
||||
|
||||
inMaint := e.isInMaintenance(site.ID)
|
||||
|
||||
if site.Type == "http" && site.CheckSSL && site.HasSSL {
|
||||
@@ -455,12 +478,19 @@ func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int
|
||||
|
||||
e.recordCheck(site.ID, latency, rawStatus == "UP")
|
||||
|
||||
if newState.Status != site.Status && site.Status != "PENDING" {
|
||||
go func() { _ = e.db.SaveStateChange(site.ID, site.Status, newState.Status, errorReason) }()
|
||||
}
|
||||
|
||||
isBroken := func(s string) bool { return s == "DOWN" || s == "SSL EXP" }
|
||||
if !isBroken(site.Status) && isBroken(newState.Status) && newState.Status != "PENDING" {
|
||||
if inMaint {
|
||||
e.AddLog(fmt.Sprintf("Monitor '%s' is DOWN (alerts suppressed — maintenance)", site.Name))
|
||||
} else {
|
||||
msg := fmt.Sprintf("Monitor '%s' is DOWN (%s)", site.Name, rawStatus)
|
||||
if errorReason != "" {
|
||||
msg = fmt.Sprintf("Monitor '%s' is DOWN: %s", site.Name, errorReason)
|
||||
}
|
||||
if site.Type == "push" {
|
||||
msg = fmt.Sprintf("Push Monitor '%s' missed heartbeat.", site.Name)
|
||||
}
|
||||
@@ -554,16 +584,17 @@ func (e *Engine) SetAggStrategy(strategy AggregationStrategy) {
|
||||
e.aggStrategy = strategy
|
||||
}
|
||||
|
||||
func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool) {
|
||||
func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool, errorReason string) {
|
||||
e.probeResultsMu.Lock()
|
||||
if e.probeResults[siteID] == nil {
|
||||
e.probeResults[siteID] = make(map[string]NodeResult)
|
||||
}
|
||||
e.probeResults[siteID][nodeID] = NodeResult{
|
||||
NodeID: nodeID,
|
||||
IsUp: isUp,
|
||||
LatencyNs: latencyNs,
|
||||
CheckedAt: time.Now(),
|
||||
NodeID: nodeID,
|
||||
IsUp: isUp,
|
||||
LatencyNs: latencyNs,
|
||||
CheckedAt: time.Now(),
|
||||
ErrorReason: errorReason,
|
||||
}
|
||||
results := make([]NodeResult, 0, len(e.probeResults[siteID]))
|
||||
for _, r := range e.probeResults[siteID] {
|
||||
@@ -588,7 +619,7 @@ func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, i
|
||||
updatedSite := site
|
||||
updatedSite.Latency = time.Duration(avgLatency)
|
||||
updatedSite.LastCheck = time.Now()
|
||||
e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency))
|
||||
e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency), errorReason)
|
||||
}
|
||||
|
||||
func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult {
|
||||
@@ -601,3 +632,11 @@ func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult {
|
||||
}
|
||||
return cp
|
||||
}
|
||||
|
||||
func (e *Engine) GetStateChanges(siteID int, limit int) []models.StateChange {
|
||||
changes, err := e.db.GetStateChanges(siteID, limit)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return changes
|
||||
}
|
||||
|
||||
@@ -2,10 +2,11 @@ package monitor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/models"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/models"
|
||||
)
|
||||
|
||||
// --- Mock Store ---
|
||||
@@ -68,12 +69,14 @@ func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, e
|
||||
func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
|
||||
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
|
||||
func (m *mockStore) SetPreference(string, string) error { return nil }
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
|
||||
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
|
||||
func (m *mockStore) SetPreference(string, string) error { return nil }
|
||||
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
|
||||
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
|
||||
func (m *mockStore) GetAllAlerts() ([]models.AlertConfig, error) {
|
||||
m.mu.Lock()
|
||||
@@ -174,7 +177,7 @@ func TestHandleStatusChange_PendingToUp(t *testing.T) {
|
||||
site := models.Site{ID: 1, Name: "test", Status: "PENDING", MaxRetries: 3, AlertID: 1}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "UP", 200, 10*time.Millisecond)
|
||||
e.handleStatusChange(site, "UP", 200, 10*time.Millisecond, "")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "UP" {
|
||||
@@ -195,7 +198,7 @@ func TestHandleStatusChange_UpIncrementFailure(t *testing.T) {
|
||||
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 3, FailureCount: 0}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "DOWN", 500, 0)
|
||||
e.handleStatusChange(site, "DOWN", 500, 0, "test error")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "UP" {
|
||||
@@ -213,7 +216,7 @@ func TestHandleStatusChange_UpToDown_ExceedsRetries(t *testing.T) {
|
||||
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 2, FailureCount: 2, AlertID: 1}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "DOWN", 500, 0)
|
||||
e.handleStatusChange(site, "DOWN", 500, 0, "test error")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "DOWN" {
|
||||
@@ -236,7 +239,7 @@ func TestHandleStatusChange_UpToDown_ZeroRetries(t *testing.T) {
|
||||
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, FailureCount: 0, AlertID: 1}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "DOWN", 0, 0)
|
||||
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "DOWN" {
|
||||
@@ -255,7 +258,7 @@ func TestHandleStatusChange_DownToUp_Recovery(t *testing.T) {
|
||||
site := models.Site{ID: 1, Name: "test", Status: "DOWN", FailureCount: 4, AlertID: 1}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "UP", 200, 5*time.Millisecond)
|
||||
e.handleStatusChange(site, "UP", 200, 5*time.Millisecond, "")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "UP" {
|
||||
@@ -276,7 +279,7 @@ func TestHandleStatusChange_DownStaysDown(t *testing.T) {
|
||||
site := models.Site{ID: 1, Name: "test", Status: "DOWN", MaxRetries: 2, FailureCount: 3}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "DOWN", 0, 0)
|
||||
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "DOWN" {
|
||||
@@ -295,7 +298,7 @@ func TestHandleStatusChange_SSLExpired(t *testing.T) {
|
||||
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "SSL EXP", 0, 0)
|
||||
e.handleStatusChange(site, "SSL EXP", 0, 0, "SSL certificate expired")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "SSL EXP" {
|
||||
@@ -315,7 +318,7 @@ func TestHandleStatusChange_AlertSuppressedMaintenance(t *testing.T) {
|
||||
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0, AlertID: 1}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "DOWN", 0, 0)
|
||||
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "DOWN" {
|
||||
@@ -346,7 +349,7 @@ func TestHandleStatusChange_RecoverySuppressedMaintenance(t *testing.T) {
|
||||
site := models.Site{ID: 1, Name: "test", Status: "DOWN", AlertID: 1}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "UP", 200, 0)
|
||||
e.handleStatusChange(site, "UP", 200, 0, "")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "UP" {
|
||||
@@ -370,7 +373,7 @@ func TestHandleStatusChange_SSLWarning(t *testing.T) {
|
||||
}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "UP", 200, 0)
|
||||
e.handleStatusChange(site, "UP", 200, 0, "")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if !s.SentSSLWarning {
|
||||
@@ -393,7 +396,7 @@ func TestHandleStatusChange_SSLWarningNotRepeated(t *testing.T) {
|
||||
}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "UP", 200, 0)
|
||||
e.handleStatusChange(site, "UP", 200, 0, "")
|
||||
|
||||
waitAsync()
|
||||
if len(ms.getAlertCallsSnapshot()) != 0 {
|
||||
@@ -412,7 +415,7 @@ func TestHandleStatusChange_SSLWarningReset(t *testing.T) {
|
||||
}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "UP", 200, 0)
|
||||
e.handleStatusChange(site, "UP", 200, 0, "")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.SentSSLWarning {
|
||||
@@ -433,7 +436,7 @@ func TestHandleStatusChange_SSLWarningSuppressedMaint(t *testing.T) {
|
||||
}
|
||||
injectSite(e, site)
|
||||
|
||||
e.handleStatusChange(site, "UP", 200, 0)
|
||||
e.handleStatusChange(site, "UP", 200, 0, "")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if !s.SentSSLWarning {
|
||||
@@ -452,7 +455,7 @@ func TestHandleStatusChange_InactiveEngine(t *testing.T) {
|
||||
injectSite(e, site)
|
||||
e.SetActive(false)
|
||||
|
||||
e.handleStatusChange(site, "DOWN", 0, 0)
|
||||
e.handleStatusChange(site, "DOWN", 0, 0, "test error")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "UP" {
|
||||
@@ -991,7 +994,7 @@ func TestConcurrent_HandleStatusChangeAndGetState(t *testing.T) {
|
||||
wg.Add(2)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e.handleStatusChange(site, "DOWN", 500, 0)
|
||||
e.handleStatusChange(site, "DOWN", 500, 0, "test error")
|
||||
}()
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
@@ -403,9 +403,10 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server {
|
||||
var req struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Results []struct {
|
||||
SiteID int `json:"site_id"`
|
||||
LatencyNs int64 `json:"latency_ns"`
|
||||
IsUp bool `json:"is_up"`
|
||||
SiteID int `json:"site_id"`
|
||||
LatencyNs int64 `json:"latency_ns"`
|
||||
IsUp bool `json:"is_up"`
|
||||
ErrorReason string `json:"error_reason"`
|
||||
} `json:"results"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
@@ -420,7 +421,7 @@ func Start(cfg ServerConfig, s store.Store, eng *monitor.Engine) *http.Server {
|
||||
if err := s.SaveCheckFromNode(result.SiteID, req.NodeID, result.LatencyNs, result.IsUp); err != nil {
|
||||
log.Printf("Failed to save probe result: %v", err)
|
||||
}
|
||||
eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp)
|
||||
eng.IngestProbeResult(req.NodeID, result.SiteID, result.LatencyNs, result.IsUp, result.ErrorReason)
|
||||
}
|
||||
if err := s.UpdateNodeLastSeen(req.NodeID); err != nil {
|
||||
log.Printf("Failed to update node last seen: %v", err)
|
||||
|
||||
@@ -4,13 +4,14 @@ import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/models"
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
|
||||
"net"
|
||||
"net/http"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/models"
|
||||
"gitea.lerkolabs.com/lerko/uptop/internal/monitor"
|
||||
)
|
||||
|
||||
// --- Mock Store ---
|
||||
@@ -69,13 +70,15 @@ func (m *mockStore) LoadLogs(int) ([]string, error) { return nil, nil
|
||||
func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
|
||||
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
|
||||
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
|
||||
func (m *mockStore) SetPreference(string, string) error { return nil }
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
func (m *mockStore) AddMaintenanceWindow(models.MaintenanceWindow) error { return nil }
|
||||
func (m *mockStore) EndMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) DeleteMaintenanceWindow(int) error { return nil }
|
||||
func (m *mockStore) IsMonitorInMaintenance(int) (bool, error) { return false, nil }
|
||||
func (m *mockStore) GetPreference(string) (string, error) { return "", nil }
|
||||
func (m *mockStore) SetPreference(string, string) error { return nil }
|
||||
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
|
||||
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
|
||||
func (m *mockStore) ExportData() (models.Backup, error) {
|
||||
return models.Backup{
|
||||
|
||||
@@ -72,6 +72,15 @@ func (d *PostgresDialect) CreateTablesSQL() []string {
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL
|
||||
)`,
|
||||
`CREATE TABLE IF NOT EXISTS state_changes (
|
||||
id SERIAL PRIMARY KEY,
|
||||
site_id INTEGER NOT NULL,
|
||||
from_status TEXT NOT NULL,
|
||||
to_status TEXT NOT NULL,
|
||||
error_reason TEXT DEFAULT '',
|
||||
changed_at TIMESTAMP DEFAULT NOW()
|
||||
)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -79,6 +79,15 @@ func (d *SQLiteDialect) CreateTablesSQL() []string {
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT NOT NULL
|
||||
)`,
|
||||
`CREATE TABLE IF NOT EXISTS state_changes (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
site_id INTEGER NOT NULL,
|
||||
from_status TEXT NOT NULL,
|
||||
to_status TEXT NOT NULL,
|
||||
error_reason TEXT DEFAULT '',
|
||||
changed_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -347,6 +347,29 @@ func (s *SQLStore) DeleteUser(id int) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *SQLStore) SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error {
|
||||
_, err := s.db.Exec(s.q("INSERT INTO state_changes (site_id, from_status, to_status, error_reason) VALUES (?, ?, ?, ?)"),
|
||||
siteID, fromStatus, toStatus, errorReason)
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *SQLStore) GetStateChanges(siteID int, limit int) ([]models.StateChange, error) {
|
||||
rows, err := s.db.Query(s.q("SELECT id, site_id, from_status, to_status, error_reason, changed_at FROM state_changes WHERE site_id = ? ORDER BY changed_at DESC LIMIT ?"), siteID, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var changes []models.StateChange
|
||||
for rows.Next() {
|
||||
var sc models.StateChange
|
||||
if err := rows.Scan(&sc.ID, &sc.SiteID, &sc.FromStatus, &sc.ToStatus, &sc.ErrorReason, &sc.ChangedAt); err != nil {
|
||||
return changes, err
|
||||
}
|
||||
changes = append(changes, sc)
|
||||
}
|
||||
return changes, rows.Err()
|
||||
}
|
||||
|
||||
func (s *SQLStore) SaveCheck(siteID int, latencyNs int64, isUp bool) error {
|
||||
return s.SaveCheckFromNode(siteID, "", latencyNs, isUp)
|
||||
}
|
||||
|
||||
@@ -38,6 +38,10 @@ type Store interface {
|
||||
SaveCheckFromNode(siteID int, nodeID string, latencyNs int64, isUp bool) error
|
||||
LoadAllHistory(limit int) (map[int][]models.CheckRecord, error)
|
||||
|
||||
// State Changes
|
||||
SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error
|
||||
GetStateChanges(siteID int, limit int) ([]models.StateChange, error)
|
||||
|
||||
// Nodes
|
||||
RegisterNode(node models.ProbeNode) error
|
||||
GetNode(id string) (models.ProbeNode, error)
|
||||
|
||||
@@ -309,6 +309,29 @@ func fmtStatus(status string, paused bool, inMaint bool) string {
|
||||
}
|
||||
}
|
||||
|
||||
func fmtDuration(d time.Duration) string {
|
||||
if d < time.Minute {
|
||||
return fmt.Sprintf("%ds", int(d.Seconds()))
|
||||
}
|
||||
if d < time.Hour {
|
||||
return fmt.Sprintf("%dm", int(d.Minutes()))
|
||||
}
|
||||
if d < 24*time.Hour {
|
||||
h := int(d.Hours())
|
||||
m := int(d.Minutes()) % 60
|
||||
if m > 0 {
|
||||
return fmt.Sprintf("%dh %dm", h, m)
|
||||
}
|
||||
return fmt.Sprintf("%dh", h)
|
||||
}
|
||||
days := int(d.Hours()) / 24
|
||||
hours := int(d.Hours()) % 24
|
||||
if hours > 0 {
|
||||
return fmt.Sprintf("%dd %dh", days, hours)
|
||||
}
|
||||
return fmt.Sprintf("%dd", days)
|
||||
}
|
||||
|
||||
func (m Model) dynamicWidths() (nameW, sparkW int) {
|
||||
fixed := 6 + 10 + 10 + 8 + 8 + 7 + 9 // #, TYPE, STATUS, LATENCY, UPTIME, SSL, RETRY
|
||||
overhead := 30 // cell padding + borders
|
||||
@@ -389,6 +412,14 @@ func (m Model) viewSitesTab() string {
|
||||
name = limitStr(name, nameW)
|
||||
}
|
||||
|
||||
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
|
||||
nameLen := len([]rune(name))
|
||||
errSpace := nameW - nameLen - 1
|
||||
if errSpace > 10 {
|
||||
name = name + " " + subtleStyle.Render(limitStr(site.LastError, errSpace))
|
||||
}
|
||||
}
|
||||
|
||||
hist, _ := m.engine.GetHistory(site.ID)
|
||||
var spark string
|
||||
if site.Type == "push" {
|
||||
@@ -732,6 +763,25 @@ func (m Model) viewDetailPanel() string {
|
||||
}
|
||||
|
||||
row("Status", fmtStatus(site.Status, site.Paused, m.isMonitorInMaintenance(site.ID)))
|
||||
|
||||
if (site.Status == "DOWN" || site.Status == "SSL EXP") && site.LastError != "" {
|
||||
row("Error", dangerStyle.Render(limitStr(site.LastError, 60)))
|
||||
}
|
||||
|
||||
if site.Type == "http" && site.StatusCode > 0 {
|
||||
row("HTTP Code", strconv.Itoa(site.StatusCode))
|
||||
}
|
||||
|
||||
if !site.StatusChangedAt.IsZero() {
|
||||
dur := time.Since(site.StatusChangedAt)
|
||||
row("State Since", site.StatusChangedAt.Format("2006-01-02 15:04:05")+" ("+fmtDuration(dur)+")")
|
||||
}
|
||||
|
||||
if !site.LastSuccessAt.IsZero() {
|
||||
ago := time.Since(site.LastSuccessAt)
|
||||
row("Last Success", site.LastSuccessAt.Format("15:04:05")+" ("+fmtDuration(ago)+" ago)")
|
||||
}
|
||||
|
||||
if m.isMonitorInMaintenance(site.ID) {
|
||||
for _, mw := range m.maintenanceWindows {
|
||||
if mw.Type == "maintenance" && (mw.MonitorID == 0 || mw.MonitorID == site.ID || mw.MonitorID == site.ParentID) {
|
||||
@@ -787,7 +837,30 @@ func (m Model) viewDetailPanel() string {
|
||||
}
|
||||
latency := time.Duration(result.LatencyNs).Milliseconds()
|
||||
ago := time.Since(result.CheckedAt).Truncate(time.Second)
|
||||
fmt.Fprintf(&b, " %-14s %s %dms %s ago\n", nodeID, status, latency, ago)
|
||||
line := fmt.Sprintf(" %-14s %s %dms %s ago", nodeID, status, latency, ago)
|
||||
if !result.IsUp && result.ErrorReason != "" {
|
||||
line += " " + dangerStyle.Render(limitStr(result.ErrorReason, 30))
|
||||
}
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
|
||||
stateChanges := m.engine.GetStateChanges(site.ID, 5)
|
||||
if len(stateChanges) > 0 {
|
||||
b.WriteString("\n" + subtleStyle.Render(" STATE CHANGES") + "\n")
|
||||
for _, sc := range stateChanges {
|
||||
ago := fmtDuration(time.Since(sc.ChangedAt))
|
||||
arrow := subtleStyle.Render(sc.FromStatus) + " → "
|
||||
if sc.ToStatus == "UP" {
|
||||
arrow += specialStyle.Render(sc.ToStatus)
|
||||
} else {
|
||||
arrow += dangerStyle.Render(sc.ToStatus)
|
||||
}
|
||||
line := fmt.Sprintf(" %s %s", arrow, subtleStyle.Render(ago+" ago"))
|
||||
if sc.ErrorReason != "" && sc.ToStatus != "UP" {
|
||||
line += " " + dangerStyle.Render(limitStr(sc.ErrorReason, 40))
|
||||
}
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user