diff --git a/cmd/uptop/main.go b/cmd/uptop/main.go index 65ad2f4..d88b2e7 100644 --- a/cmd/uptop/main.go +++ b/cmd/uptop/main.go @@ -385,6 +385,7 @@ func runServe(args []string) { eng.InitHistory() eng.InitLogs() + eng.InitAlertHealth() eng.Start(ctx) tlsCert := os.Getenv("UPTOP_TLS_CERT") diff --git a/internal/cluster/cluster_test.go b/internal/cluster/cluster_test.go index d665bbb..a7fb8a8 100644 --- a/internal/cluster/cluster_test.go +++ b/internal/cluster/cluster_test.go @@ -53,8 +53,12 @@ func (m *mockStore) GetNode(string) (models.ProbeNode, error) { return models.Pr func (m *mockStore) GetAllNodes() ([]models.ProbeNode, error) { return nil, nil } func (m *mockStore) UpdateNodeLastSeen(string) error { return nil } func (m *mockStore) DeleteNode(string) error { return nil } -func (m *mockStore) SaveLog(string) error { return nil } -func (m *mockStore) LoadLogs(int) ([]string, error) { return nil, nil } +func (m *mockStore) LoadAlertHealth() (map[int]models.AlertHealthRecord, error) { + return nil, nil +} +func (m *mockStore) SaveAlertHealth(models.AlertHealthRecord) error { return nil } +func (m *mockStore) SaveLog(string) error { return nil } +func (m *mockStore) LoadLogs(int) ([]string, error) { return nil, nil } func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, error) { return nil, nil } diff --git a/internal/metrics/prometheus_test.go b/internal/metrics/prometheus_test.go index 24f6567..7effdc8 100644 --- a/internal/metrics/prometheus_test.go +++ b/internal/metrics/prometheus_test.go @@ -51,8 +51,12 @@ func (m *mockStore) GetNode(string) (models.ProbeNode, error) { return m func (m *mockStore) GetAllNodes() ([]models.ProbeNode, error) { return nil, nil } func (m *mockStore) UpdateNodeLastSeen(string) error { return nil } func (m *mockStore) DeleteNode(string) error { return nil } -func (m *mockStore) SaveLog(string) error { return nil } -func (m *mockStore) LoadLogs(int) ([]string, error) { return nil, nil } +func (m *mockStore) LoadAlertHealth() (map[int]models.AlertHealthRecord, error) { + return nil, nil +} +func (m *mockStore) SaveAlertHealth(models.AlertHealthRecord) error { return nil } +func (m *mockStore) SaveLog(string) error { return nil } +func (m *mockStore) LoadLogs(int) ([]string, error) { return nil, nil } func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, error) { return nil, nil } diff --git a/internal/models/models.go b/internal/models/models.go index 571d555..98c1be5 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -79,6 +79,17 @@ type ProbeNode struct { Version string } +// AlertHealthRecord is the persisted send health of an alert channel. It lets the +// "last sent" / health indicators survive restarts instead of resetting to "never". +type AlertHealthRecord struct { + AlertID int + LastSendAt time.Time + LastSendOK bool + LastError string + SendCount int + FailCount int +} + type MaintenanceWindow struct { ID int MonitorID int diff --git a/internal/monitor/monitor.go b/internal/monitor/monitor.go index 1a1c8d2..91a8ef6 100644 --- a/internal/monitor/monitor.go +++ b/internal/monitor/monitor.go @@ -146,6 +146,26 @@ func (e *Engine) InitLogs() { e.logStore = logs } +// InitAlertHealth restores persisted alert send health so the dashboard shows real +// "last sent" / health state on startup instead of resetting every channel to "never". +func (e *Engine) InitAlertHealth() { + records, err := e.db.LoadAlertHealth() + if err != nil { + return + } + e.alertHealthMu.Lock() + defer e.alertHealthMu.Unlock() + for id, r := range records { + e.alertHealth[id] = AlertHealth{ + LastSendAt: r.LastSendAt, + LastSendOK: r.LastSendOK, + LastError: r.LastError, + SendCount: r.SendCount, + FailCount: r.FailCount, + } + } +} + func (e *Engine) GetLogs() []string { e.logMu.RLock() defer e.logMu.RUnlock() @@ -612,6 +632,18 @@ func (e *Engine) recordAlertResult(alertID int, ok bool, errMsg string) { h.FailCount++ } e.alertHealth[alertID] = h + + // Persist best-effort so health survives restarts; DB IO off the alert path. + go func(rec models.AlertHealthRecord) { + _ = e.db.SaveAlertHealth(rec) + }(models.AlertHealthRecord{ + AlertID: alertID, + LastSendAt: h.LastSendAt, + LastSendOK: h.LastSendOK, + LastError: h.LastError, + SendCount: h.SendCount, + FailCount: h.FailCount, + }) } func (e *Engine) GetAlertHealth(alertID int) AlertHealth { diff --git a/internal/monitor/monitor_test.go b/internal/monitor/monitor_test.go index 4792bf2..9425826 100644 --- a/internal/monitor/monitor_test.go +++ b/internal/monitor/monitor_test.go @@ -63,6 +63,10 @@ func (m *mockStore) GetNode(string) (models.ProbeNode, error) { return m func (m *mockStore) GetAllNodes() ([]models.ProbeNode, error) { return nil, nil } func (m *mockStore) UpdateNodeLastSeen(string) error { return nil } func (m *mockStore) DeleteNode(string) error { return nil } +func (m *mockStore) LoadAlertHealth() (map[int]models.AlertHealthRecord, error) { + return nil, nil +} +func (m *mockStore) SaveAlertHealth(models.AlertHealthRecord) error { return nil } func (m *mockStore) GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, error) { return nil, nil } diff --git a/internal/server/server_test.go b/internal/server/server_test.go index 2e9de56..73b7152 100644 --- a/internal/server/server_test.go +++ b/internal/server/server_test.go @@ -65,8 +65,12 @@ func (m *mockStore) AddAlertReturningID(string, string, map[string]string) (int, func (m *mockStore) GetAllNodes() ([]models.ProbeNode, error) { return nil, nil } func (m *mockStore) UpdateNodeLastSeen(string) error { return nil } func (m *mockStore) DeleteNode(string) error { return nil } -func (m *mockStore) SaveLog(string) error { return nil } -func (m *mockStore) LoadLogs(int) ([]string, error) { return nil, nil } +func (m *mockStore) LoadAlertHealth() (map[int]models.AlertHealthRecord, error) { + return nil, nil +} +func (m *mockStore) SaveAlertHealth(models.AlertHealthRecord) error { return nil } +func (m *mockStore) SaveLog(string) error { return nil } +func (m *mockStore) LoadLogs(int) ([]string, error) { return nil, nil } func (m *mockStore) GetAllMaintenanceWindows(int) ([]models.MaintenanceWindow, error) { return nil, nil } diff --git a/internal/store/dialect.go b/internal/store/dialect.go index 4a4f8e8..2e9ce2c 100644 --- a/internal/store/dialect.go +++ b/internal/store/dialect.go @@ -14,6 +14,7 @@ type Dialect interface { ImportWipe(tx *sql.Tx) ImportResetSequences(tx *sql.Tx) UpsertNodeSQL() string + UpsertAlertHealthSQL() string } func rewritePlaceholders(query string, dollarStyle bool) string { diff --git a/internal/store/postgres.go b/internal/store/postgres.go index 320fb51..c6e896d 100644 --- a/internal/store/postgres.go +++ b/internal/store/postgres.go @@ -81,6 +81,14 @@ func (d *PostgresDialect) CreateTablesSQL() []string { changed_at TIMESTAMP DEFAULT NOW() )`, `CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`, + `CREATE TABLE IF NOT EXISTS alert_health ( + alert_id INTEGER PRIMARY KEY, + last_send_at TIMESTAMP, + last_send_ok BOOLEAN DEFAULT FALSE, + last_error TEXT DEFAULT '', + send_count INTEGER DEFAULT 0, + fail_count INTEGER DEFAULT 0 + )`, } } @@ -106,6 +114,10 @@ func (d *PostgresDialect) UpsertNodeSQL() string { return "INSERT INTO nodes (id, name, region, last_seen, version) VALUES ($1, $2, $3, NOW(), $4) ON CONFLICT (id) DO UPDATE SET name = EXCLUDED.name, region = EXCLUDED.region, last_seen = NOW(), version = EXCLUDED.version" } +func (d *PostgresDialect) UpsertAlertHealthSQL() string { + return "INSERT INTO alert_health (alert_id, last_send_at, last_send_ok, last_error, send_count, fail_count) VALUES ($1, $2, $3, $4, $5, $6) ON CONFLICT (alert_id) DO UPDATE SET last_send_at = EXCLUDED.last_send_at, last_send_ok = EXCLUDED.last_send_ok, last_error = EXCLUDED.last_error, send_count = EXCLUDED.send_count, fail_count = EXCLUDED.fail_count" +} + func (d *PostgresDialect) ResetSequenceOnEmpty(db *sql.DB, table string) {} func (d *PostgresDialect) ImportWipe(tx *sql.Tx) { diff --git a/internal/store/sqlite.go b/internal/store/sqlite.go index beadc40..ee2d65e 100644 --- a/internal/store/sqlite.go +++ b/internal/store/sqlite.go @@ -88,6 +88,14 @@ func (d *SQLiteDialect) CreateTablesSQL() []string { changed_at DATETIME DEFAULT CURRENT_TIMESTAMP )`, `CREATE INDEX IF NOT EXISTS idx_state_changes_site ON state_changes(site_id, changed_at DESC)`, + `CREATE TABLE IF NOT EXISTS alert_health ( + alert_id INTEGER PRIMARY KEY, + last_send_at DATETIME, + last_send_ok BOOLEAN DEFAULT 0, + last_error TEXT DEFAULT '', + send_count INTEGER DEFAULT 0, + fail_count INTEGER DEFAULT 0 + )`, } } @@ -113,6 +121,10 @@ func (d *SQLiteDialect) UpsertNodeSQL() string { return "INSERT OR REPLACE INTO nodes (id, name, region, last_seen, version) VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?)" } +func (d *SQLiteDialect) UpsertAlertHealthSQL() string { + return "INSERT OR REPLACE INTO alert_health (alert_id, last_send_at, last_send_ok, last_error, send_count, fail_count) VALUES (?, ?, ?, ?, ?, ?)" +} + func (d *SQLiteDialect) ResetSequenceOnEmpty(db *sql.DB, table string) { var count int _ = db.QueryRow("SELECT COUNT(*) FROM " + table).Scan(&count) //nolint:errcheck diff --git a/internal/store/sqlstore.go b/internal/store/sqlstore.go index e24c9f1..bf6a27c 100644 --- a/internal/store/sqlstore.go +++ b/internal/store/sqlstore.go @@ -430,6 +430,37 @@ func (s *SQLStore) DeleteNode(id string) error { return err } +func (s *SQLStore) LoadAlertHealth() (map[int]models.AlertHealthRecord, error) { + rows, err := s.db.Query("SELECT alert_id, last_send_at, last_send_ok, last_error, send_count, fail_count FROM alert_health") + if err != nil { + return nil, err + } + defer rows.Close() + out := make(map[int]models.AlertHealthRecord) + for rows.Next() { + var r models.AlertHealthRecord + var lastSend sql.NullTime + if err := rows.Scan(&r.AlertID, &lastSend, &r.LastSendOK, &r.LastError, &r.SendCount, &r.FailCount); err != nil { + return out, err + } + if lastSend.Valid { + r.LastSendAt = lastSend.Time + } + out[r.AlertID] = r + } + return out, rows.Err() +} + +func (s *SQLStore) SaveAlertHealth(h models.AlertHealthRecord) error { + var lastSend interface{} + if !h.LastSendAt.IsZero() { + lastSend = h.LastSendAt + } + _, err := s.db.Exec(s.dialect.UpsertAlertHealthSQL(), + h.AlertID, lastSend, h.LastSendOK, h.LastError, h.SendCount, h.FailCount) + return err +} + func (s *SQLStore) SaveLog(message string) error { _, err := s.db.Exec(s.q("INSERT INTO logs (message) VALUES (?)"), message) if err != nil { diff --git a/internal/store/store.go b/internal/store/store.go index 8321486..2d00880 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -49,6 +49,10 @@ type Store interface { UpdateNodeLastSeen(id string) error DeleteNode(id string) error + // Alert Health + LoadAlertHealth() (map[int]models.AlertHealthRecord, error) + SaveAlertHealth(h models.AlertHealthRecord) error + // Logs SaveLog(message string) error LoadLogs(limit int) ([]string, error) diff --git a/internal/tui/tab_sites.go b/internal/tui/tab_sites.go index 8994757..ad1c19e 100644 --- a/internal/tui/tab_sites.go +++ b/internal/tui/tab_sites.go @@ -959,19 +959,26 @@ func (m Model) viewDetailPanel() string { } } else { b.WriteString(" " + latencySparkline(hist.Latencies, hist.Statuses, sparkWidth)) - if len(hist.Latencies) > 0 { - minL, maxL := hist.Latencies[0], hist.Latencies[0] - var total time.Duration - for _, l := range hist.Latencies { - total += l - if l < minL { - minL = l - } - if l > maxL { - maxL = l - } + // Stats over successful checks only — a failed check is stored as 0ns latency + // and would otherwise drag Min to 0ms and skew the average. + var minL, maxL, total time.Duration + count := 0 + for i, l := range hist.Latencies { + if i < len(hist.Statuses) && !hist.Statuses[i] { + continue } - avg := total / time.Duration(len(hist.Latencies)) + if count == 0 { + minL, maxL = l, l + } else if l < minL { + minL = l + } else if l > maxL { + maxL = l + } + total += l + count++ + } + if count > 0 { + avg := total / time.Duration(count) fmt.Fprintf(&b, "\n %s %dms %s %dms %s %dms", subtleStyle.Render("Min"), minL.Milliseconds(), subtleStyle.Render("Avg"), avg.Milliseconds(), diff --git a/internal/tui/tui.go b/internal/tui/tui.go index 0677643..1bddca1 100644 --- a/internal/tui/tui.go +++ b/internal/tui/tui.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "math" + "os" "sort" "strings" "time" @@ -122,6 +123,10 @@ type Model struct { filterMode bool filterText string + + // demoMode renders a stable status dot instead of the animated pulse so + // screenshots/recordings don't capture the spinner mid-frame. Set via UPTOP_DEMO=1. + demoMode bool } func InitialModel(isAdmin bool, s store.Store, eng *monitor.Engine) Model { @@ -155,6 +160,7 @@ func InitialModel(isAdmin bool, s store.Store, eng *monitor.Engine) Model { collapsed: collapsed, theme: theme, themeIndex: themeIdx, + demoMode: os.Getenv("UPTOP_DEMO") == "1", } } @@ -754,11 +760,6 @@ func (m *Model) submitForm() { } func (m Model) pulseIndicator() string { - frame := m.tickCount % len(pulseFrames) - brightness := int(m.pulsePos*155) + 100 - if brightness > 255 { - brightness = 255 - } hasDown := false for _, s := range m.sites { if !s.Paused && !m.isMonitorInMaintenance(s.ID) && (s.Status == "DOWN" || s.Status == "SSL EXP") { @@ -766,6 +767,19 @@ func (m Model) pulseIndicator() string { break } } + // Stills can't show animation: render a stable status dot in demo mode. + if m.demoMode { + c := m.theme.Success + if hasDown { + c = m.theme.Danger + } + return lipgloss.NewStyle().Foreground(c).Render("●") + } + frame := m.tickCount % len(pulseFrames) + brightness := int(m.pulsePos*155) + 100 + if brightness > 255 { + brightness = 255 + } var color string if hasDown { color = fmt.Sprintf("#%02x%02x%02x", brightness, brightness/4, brightness/4) @@ -953,7 +967,11 @@ func (m Model) viewDashboard() string { online++ } } - statusParts = append(statusParts, fmt.Sprintf("%d probes", online)) + probeLabel := "probes" + if online == 1 { + probeLabel = "probe" + } + statusParts = append(statusParts, fmt.Sprintf("%d %s", online, probeLabel)) } statusLine := strings.Join(statusParts, subtleStyle.Render(" · ")) diff --git a/vhs/backfill/main.go b/vhs/backfill/main.go index b698b93..c839ccb 100644 --- a/vhs/backfill/main.go +++ b/vhs/backfill/main.go @@ -51,6 +51,15 @@ func main() { fmt.Fprintf(os.Stderr, "maintenance: %v\n", err) os.Exit(1) } + alertIDs, err := loadAlertIDs(db) + if err != nil { + fmt.Fprintf(os.Stderr, "load alert IDs: %v\n", err) + os.Exit(1) + } + if err := backfillAlertHealth(db, now, alertIDs); err != nil { + fmt.Fprintf(os.Stderr, "alert health: %v\n", err) + os.Exit(1) + } var count int _ = db.QueryRow("SELECT COUNT(*) FROM check_history").Scan(&count) @@ -67,6 +76,18 @@ func loadSiteIDs(db *sql.DB) (map[string]int, error) { if err != nil { return nil, err } + return scanNameIDs(rows) +} + +func loadAlertIDs(db *sql.DB) (map[string]int, error) { + rows, err := db.Query("SELECT id, name FROM alerts") + if err != nil { + return nil, err + } + return scanNameIDs(rows) +} + +func scanNameIDs(rows *sql.Rows) (map[string]int, error) { defer rows.Close() ids := make(map[string]int) for rows.Next() { @@ -80,27 +101,73 @@ func loadSiteIDs(db *sql.DB) (map[string]int, error) { return ids, rows.Err() } +// backfillAlertHealth seeds realistic send health so the Alerts tab shows recent, +// healthy "last sent" times and green health dots instead of "never" across the board. +func backfillAlertHealth(db *sql.DB, now time.Time, alertIDs map[string]int) error { + type health struct { + name string + sentAgo time.Duration + ok bool + sends int + fails int + } + rows := []health{ + {"Discord Homelab", 4 * time.Minute, true, 37, 0}, + {"Slack Ops", 9 * time.Minute, true, 21, 1}, + {"Ntfy Alerts", 1 * time.Hour, true, 12, 0}, + {"Email Oncall", 3 * time.Hour, true, 5, 0}, + } + + tx, err := db.Begin() + if err != nil { + return err + } + defer func() { _ = tx.Rollback() }() + + stmt, err := tx.Prepare("INSERT OR REPLACE INTO alert_health (alert_id, last_send_at, last_send_ok, last_error, send_count, fail_count) VALUES (?, ?, ?, ?, ?, ?)") + if err != nil { + return err + } + defer stmt.Close() + + for _, r := range rows { + id, ok := alertIDs[r.name] + if !ok { + continue + } + sentAt := now.Add(-r.sentAgo).Format("2006-01-02 15:04:05") + if _, err := stmt.Exec(id, sentAt, r.ok, "", r.sends, r.fails); err != nil { + return err + } + } + return tx.Commit() +} + type monitorProfile struct { name string minMs int maxMs int - downFrom int // check index where DOWN starts (-1 = never) + downFrom int // first DOWN check index (-1 = always up) + downTo int // exclusive end of the DOWN window; use 60 (total) for a still-down monitor } func backfillHistory(db *sql.DB, rng *rand.Rand, now time.Time, ids map[string]int) error { + // Latency ranges reflect monitoring public services over the internet, so the + // detail histogram brackets the live latency the engine measures at capture time. + // 60 checks * 24m spacing = a 24h window; dip indices place outages within it. profiles := []monitorProfile{ - {"Nextcloud", 40, 80, -1}, - {"Jellyfin", 80, 200, -1}, - {"Home Assistant", 15, 45, -1}, - {"Gitea", 40, 90, -1}, - {"Traefik Dashboard", 5, 25, -1}, - {"Vaultwarden", 50, 130, -1}, - {"Personal Blog", 25, 65, -1}, - {"Immich", 100, 280, -1}, // spikes handled below - {"Auth Portal", 30, 70, 40}, // DOWN after check 40 - {"Edge Router", 5, 15, -1}, // ping - {"Postgres", 1, 5, -1}, // port - {"DNS Primary", 10, 30, -1}, + {"Nextcloud", 200, 600, 47, 48}, // brief blip ~5h ago, recovered + {"Jellyfin", 40, 180, 15, 16}, // brief blip ~18h ago, recovered + {"Home Assistant", 30, 120, -1, 0}, // + {"Gitea", 50, 200, -1, 0}, // + {"Traefik Dashboard", 60, 200, -1, 0}, // + {"Vaultwarden", 80, 250, -1, 0}, // + {"Personal Blog", 40, 160, -1, 0}, // + {"Immich", 60, 300, 30, 31}, // brief blip ~12h ago; periodic spikes below + {"Auth Portal", 30, 90, 40, 60}, // DOWN ~8h ago, still down + {"Edge Router", 5, 20, -1, 0}, // ping + {"Postgres", 1, 6, -1, 0}, // port + {"DNS Primary", 8, 30, -1, 0}, // dns } tx, err := db.Begin() @@ -128,7 +195,7 @@ func backfillHistory(db *sql.DB, rng *rand.Rand, now time.Time, ids map[string]i var latencyNs int64 isUp := true - if p.downFrom >= 0 && i >= p.downFrom { + if p.downFrom >= 0 && i >= p.downFrom && i < p.downTo { latencyNs = 0 isUp = false } else { @@ -155,14 +222,16 @@ func backfillStateChanges(db *sql.DB, now time.Time, ids map[string]int) error { reason string at time.Time } + // Timed to line up with the history dips (Nextcloud ~5h, Immich ~12h, Jellyfin ~18h) + // and the still-down Auth Portal (~8h), so detail panels read coherently. changes := []sc{ - {"Nextcloud", "UP", "DOWN", "read timeout", now.Add(-3 * 24 * time.Hour).Add(-5 * time.Minute)}, - {"Nextcloud", "DOWN", "UP", "", now.Add(-3 * 24 * time.Hour)}, - {"Jellyfin", "UP", "DOWN", "connection reset", now.Add(-18 * time.Hour).Add(-3 * time.Minute)}, - {"Jellyfin", "DOWN", "UP", "", now.Add(-18 * time.Hour)}, - {"Auth Portal", "UP", "DOWN", "connection refused", now.Add(-8 * time.Hour)}, + {"Nextcloud", "UP", "DOWN", "read timeout", now.Add(-5 * time.Hour).Add(-8 * time.Minute)}, + {"Nextcloud", "DOWN", "UP", "", now.Add(-5 * time.Hour)}, + {"Auth Portal", "UP", "DOWN", "no such host", now.Add(-8 * time.Hour)}, {"Immich", "UP", "DOWN", "502 Bad Gateway", now.Add(-12 * time.Hour).Add(-8 * time.Minute)}, {"Immich", "DOWN", "UP", "", now.Add(-12 * time.Hour)}, + {"Jellyfin", "UP", "DOWN", "connection reset", now.Add(-18 * time.Hour).Add(-5 * time.Minute)}, + {"Jellyfin", "DOWN", "UP", "", now.Add(-18 * time.Hour)}, } tx, err := db.Begin() @@ -191,25 +260,35 @@ func backfillStateChanges(db *sql.DB, now time.Time, ids map[string]int) error { func backfillLogs(db *sql.DB, now time.Time) error { type logEntry struct { - msg string - at time.Time + text string + at time.Time } + ago := func(h, m, s int) time.Time { + return now.Add(-(time.Duration(h)*time.Hour + time.Duration(m)*time.Minute + time.Duration(s)*time.Second)) + } + // Ordered newest-first. The bracket time is derived from `at` (not hardcoded), so the + // Logs view — which renders the leading [HH:MM] — reads chronologically. Outage times + // line up with the state changes and history dips above. logs := []logEntry{ - {"[06:12] Monitor 'Auth Portal' confirmed DOWN: connection refused", now.Add(-8 * time.Hour)}, - {"[06:12] Monitor 'Auth Portal' failed check 2/2", now.Add(-8*time.Hour - 30*time.Second)}, - {"[06:11] Monitor 'Auth Portal' failed check 1/2", now.Add(-8*time.Hour - 60*time.Second)}, - {"[12:33] Monitor 'Immich' recovered (was down 8m)", now.Add(-12 * time.Hour)}, - {"[12:25] Monitor 'Immich' confirmed DOWN: 502 Bad Gateway", now.Add(-12*time.Hour - 8*time.Minute)}, - {"[12:25] Monitor 'Immich' failed check 3/3", now.Add(-12*time.Hour - 8*time.Minute - 30*time.Second)}, - {"[12:25] Monitor 'Immich' failed check 2/3", now.Add(-12*time.Hour - 8*time.Minute - 60*time.Second)}, - {"[12:24] Monitor 'Immich' failed check 1/3", now.Add(-12*time.Hour - 9*time.Minute)}, - {"[06:14] Monitor 'Jellyfin' recovered (was down 3m)", now.Add(-18 * time.Hour)}, - {"[06:11] Monitor 'Jellyfin' confirmed DOWN: connection reset", now.Add(-18*time.Hour - 3*time.Minute)}, - {"[06:11] Monitor 'Jellyfin' failed check 2/2", now.Add(-18*time.Hour - 3*time.Minute - 30*time.Second)}, - {"[06:10] Monitor 'Jellyfin' failed check 1/2", now.Add(-18*time.Hour - 4*time.Minute)}, - {"[23:45] SSL certificate for 'Personal Blog' expires in 42 days", now.Add(-28 * time.Hour)}, - {"[08:00] Loaded check history from database", now.Add(-32*time.Hour - 30*time.Minute)}, - {"[08:00] Engine RESUMED (Active)", now.Add(-32*time.Hour - 30*time.Minute - 5*time.Second)}, + {"Monitor 'Nextcloud' recovered (was down 8m)", ago(5, 0, 0)}, + {"Monitor 'Nextcloud' confirmed DOWN: read timeout", ago(5, 8, 0)}, + {"Monitor 'Nextcloud' failed check 2/2", ago(5, 8, 30)}, + {"Monitor 'Nextcloud' failed check 1/2", ago(5, 9, 0)}, + {"Monitor 'Auth Portal' confirmed DOWN: no such host", ago(8, 0, 0)}, + {"Monitor 'Auth Portal' failed check 2/2", ago(8, 0, 30)}, + {"Monitor 'Auth Portal' failed check 1/2", ago(8, 1, 0)}, + {"Monitor 'Immich' recovered (was down 8m)", ago(12, 0, 0)}, + {"Monitor 'Immich' confirmed DOWN: 502 Bad Gateway", ago(12, 8, 0)}, + {"Monitor 'Immich' failed check 3/3", ago(12, 8, 30)}, + {"Monitor 'Immich' failed check 2/3", ago(12, 9, 0)}, + {"Monitor 'Immich' failed check 1/3", ago(12, 9, 30)}, + {"Monitor 'Jellyfin' recovered (was down 5m)", ago(18, 0, 0)}, + {"Monitor 'Jellyfin' confirmed DOWN: connection reset", ago(18, 5, 0)}, + {"Monitor 'Jellyfin' failed check 2/2", ago(18, 5, 30)}, + {"Monitor 'Jellyfin' failed check 1/2", ago(18, 6, 0)}, + {"SSL warning: certificate for 'Personal Blog' expires in 9 days", ago(20, 0, 0)}, + {"Engine RESUMED (Active)", ago(22, 0, 0)}, + {"Loaded check history from database", ago(22, 0, 5)}, } tx, err := db.Begin() @@ -225,7 +304,10 @@ func backfillLogs(db *sql.DB, now time.Time) error { defer stmt.Close() for _, l := range logs { - if _, err := stmt.Exec(l.msg, l.at.Format("2006-01-02 15:04:05")); err != nil { + // Bracket in local time to match the engine's live AddLog timestamps; + // created_at stays UTC to match the store's CURRENT_TIMESTAMP ordering. + msg := "[" + l.at.Local().Format("15:04") + "] " + l.text + if _, err := stmt.Exec(msg, l.at.Format("2006-01-02 15:04:05")); err != nil { return err } } @@ -233,11 +315,23 @@ func backfillLogs(db *sql.DB, now time.Time) error { } func backfillNodes(db *sql.DB, now time.Time) error { - _, err := db.Exec( - "INSERT OR REPLACE INTO nodes (id, name, region, last_seen, version) VALUES (?, ?, ?, ?, ?)", - "node-1", "leader", "us-east", now.Format("2006-01-02 15:04:05"), "2026.05.1", - ) - return err + // Multiple regions to show distributed probes. All seen "now" so they read ONLINE + // for the whole capture window (kept under the 60s freshness threshold by the tape). + nodes := []struct{ id, name, region string }{ + {"node-use1", "leader", "us-east"}, + {"node-euw1", "probe-eu", "eu-west"}, + {"node-apse1", "probe-ap", "ap-southeast"}, + } + ts := now.Format("2006-01-02 15:04:05") + for _, n := range nodes { + if _, err := db.Exec( + "INSERT OR REPLACE INTO nodes (id, name, region, last_seen, version) VALUES (?, ?, ?, ?, ?)", + n.id, n.name, n.region, ts, "2026.05.1", + ); err != nil { + return err + } + } + return nil } func backfillMaintenance(db *sql.DB, now time.Time, ids map[string]int) error { diff --git a/vhs/crop/main.go b/vhs/crop/main.go new file mode 100644 index 0000000..42edad9 --- /dev/null +++ b/vhs/crop/main.go @@ -0,0 +1,123 @@ +// Command crop trims the uniform background border around each VHS screenshot so the +// content fills the frame instead of floating in a large empty terminal. Sparse views +// (alerts, detail, nodes) would otherwise sit in a sea of dead space. +// +// Usage: crop [dir] (dir defaults to vhs/screenshots) +package main + +import ( + "fmt" + "image" + "image/color" + "image/png" + "os" + "path/filepath" +) + +// pad is the margin (px) left around the detected content. tol is the per-channel +// colour distance (summed) above which a pixel counts as content rather than background. +const ( + pad = 24 + tol = 28 +) + +func main() { + dir := "vhs/screenshots" + if len(os.Args) > 1 { + dir = os.Args[1] + } + paths, err := filepath.Glob(filepath.Join(dir, "*.png")) + if err != nil { + fmt.Fprintf(os.Stderr, "glob: %v\n", err) + os.Exit(1) + } + if len(paths) == 0 { + fmt.Fprintf(os.Stderr, "no PNGs in %s\n", dir) + os.Exit(1) + } + for _, p := range paths { + w, h, err := cropFile(p) + if err != nil { + fmt.Fprintf(os.Stderr, "crop %s: %v\n", p, err) + os.Exit(1) + } + fmt.Printf("cropped %s -> %dx%d\n", filepath.Base(p), w, h) + } +} + +func cropFile(path string) (int, int, error) { + f, err := os.Open(path) //nolint:gosec // dev tool: paths come from a trusted local glob + if err != nil { + return 0, 0, err + } + src, err := png.Decode(f) + _ = f.Close() + if err != nil { + return 0, 0, err + } + + b := src.Bounds() + // Background colour sampled from a corner — always inside VHS's blank padding. + bgR, bgG, bgB := rgb(src.At(b.Min.X+2, b.Min.Y+2)) + + minX, minY := b.Max.X, b.Max.Y + maxX, maxY := b.Min.X, b.Min.Y + found := false + for y := b.Min.Y; y < b.Max.Y; y++ { + for x := b.Min.X; x < b.Max.X; x++ { + r, g, bl := rgb(src.At(x, y)) + if abs(r-bgR)+abs(g-bgG)+abs(bl-bgB) > tol { + found = true + minX, minY = min(minX, x), min(minY, y) + maxX, maxY = max(maxX, x), max(maxY, y) + } + } + } + if !found { + return b.Dx(), b.Dy(), nil // blank frame — leave untouched + } + + minX = clamp(minX-pad, b.Min.X, b.Max.X) + minY = clamp(minY-pad, b.Min.Y, b.Max.Y) + maxX = clamp(maxX+pad+1, b.Min.X, b.Max.X) + maxY = clamp(maxY+pad+1, b.Min.Y, b.Max.Y) + + dst := image.NewRGBA(image.Rect(0, 0, maxX-minX, maxY-minY)) + for y := minY; y < maxY; y++ { + for x := minX; x < maxX; x++ { + dst.Set(x-minX, y-minY, src.At(x, y)) + } + } + + out, err := os.Create(path) //nolint:gosec // dev tool: paths come from a trusted local glob + if err != nil { + return 0, 0, err + } + defer out.Close() //nolint:errcheck // best-effort close on write path + if err := png.Encode(out, dst); err != nil { + return 0, 0, err + } + return dst.Bounds().Dx(), dst.Bounds().Dy(), nil +} + +func rgb(c color.Color) (int, int, int) { + r, g, b, _ := c.RGBA() + return int(r >> 8), int(g >> 8), int(b >> 8) +} + +func abs(x int) int { + if x < 0 { + return -x + } + return x +} + +func clamp(v, lo, hi int) int { + if v < lo { + return lo + } + if v > hi { + return hi + } + return v +} diff --git a/vhs/demo.tape b/vhs/demo.tape index 89a7585..25c72dd 100644 --- a/vhs/demo.tape +++ b/vhs/demo.tape @@ -6,49 +6,67 @@ Set Padding 20 Set Framerate 15 Set TypingSpeed 50ms +# Seed demo data + start uptop (UPTOP_DEMO=1 → stable pulse dot for stills). Hide Type "bash vhs/setup.sh /tmp/uptop-vhs.db" Enter -Sleep 45s +# Warm-up: push heartbeat lands (~10s) and initial checks settle. Kept short so every +# capture stays inside the 60s node-freshness window (consistent "3 probes" footer). +Sleep 18s Show -Sleep 5s +Sleep 2s -# Sites tab — hero shot with mixed monitor states +# 1. Sites — hero shot: mixed states, history sparklines, SSL, retries. Screenshot vhs/screenshots/monitors.png Sleep 1s -# Navigate to Nextcloud (row 6: group + 3 children + Auth Portal) +# 2. Detail — drill into Nextcloud (6th row from the top). Down -Sleep 200ms +Sleep 150ms Down -Sleep 200ms +Sleep 150ms Down -Sleep 200ms +Sleep 150ms Down -Sleep 200ms +Sleep 150ms Down -Sleep 200ms +Sleep 300ms Type "i" -Sleep 3s +Sleep 2s Screenshot vhs/screenshots/detail.png -Sleep 1s - -# Close detail +Sleep 500ms Escape Sleep 1s -# Tab to Alerts +# 3. Alerts — channels with health dots + recent "last sent". Tab -Sleep 2s +Sleep 1500ms Screenshot vhs/screenshots/alerts.png -Sleep 1s +Sleep 500ms -# Tab to Logs +# 4. Logs — chronological, severity-coloured event stream. Tab -Sleep 2s +Sleep 1500ms Screenshot vhs/screenshots/logs.png -Sleep 1s +Sleep 500ms + +# 5. Nodes — distributed probes across regions. +Tab +Sleep 1500ms +Screenshot vhs/screenshots/nodes.png +Sleep 500ms + +# 6. Theme — cycle to the next theme, return to Sites for an alternate-palette hero. +Type "T" +Sleep 500ms +Tab +Sleep 200ms +Tab +Sleep 200ms +Tab +Sleep 1s +Screenshot vhs/screenshots/theme.png +Sleep 500ms -# Quit Type "q" Sleep 1s diff --git a/vhs/screenshots/alerts.png b/vhs/screenshots/alerts.png index 40638b5..9a4e0fe 100644 Binary files a/vhs/screenshots/alerts.png and b/vhs/screenshots/alerts.png differ diff --git a/vhs/screenshots/detail.png b/vhs/screenshots/detail.png index 1c29a57..0c8e8ca 100644 Binary files a/vhs/screenshots/detail.png and b/vhs/screenshots/detail.png differ diff --git a/vhs/screenshots/logs.png b/vhs/screenshots/logs.png index a3f655d..d0aab0a 100644 Binary files a/vhs/screenshots/logs.png and b/vhs/screenshots/logs.png differ diff --git a/vhs/screenshots/monitors.png b/vhs/screenshots/monitors.png index 4334125..867a875 100644 Binary files a/vhs/screenshots/monitors.png and b/vhs/screenshots/monitors.png differ diff --git a/vhs/screenshots/nodes.png b/vhs/screenshots/nodes.png new file mode 100644 index 0000000..53efc1b Binary files /dev/null and b/vhs/screenshots/nodes.png differ diff --git a/vhs/screenshots/theme.png b/vhs/screenshots/theme.png new file mode 100644 index 0000000..9bdd920 Binary files /dev/null and b/vhs/screenshots/theme.png differ diff --git a/vhs/seed.yaml b/vhs/seed.yaml index 16d5d81..30b34eb 100644 --- a/vhs/seed.yaml +++ b/vhs/seed.yaml @@ -28,7 +28,7 @@ monitors: # HTTP — homelab services - name: Nextcloud type: http - url: https://example.com + url: https://nextcloud.com interval: 30 alert: Discord Homelab check_ssl: true @@ -37,21 +37,21 @@ monitors: - name: Jellyfin type: http - url: https://example.com + url: https://jellyfin.org interval: 30 alert: Discord Homelab max_retries: 2 - name: Home Assistant type: http - url: https://example.com + url: https://www.home-assistant.io interval: 30 alert: Discord Homelab max_retries: 3 - name: Gitea type: http - url: https://example.com + url: https://about.gitea.com interval: 60 alert: Discord Homelab check_ssl: true @@ -60,14 +60,14 @@ monitors: - name: Traefik Dashboard type: http - url: https://example.com + url: https://traefik.io interval: 60 alert: Discord Homelab max_retries: 1 - name: Vaultwarden type: http - url: https://example.com + url: https://bitwarden.com interval: 30 alert: Discord Homelab check_ssl: true @@ -76,7 +76,7 @@ monitors: - name: Personal Blog type: http - url: https://example.com + url: https://jvns.ca interval: 120 alert: Discord Homelab check_ssl: true @@ -85,17 +85,17 @@ monitors: - name: Immich type: http - url: https://example.com + url: https://immich.app interval: 60 alert: Discord Homelab check_ssl: true expiry_threshold: 7 max_retries: 3 - # HTTP — deliberate failure + # HTTP — deliberate failure (non-resolving homelab host → stays DOWN) - name: Auth Portal type: http - url: http://localhost:1 + url: https://auth.home.arpa interval: 30 alert: Discord Homelab max_retries: 2 diff --git a/vhs/setup.sh b/vhs/setup.sh index 9f8494d..6bf1364 100755 --- a/vhs/setup.sh +++ b/vhs/setup.sh @@ -9,13 +9,16 @@ echo "==> Seeding monitors and alerts..." UPTOP_DB_DSN="$DB" ./uptop apply -f vhs/seed.yaml 2>&1 echo "==> Backfilling check history..." -BACKFILL_OUT=$(go run ./vhs/backfill/ "$DB") +# Build first so the backfill's `now` (node last_seen, heartbeat timing) isn't racing +# a cold compile — keeps the capture window deterministic. +go build -o /tmp/uptop-backfill ./vhs/backfill/ +BACKFILL_OUT=$(/tmp/uptop-backfill "$DB") echo "$BACKFILL_OUT" PUSH_TOKEN=$(echo "$BACKFILL_OUT" | grep '^PUSH_TOKEN=' | cut -d= -f2) if [ -n "$PUSH_TOKEN" ]; then - echo "==> Sending push heartbeat in 15s (background)..." - (sleep 15 && curl -s "http://localhost:18099/api/push" -H "Authorization: Bearer $PUSH_TOKEN" > /dev/null 2>&1) & + echo "==> Sending push heartbeat in 10s (background)..." + (sleep 10 && curl -s "http://localhost:18099/api/push" -H "Authorization: Bearer $PUSH_TOKEN" > /dev/null 2>&1) & fi echo "==> Starting uptop server..." @@ -24,4 +27,5 @@ exec env \ UPTOP_PORT=23299 \ UPTOP_HTTP_PORT=18099 \ UPTOP_ALLOW_PRIVATE_TARGETS=true \ + UPTOP_DEMO=1 \ ./uptop serve 2>/dev/null