chore(tui): polish demo + regenerate screenshots
CI / test (pull_request) Successful in 2m45s
CI / lint (pull_request) Successful in 1m4s
CI / vulncheck (pull_request) Successful in 56s

Rework the VHS demo so the README screenshots actually entice a download.

Demo data / tooling:
- seed.yaml: real, reachable service URLs (detail now shows nextcloud.com,
  not example.com); Auth Portal -> non-resolving home.arpa host so it reads
  as a believable, reliably-DOWN monitor
- backfill: transient outages for Nextcloud/Jellyfin/Immich aligned with their
  state changes (uptime % now matches); log timestamps derived from now so the
  Logs view reads chronologically; real SSL warning; three probe nodes across
  regions; seeded alert send health
- demo.tape: shorter warm-up, added Nodes + theme captures, ordered so every
  shot stays inside the 60s node-freshness window (consistent probe count)
- vhs/crop: new tool to trim the empty terminal border around each screenshot
- setup.sh: build backfill up front for deterministic timing; UPTOP_DEMO=1

Supporting code:
- persist alert send health (new alert_health table, load on startup,
  best-effort save on send) so health/last-sent survive restarts
- latency Min/Avg/Max ignore failed checks (no more "Min 0ms")
- correct "probe"/"probes" pluralization
- stable status dot instead of an animated spinner under UPTOP_DEMO
This commit is contained in:
2026-05-28 22:32:45 -04:00
parent 9c7ed284b3
commit 03cbe283df
25 changed files with 483 additions and 99 deletions
+136 -42
View File
@@ -51,6 +51,15 @@ func main() {
fmt.Fprintf(os.Stderr, "maintenance: %v\n", err)
os.Exit(1)
}
alertIDs, err := loadAlertIDs(db)
if err != nil {
fmt.Fprintf(os.Stderr, "load alert IDs: %v\n", err)
os.Exit(1)
}
if err := backfillAlertHealth(db, now, alertIDs); err != nil {
fmt.Fprintf(os.Stderr, "alert health: %v\n", err)
os.Exit(1)
}
var count int
_ = db.QueryRow("SELECT COUNT(*) FROM check_history").Scan(&count)
@@ -67,6 +76,18 @@ func loadSiteIDs(db *sql.DB) (map[string]int, error) {
if err != nil {
return nil, err
}
return scanNameIDs(rows)
}
func loadAlertIDs(db *sql.DB) (map[string]int, error) {
rows, err := db.Query("SELECT id, name FROM alerts")
if err != nil {
return nil, err
}
return scanNameIDs(rows)
}
func scanNameIDs(rows *sql.Rows) (map[string]int, error) {
defer rows.Close()
ids := make(map[string]int)
for rows.Next() {
@@ -80,27 +101,73 @@ func loadSiteIDs(db *sql.DB) (map[string]int, error) {
return ids, rows.Err()
}
// backfillAlertHealth seeds realistic send health so the Alerts tab shows recent,
// healthy "last sent" times and green health dots instead of "never" across the board.
func backfillAlertHealth(db *sql.DB, now time.Time, alertIDs map[string]int) error {
type health struct {
name string
sentAgo time.Duration
ok bool
sends int
fails int
}
rows := []health{
{"Discord Homelab", 4 * time.Minute, true, 37, 0},
{"Slack Ops", 9 * time.Minute, true, 21, 1},
{"Ntfy Alerts", 1 * time.Hour, true, 12, 0},
{"Email Oncall", 3 * time.Hour, true, 5, 0},
}
tx, err := db.Begin()
if err != nil {
return err
}
defer func() { _ = tx.Rollback() }()
stmt, err := tx.Prepare("INSERT OR REPLACE INTO alert_health (alert_id, last_send_at, last_send_ok, last_error, send_count, fail_count) VALUES (?, ?, ?, ?, ?, ?)")
if err != nil {
return err
}
defer stmt.Close()
for _, r := range rows {
id, ok := alertIDs[r.name]
if !ok {
continue
}
sentAt := now.Add(-r.sentAgo).Format("2006-01-02 15:04:05")
if _, err := stmt.Exec(id, sentAt, r.ok, "", r.sends, r.fails); err != nil {
return err
}
}
return tx.Commit()
}
type monitorProfile struct {
name string
minMs int
maxMs int
downFrom int // check index where DOWN starts (-1 = never)
downFrom int // first DOWN check index (-1 = always up)
downTo int // exclusive end of the DOWN window; use 60 (total) for a still-down monitor
}
func backfillHistory(db *sql.DB, rng *rand.Rand, now time.Time, ids map[string]int) error {
// Latency ranges reflect monitoring public services over the internet, so the
// detail histogram brackets the live latency the engine measures at capture time.
// 60 checks * 24m spacing = a 24h window; dip indices place outages within it.
profiles := []monitorProfile{
{"Nextcloud", 40, 80, -1},
{"Jellyfin", 80, 200, -1},
{"Home Assistant", 15, 45, -1},
{"Gitea", 40, 90, -1},
{"Traefik Dashboard", 5, 25, -1},
{"Vaultwarden", 50, 130, -1},
{"Personal Blog", 25, 65, -1},
{"Immich", 100, 280, -1}, // spikes handled below
{"Auth Portal", 30, 70, 40}, // DOWN after check 40
{"Edge Router", 5, 15, -1}, // ping
{"Postgres", 1, 5, -1}, // port
{"DNS Primary", 10, 30, -1},
{"Nextcloud", 200, 600, 47, 48}, // brief blip ~5h ago, recovered
{"Jellyfin", 40, 180, 15, 16}, // brief blip ~18h ago, recovered
{"Home Assistant", 30, 120, -1, 0}, //
{"Gitea", 50, 200, -1, 0}, //
{"Traefik Dashboard", 60, 200, -1, 0}, //
{"Vaultwarden", 80, 250, -1, 0}, //
{"Personal Blog", 40, 160, -1, 0}, //
{"Immich", 60, 300, 30, 31}, // brief blip ~12h ago; periodic spikes below
{"Auth Portal", 30, 90, 40, 60}, // DOWN ~8h ago, still down
{"Edge Router", 5, 20, -1, 0}, // ping
{"Postgres", 1, 6, -1, 0}, // port
{"DNS Primary", 8, 30, -1, 0}, // dns
}
tx, err := db.Begin()
@@ -128,7 +195,7 @@ func backfillHistory(db *sql.DB, rng *rand.Rand, now time.Time, ids map[string]i
var latencyNs int64
isUp := true
if p.downFrom >= 0 && i >= p.downFrom {
if p.downFrom >= 0 && i >= p.downFrom && i < p.downTo {
latencyNs = 0
isUp = false
} else {
@@ -155,14 +222,16 @@ func backfillStateChanges(db *sql.DB, now time.Time, ids map[string]int) error {
reason string
at time.Time
}
// Timed to line up with the history dips (Nextcloud ~5h, Immich ~12h, Jellyfin ~18h)
// and the still-down Auth Portal (~8h), so detail panels read coherently.
changes := []sc{
{"Nextcloud", "UP", "DOWN", "read timeout", now.Add(-3 * 24 * time.Hour).Add(-5 * time.Minute)},
{"Nextcloud", "DOWN", "UP", "", now.Add(-3 * 24 * time.Hour)},
{"Jellyfin", "UP", "DOWN", "connection reset", now.Add(-18 * time.Hour).Add(-3 * time.Minute)},
{"Jellyfin", "DOWN", "UP", "", now.Add(-18 * time.Hour)},
{"Auth Portal", "UP", "DOWN", "connection refused", now.Add(-8 * time.Hour)},
{"Nextcloud", "UP", "DOWN", "read timeout", now.Add(-5 * time.Hour).Add(-8 * time.Minute)},
{"Nextcloud", "DOWN", "UP", "", now.Add(-5 * time.Hour)},
{"Auth Portal", "UP", "DOWN", "no such host", now.Add(-8 * time.Hour)},
{"Immich", "UP", "DOWN", "502 Bad Gateway", now.Add(-12 * time.Hour).Add(-8 * time.Minute)},
{"Immich", "DOWN", "UP", "", now.Add(-12 * time.Hour)},
{"Jellyfin", "UP", "DOWN", "connection reset", now.Add(-18 * time.Hour).Add(-5 * time.Minute)},
{"Jellyfin", "DOWN", "UP", "", now.Add(-18 * time.Hour)},
}
tx, err := db.Begin()
@@ -191,25 +260,35 @@ func backfillStateChanges(db *sql.DB, now time.Time, ids map[string]int) error {
func backfillLogs(db *sql.DB, now time.Time) error {
type logEntry struct {
msg string
at time.Time
text string
at time.Time
}
ago := func(h, m, s int) time.Time {
return now.Add(-(time.Duration(h)*time.Hour + time.Duration(m)*time.Minute + time.Duration(s)*time.Second))
}
// Ordered newest-first. The bracket time is derived from `at` (not hardcoded), so the
// Logs view — which renders the leading [HH:MM] — reads chronologically. Outage times
// line up with the state changes and history dips above.
logs := []logEntry{
{"[06:12] Monitor 'Auth Portal' confirmed DOWN: connection refused", now.Add(-8 * time.Hour)},
{"[06:12] Monitor 'Auth Portal' failed check 2/2", now.Add(-8*time.Hour - 30*time.Second)},
{"[06:11] Monitor 'Auth Portal' failed check 1/2", now.Add(-8*time.Hour - 60*time.Second)},
{"[12:33] Monitor 'Immich' recovered (was down 8m)", now.Add(-12 * time.Hour)},
{"[12:25] Monitor 'Immich' confirmed DOWN: 502 Bad Gateway", now.Add(-12*time.Hour - 8*time.Minute)},
{"[12:25] Monitor 'Immich' failed check 3/3", now.Add(-12*time.Hour - 8*time.Minute - 30*time.Second)},
{"[12:25] Monitor 'Immich' failed check 2/3", now.Add(-12*time.Hour - 8*time.Minute - 60*time.Second)},
{"[12:24] Monitor 'Immich' failed check 1/3", now.Add(-12*time.Hour - 9*time.Minute)},
{"[06:14] Monitor 'Jellyfin' recovered (was down 3m)", now.Add(-18 * time.Hour)},
{"[06:11] Monitor 'Jellyfin' confirmed DOWN: connection reset", now.Add(-18*time.Hour - 3*time.Minute)},
{"[06:11] Monitor 'Jellyfin' failed check 2/2", now.Add(-18*time.Hour - 3*time.Minute - 30*time.Second)},
{"[06:10] Monitor 'Jellyfin' failed check 1/2", now.Add(-18*time.Hour - 4*time.Minute)},
{"[23:45] SSL certificate for 'Personal Blog' expires in 42 days", now.Add(-28 * time.Hour)},
{"[08:00] Loaded check history from database", now.Add(-32*time.Hour - 30*time.Minute)},
{"[08:00] Engine RESUMED (Active)", now.Add(-32*time.Hour - 30*time.Minute - 5*time.Second)},
{"Monitor 'Nextcloud' recovered (was down 8m)", ago(5, 0, 0)},
{"Monitor 'Nextcloud' confirmed DOWN: read timeout", ago(5, 8, 0)},
{"Monitor 'Nextcloud' failed check 2/2", ago(5, 8, 30)},
{"Monitor 'Nextcloud' failed check 1/2", ago(5, 9, 0)},
{"Monitor 'Auth Portal' confirmed DOWN: no such host", ago(8, 0, 0)},
{"Monitor 'Auth Portal' failed check 2/2", ago(8, 0, 30)},
{"Monitor 'Auth Portal' failed check 1/2", ago(8, 1, 0)},
{"Monitor 'Immich' recovered (was down 8m)", ago(12, 0, 0)},
{"Monitor 'Immich' confirmed DOWN: 502 Bad Gateway", ago(12, 8, 0)},
{"Monitor 'Immich' failed check 3/3", ago(12, 8, 30)},
{"Monitor 'Immich' failed check 2/3", ago(12, 9, 0)},
{"Monitor 'Immich' failed check 1/3", ago(12, 9, 30)},
{"Monitor 'Jellyfin' recovered (was down 5m)", ago(18, 0, 0)},
{"Monitor 'Jellyfin' confirmed DOWN: connection reset", ago(18, 5, 0)},
{"Monitor 'Jellyfin' failed check 2/2", ago(18, 5, 30)},
{"Monitor 'Jellyfin' failed check 1/2", ago(18, 6, 0)},
{"SSL warning: certificate for 'Personal Blog' expires in 9 days", ago(20, 0, 0)},
{"Engine RESUMED (Active)", ago(22, 0, 0)},
{"Loaded check history from database", ago(22, 0, 5)},
}
tx, err := db.Begin()
@@ -225,7 +304,10 @@ func backfillLogs(db *sql.DB, now time.Time) error {
defer stmt.Close()
for _, l := range logs {
if _, err := stmt.Exec(l.msg, l.at.Format("2006-01-02 15:04:05")); err != nil {
// Bracket in local time to match the engine's live AddLog timestamps;
// created_at stays UTC to match the store's CURRENT_TIMESTAMP ordering.
msg := "[" + l.at.Local().Format("15:04") + "] " + l.text
if _, err := stmt.Exec(msg, l.at.Format("2006-01-02 15:04:05")); err != nil {
return err
}
}
@@ -233,11 +315,23 @@ func backfillLogs(db *sql.DB, now time.Time) error {
}
func backfillNodes(db *sql.DB, now time.Time) error {
_, err := db.Exec(
"INSERT OR REPLACE INTO nodes (id, name, region, last_seen, version) VALUES (?, ?, ?, ?, ?)",
"node-1", "leader", "us-east", now.Format("2006-01-02 15:04:05"), "2026.05.1",
)
return err
// Multiple regions to show distributed probes. All seen "now" so they read ONLINE
// for the whole capture window (kept under the 60s freshness threshold by the tape).
nodes := []struct{ id, name, region string }{
{"node-use1", "leader", "us-east"},
{"node-euw1", "probe-eu", "eu-west"},
{"node-apse1", "probe-ap", "ap-southeast"},
}
ts := now.Format("2006-01-02 15:04:05")
for _, n := range nodes {
if _, err := db.Exec(
"INSERT OR REPLACE INTO nodes (id, name, region, last_seen, version) VALUES (?, ?, ?, ?, ?)",
n.id, n.name, n.region, ts, "2026.05.1",
); err != nil {
return err
}
}
return nil
}
func backfillMaintenance(db *sql.DB, now time.Time, ids map[string]int) error {