feat(tui): add SLA reporting view
Full-screen SLA report accessible via [s] from detail panel. Computes uptime%, downtime, outage count, longest outage, MTTR, and MTBF from state_changes table. Includes daily breakdown with bar chart, switchable time periods (24h/7d/30d/90d), and scrollable viewport. LATE/STALE treated as UP for SLA purposes.
This commit is contained in:
@@ -823,3 +823,11 @@ func (e *Engine) GetStateChanges(siteID int, limit int) []models.StateChange {
|
||||
}
|
||||
return changes
|
||||
}
|
||||
|
||||
func (e *Engine) GetStateChangesSince(siteID int, since time.Time) []models.StateChange {
|
||||
changes, err := e.db.GetStateChangesSince(siteID, since)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return changes
|
||||
}
|
||||
|
||||
@@ -80,7 +80,10 @@ func (m *mockStore) GetPreference(string) (string, error) { re
|
||||
func (m *mockStore) SetPreference(string, string) error { return nil }
|
||||
func (m *mockStore) SaveStateChange(int, string, string, string) error { return nil }
|
||||
func (m *mockStore) GetStateChanges(int, int) ([]models.StateChange, error) { return nil, nil }
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
func (m *mockStore) GetStateChangesSince(int, time.Time) ([]models.StateChange, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockStore) Close() error { return nil }
|
||||
|
||||
func (m *mockStore) GetAllAlerts() ([]models.AlertConfig, error) {
|
||||
m.mu.Lock()
|
||||
|
||||
@@ -0,0 +1,225 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"gitea.lerkolabs.com/lerkolabs/uptop/internal/models"
|
||||
)
|
||||
|
||||
type SLAReport struct {
|
||||
Window time.Duration
|
||||
UptimePct float64
|
||||
Downtime time.Duration
|
||||
OutageCount int
|
||||
LongestOut time.Duration
|
||||
MTTR time.Duration
|
||||
MTBF time.Duration
|
||||
}
|
||||
|
||||
func ComputeSLA(changes []models.StateChange, currentStatus string, window time.Duration) SLAReport {
|
||||
now := time.Now()
|
||||
windowStart := now.Add(-window)
|
||||
|
||||
report := SLAReport{Window: window}
|
||||
|
||||
if len(changes) == 0 {
|
||||
if isDown(currentStatus) {
|
||||
report.UptimePct = 0
|
||||
report.Downtime = window
|
||||
} else {
|
||||
report.UptimePct = 100
|
||||
}
|
||||
return report
|
||||
}
|
||||
|
||||
// Sort changes chronologically (they come in DESC from DB).
|
||||
sorted := make([]models.StateChange, len(changes))
|
||||
copy(sorted, changes)
|
||||
for i, j := 0, len(sorted)-1; i < j; i, j = i+1, j-1 {
|
||||
sorted[i], sorted[j] = sorted[j], sorted[i]
|
||||
}
|
||||
|
||||
// Determine status at window start: last transition before or at windowStart.
|
||||
statusAtStart := "UP"
|
||||
for i := len(sorted) - 1; i >= 0; i-- {
|
||||
if !sorted[i].ChangedAt.After(windowStart) {
|
||||
statusAtStart = sorted[i].ToStatus
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
var upTime, downTime time.Duration
|
||||
var outages []time.Duration
|
||||
cursor := windowStart
|
||||
wasDown := isDown(statusAtStart)
|
||||
|
||||
if wasDown {
|
||||
report.OutageCount = 1
|
||||
}
|
||||
|
||||
var outageStart time.Time
|
||||
if wasDown {
|
||||
outageStart = windowStart
|
||||
}
|
||||
|
||||
for _, sc := range sorted {
|
||||
if sc.ChangedAt.Before(windowStart) {
|
||||
continue
|
||||
}
|
||||
if sc.ChangedAt.After(now) {
|
||||
break
|
||||
}
|
||||
|
||||
seg := sc.ChangedAt.Sub(cursor)
|
||||
if wasDown {
|
||||
downTime += seg
|
||||
} else {
|
||||
upTime += seg
|
||||
}
|
||||
|
||||
newDown := isDown(sc.ToStatus)
|
||||
if !wasDown && newDown {
|
||||
report.OutageCount++
|
||||
outageStart = sc.ChangedAt
|
||||
}
|
||||
if wasDown && !newDown {
|
||||
dur := sc.ChangedAt.Sub(outageStart)
|
||||
outages = append(outages, dur)
|
||||
}
|
||||
|
||||
wasDown = newDown
|
||||
cursor = sc.ChangedAt
|
||||
}
|
||||
|
||||
// Account for time from last change to now.
|
||||
remaining := now.Sub(cursor)
|
||||
if wasDown {
|
||||
downTime += remaining
|
||||
dur := now.Sub(outageStart)
|
||||
outages = append(outages, dur)
|
||||
} else {
|
||||
upTime += remaining
|
||||
}
|
||||
|
||||
total := upTime + downTime
|
||||
if total > 0 {
|
||||
report.UptimePct = float64(upTime) / float64(total) * 100
|
||||
} else {
|
||||
report.UptimePct = 100
|
||||
}
|
||||
report.Downtime = downTime
|
||||
|
||||
if len(outages) > 0 {
|
||||
var totalOutage time.Duration
|
||||
for _, d := range outages {
|
||||
totalOutage += d
|
||||
if d > report.LongestOut {
|
||||
report.LongestOut = d
|
||||
}
|
||||
}
|
||||
report.MTTR = totalOutage / time.Duration(len(outages))
|
||||
}
|
||||
|
||||
if report.OutageCount > 0 && upTime > 0 {
|
||||
report.MTBF = upTime / time.Duration(report.OutageCount)
|
||||
}
|
||||
|
||||
return report
|
||||
}
|
||||
|
||||
func ComputeDailyBreakdown(changes []models.StateChange, currentStatus string, days int) []DayReport {
|
||||
now := time.Now()
|
||||
reports := make([]DayReport, days)
|
||||
|
||||
for i := 0; i < days; i++ {
|
||||
dayEnd := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, now.Location()).Add(-time.Duration(i) * 24 * time.Hour)
|
||||
if i == 0 {
|
||||
dayEnd = now
|
||||
}
|
||||
dayStart := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, now.Location()).Add(-time.Duration(i) * 24 * time.Hour)
|
||||
if i > 0 {
|
||||
dayEnd = dayStart.Add(24 * time.Hour)
|
||||
}
|
||||
|
||||
windowChanges := filterChangesForWindow(changes, dayStart, dayEnd)
|
||||
|
||||
statusAtStart := inferStatusAt(changes, dayStart)
|
||||
sla := computeSLAForWindow(windowChanges, statusAtStart, dayStart, dayEnd)
|
||||
|
||||
reports[i] = DayReport{
|
||||
Date: dayStart,
|
||||
UptimePct: sla,
|
||||
}
|
||||
}
|
||||
|
||||
return reports
|
||||
}
|
||||
|
||||
type DayReport struct {
|
||||
Date time.Time
|
||||
UptimePct float64
|
||||
}
|
||||
|
||||
func isDown(status string) bool {
|
||||
return status == "DOWN" || status == "SSL EXP"
|
||||
}
|
||||
|
||||
func filterChangesForWindow(changes []models.StateChange, start, end time.Time) []models.StateChange {
|
||||
var filtered []models.StateChange
|
||||
for _, sc := range changes {
|
||||
if !sc.ChangedAt.Before(start) && sc.ChangedAt.Before(end) {
|
||||
filtered = append(filtered, sc)
|
||||
}
|
||||
}
|
||||
return filtered
|
||||
}
|
||||
|
||||
func inferStatusAt(changes []models.StateChange, at time.Time) string {
|
||||
// Changes come DESC from DB. Walk backwards to find last change before `at`.
|
||||
for _, sc := range changes {
|
||||
if !sc.ChangedAt.After(at) {
|
||||
return sc.ToStatus
|
||||
}
|
||||
}
|
||||
return "UP"
|
||||
}
|
||||
|
||||
func computeSLAForWindow(changes []models.StateChange, statusAtStart string, start, end time.Time) float64 {
|
||||
// Sort chronologically.
|
||||
sorted := make([]models.StateChange, len(changes))
|
||||
copy(sorted, changes)
|
||||
for i, j := 0, len(sorted)-1; i < j; i, j = i+1, j-1 {
|
||||
sorted[i], sorted[j] = sorted[j], sorted[i]
|
||||
}
|
||||
|
||||
var upTime, downTime time.Duration
|
||||
cursor := start
|
||||
wasDown := isDown(statusAtStart)
|
||||
|
||||
for _, sc := range sorted {
|
||||
if sc.ChangedAt.Before(start) || !sc.ChangedAt.Before(end) {
|
||||
continue
|
||||
}
|
||||
seg := sc.ChangedAt.Sub(cursor)
|
||||
if wasDown {
|
||||
downTime += seg
|
||||
} else {
|
||||
upTime += seg
|
||||
}
|
||||
wasDown = isDown(sc.ToStatus)
|
||||
cursor = sc.ChangedAt
|
||||
}
|
||||
|
||||
remaining := end.Sub(cursor)
|
||||
if wasDown {
|
||||
downTime += remaining
|
||||
} else {
|
||||
upTime += remaining
|
||||
}
|
||||
|
||||
total := upTime + downTime
|
||||
if total <= 0 {
|
||||
return 100
|
||||
}
|
||||
return float64(upTime) / float64(total) * 100
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.lerkolabs.com/lerkolabs/uptop/internal/models"
|
||||
)
|
||||
|
||||
func TestComputeSLA_NoChanges_CurrentlyUp(t *testing.T) {
|
||||
r := ComputeSLA(nil, "UP", 24*time.Hour)
|
||||
if r.UptimePct != 100 {
|
||||
t.Errorf("expected 100%% uptime, got %.2f%%", r.UptimePct)
|
||||
}
|
||||
if r.Downtime != 0 {
|
||||
t.Errorf("expected 0 downtime, got %v", r.Downtime)
|
||||
}
|
||||
}
|
||||
|
||||
func TestComputeSLA_NoChanges_CurrentlyDown(t *testing.T) {
|
||||
r := ComputeSLA(nil, "DOWN", 24*time.Hour)
|
||||
if r.UptimePct != 0 {
|
||||
t.Errorf("expected 0%% uptime, got %.2f%%", r.UptimePct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestComputeSLA_SingleOutage(t *testing.T) {
|
||||
now := time.Now()
|
||||
// DOWN 2 hours ago, recovered 1 hour ago → 1 hour downtime in 24h window
|
||||
changes := []models.StateChange{
|
||||
{ToStatus: "UP", ChangedAt: now.Add(-1 * time.Hour)},
|
||||
{ToStatus: "DOWN", FromStatus: "UP", ChangedAt: now.Add(-2 * time.Hour)},
|
||||
}
|
||||
|
||||
r := ComputeSLA(changes, "UP", 24*time.Hour)
|
||||
|
||||
if r.OutageCount != 1 {
|
||||
t.Errorf("expected 1 outage, got %d", r.OutageCount)
|
||||
}
|
||||
|
||||
expectedDowntime := 1 * time.Hour
|
||||
if absDuration(r.Downtime-expectedDowntime) > time.Minute {
|
||||
t.Errorf("expected ~1h downtime, got %v", r.Downtime)
|
||||
}
|
||||
|
||||
expectedPct := float64(23) / float64(24) * 100
|
||||
if math.Abs(r.UptimePct-expectedPct) > 0.5 {
|
||||
t.Errorf("expected ~%.1f%% uptime, got %.2f%%", expectedPct, r.UptimePct)
|
||||
}
|
||||
|
||||
if r.LongestOut < 55*time.Minute || r.LongestOut > 65*time.Minute {
|
||||
t.Errorf("expected longest outage ~1h, got %v", r.LongestOut)
|
||||
}
|
||||
}
|
||||
|
||||
func TestComputeSLA_CurrentlyDown(t *testing.T) {
|
||||
now := time.Now()
|
||||
// Went down 3 hours ago, still down
|
||||
changes := []models.StateChange{
|
||||
{ToStatus: "DOWN", FromStatus: "UP", ChangedAt: now.Add(-3 * time.Hour)},
|
||||
}
|
||||
|
||||
r := ComputeSLA(changes, "DOWN", 24*time.Hour)
|
||||
|
||||
if r.OutageCount != 1 {
|
||||
t.Errorf("expected 1 outage, got %d", r.OutageCount)
|
||||
}
|
||||
|
||||
expectedDowntime := 3 * time.Hour
|
||||
if absDuration(r.Downtime-expectedDowntime) > time.Minute {
|
||||
t.Errorf("expected ~3h downtime, got %v", r.Downtime)
|
||||
}
|
||||
}
|
||||
|
||||
func TestComputeSLA_MultipleOutages(t *testing.T) {
|
||||
now := time.Now()
|
||||
// Two outages: 6h-5h ago and 2h-1h ago
|
||||
changes := []models.StateChange{
|
||||
{ToStatus: "UP", ChangedAt: now.Add(-1 * time.Hour)},
|
||||
{ToStatus: "DOWN", FromStatus: "UP", ChangedAt: now.Add(-2 * time.Hour)},
|
||||
{ToStatus: "UP", ChangedAt: now.Add(-5 * time.Hour)},
|
||||
{ToStatus: "DOWN", FromStatus: "UP", ChangedAt: now.Add(-6 * time.Hour)},
|
||||
}
|
||||
|
||||
r := ComputeSLA(changes, "UP", 24*time.Hour)
|
||||
|
||||
if r.OutageCount != 2 {
|
||||
t.Errorf("expected 2 outages, got %d", r.OutageCount)
|
||||
}
|
||||
|
||||
expectedDowntime := 2 * time.Hour
|
||||
if absDuration(r.Downtime-expectedDowntime) > time.Minute {
|
||||
t.Errorf("expected ~2h downtime, got %v", r.Downtime)
|
||||
}
|
||||
|
||||
if r.MTTR < 55*time.Minute || r.MTTR > 65*time.Minute {
|
||||
t.Errorf("expected MTTR ~1h, got %v", r.MTTR)
|
||||
}
|
||||
}
|
||||
|
||||
func TestComputeSLA_LateNotDown(t *testing.T) {
|
||||
now := time.Now()
|
||||
// LATE for 2 hours is not downtime
|
||||
changes := []models.StateChange{
|
||||
{ToStatus: "UP", ChangedAt: now.Add(-1 * time.Hour)},
|
||||
{ToStatus: "LATE", FromStatus: "UP", ChangedAt: now.Add(-3 * time.Hour)},
|
||||
}
|
||||
|
||||
r := ComputeSLA(changes, "UP", 24*time.Hour)
|
||||
|
||||
if r.OutageCount != 0 {
|
||||
t.Errorf("expected 0 outages for LATE, got %d", r.OutageCount)
|
||||
}
|
||||
if r.UptimePct != 100 {
|
||||
t.Errorf("expected 100%% uptime (LATE is not down), got %.2f%%", r.UptimePct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestComputeDailyBreakdown(t *testing.T) {
|
||||
now := time.Now()
|
||||
changes := []models.StateChange{
|
||||
{ToStatus: "UP", ChangedAt: now.Add(-1 * time.Hour)},
|
||||
{ToStatus: "DOWN", FromStatus: "UP", ChangedAt: now.Add(-2 * time.Hour)},
|
||||
}
|
||||
|
||||
days := ComputeDailyBreakdown(changes, "UP", 7)
|
||||
|
||||
if len(days) != 7 {
|
||||
t.Fatalf("expected 7 days, got %d", len(days))
|
||||
}
|
||||
|
||||
// Today should have less than 100% uptime
|
||||
if days[0].UptimePct >= 100 {
|
||||
t.Errorf("expected today < 100%%, got %.2f%%", days[0].UptimePct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsDown(t *testing.T) {
|
||||
if !isDown("DOWN") {
|
||||
t.Error("DOWN should be down")
|
||||
}
|
||||
if !isDown("SSL EXP") {
|
||||
t.Error("SSL EXP should be down")
|
||||
}
|
||||
if isDown("UP") {
|
||||
t.Error("UP should not be down")
|
||||
}
|
||||
if isDown("LATE") {
|
||||
t.Error("LATE should not be down")
|
||||
}
|
||||
if isDown("STALE") {
|
||||
t.Error("STALE should not be down")
|
||||
}
|
||||
if isDown("PENDING") {
|
||||
t.Error("PENDING should not be down")
|
||||
}
|
||||
}
|
||||
|
||||
func absDuration(d time.Duration) time.Duration {
|
||||
if d < 0 {
|
||||
return -d
|
||||
}
|
||||
return d
|
||||
}
|
||||
Reference in New Issue
Block a user