ca5a42314f
Phase 2 of distributed probing: - Extract check logic into standalone RunCheck() for use by probes - Add probe cluster mode: stateless nodes that fetch assignments, execute checks, and report results to the leader - Add multi-node result aggregation with configurable strategy (any-down, majority-down, all-down) - Leader ingests probe results into engine live state and triggers alerts - New env vars: UPKEEP_NODE_ID, UPKEEP_NODE_NAME, UPKEEP_NODE_REGION, UPKEEP_AGG_STRATEGY - Example docker-compose.probe.yml with leader + 2 regional probes
45 lines
837 B
Go
45 lines
837 B
Go
package monitor
|
|
|
|
import "time"
|
|
|
|
type AggregationStrategy string
|
|
|
|
const (
|
|
AggAnyDown AggregationStrategy = "any-down"
|
|
AggMajorityDown AggregationStrategy = "majority-down"
|
|
AggAllDown AggregationStrategy = "all-down"
|
|
)
|
|
|
|
type NodeResult struct {
|
|
NodeID string
|
|
IsUp bool
|
|
LatencyNs int64
|
|
CheckedAt time.Time
|
|
}
|
|
|
|
func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) {
|
|
if len(results) == 0 {
|
|
return true, 0
|
|
}
|
|
|
|
upCount := 0
|
|
var totalLatency int64
|
|
for _, r := range results {
|
|
if r.IsUp {
|
|
upCount++
|
|
}
|
|
totalLatency += r.LatencyNs
|
|
}
|
|
avgLatencyNs = totalLatency / int64(len(results))
|
|
|
|
switch strategy {
|
|
case AggMajorityDown:
|
|
isUp = upCount > len(results)/2
|
|
case AggAllDown:
|
|
isUp = upCount > 0
|
|
default:
|
|
isUp = upCount == len(results)
|
|
}
|
|
return
|
|
}
|