feat(cluster): add probe execution mode, check extraction, and result aggregation
Phase 2 of distributed probing: - Extract check logic into standalone RunCheck() for use by probes - Add probe cluster mode: stateless nodes that fetch assignments, execute checks, and report results to the leader - Add multi-node result aggregation with configurable strategy (any-down, majority-down, all-down) - Leader ingests probe results into engine live state and triggers alerts - New env vars: UPKEEP_NODE_ID, UPKEEP_NODE_NAME, UPKEEP_NODE_REGION, UPKEEP_AGG_STRATEGY - Example docker-compose.probe.yml with leader + 2 regional probes
This commit is contained in:
@@ -0,0 +1,44 @@
|
||||
package monitor
|
||||
|
||||
import "time"
|
||||
|
||||
type AggregationStrategy string
|
||||
|
||||
const (
|
||||
AggAnyDown AggregationStrategy = "any-down"
|
||||
AggMajorityDown AggregationStrategy = "majority-down"
|
||||
AggAllDown AggregationStrategy = "all-down"
|
||||
)
|
||||
|
||||
type NodeResult struct {
|
||||
NodeID string
|
||||
IsUp bool
|
||||
LatencyNs int64
|
||||
CheckedAt time.Time
|
||||
}
|
||||
|
||||
func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) {
|
||||
if len(results) == 0 {
|
||||
return true, 0
|
||||
}
|
||||
|
||||
upCount := 0
|
||||
var totalLatency int64
|
||||
for _, r := range results {
|
||||
if r.IsUp {
|
||||
upCount++
|
||||
}
|
||||
totalLatency += r.LatencyNs
|
||||
}
|
||||
avgLatencyNs = totalLatency / int64(len(results))
|
||||
|
||||
switch strategy {
|
||||
case AggMajorityDown:
|
||||
isUp = upCount > len(results)/2
|
||||
case AggAllDown:
|
||||
isUp = upCount > 0
|
||||
default:
|
||||
isUp = upCount == len(results)
|
||||
}
|
||||
return
|
||||
}
|
||||
Reference in New Issue
Block a user