438 lines
13 KiB
Go
438 lines
13 KiB
Go
package mesh
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
PeerRecoveryModeSteady = "steady"
|
|
PeerRecoveryModeRecovery = "recovery"
|
|
)
|
|
|
|
const (
|
|
DefaultStablePeerTarget = 3
|
|
DefaultRecoveryProbeLimit = 6
|
|
)
|
|
|
|
type PeerRecoveryPlanConfig struct {
|
|
PeerCache PeerCacheSnapshot
|
|
Connections PeerConnectionSnapshot
|
|
TargetReadyPeers int
|
|
MaxProbeCandidates int
|
|
PreferredRegion string
|
|
Now time.Time
|
|
}
|
|
|
|
type PeerRecoveryPlan struct {
|
|
Mode string `json:"mode"`
|
|
Healthy bool `json:"healthy"`
|
|
TargetReadyPeers int `json:"target_ready_peers"`
|
|
ReadyPeerCount int `json:"ready_peer_count"`
|
|
DegradedPeerCount int `json:"degraded_peer_count"`
|
|
BackoffPeerCount int `json:"backoff_peer_count"`
|
|
ConnectablePeerCount int `json:"connectable_peer_count"`
|
|
Deficit int `json:"deficit"`
|
|
ProbeCandidateCount int `json:"probe_candidate_count"`
|
|
RecoverySeedCandidateCount int `json:"recovery_seed_candidate_count"`
|
|
GeneratedAt time.Time `json:"generated_at"`
|
|
Candidates []PeerRecoveryCandidate `json:"candidates,omitempty"`
|
|
}
|
|
|
|
type PeerRecoveryCandidate struct {
|
|
NodeID string `json:"node_id"`
|
|
Endpoint string `json:"endpoint,omitempty"`
|
|
Region string `json:"region,omitempty"`
|
|
Warm bool `json:"warm"`
|
|
WarmReason string `json:"warm_reason,omitempty"`
|
|
RecoverySeed bool `json:"recovery_seed"`
|
|
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
|
BestTransport string `json:"best_transport,omitempty"`
|
|
ConnectionState string `json:"connection_state"`
|
|
ConsecutiveFailures int `json:"consecutive_failures,omitempty"`
|
|
LastLatencyMs int `json:"last_latency_ms,omitempty"`
|
|
BackoffUntil time.Time `json:"backoff_until,omitempty"`
|
|
Reason string `json:"reason"`
|
|
Priority int `json:"priority"`
|
|
}
|
|
|
|
type peerRecoveryCandidateBuild struct {
|
|
PeerRecoveryCandidate
|
|
PublicIngressCount int
|
|
}
|
|
|
|
func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
|
|
now := normalizedNow(cfg.Now)
|
|
target := cfg.TargetReadyPeers
|
|
if target <= 0 {
|
|
target = DefaultStablePeerTarget
|
|
}
|
|
limit := cfg.MaxProbeCandidates
|
|
if limit <= 0 {
|
|
limit = DefaultRecoveryProbeLimit
|
|
}
|
|
connectable := connectablePeerCount(cfg.PeerCache)
|
|
if target > connectable {
|
|
target = connectable
|
|
}
|
|
if limit < target {
|
|
limit = target
|
|
}
|
|
|
|
connectionByNode := map[string]PeerConnectionState{}
|
|
for _, connection := range cfg.Connections.Entries {
|
|
if strings.TrimSpace(connection.NodeID) == "" {
|
|
continue
|
|
}
|
|
connectionByNode[connection.NodeID] = connection
|
|
}
|
|
|
|
entryByNode := map[string]PeerCacheEntry{}
|
|
for _, entry := range cfg.PeerCache.Entries {
|
|
if strings.TrimSpace(entry.NodeID) == "" {
|
|
continue
|
|
}
|
|
entryByNode[entry.NodeID] = entry
|
|
}
|
|
|
|
ready := 0
|
|
degraded := 0
|
|
backoff := 0
|
|
readyExternalRegions := map[string]struct{}{}
|
|
for nodeID, connection := range connectionByNode {
|
|
entry, ok := entryByNode[nodeID]
|
|
if !ok || strings.TrimSpace(entry.Endpoint) == "" {
|
|
continue
|
|
}
|
|
switch connection.State {
|
|
case PeerConnectionReady:
|
|
ready++
|
|
region := strings.TrimSpace(entry.BestRegion)
|
|
if region != "" && (strings.TrimSpace(cfg.PreferredRegion) == "" || !strings.EqualFold(region, cfg.PreferredRegion)) {
|
|
readyExternalRegions[strings.ToLower(region)] = struct{}{}
|
|
}
|
|
case PeerConnectionRelayReady:
|
|
// Relay-ready peers remain valuable for control-plane reachability,
|
|
// but they do not satisfy the target for direct-ready transport paths.
|
|
case PeerConnectionDegraded:
|
|
degraded++
|
|
case PeerConnectionBackoff:
|
|
backoff++
|
|
}
|
|
}
|
|
|
|
deficit := target - ready
|
|
if deficit < 0 {
|
|
deficit = 0
|
|
}
|
|
mode := PeerRecoveryModeSteady
|
|
if deficit > 0 {
|
|
mode = PeerRecoveryModeRecovery
|
|
}
|
|
if mode == PeerRecoveryModeSteady {
|
|
limit = target
|
|
}
|
|
missingExternalRegions := missingPeerRecoveryExternalRegions(cfg.PeerCache, cfg.PreferredRegion, readyExternalRegions, target)
|
|
|
|
candidates := make([]peerRecoveryCandidateBuild, 0, len(cfg.PeerCache.Entries))
|
|
for _, entry := range cfg.PeerCache.Entries {
|
|
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
|
|
continue
|
|
}
|
|
connection := connectionByNode[entry.NodeID]
|
|
if connection.State == "" {
|
|
connection.State = PeerConnectionDisconnected
|
|
}
|
|
if connection.State == PeerConnectionBackoff && connection.BackoffUntil.After(now) {
|
|
continue
|
|
}
|
|
reason, ok := peerRecoveryCandidateReason(mode, entry, connection, missingExternalRegions, cfg.PreferredRegion)
|
|
if !ok {
|
|
continue
|
|
}
|
|
candidate := PeerRecoveryCandidate{
|
|
NodeID: entry.NodeID,
|
|
Endpoint: strings.TrimSpace(entry.Endpoint),
|
|
Region: strings.TrimSpace(entry.BestRegion),
|
|
Warm: entry.Warm,
|
|
WarmReason: entry.WarmReason,
|
|
RecoverySeed: entry.RecoverySeed,
|
|
BestCandidateID: entry.BestCandidateID,
|
|
BestTransport: entry.BestTransport,
|
|
ConnectionState: connection.State,
|
|
ConsecutiveFailures: connection.ConsecutiveFailures,
|
|
LastLatencyMs: connection.LastLatencyMs,
|
|
BackoffUntil: connection.BackoffUntil,
|
|
Reason: reason,
|
|
Priority: peerRecoveryCandidatePriority(entry, connection, reason, cfg.PreferredRegion),
|
|
}
|
|
candidates = append(candidates, peerRecoveryCandidateBuild{
|
|
PeerRecoveryCandidate: candidate,
|
|
PublicIngressCount: entry.PublicIngressCount,
|
|
})
|
|
}
|
|
sort.SliceStable(candidates, func(i, j int) bool {
|
|
if candidates[i].Priority != candidates[j].Priority {
|
|
return candidates[i].Priority > candidates[j].Priority
|
|
}
|
|
return candidates[i].NodeID < candidates[j].NodeID
|
|
})
|
|
if len(candidates) > limit {
|
|
candidates = trimPeerRecoveryCandidates(candidates, limit, cfg.PreferredRegion)
|
|
}
|
|
|
|
outCandidates := make([]PeerRecoveryCandidate, 0, len(candidates))
|
|
recoverySeedCandidates := 0
|
|
for _, candidate := range candidates {
|
|
outCandidates = append(outCandidates, candidate.PeerRecoveryCandidate)
|
|
if candidate.RecoverySeed {
|
|
recoverySeedCandidates++
|
|
}
|
|
}
|
|
|
|
return PeerRecoveryPlan{
|
|
Mode: mode,
|
|
Healthy: deficit == 0,
|
|
TargetReadyPeers: target,
|
|
ReadyPeerCount: ready,
|
|
DegradedPeerCount: degraded,
|
|
BackoffPeerCount: backoff,
|
|
ConnectablePeerCount: connectable,
|
|
Deficit: deficit,
|
|
ProbeCandidateCount: len(outCandidates),
|
|
RecoverySeedCandidateCount: recoverySeedCandidates,
|
|
GeneratedAt: now,
|
|
Candidates: outCandidates,
|
|
}
|
|
}
|
|
|
|
func missingPeerRecoveryExternalRegions(snapshot PeerCacheSnapshot, preferredRegion string, readyExternalRegions map[string]struct{}, target int) map[string]struct{} {
|
|
preferredRegion = strings.TrimSpace(preferredRegion)
|
|
availableExternalRegions := map[string]struct{}{}
|
|
for _, entry := range snapshot.Entries {
|
|
region := strings.TrimSpace(entry.BestRegion)
|
|
if region == "" {
|
|
continue
|
|
}
|
|
if preferredRegion != "" && strings.EqualFold(region, preferredRegion) {
|
|
continue
|
|
}
|
|
availableExternalRegions[strings.ToLower(region)] = struct{}{}
|
|
}
|
|
if len(availableExternalRegions) == 0 {
|
|
return nil
|
|
}
|
|
desiredExternal := len(availableExternalRegions)
|
|
if desiredExternal > 2 {
|
|
desiredExternal = 2
|
|
}
|
|
if target > 0 && desiredExternal > target {
|
|
desiredExternal = target
|
|
}
|
|
if len(readyExternalRegions) >= desiredExternal {
|
|
return nil
|
|
}
|
|
missing := map[string]struct{}{}
|
|
for region := range availableExternalRegions {
|
|
if _, ok := readyExternalRegions[region]; ok {
|
|
continue
|
|
}
|
|
missing[region] = struct{}{}
|
|
}
|
|
if len(missing) == 0 {
|
|
return nil
|
|
}
|
|
return missing
|
|
}
|
|
|
|
func trimPeerRecoveryCandidates(candidates []peerRecoveryCandidateBuild, limit int, preferredRegion string) []peerRecoveryCandidateBuild {
|
|
if len(candidates) <= limit || limit <= 0 {
|
|
return candidates
|
|
}
|
|
preferredRegion = strings.TrimSpace(preferredRegion)
|
|
externalRegions := map[string]struct{}{}
|
|
for _, candidate := range candidates {
|
|
region := strings.TrimSpace(candidate.Region)
|
|
if region == "" || (preferredRegion != "" && strings.EqualFold(region, preferredRegion)) {
|
|
continue
|
|
}
|
|
externalRegions[strings.ToLower(region)] = struct{}{}
|
|
}
|
|
if len(externalRegions) < 2 {
|
|
return candidates[:limit]
|
|
}
|
|
selected := make([]peerRecoveryCandidateBuild, 0, limit)
|
|
selectedNodeIDs := map[string]struct{}{}
|
|
selectedRegions := map[string]struct{}{}
|
|
for _, candidate := range candidates {
|
|
if len(selected) >= limit {
|
|
break
|
|
}
|
|
region := strings.TrimSpace(candidate.Region)
|
|
if region == "" || (preferredRegion != "" && strings.EqualFold(region, preferredRegion)) {
|
|
continue
|
|
}
|
|
regionKey := strings.ToLower(region)
|
|
if _, exists := selectedRegions[regionKey]; exists {
|
|
continue
|
|
}
|
|
selected = append(selected, candidate)
|
|
selectedNodeIDs[candidate.NodeID] = struct{}{}
|
|
selectedRegions[regionKey] = struct{}{}
|
|
}
|
|
if len(selected) < limit && !selectedHasPublicIngress(selected) {
|
|
for _, candidate := range candidates {
|
|
if len(selected) >= limit {
|
|
break
|
|
}
|
|
if _, exists := selectedNodeIDs[candidate.NodeID]; exists {
|
|
continue
|
|
}
|
|
if candidatePublicIngressCount(candidate) <= 0 {
|
|
continue
|
|
}
|
|
selected = append(selected, candidate)
|
|
selectedNodeIDs[candidate.NodeID] = struct{}{}
|
|
break
|
|
}
|
|
}
|
|
for _, candidate := range candidates {
|
|
if len(selected) >= limit {
|
|
break
|
|
}
|
|
if _, exists := selectedNodeIDs[candidate.NodeID]; exists {
|
|
continue
|
|
}
|
|
selected = append(selected, candidate)
|
|
selectedNodeIDs[candidate.NodeID] = struct{}{}
|
|
}
|
|
if len(selected) > limit {
|
|
selected = selected[:limit]
|
|
}
|
|
return selected
|
|
}
|
|
|
|
func selectedHasPublicIngress(candidates []peerRecoveryCandidateBuild) bool {
|
|
for _, candidate := range candidates {
|
|
if candidatePublicIngressCount(candidate) > 0 {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func candidatePublicIngressCount(candidate peerRecoveryCandidateBuild) int {
|
|
return candidate.PublicIngressCount
|
|
}
|
|
|
|
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState, missingExternalRegions map[string]struct{}, preferredRegion string) (string, bool) {
|
|
if mode == PeerRecoveryModeSteady {
|
|
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
|
|
return "maintain_ready", true
|
|
}
|
|
region := strings.ToLower(strings.TrimSpace(entry.BestRegion))
|
|
if region != "" && len(missingExternalRegions) > 0 {
|
|
if _, ok := missingExternalRegions[region]; ok {
|
|
if preferredRegion == "" || !strings.EqualFold(strings.TrimSpace(entry.BestRegion), preferredRegion) {
|
|
if connection.State == PeerConnectionDegraded {
|
|
return "recover_external_area", true
|
|
}
|
|
if entry.Warm || entry.RecoverySeed || connection.State == PeerConnectionDisconnected || connection.State == PeerConnectionConnecting {
|
|
return "recover_external_area", true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
|
|
return "maintain_ready", true
|
|
}
|
|
if connection.State == PeerConnectionDegraded {
|
|
return "recover_degraded", true
|
|
}
|
|
if entry.Warm {
|
|
return "recover_warm", true
|
|
}
|
|
if entry.RecoverySeed {
|
|
return "recover_seed", true
|
|
}
|
|
return "recover_peer", true
|
|
}
|
|
|
|
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string, preferredRegion string) int {
|
|
score := 0
|
|
if entry.Warm {
|
|
score += 1000
|
|
}
|
|
switch entry.WarmReason {
|
|
case "route_adjacent":
|
|
score += 500
|
|
case "recovery_seed":
|
|
score += 350
|
|
case "endpoint_candidate":
|
|
score += 200
|
|
case "peer_endpoint":
|
|
score += 100
|
|
}
|
|
if entry.RecoverySeed {
|
|
score += 250
|
|
}
|
|
if entry.BestCandidateID != "" {
|
|
score += 150
|
|
}
|
|
if entry.PublicIngressCount > 0 {
|
|
score += entry.PublicIngressCount * 90
|
|
}
|
|
preferredRegion = strings.TrimSpace(preferredRegion)
|
|
entryRegion := strings.TrimSpace(entry.BestRegion)
|
|
switch {
|
|
case preferredRegion != "" && entryRegion != "" && !strings.EqualFold(entryRegion, preferredRegion):
|
|
score += 275
|
|
case preferredRegion != "" && entryRegion != "" && strings.EqualFold(entryRegion, preferredRegion):
|
|
score += 25
|
|
}
|
|
score += entry.BestCandidateScore / 10
|
|
switch connection.State {
|
|
case PeerConnectionReady, PeerConnectionRelayReady:
|
|
score += 600
|
|
case PeerConnectionDegraded:
|
|
score += 350
|
|
case PeerConnectionConnecting:
|
|
score += 200
|
|
case PeerConnectionDisconnected:
|
|
score += 100
|
|
}
|
|
switch reason {
|
|
case "maintain_ready":
|
|
score += 500
|
|
case "recover_external_area":
|
|
score += 450
|
|
case "recover_degraded":
|
|
score += 300
|
|
case "recover_seed":
|
|
score += 250
|
|
case "recover_warm":
|
|
score += 150
|
|
}
|
|
if connection.LastLatencyMs > 0 {
|
|
score -= connection.LastLatencyMs / 10
|
|
}
|
|
if score < 0 {
|
|
return 0
|
|
}
|
|
return score
|
|
}
|
|
|
|
func connectablePeerCount(snapshot PeerCacheSnapshot) int {
|
|
count := 0
|
|
for _, entry := range snapshot.Entries {
|
|
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
|
|
continue
|
|
}
|
|
count++
|
|
}
|
|
return count
|
|
}
|