Apply endpoint health in peer cache

This commit is contained in:
2026-05-16 11:26:06 +03:00
parent 9c99899322
commit 831701003c
4 changed files with 91 additions and 30 deletions
@@ -924,16 +924,17 @@ func startSyntheticMeshEndpoint(ctx context.Context, _ context.CancelFunc, cfg c
productionForwardingEnabled := cfg.MeshProductionForwardingEnabled || loadedConfig.ProductionForwarding productionForwardingEnabled := cfg.MeshProductionForwardingEnabled || loadedConfig.ProductionForwarding
routeHealthRoutes := routeHealthRoutesFromPathDecisions(routes, loadedConfig.RoutePathDecisions) routeHealthRoutes := routeHealthRoutesFromPathDecisions(routes, loadedConfig.RoutePathDecisions)
peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{ peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{
Local: local, Local: local,
PeerEndpoints: loadedConfig.PeerEndpoints, PeerEndpoints: loadedConfig.PeerEndpoints,
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates, PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
PeerDirectory: loadedConfig.PeerDirectory, PeerEndpointObservations: loadedConfig.PeerEndpointObservations,
RecoverySeeds: loadedConfig.RecoverySeeds, PeerDirectory: loadedConfig.PeerDirectory,
RendezvousLeases: loadedConfig.RendezvousLeases, RecoverySeeds: loadedConfig.RecoverySeeds,
Routes: loadedConfig.Routes, RendezvousLeases: loadedConfig.RendezvousLeases,
WarmPeerLimit: mesh.DefaultWarmPeerLimit, Routes: loadedConfig.Routes,
PreferredRegion: cfg.MeshRegion, WarmPeerLimit: mesh.DefaultWarmPeerLimit,
Now: time.Now().UTC(), PreferredRegion: cfg.MeshRegion,
Now: time.Now().UTC(),
}) })
peerCacheSnapshot := peerCache.Snapshot() peerCacheSnapshot := peerCache.Snapshot()
peerConnections := mesh.NewPeerConnectionTracker(peerCacheSnapshot, time.Now().UTC()) peerConnections := mesh.NewPeerConnectionTracker(peerCacheSnapshot, time.Now().UTC())
@@ -1931,16 +1932,17 @@ func refreshSyntheticMeshConfigForRouteHealthFeedback(ctx context.Context, cfg c
func applyRefreshedSyntheticMeshConfig(ctx context.Context, cfg config.Config, identity state.Identity, meshState *syntheticMeshState, loadedConfig loadedSyntheticMeshConfig, local mesh.PeerIdentity, preferredRegion string, observedAt time.Time) { func applyRefreshedSyntheticMeshConfig(ctx context.Context, cfg config.Config, identity state.Identity, meshState *syntheticMeshState, loadedConfig loadedSyntheticMeshConfig, local mesh.PeerIdentity, preferredRegion string, observedAt time.Time) {
routeHealthRoutes := routeHealthRoutesFromPathDecisions(loadedConfig.Routes, loadedConfig.RoutePathDecisions) routeHealthRoutes := routeHealthRoutesFromPathDecisions(loadedConfig.Routes, loadedConfig.RoutePathDecisions)
peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{ peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{
Local: local, Local: local,
PeerEndpoints: loadedConfig.PeerEndpoints, PeerEndpoints: loadedConfig.PeerEndpoints,
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates, PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
PeerDirectory: loadedConfig.PeerDirectory, PeerEndpointObservations: loadedConfig.PeerEndpointObservations,
RecoverySeeds: loadedConfig.RecoverySeeds, PeerDirectory: loadedConfig.PeerDirectory,
RendezvousLeases: loadedConfig.RendezvousLeases, RecoverySeeds: loadedConfig.RecoverySeeds,
Routes: loadedConfig.Routes, RendezvousLeases: loadedConfig.RendezvousLeases,
WarmPeerLimit: mesh.DefaultWarmPeerLimit, Routes: loadedConfig.Routes,
PreferredRegion: preferredRegion, WarmPeerLimit: mesh.DefaultWarmPeerLimit,
Now: observedAt, PreferredRegion: preferredRegion,
Now: observedAt,
}) })
if meshState.PeerConnections == nil { if meshState.PeerConnections == nil {
meshState.PeerConnections = mesh.NewPeerConnectionTracker(peerCache.Snapshot(), observedAt) meshState.PeerConnections = mesh.NewPeerConnectionTracker(peerCache.Snapshot(), observedAt)
@@ -9,16 +9,17 @@ import (
const DefaultWarmPeerLimit = 8 const DefaultWarmPeerLimit = 8
type PeerCacheConfig struct { type PeerCacheConfig struct {
Local PeerIdentity Local PeerIdentity
PeerEndpoints map[string]string PeerEndpoints map[string]string
PeerEndpointCandidates map[string][]PeerEndpointCandidate PeerEndpointCandidates map[string][]PeerEndpointCandidate
PeerDirectory []PeerDirectoryEntry PeerEndpointObservations map[string]EndpointCandidateHealthObservation
RecoverySeeds []PeerRecoverySeed PeerDirectory []PeerDirectoryEntry
RendezvousLeases []PeerRendezvousLease RecoverySeeds []PeerRecoverySeed
Routes []SyntheticRoute RendezvousLeases []PeerRendezvousLease
WarmPeerLimit int Routes []SyntheticRoute
PreferredRegion string WarmPeerLimit int
Now time.Time PreferredRegion string
Now time.Time
} }
type PeerCache struct { type PeerCache struct {
@@ -116,6 +117,8 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
PreferredRegion: cfg.PreferredRegion, PreferredRegion: cfg.PreferredRegion,
Now: now, Now: now,
MaxVerificationAge: time.Hour, MaxVerificationAge: time.Hour,
Observations: cfg.PeerEndpointObservations,
MaxObservationAge: time.Hour,
}) })
if len(scored) > 0 { if len(scored) > 0 {
entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored)) entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored))
@@ -100,6 +100,59 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
} }
} }
func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-quic",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://node-b.example.test:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 1,
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-wss",
NodeID: "node-b",
Transport: "wss",
Address: "https://node-b.example.test:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 1,
LastVerifiedAt: &now,
},
},
},
PeerEndpointObservations: map[string]EndpointCandidateHealthObservation{
"node-b-quic": {
EndpointID: "node-b-quic",
FailureCount: 2,
LastFailureReason: "session_open_failed",
ReliabilityScore: 35,
ObservedAt: now,
},
},
WarmPeerLimit: 1,
Now: now,
})
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
if !ok {
t.Fatal("node-b missing from cache")
}
if entry.BestCandidateID != "node-b-wss" || entry.Endpoint != "https://node-b.example.test:443" {
t.Fatalf("peer cache did not apply endpoint observations: %+v", entry)
}
}
func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) { func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"} local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{ cache := NewPeerCache(PeerCacheConfig{
@@ -347,6 +347,9 @@ plane can distinguish local dial feedback from aggregated or policy-generated
health hints. health hints.
The endpoint health heartbeat report also includes the reporter node id at the The endpoint health heartbeat report also includes the reporter node id at the
report level for simpler multi-node ingestion and diagnostics. report level for simpler multi-node ingestion and diagnostics.
Peer cache construction now applies endpoint health observations when ranking
peer endpoint candidates, so recovery and warm-peer decisions see the same
degraded-path feedback as VPN fabric-session dialing.
Deliverables: Deliverables: