Apply endpoint health in peer cache

This commit is contained in:
2026-05-16 11:26:06 +03:00
parent 9c99899322
commit 831701003c
4 changed files with 91 additions and 30 deletions
@@ -927,6 +927,7 @@ func startSyntheticMeshEndpoint(ctx context.Context, _ context.CancelFunc, cfg c
Local: local, Local: local,
PeerEndpoints: loadedConfig.PeerEndpoints, PeerEndpoints: loadedConfig.PeerEndpoints,
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates, PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
PeerEndpointObservations: loadedConfig.PeerEndpointObservations,
PeerDirectory: loadedConfig.PeerDirectory, PeerDirectory: loadedConfig.PeerDirectory,
RecoverySeeds: loadedConfig.RecoverySeeds, RecoverySeeds: loadedConfig.RecoverySeeds,
RendezvousLeases: loadedConfig.RendezvousLeases, RendezvousLeases: loadedConfig.RendezvousLeases,
@@ -1934,6 +1935,7 @@ func applyRefreshedSyntheticMeshConfig(ctx context.Context, cfg config.Config, i
Local: local, Local: local,
PeerEndpoints: loadedConfig.PeerEndpoints, PeerEndpoints: loadedConfig.PeerEndpoints,
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates, PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
PeerEndpointObservations: loadedConfig.PeerEndpointObservations,
PeerDirectory: loadedConfig.PeerDirectory, PeerDirectory: loadedConfig.PeerDirectory,
RecoverySeeds: loadedConfig.RecoverySeeds, RecoverySeeds: loadedConfig.RecoverySeeds,
RendezvousLeases: loadedConfig.RendezvousLeases, RendezvousLeases: loadedConfig.RendezvousLeases,
@@ -12,6 +12,7 @@ type PeerCacheConfig struct {
Local PeerIdentity Local PeerIdentity
PeerEndpoints map[string]string PeerEndpoints map[string]string
PeerEndpointCandidates map[string][]PeerEndpointCandidate PeerEndpointCandidates map[string][]PeerEndpointCandidate
PeerEndpointObservations map[string]EndpointCandidateHealthObservation
PeerDirectory []PeerDirectoryEntry PeerDirectory []PeerDirectoryEntry
RecoverySeeds []PeerRecoverySeed RecoverySeeds []PeerRecoverySeed
RendezvousLeases []PeerRendezvousLease RendezvousLeases []PeerRendezvousLease
@@ -116,6 +117,8 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
PreferredRegion: cfg.PreferredRegion, PreferredRegion: cfg.PreferredRegion,
Now: now, Now: now,
MaxVerificationAge: time.Hour, MaxVerificationAge: time.Hour,
Observations: cfg.PeerEndpointObservations,
MaxObservationAge: time.Hour,
}) })
if len(scored) > 0 { if len(scored) > 0 {
entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored)) entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored))
@@ -100,6 +100,59 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
} }
} }
func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-quic",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://node-b.example.test:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 1,
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-wss",
NodeID: "node-b",
Transport: "wss",
Address: "https://node-b.example.test:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 1,
LastVerifiedAt: &now,
},
},
},
PeerEndpointObservations: map[string]EndpointCandidateHealthObservation{
"node-b-quic": {
EndpointID: "node-b-quic",
FailureCount: 2,
LastFailureReason: "session_open_failed",
ReliabilityScore: 35,
ObservedAt: now,
},
},
WarmPeerLimit: 1,
Now: now,
})
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
if !ok {
t.Fatal("node-b missing from cache")
}
if entry.BestCandidateID != "node-b-wss" || entry.Endpoint != "https://node-b.example.test:443" {
t.Fatalf("peer cache did not apply endpoint observations: %+v", entry)
}
}
func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) { func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"} local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{ cache := NewPeerCache(PeerCacheConfig{
@@ -347,6 +347,9 @@ plane can distinguish local dial feedback from aggregated or policy-generated
health hints. health hints.
The endpoint health heartbeat report also includes the reporter node id at the The endpoint health heartbeat report also includes the reporter node id at the
report level for simpler multi-node ingestion and diagnostics. report level for simpler multi-node ingestion and diagnostics.
Peer cache construction now applies endpoint health observations when ranking
peer endpoint candidates, so recovery and warm-peer decisions see the same
degraded-path feedback as VPN fabric-session dialing.
Deliverables: Deliverables: