Apply endpoint health in peer cache
This commit is contained in:
@@ -927,6 +927,7 @@ func startSyntheticMeshEndpoint(ctx context.Context, _ context.CancelFunc, cfg c
|
||||
Local: local,
|
||||
PeerEndpoints: loadedConfig.PeerEndpoints,
|
||||
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
|
||||
PeerEndpointObservations: loadedConfig.PeerEndpointObservations,
|
||||
PeerDirectory: loadedConfig.PeerDirectory,
|
||||
RecoverySeeds: loadedConfig.RecoverySeeds,
|
||||
RendezvousLeases: loadedConfig.RendezvousLeases,
|
||||
@@ -1934,6 +1935,7 @@ func applyRefreshedSyntheticMeshConfig(ctx context.Context, cfg config.Config, i
|
||||
Local: local,
|
||||
PeerEndpoints: loadedConfig.PeerEndpoints,
|
||||
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
|
||||
PeerEndpointObservations: loadedConfig.PeerEndpointObservations,
|
||||
PeerDirectory: loadedConfig.PeerDirectory,
|
||||
RecoverySeeds: loadedConfig.RecoverySeeds,
|
||||
RendezvousLeases: loadedConfig.RendezvousLeases,
|
||||
|
||||
@@ -12,6 +12,7 @@ type PeerCacheConfig struct {
|
||||
Local PeerIdentity
|
||||
PeerEndpoints map[string]string
|
||||
PeerEndpointCandidates map[string][]PeerEndpointCandidate
|
||||
PeerEndpointObservations map[string]EndpointCandidateHealthObservation
|
||||
PeerDirectory []PeerDirectoryEntry
|
||||
RecoverySeeds []PeerRecoverySeed
|
||||
RendezvousLeases []PeerRendezvousLease
|
||||
@@ -116,6 +117,8 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
|
||||
PreferredRegion: cfg.PreferredRegion,
|
||||
Now: now,
|
||||
MaxVerificationAge: time.Hour,
|
||||
Observations: cfg.PeerEndpointObservations,
|
||||
MaxObservationAge: time.Hour,
|
||||
})
|
||||
if len(scored) > 0 {
|
||||
entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored))
|
||||
|
||||
@@ -100,6 +100,59 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-quic",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://node-b.example.test:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 1,
|
||||
LastVerifiedAt: &now,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-wss",
|
||||
NodeID: "node-b",
|
||||
Transport: "wss",
|
||||
Address: "https://node-b.example.test:443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 1,
|
||||
LastVerifiedAt: &now,
|
||||
},
|
||||
},
|
||||
},
|
||||
PeerEndpointObservations: map[string]EndpointCandidateHealthObservation{
|
||||
"node-b-quic": {
|
||||
EndpointID: "node-b-quic",
|
||||
FailureCount: 2,
|
||||
LastFailureReason: "session_open_failed",
|
||||
ReliabilityScore: 35,
|
||||
ObservedAt: now,
|
||||
},
|
||||
},
|
||||
WarmPeerLimit: 1,
|
||||
Now: now,
|
||||
})
|
||||
|
||||
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
|
||||
if !ok {
|
||||
t.Fatal("node-b missing from cache")
|
||||
}
|
||||
if entry.BestCandidateID != "node-b-wss" || entry.Endpoint != "https://node-b.example.test:443" {
|
||||
t.Fatalf("peer cache did not apply endpoint observations: %+v", entry)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
|
||||
@@ -347,6 +347,9 @@ plane can distinguish local dial feedback from aggregated or policy-generated
|
||||
health hints.
|
||||
The endpoint health heartbeat report also includes the reporter node id at the
|
||||
report level for simpler multi-node ingestion and diagnostics.
|
||||
Peer cache construction now applies endpoint health observations when ranking
|
||||
peer endpoint candidates, so recovery and warm-peer decisions see the same
|
||||
degraded-path feedback as VPN fabric-session dialing.
|
||||
|
||||
Deliverables:
|
||||
|
||||
|
||||
Reference in New Issue
Block a user