From a02f4fa8aa6646c036d3cf1867e7b5a4f376b349 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Sat, 16 May 2026 11:55:38 +0300 Subject: [PATCH] Score capacity pressure softly --- .../mesh/endpoint_candidate_scoring.go | 9 +++-- .../mesh/endpoint_candidate_scoring_test.go | 34 +++++++++++++++++++ .../DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md | 3 ++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring.go b/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring.go index 38426d2..707ed6a 100644 --- a/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring.go +++ b/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring.go @@ -237,8 +237,13 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv reasons = append(reasons, "history:failure") } if strings.TrimSpace(observation.LastFailureReason) != "" { - score -= 8 - reasons = append(reasons, "failure:recent") + if strings.TrimSpace(observation.LastFailureReason) == "capacity_limited" { + score -= 4 + reasons = append(reasons, "capacity:limited") + } else { + score -= 8 + reasons = append(reasons, "failure:recent") + } } return score, reasons } diff --git a/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring_test.go b/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring_test.go index c1127c1..53ce712 100644 --- a/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring_test.go +++ b/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring_test.go @@ -353,6 +353,40 @@ func TestRankPeerEndpointCandidatesDoesNotRewardZeroLatencyFailure(t *testing.T) } } +func TestRankPeerEndpointCandidatesTreatsCapacityAsSoftPressure(t *testing.T) { + now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC) + ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{ + { + EndpointID: "node-b-quic", + NodeID: "node-b", + Transport: "direct_quic", + Address: "quic://node-b.example.test:19443", + Reachability: "public", + ConnectivityMode: "direct", + Priority: 10, + LastVerifiedAt: &now, + }, + }, EndpointCandidateScoreOptions{ + Now: now, + MaxVerificationAge: time.Minute, + Observations: map[string]EndpointCandidateHealthObservation{ + "node-b-quic": { + EndpointID: "node-b-quic", + LastFailureReason: "capacity_limited", + ReliabilityScore: 95, + ObservedAt: now, + }, + }, + MaxObservationAge: time.Minute, + }) + if len(ranked) != 1 || !containsReason(ranked[0].Reasons, "capacity:limited") { + t.Fatalf("capacity pressure reason missing: %+v", ranked) + } + if containsReason(ranked[0].Reasons, "failure:recent") { + t.Fatalf("capacity pressure treated as recent failure: %+v", ranked[0].Reasons) + } +} + func containsReason(reasons []string, reason string) bool { for _, item := range reasons { if item == reason { diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index 3d9cc6f..43ffc49 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -374,6 +374,9 @@ but saturated carrier. VPN fabric dial telemetry records the last capacity-limited endpoint and transport, making stream saturation visible without poisoning endpoint health observations. +Endpoint ranking treats `capacity_limited` observations as a soft pressure +penalty instead of a hard recent failure, enabling load spreading without +marking the carrier unhealthy. Cached QUIC carrier idle TTL is configurable through `RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS` / `-vpn-fabric-quic-idle-ttl` and propagated by host-agent install profiles.