From 9e964e28cbf6d42b722e6970d20af0817d7bc522 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Sat, 16 May 2026 11:59:27 +0300 Subject: [PATCH] Emit capacity endpoint observations --- .../rap-node-agent/cmd/rap-node-agent/main.go | 20 +++++++++++++++++++ .../cmd/rap-node-agent/main_test.go | 14 +++++++++++++ .../DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md | 3 +++ 3 files changed, 37 insertions(+) diff --git a/agents/rap-node-agent/cmd/rap-node-agent/main.go b/agents/rap-node-agent/cmd/rap-node-agent/main.go index aa3b05c..1b2e662 100644 --- a/agents/rap-node-agent/cmd/rap-node-agent/main.go +++ b/agents/rap-node-agent/cmd/rap-node-agent/main.go @@ -576,6 +576,25 @@ func (s *vpnFabricEndpointObservationStore) ObserveFailure(endpointID string, re s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries) } +func (s *vpnFabricEndpointObservationStore) ObserveCapacity(endpointID string) { + if s == nil || strings.TrimSpace(endpointID) == "" { + return + } + s.mu.Lock() + defer s.mu.Unlock() + observation := s.observations[endpointID] + observation.EndpointID = endpointID + observation.Source = "local_vpn_fabric_session" + observation.ReporterNodeID = s.reporterNodeID + observation.LastFailureReason = "capacity_limited" + if observation.ReliabilityScore <= 0 { + observation.ReliabilityScore = 90 + } + observation.ObservedAt = time.Now().UTC() + s.observations[endpointID] = observation + s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries) +} + func fabricTransportLabelIsQUIC(label string) bool { switch strings.ToLower(strings.TrimSpace(label)) { case "quic", "direct_quic", "udp_quic", "quic_udp": @@ -5063,6 +5082,7 @@ func fabricSessionGatewayTransportForAssignment(ctx context.Context, identity st reason := fabricSessionOpenFailureReason(err) if reason == "capacity_limited" { meshState.VPNFabricSessionDialStats.ObserveCapacityLimited(selectedTarget) + meshState.VPNFabricEndpointObservations.ObserveCapacity(selectedTarget.EndpointID) } else { meshState.VPNFabricSessionDialStats.ObserveCandidateFailure(reason) meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, reason) diff --git a/agents/rap-node-agent/cmd/rap-node-agent/main_test.go b/agents/rap-node-agent/cmd/rap-node-agent/main_test.go index 10418cb..9c92c73 100644 --- a/agents/rap-node-agent/cmd/rap-node-agent/main_test.go +++ b/agents/rap-node-agent/cmd/rap-node-agent/main_test.go @@ -851,6 +851,20 @@ func TestVPNFabricEndpointObservationStoreTagsLocalSource(t *testing.T) { } } +func TestVPNFabricEndpointObservationStoreRecordsCapacityWithoutFailure(t *testing.T) { + store := newVPNFabricEndpointObservationStore("node-a") + store.ObserveCapacity("endpoint-a") + snapshot := store.Snapshot() + observation := snapshot["endpoint-a"] + if observation.LastFailureReason != "capacity_limited" || + observation.FailureCount != 0 || + observation.ReliabilityScore != 90 || + observation.Source != "local_vpn_fabric_session" || + observation.ReporterNodeID != "node-a" { + t.Fatalf("unexpected capacity observation: %+v", observation) + } +} + func TestVPNFabricEndpointObservationStorePrunesOldAndExcessEntries(t *testing.T) { store := newVPNFabricEndpointObservationStore() now := time.Now().UTC() diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index 43ffc49..7ff927c 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -377,6 +377,9 @@ observations. Endpoint ranking treats `capacity_limited` observations as a soft pressure penalty instead of a hard recent failure, enabling load spreading without marking the carrier unhealthy. +Local QUIC stream-limit pressure is now emitted as a capacity observation with +no failure-count increment, allowing control plane to spread load without +treating saturation as packet-path breakage. Cached QUIC carrier idle TTL is configurable through `RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS` / `-vpn-fabric-quic-idle-ttl` and propagated by host-agent install profiles.