Emit capacity endpoint observations

This commit is contained in:
2026-05-16 11:59:27 +03:00
parent a02f4fa8aa
commit 9e964e28cb
3 changed files with 37 additions and 0 deletions
@@ -576,6 +576,25 @@ func (s *vpnFabricEndpointObservationStore) ObserveFailure(endpointID string, re
s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries)
}
func (s *vpnFabricEndpointObservationStore) ObserveCapacity(endpointID string) {
if s == nil || strings.TrimSpace(endpointID) == "" {
return
}
s.mu.Lock()
defer s.mu.Unlock()
observation := s.observations[endpointID]
observation.EndpointID = endpointID
observation.Source = "local_vpn_fabric_session"
observation.ReporterNodeID = s.reporterNodeID
observation.LastFailureReason = "capacity_limited"
if observation.ReliabilityScore <= 0 {
observation.ReliabilityScore = 90
}
observation.ObservedAt = time.Now().UTC()
s.observations[endpointID] = observation
s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries)
}
func fabricTransportLabelIsQUIC(label string) bool {
switch strings.ToLower(strings.TrimSpace(label)) {
case "quic", "direct_quic", "udp_quic", "quic_udp":
@@ -5063,6 +5082,7 @@ func fabricSessionGatewayTransportForAssignment(ctx context.Context, identity st
reason := fabricSessionOpenFailureReason(err)
if reason == "capacity_limited" {
meshState.VPNFabricSessionDialStats.ObserveCapacityLimited(selectedTarget)
meshState.VPNFabricEndpointObservations.ObserveCapacity(selectedTarget.EndpointID)
} else {
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure(reason)
meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, reason)
@@ -851,6 +851,20 @@ func TestVPNFabricEndpointObservationStoreTagsLocalSource(t *testing.T) {
}
}
func TestVPNFabricEndpointObservationStoreRecordsCapacityWithoutFailure(t *testing.T) {
store := newVPNFabricEndpointObservationStore("node-a")
store.ObserveCapacity("endpoint-a")
snapshot := store.Snapshot()
observation := snapshot["endpoint-a"]
if observation.LastFailureReason != "capacity_limited" ||
observation.FailureCount != 0 ||
observation.ReliabilityScore != 90 ||
observation.Source != "local_vpn_fabric_session" ||
observation.ReporterNodeID != "node-a" {
t.Fatalf("unexpected capacity observation: %+v", observation)
}
}
func TestVPNFabricEndpointObservationStorePrunesOldAndExcessEntries(t *testing.T) {
store := newVPNFabricEndpointObservationStore()
now := time.Now().UTC()
@@ -377,6 +377,9 @@ observations.
Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy.
Local QUIC stream-limit pressure is now emitted as a capacity observation with
no failure-count increment, allowing control plane to spread load without
treating saturation as packet-path breakage.
Cached QUIC carrier idle TTL is configurable through
`RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS` / `-vpn-fabric-quic-idle-ttl` and
propagated by host-agent install profiles.