Emit capacity endpoint observations
This commit is contained in:
@@ -576,6 +576,25 @@ func (s *vpnFabricEndpointObservationStore) ObserveFailure(endpointID string, re
|
|||||||
s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries)
|
s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *vpnFabricEndpointObservationStore) ObserveCapacity(endpointID string) {
|
||||||
|
if s == nil || strings.TrimSpace(endpointID) == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
observation := s.observations[endpointID]
|
||||||
|
observation.EndpointID = endpointID
|
||||||
|
observation.Source = "local_vpn_fabric_session"
|
||||||
|
observation.ReporterNodeID = s.reporterNodeID
|
||||||
|
observation.LastFailureReason = "capacity_limited"
|
||||||
|
if observation.ReliabilityScore <= 0 {
|
||||||
|
observation.ReliabilityScore = 90
|
||||||
|
}
|
||||||
|
observation.ObservedAt = time.Now().UTC()
|
||||||
|
s.observations[endpointID] = observation
|
||||||
|
s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries)
|
||||||
|
}
|
||||||
|
|
||||||
func fabricTransportLabelIsQUIC(label string) bool {
|
func fabricTransportLabelIsQUIC(label string) bool {
|
||||||
switch strings.ToLower(strings.TrimSpace(label)) {
|
switch strings.ToLower(strings.TrimSpace(label)) {
|
||||||
case "quic", "direct_quic", "udp_quic", "quic_udp":
|
case "quic", "direct_quic", "udp_quic", "quic_udp":
|
||||||
@@ -5063,6 +5082,7 @@ func fabricSessionGatewayTransportForAssignment(ctx context.Context, identity st
|
|||||||
reason := fabricSessionOpenFailureReason(err)
|
reason := fabricSessionOpenFailureReason(err)
|
||||||
if reason == "capacity_limited" {
|
if reason == "capacity_limited" {
|
||||||
meshState.VPNFabricSessionDialStats.ObserveCapacityLimited(selectedTarget)
|
meshState.VPNFabricSessionDialStats.ObserveCapacityLimited(selectedTarget)
|
||||||
|
meshState.VPNFabricEndpointObservations.ObserveCapacity(selectedTarget.EndpointID)
|
||||||
} else {
|
} else {
|
||||||
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure(reason)
|
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure(reason)
|
||||||
meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, reason)
|
meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, reason)
|
||||||
|
|||||||
@@ -851,6 +851,20 @@ func TestVPNFabricEndpointObservationStoreTagsLocalSource(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestVPNFabricEndpointObservationStoreRecordsCapacityWithoutFailure(t *testing.T) {
|
||||||
|
store := newVPNFabricEndpointObservationStore("node-a")
|
||||||
|
store.ObserveCapacity("endpoint-a")
|
||||||
|
snapshot := store.Snapshot()
|
||||||
|
observation := snapshot["endpoint-a"]
|
||||||
|
if observation.LastFailureReason != "capacity_limited" ||
|
||||||
|
observation.FailureCount != 0 ||
|
||||||
|
observation.ReliabilityScore != 90 ||
|
||||||
|
observation.Source != "local_vpn_fabric_session" ||
|
||||||
|
observation.ReporterNodeID != "node-a" {
|
||||||
|
t.Fatalf("unexpected capacity observation: %+v", observation)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestVPNFabricEndpointObservationStorePrunesOldAndExcessEntries(t *testing.T) {
|
func TestVPNFabricEndpointObservationStorePrunesOldAndExcessEntries(t *testing.T) {
|
||||||
store := newVPNFabricEndpointObservationStore()
|
store := newVPNFabricEndpointObservationStore()
|
||||||
now := time.Now().UTC()
|
now := time.Now().UTC()
|
||||||
|
|||||||
@@ -377,6 +377,9 @@ observations.
|
|||||||
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
||||||
penalty instead of a hard recent failure, enabling load spreading without
|
penalty instead of a hard recent failure, enabling load spreading without
|
||||||
marking the carrier unhealthy.
|
marking the carrier unhealthy.
|
||||||
|
Local QUIC stream-limit pressure is now emitted as a capacity observation with
|
||||||
|
no failure-count increment, allowing control plane to spread load without
|
||||||
|
treating saturation as packet-path breakage.
|
||||||
Cached QUIC carrier idle TTL is configurable through
|
Cached QUIC carrier idle TTL is configurable through
|
||||||
`RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS` / `-vpn-fabric-quic-idle-ttl` and
|
`RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS` / `-vpn-fabric-quic-idle-ttl` and
|
||||||
propagated by host-agent install profiles.
|
propagated by host-agent install profiles.
|
||||||
|
|||||||
Reference in New Issue
Block a user