Report QUIC fabric capacity pressure
This commit is contained in:
@@ -423,8 +423,11 @@ type vpnFabricSessionDialStats struct {
|
|||||||
PinnedCertSelected atomic.Int64
|
PinnedCertSelected atomic.Int64
|
||||||
LastTransport atomic.Value
|
LastTransport atomic.Value
|
||||||
LastEndpoint atomic.Value
|
LastEndpoint atomic.Value
|
||||||
|
LastCapacityEndpoint atomic.Value
|
||||||
|
LastCapacityTransport atomic.Value
|
||||||
LastFailureReason atomic.Value
|
LastFailureReason atomic.Value
|
||||||
LastSelectedUnixSec atomic.Int64
|
LastSelectedUnixSec atomic.Int64
|
||||||
|
LastCapacityUnixSec atomic.Int64
|
||||||
LastFailureUnixSec atomic.Int64
|
LastFailureUnixSec atomic.Int64
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -610,6 +613,20 @@ func (s *vpnFabricSessionDialStats) ObserveCandidateFailure(reason string) {
|
|||||||
s.LastFailureUnixSec.Store(time.Now().UTC().Unix())
|
s.LastFailureUnixSec.Store(time.Now().UTC().Unix())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *vpnFabricSessionDialStats) ObserveCapacityLimited(target mesh.FabricTransportTarget) {
|
||||||
|
if s == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.ObserveCandidateFailure("capacity_limited")
|
||||||
|
s.LastCapacityEndpoint.Store(strings.TrimSpace(target.Endpoint))
|
||||||
|
transport := strings.TrimSpace(target.Transport)
|
||||||
|
if transport == "" {
|
||||||
|
transport = "legacy_peer_endpoint"
|
||||||
|
}
|
||||||
|
s.LastCapacityTransport.Store(transport)
|
||||||
|
s.LastCapacityUnixSec.Store(time.Now().UTC().Unix())
|
||||||
|
}
|
||||||
|
|
||||||
func (s *vpnFabricSessionDialStats) ObserveAllCandidatesFailed() {
|
func (s *vpnFabricSessionDialStats) ObserveAllCandidatesFailed() {
|
||||||
if s == nil {
|
if s == nil {
|
||||||
return
|
return
|
||||||
@@ -664,6 +681,7 @@ func (s *vpnFabricSessionDialStats) Report(observedAt time.Time) map[string]any
|
|||||||
"legacy_selected": s.LegacySelected.Load(),
|
"legacy_selected": s.LegacySelected.Load(),
|
||||||
"pinned_cert_selected": s.PinnedCertSelected.Load(),
|
"pinned_cert_selected": s.PinnedCertSelected.Load(),
|
||||||
"last_selected_unix_sec": s.LastSelectedUnixSec.Load(),
|
"last_selected_unix_sec": s.LastSelectedUnixSec.Load(),
|
||||||
|
"last_capacity_unix_sec": s.LastCapacityUnixSec.Load(),
|
||||||
"last_failure_unix_sec": s.LastFailureUnixSec.Load(),
|
"last_failure_unix_sec": s.LastFailureUnixSec.Load(),
|
||||||
}
|
}
|
||||||
if value, ok := s.LastTransport.Load().(string); ok && value != "" {
|
if value, ok := s.LastTransport.Load().(string); ok && value != "" {
|
||||||
@@ -672,6 +690,12 @@ func (s *vpnFabricSessionDialStats) Report(observedAt time.Time) map[string]any
|
|||||||
if value, ok := s.LastEndpoint.Load().(string); ok && value != "" {
|
if value, ok := s.LastEndpoint.Load().(string); ok && value != "" {
|
||||||
report["last_endpoint"] = value
|
report["last_endpoint"] = value
|
||||||
}
|
}
|
||||||
|
if value, ok := s.LastCapacityEndpoint.Load().(string); ok && value != "" {
|
||||||
|
report["last_capacity_endpoint"] = value
|
||||||
|
}
|
||||||
|
if value, ok := s.LastCapacityTransport.Load().(string); ok && value != "" {
|
||||||
|
report["last_capacity_transport"] = value
|
||||||
|
}
|
||||||
if value, ok := s.LastFailureReason.Load().(string); ok && value != "" {
|
if value, ok := s.LastFailureReason.Load().(string); ok && value != "" {
|
||||||
report["last_failure_reason"] = value
|
report["last_failure_reason"] = value
|
||||||
}
|
}
|
||||||
@@ -5037,8 +5061,10 @@ func fabricSessionGatewayTransportForAssignment(ctx context.Context, identity st
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
cancel()
|
cancel()
|
||||||
reason := fabricSessionOpenFailureReason(err)
|
reason := fabricSessionOpenFailureReason(err)
|
||||||
|
if reason == "capacity_limited" {
|
||||||
|
meshState.VPNFabricSessionDialStats.ObserveCapacityLimited(selectedTarget)
|
||||||
|
} else {
|
||||||
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure(reason)
|
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure(reason)
|
||||||
if reason != "capacity_limited" {
|
|
||||||
meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, reason)
|
meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, reason)
|
||||||
}
|
}
|
||||||
log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=%s error=%v", assignment.VPNConnectionID, nextHop, index, selectedTarget.Endpoint, selectedTarget.Transport, reason, err)
|
log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=%s error=%v", assignment.VPNConnectionID, nextHop, index, selectedTarget.Endpoint, selectedTarget.Transport, reason, err)
|
||||||
|
|||||||
@@ -776,7 +776,10 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
|
|||||||
func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
||||||
stats := newVPNFabricSessionDialStats()
|
stats := newVPNFabricSessionDialStats()
|
||||||
stats.Attempts.Add(1)
|
stats.Attempts.Add(1)
|
||||||
stats.ObserveCandidateFailure("capacity_limited")
|
stats.ObserveCapacityLimited(mesh.FabricTransportTarget{
|
||||||
|
Endpoint: "quic://node-b.example.test:19443",
|
||||||
|
Transport: "direct_quic",
|
||||||
|
})
|
||||||
stats.ObserveCandidateFailure("session_open_failed")
|
stats.ObserveCandidateFailure("session_open_failed")
|
||||||
stats.ObserveSelected(mesh.FabricTransportTarget{
|
stats.ObserveSelected(mesh.FabricTransportTarget{
|
||||||
Endpoint: "quic://node-b.example.test:19443",
|
Endpoint: "quic://node-b.example.test:19443",
|
||||||
@@ -794,6 +797,8 @@ func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
|||||||
report["pinned_cert_selected"] != int64(1) ||
|
report["pinned_cert_selected"] != int64(1) ||
|
||||||
report["last_transport"] != "direct_quic" ||
|
report["last_transport"] != "direct_quic" ||
|
||||||
report["last_endpoint"] != "quic://node-b.example.test:19443" ||
|
report["last_endpoint"] != "quic://node-b.example.test:19443" ||
|
||||||
|
report["last_capacity_endpoint"] != "quic://node-b.example.test:19443" ||
|
||||||
|
report["last_capacity_transport"] != "direct_quic" ||
|
||||||
report["last_failure_reason"] != "session_open_failed" {
|
report["last_failure_reason"] != "session_open_failed" {
|
||||||
t.Fatalf("unexpected dial stats report: %+v", report)
|
t.Fatalf("unexpected dial stats report: %+v", report)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -371,6 +371,9 @@ profiles.
|
|||||||
QUIC stream-limit rejects are classified as capacity pressure instead of peer
|
QUIC stream-limit rejects are classified as capacity pressure instead of peer
|
||||||
endpoint failure, so local health feedback does not incorrectly demote a healthy
|
endpoint failure, so local health feedback does not incorrectly demote a healthy
|
||||||
but saturated carrier.
|
but saturated carrier.
|
||||||
|
VPN fabric dial telemetry records the last capacity-limited endpoint and
|
||||||
|
transport, making stream saturation visible without poisoning endpoint health
|
||||||
|
observations.
|
||||||
Cached QUIC carrier idle TTL is configurable through
|
Cached QUIC carrier idle TTL is configurable through
|
||||||
`RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS` / `-vpn-fabric-quic-idle-ttl` and
|
`RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS` / `-vpn-fabric-quic-idle-ttl` and
|
||||||
propagated by host-agent install profiles.
|
propagated by host-agent install profiles.
|
||||||
|
|||||||
Reference in New Issue
Block a user