Expose VPN fabric bulk pressure telemetry

This commit is contained in:
2026-05-16 13:02:31 +03:00
parent ebdae833fc
commit f1cd43e6f2
3 changed files with 19 additions and 0 deletions
@@ -251,6 +251,9 @@ type FabricFlowSchedulerSnapshot struct {
AdaptiveBackpressureReason string `json:"adaptive_backpressure_reason,omitempty"` AdaptiveBackpressureReason string `json:"adaptive_backpressure_reason,omitempty"`
RecommendedParallelWindows map[string]int `json:"recommended_parallel_windows,omitempty"` RecommendedParallelWindows map[string]int `json:"recommended_parallel_windows,omitempty"`
AdaptivePolicyFingerprint string `json:"adaptive_policy_fingerprint,omitempty"` AdaptivePolicyFingerprint string `json:"adaptive_policy_fingerprint,omitempty"`
BulkPressureActive bool `json:"bulk_pressure_active,omitempty"`
BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"`
InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"`
SlowChannelCount int `json:"slow_channel_count"` SlowChannelCount int `json:"slow_channel_count"`
FailingChannelCount int `json:"failing_channel_count"` FailingChannelCount int `json:"failing_channel_count"`
QualityWindowSampleCount int `json:"quality_window_sample_count"` QualityWindowSampleCount int `json:"quality_window_sample_count"`
@@ -778,6 +781,16 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
if snapshot.QualityWindowDropCount > 0 { if snapshot.QualityWindowDropCount > 0 {
snapshot.BackpressureActive = true snapshot.BackpressureActive = true
} }
snapshot.BulkPressureChannelCount = snapshot.TrafficClassCounts[FabricTrafficClassBulk]
snapshot.InteractiveOrControlCount = snapshot.TrafficClassCounts[FabricTrafficClassControl] + snapshot.TrafficClassCounts[FabricTrafficClassInteractive]
bulkPressureThreshold := s.adaptivePolicy.BulkPressureChannelThreshold
if bulkPressureThreshold <= 0 {
bulkPressureThreshold = defaultFabricServiceChannelAdaptivePolicy().BulkPressureChannelThreshold
}
if snapshot.BulkPressureChannelCount >= bulkPressureThreshold && snapshot.InteractiveOrControlCount > 0 {
snapshot.BulkPressureActive = true
snapshot.BackpressureActive = true
}
for _, trafficClass := range []string{FabricTrafficClassControl, FabricTrafficClassInteractive, FabricTrafficClassReliable, FabricTrafficClassBulk, FabricTrafficClassDroppable} { for _, trafficClass := range []string{FabricTrafficClassControl, FabricTrafficClassInteractive, FabricTrafficClassReliable, FabricTrafficClassBulk, FabricTrafficClassDroppable} {
snapshot.RecommendedParallelWindows[trafficClass] = s.recommendedParallelSendWindowForTrafficClassLocked(trafficClass, s.adaptivePolicy.MaxParallelWindow) snapshot.RecommendedParallelWindows[trafficClass] = s.recommendedParallelSendWindowForTrafficClassLocked(trafficClass, s.adaptivePolicy.MaxParallelWindow)
} }
@@ -1879,6 +1879,9 @@ func TestFabricFlowSchedulerProtectsInteractiveWindowDuringBulkPressure(t *testi
if snapshot.TrafficClassCounts[FabricTrafficClassBulk] != 16 || snapshot.TrafficClassCounts[FabricTrafficClassInteractive] != 1 { if snapshot.TrafficClassCounts[FabricTrafficClassBulk] != 16 || snapshot.TrafficClassCounts[FabricTrafficClassInteractive] != 1 {
t.Fatalf("traffic class counts = %+v", snapshot.TrafficClassCounts) t.Fatalf("traffic class counts = %+v", snapshot.TrafficClassCounts)
} }
if !snapshot.BulkPressureActive || snapshot.BulkPressureChannelCount != 16 || snapshot.InteractiveOrControlCount != 1 || !snapshot.BackpressureActive {
t.Fatalf("bulk pressure telemetry = %+v", snapshot)
}
} }
func TestFabricFlowSchedulerRollingQualityWindowForgetsOldPressure(t *testing.T) { func TestFabricFlowSchedulerRollingQualityWindowForgetsOldPressure(t *testing.T) {
@@ -435,6 +435,9 @@ fresh sample to hide a saturated endpoint.
Heartbeat VPN fabric reports now include a bounded `quic_capacity_pressure` Heartbeat VPN fabric reports now include a bounded `quic_capacity_pressure`
summary sorted by busiest cached QUIC connection, making overload diagnosis summary sorted by busiest cached QUIC connection, making overload diagnosis
visible without digging through the full carrier snapshot. visible without digging through the full carrier snapshot.
VPN fabric flow-scheduler snapshots now expose bulk pressure activation plus
bulk and interactive/control channel counts, making mixed browser/RDP load
diagnosis explicit when bulk windows are reduced to protect interactive traffic.
Endpoint ranking treats `capacity_limited` observations as a soft pressure Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy. marking the carrier unhealthy.