Score VPN scheduler pressure

This commit is contained in:
2026-05-16 13:26:07 +03:00
parent db75e1baed
commit f9ff0a4631
3 changed files with 33 additions and 8 deletions
@@ -253,6 +253,7 @@ type FabricFlowSchedulerSnapshot struct {
HighWatermark int `json:"high_watermark"` HighWatermark int `json:"high_watermark"`
BackpressureActive bool `json:"backpressure_active"` BackpressureActive bool `json:"backpressure_active"`
PressureLevel string `json:"pressure_level,omitempty"` PressureLevel string `json:"pressure_level,omitempty"`
PressureScore int `json:"pressure_score,omitempty"`
PressureReasons []string `json:"pressure_reasons,omitempty"` PressureReasons []string `json:"pressure_reasons,omitempty"`
InFlight int `json:"in_flight"` InFlight int `json:"in_flight"`
MaxInFlight int `json:"max_in_flight"` MaxInFlight int `json:"max_in_flight"`
@@ -862,12 +863,13 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
snapshot.AdaptiveBackpressureReason = "bulk_window_reduced_to_protect_interactive" snapshot.AdaptiveBackpressureReason = "bulk_window_reduced_to_protect_interactive"
} }
} }
snapshot.PressureLevel, snapshot.PressureReasons = fabricFlowSchedulerPressure(snapshot) snapshot.PressureLevel, snapshot.PressureScore, snapshot.PressureReasons = fabricFlowSchedulerPressure(snapshot)
return snapshot return snapshot
} }
func fabricFlowSchedulerPressure(snapshot FabricFlowSchedulerSnapshot) (string, []string) { func fabricFlowSchedulerPressure(snapshot FabricFlowSchedulerSnapshot) (string, int, []string) {
level := "nominal" level := "nominal"
score := 0
reasons := []string{} reasons := []string{}
addReason := func(reason string) { addReason := func(reason string) {
reason = strings.TrimSpace(reason) reason = strings.TrimSpace(reason)
@@ -888,33 +890,43 @@ func fabricFlowSchedulerPressure(snapshot FabricFlowSchedulerSnapshot) (string,
} }
if snapshot.Dropped > 0 || snapshot.QualityWindowDropCount > 0 { if snapshot.Dropped > 0 || snapshot.QualityWindowDropCount > 0 {
escalate("critical") escalate("critical")
score += boundedFabricPressureScore(int(snapshot.Dropped)+snapshot.QualityWindowDropCount*10, 20, 40)
addReason("drops") addReason("drops")
} }
if snapshot.FailingChannelCount > 0 || snapshot.QualityWindowFailureCount > 0 { if snapshot.FailingChannelCount > 0 || snapshot.QualityWindowFailureCount > 0 {
escalate("critical") escalate("critical")
score += boundedFabricPressureScore((snapshot.FailingChannelCount+snapshot.QualityWindowFailureCount)*10, 20, 40)
addReason("route_failures") addReason("route_failures")
} }
if snapshot.RouteRecoveredChannelCount > 0 || snapshot.RouteSwitchCount > 0 { if snapshot.RouteRecoveredChannelCount > 0 || snapshot.RouteSwitchCount > 0 {
escalate("warning") escalate("warning")
score += boundedFabricPressureScore(snapshot.RouteRecoveredChannelCount*8+int(snapshot.RouteSwitchCount)*4, 8, 24)
addReason("route_recovery") addReason("route_recovery")
} }
if snapshot.SlowChannelCount > 0 || snapshot.QualityWindowSlowCount > 0 { if snapshot.SlowChannelCount > 0 || snapshot.QualityWindowSlowCount > 0 {
escalate("warning") escalate("warning")
score += boundedFabricPressureScore((snapshot.SlowChannelCount+snapshot.QualityWindowSlowCount)*6, 6, 24)
addReason("slow_channels") addReason("slow_channels")
} }
if snapshot.BulkPressureActive { if snapshot.BulkPressureActive {
escalate("warning") escalate("warning")
score += 15
addReason("bulk_pressure") addReason("bulk_pressure")
} }
if snapshot.AdaptiveBackpressureActive { if snapshot.AdaptiveBackpressureActive {
escalate("warning") escalate("warning")
score += 10
addReason(snapshot.AdaptiveBackpressureReason) addReason(snapshot.AdaptiveBackpressureReason)
} }
if snapshot.BackpressureActive { if snapshot.BackpressureActive {
escalate("warning") escalate("warning")
score += 10
addReason("backpressure") addReason("backpressure")
} }
return level, reasons if score > 100 {
score = 100
}
return level, score, reasons
} }
func flowPressureRank(level string) int { func flowPressureRank(level string) int {
@@ -928,6 +940,16 @@ func flowPressureRank(level string) int {
} }
} }
func boundedFabricPressureScore(value, minValue, maxValue int) int {
if value < minValue {
return minValue
}
if value > maxValue {
return maxValue
}
return value
}
func (s *FabricFlowScheduler) recommendedParallelSendWindowForTrafficClassLocked(trafficClass string, maxWindow int) int { func (s *FabricFlowScheduler) recommendedParallelSendWindowForTrafficClassLocked(trafficClass string, maxWindow int) int {
if maxWindow <= 1 { if maxWindow <= 1 {
return 1 return 1
@@ -789,8 +789,8 @@ func TestFabricFlowSchedulerDropsWhenChannelQueueIsFull(t *testing.T) {
if snapshot.Dropped != 1 || !snapshot.BackpressureActive { if snapshot.Dropped != 1 || !snapshot.BackpressureActive {
t.Fatalf("snapshot = %+v, want one dropped packet and active backpressure", snapshot) t.Fatalf("snapshot = %+v, want one dropped packet and active backpressure", snapshot)
} }
if snapshot.PressureLevel != "critical" || !containsString(snapshot.PressureReasons, "drops") { if snapshot.PressureLevel != "critical" || snapshot.PressureScore <= 0 || !containsString(snapshot.PressureReasons, "drops") {
t.Fatalf("pressure = %s/%v, want critical drops", snapshot.PressureLevel, snapshot.PressureReasons) t.Fatalf("pressure = %s score=%d reasons=%v, want critical drops", snapshot.PressureLevel, snapshot.PressureScore, snapshot.PressureReasons)
} }
} }
@@ -1548,9 +1548,10 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler) t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
} }
if snapshot.FlowScheduler.PressureLevel != "critical" || if snapshot.FlowScheduler.PressureLevel != "critical" ||
snapshot.FlowScheduler.PressureScore <= 0 ||
!containsString(snapshot.FlowScheduler.PressureReasons, "route_recovery") || !containsString(snapshot.FlowScheduler.PressureReasons, "route_recovery") ||
!containsString(snapshot.FlowScheduler.PressureReasons, "route_failures") { !containsString(snapshot.FlowScheduler.PressureReasons, "route_failures") {
t.Fatalf("route recovery pressure = %s/%v", snapshot.FlowScheduler.PressureLevel, snapshot.FlowScheduler.PressureReasons) t.Fatalf("route recovery pressure = %s score=%d reasons=%v", snapshot.FlowScheduler.PressureLevel, snapshot.FlowScheduler.PressureScore, snapshot.FlowScheduler.PressureReasons)
} }
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 { if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
t.Fatalf("channel B stat = %+v, want primary route memory preserved", statB) t.Fatalf("channel B stat = %+v, want primary route memory preserved", statB)
@@ -1919,8 +1920,8 @@ func TestFabricFlowSchedulerProtectsInteractiveWindowDuringBulkPressure(t *testi
if !snapshot.BulkPressureActive || snapshot.BulkPressureChannelCount != 16 || snapshot.InteractiveOrControlCount != 1 || !snapshot.BackpressureActive { if !snapshot.BulkPressureActive || snapshot.BulkPressureChannelCount != 16 || snapshot.InteractiveOrControlCount != 1 || !snapshot.BackpressureActive {
t.Fatalf("bulk pressure telemetry = %+v", snapshot) t.Fatalf("bulk pressure telemetry = %+v", snapshot)
} }
if snapshot.PressureLevel != "warning" || !containsString(snapshot.PressureReasons, "bulk_pressure") { if snapshot.PressureLevel != "warning" || snapshot.PressureScore <= 0 || !containsString(snapshot.PressureReasons, "bulk_pressure") {
t.Fatalf("pressure = %s/%v, want warning bulk pressure", snapshot.PressureLevel, snapshot.PressureReasons) t.Fatalf("pressure = %s score=%d reasons=%v, want warning bulk pressure", snapshot.PressureLevel, snapshot.PressureScore, snapshot.PressureReasons)
} }
} }
@@ -465,6 +465,8 @@ Flow-scheduler snapshots now include a machine-readable pressure level
(`nominal`, `warning`, `critical`) and bounded reason list derived from drops, (`nominal`, `warning`, `critical`) and bounded reason list derived from drops,
route failures, route recovery, slow channels, bulk pressure, and adaptive route failures, route recovery, slow channels, bulk pressure, and adaptive
backpressure. backpressure.
The same pressure classification includes a bounded 0-100 score for automated
route, endpoint, and node comparisons.
`mesh-live-smoke` reports the mixed-load scheduler pressure level and reasons. `mesh-live-smoke` reports the mixed-load scheduler pressure level and reasons.
Endpoint ranking treats `capacity_limited` observations as a soft pressure Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without penalty instead of a hard recent failure, enabling load spreading without