Track VPN pressure history

This commit is contained in:
2026-05-16 13:47:42 +03:00
parent 6a46063565
commit 8e9402580f
5 changed files with 171 additions and 51 deletions
@@ -910,6 +910,7 @@ func vpnFabricFlowPressureReport(snapshot vpnruntime.FabricFlowSchedulerSnapshot
"pressure_score": snapshot.PressureScore,
"pressure_reasons": append([]string{}, snapshot.PressureReasons...),
"recommended_action": snapshot.RecommendedAction,
"pressure_history": copyFabricFlowPressureHistory(snapshot.PressureHistory),
"backpressure_active": snapshot.BackpressureActive,
"bulk_pressure_active": snapshot.BulkPressureActive,
"bulk_pressure_channel_count": snapshot.BulkPressureChannelCount,
@@ -936,6 +937,18 @@ func vpnFabricFlowPressureReport(snapshot vpnruntime.FabricFlowSchedulerSnapshot
return report
}
func copyFabricFlowPressureHistory(in []vpnruntime.FabricFlowPressureHistorySample) []vpnruntime.FabricFlowPressureHistorySample {
if len(in) == 0 {
return []vpnruntime.FabricFlowPressureHistorySample{}
}
out := make([]vpnruntime.FabricFlowPressureHistorySample, 0, len(in))
for _, sample := range in {
sample.PressureReasons = append([]string{}, sample.PressureReasons...)
out = append(out, sample)
}
return out
}
func copyStringIntMap(in map[string]int) map[string]int {
if len(in) == 0 {
return map[string]int{}
@@ -1260,12 +1260,25 @@ func TestVPNFabricFlowPressureReportIncludesRecommendedAction(t *testing.T) {
RouteRecoveredChannelCount: 0,
RouteRecoveryMaxMillis: 0,
RouteRecoveryAvgMillis: 0,
PressureHistory: []vpnruntime.FabricFlowPressureHistorySample{
{
ObservedAt: "2026-05-16T12:00:00Z",
PressureLevel: "warning",
PressureScore: 35,
PressureReasons: []string{"bulk_pressure"},
RecommendedAction: "throttle_bulk",
},
},
})
if report["recommended_action"] != "throttle_bulk" ||
report["pressure_score"] != 35 ||
report["bulk_pressure_channel_count"] != 16 {
t.Fatalf("unexpected flow pressure report: %+v", report)
}
history, ok := report["pressure_history"].([]vpnruntime.FabricFlowPressureHistorySample)
if !ok || len(history) != 1 || history[0].RecommendedAction != "throttle_bulk" {
t.Fatalf("unexpected flow pressure history: %+v", report["pressure_history"])
}
}
func TestMergedEndpointCandidateObservationsKeepsNewest(t *testing.T) {
@@ -7,6 +7,7 @@ import (
"fmt"
"hash/fnv"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
@@ -26,6 +27,7 @@ const (
defaultFabricFlowFailureThreshold = 2
defaultFabricFlowSlowSendThreshold = 2 * time.Second
defaultFabricRouteQualitySwitchThreshold = 30
defaultFabricFlowPressureHistoryCapacity = 8
)
type FabricPacketTransport struct {
@@ -130,17 +132,19 @@ type FabricServiceChannelRouteQualityPreference struct {
}
type FabricFlowScheduler struct {
mu sync.Mutex
shardCount int
queueCapacity int
adaptivePolicy FabricServiceChannelAdaptivePolicy
queues map[string]*fabricFlowQueue
enqueued uint64
dequeued uint64
dropped uint64
highWatermark int
inFlight int
maxInFlight int
mu sync.Mutex
shardCount int
queueCapacity int
adaptivePolicy FabricServiceChannelAdaptivePolicy
queues map[string]*fabricFlowQueue
enqueued uint64
dequeued uint64
dropped uint64
highWatermark int
inFlight int
maxInFlight int
pressureHistory []FabricFlowPressureHistorySample
lastPressureFingerprint string
}
type FabricServiceChannelAdaptivePolicy struct {
@@ -239,46 +243,55 @@ type FabricScheduledPacketBatch struct {
}
type FabricFlowSchedulerSnapshot struct {
SchemaVersion string `json:"schema_version"`
Enabled bool `json:"enabled"`
ServiceNeutral bool `json:"service_neutral"`
Classifier string `json:"classifier"`
ServiceMode string `json:"service_mode"`
ShardCount int `json:"shard_count"`
QueueCapacity int `json:"queue_capacity"`
ChannelCount int `json:"channel_count"`
Enqueued uint64 `json:"enqueued"`
Dequeued uint64 `json:"dequeued"`
Dropped uint64 `json:"dropped"`
HighWatermark int `json:"high_watermark"`
BackpressureActive bool `json:"backpressure_active"`
PressureLevel string `json:"pressure_level,omitempty"`
PressureScore int `json:"pressure_score,omitempty"`
PressureReasons []string `json:"pressure_reasons,omitempty"`
RecommendedAction string `json:"recommended_action,omitempty"`
InFlight int `json:"in_flight"`
MaxInFlight int `json:"max_in_flight"`
AdaptiveBackpressureActive bool `json:"adaptive_backpressure_active,omitempty"`
AdaptiveBackpressureReason string `json:"adaptive_backpressure_reason,omitempty"`
RecommendedParallelWindows map[string]int `json:"recommended_parallel_windows,omitempty"`
AdaptivePolicyFingerprint string `json:"adaptive_policy_fingerprint,omitempty"`
BulkPressureActive bool `json:"bulk_pressure_active,omitempty"`
BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"`
InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"`
RouteRecoveredChannelCount int `json:"route_recovered_channel_count,omitempty"`
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
RouteRecoveryMaxMillis int64 `json:"route_recovery_max_ms,omitempty"`
RouteRecoveryAvgMillis int64 `json:"route_recovery_avg_ms,omitempty"`
RouteSwitchReasonCounts map[string]int `json:"route_switch_reason_counts,omitempty"`
SlowChannelCount int `json:"slow_channel_count"`
FailingChannelCount int `json:"failing_channel_count"`
QualityWindowSampleCount int `json:"quality_window_sample_count"`
QualityWindowFailureCount int `json:"quality_window_failure_count"`
QualityWindowSlowCount int `json:"quality_window_slow_count"`
QualityWindowDropCount int `json:"quality_window_drop_count"`
QueueDepths map[string]int `json:"queue_depths"`
TrafficClassCounts map[string]int `json:"traffic_class_counts,omitempty"`
ChannelStats map[string]FabricFlowStat `json:"channel_stats"`
SchemaVersion string `json:"schema_version"`
Enabled bool `json:"enabled"`
ServiceNeutral bool `json:"service_neutral"`
Classifier string `json:"classifier"`
ServiceMode string `json:"service_mode"`
ShardCount int `json:"shard_count"`
QueueCapacity int `json:"queue_capacity"`
ChannelCount int `json:"channel_count"`
Enqueued uint64 `json:"enqueued"`
Dequeued uint64 `json:"dequeued"`
Dropped uint64 `json:"dropped"`
HighWatermark int `json:"high_watermark"`
BackpressureActive bool `json:"backpressure_active"`
PressureLevel string `json:"pressure_level,omitempty"`
PressureScore int `json:"pressure_score,omitempty"`
PressureReasons []string `json:"pressure_reasons,omitempty"`
RecommendedAction string `json:"recommended_action,omitempty"`
PressureHistory []FabricFlowPressureHistorySample `json:"pressure_history,omitempty"`
InFlight int `json:"in_flight"`
MaxInFlight int `json:"max_in_flight"`
AdaptiveBackpressureActive bool `json:"adaptive_backpressure_active,omitempty"`
AdaptiveBackpressureReason string `json:"adaptive_backpressure_reason,omitempty"`
RecommendedParallelWindows map[string]int `json:"recommended_parallel_windows,omitempty"`
AdaptivePolicyFingerprint string `json:"adaptive_policy_fingerprint,omitempty"`
BulkPressureActive bool `json:"bulk_pressure_active,omitempty"`
BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"`
InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"`
RouteRecoveredChannelCount int `json:"route_recovered_channel_count,omitempty"`
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
RouteRecoveryMaxMillis int64 `json:"route_recovery_max_ms,omitempty"`
RouteRecoveryAvgMillis int64 `json:"route_recovery_avg_ms,omitempty"`
RouteSwitchReasonCounts map[string]int `json:"route_switch_reason_counts,omitempty"`
SlowChannelCount int `json:"slow_channel_count"`
FailingChannelCount int `json:"failing_channel_count"`
QualityWindowSampleCount int `json:"quality_window_sample_count"`
QualityWindowFailureCount int `json:"quality_window_failure_count"`
QualityWindowSlowCount int `json:"quality_window_slow_count"`
QualityWindowDropCount int `json:"quality_window_drop_count"`
QueueDepths map[string]int `json:"queue_depths"`
TrafficClassCounts map[string]int `json:"traffic_class_counts,omitempty"`
ChannelStats map[string]FabricFlowStat `json:"channel_stats"`
}
type FabricFlowPressureHistorySample struct {
ObservedAt string `json:"observed_at"`
PressureLevel string `json:"pressure_level"`
PressureScore int `json:"pressure_score"`
PressureReasons []string `json:"pressure_reasons,omitempty"`
RecommendedAction string `json:"recommended_action"`
}
type FabricFlowStat struct {
@@ -866,9 +879,53 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
}
snapshot.PressureLevel, snapshot.PressureScore, snapshot.PressureReasons = fabricFlowSchedulerPressure(snapshot)
snapshot.RecommendedAction = fabricFlowSchedulerRecommendedAction(snapshot)
s.recordPressureHistoryLocked(&snapshot, time.Now())
return snapshot
}
func (s *FabricFlowScheduler) recordPressureHistoryLocked(snapshot *FabricFlowSchedulerSnapshot, observedAt time.Time) {
if s == nil || snapshot == nil {
return
}
fingerprint := fabricFlowPressureFingerprint(*snapshot)
if fingerprint != s.lastPressureFingerprint {
s.pressureHistory = append(s.pressureHistory, FabricFlowPressureHistorySample{
ObservedAt: observedAt.UTC().Format(time.RFC3339Nano),
PressureLevel: snapshot.PressureLevel,
PressureScore: snapshot.PressureScore,
PressureReasons: append([]string{}, snapshot.PressureReasons...),
RecommendedAction: snapshot.RecommendedAction,
})
if len(s.pressureHistory) > defaultFabricFlowPressureHistoryCapacity {
start := len(s.pressureHistory) - defaultFabricFlowPressureHistoryCapacity
s.pressureHistory = append([]FabricFlowPressureHistorySample{}, s.pressureHistory[start:]...)
}
s.lastPressureFingerprint = fingerprint
}
snapshot.PressureHistory = copyFabricFlowPressureHistory(s.pressureHistory)
}
func fabricFlowPressureFingerprint(snapshot FabricFlowSchedulerSnapshot) string {
return strings.Join([]string{
snapshot.PressureLevel,
strconv.Itoa(snapshot.PressureScore),
snapshot.RecommendedAction,
strings.Join(snapshot.PressureReasons, ","),
}, "|")
}
func copyFabricFlowPressureHistory(in []FabricFlowPressureHistorySample) []FabricFlowPressureHistorySample {
if len(in) == 0 {
return nil
}
out := make([]FabricFlowPressureHistorySample, 0, len(in))
for _, sample := range in {
sample.PressureReasons = append([]string{}, sample.PressureReasons...)
out = append(out, sample)
}
return out
}
func fabricFlowSchedulerPressure(snapshot FabricFlowSchedulerSnapshot) (string, int, []string) {
level := "nominal"
score := 0
@@ -805,6 +805,39 @@ func TestFabricFlowSchedulerSnapshotReportsNominalAction(t *testing.T) {
snapshot.RecommendedAction != "observe" {
t.Fatalf("nominal pressure snapshot = %+v", snapshot)
}
if len(snapshot.PressureHistory) != 1 ||
snapshot.PressureHistory[0].PressureLevel != "nominal" ||
snapshot.PressureHistory[0].RecommendedAction != "observe" {
t.Fatalf("nominal pressure history = %+v", snapshot.PressureHistory)
}
}
func TestFabricFlowSchedulerRecordsPressureHistoryTransitions(t *testing.T) {
scheduler := NewFabricFlowScheduler(1, 1)
nominal := scheduler.Snapshot()
if len(nominal.PressureHistory) != 1 || nominal.PressureHistory[0].RecommendedAction != "observe" {
t.Fatalf("nominal pressure history = %+v", nominal.PressureHistory)
}
packetA := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
packetB := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
scheduler.scheduleClientPackets("", "", [][]byte{packetA, packetB})
pressure := scheduler.Snapshot()
if len(pressure.PressureHistory) != 2 {
t.Fatalf("pressure history = %+v, want nominal plus critical transition", pressure.PressureHistory)
}
last := pressure.PressureHistory[len(pressure.PressureHistory)-1]
if last.PressureLevel != "critical" ||
last.RecommendedAction != "shed_or_reroute" ||
!containsString(last.PressureReasons, "drops") {
t.Fatalf("last pressure history sample = %+v", last)
}
unchanged := scheduler.Snapshot()
if len(unchanged.PressureHistory) != 2 {
t.Fatalf("unchanged pressure history duplicated: %+v", unchanged.PressureHistory)
}
}
func TestFabricFlowSchedulerRoundsSubMillisecondSendDuration(t *testing.T) {