Track VPN pressure history

This commit is contained in:
2026-05-16 13:47:42 +03:00
parent 6a46063565
commit 8e9402580f
5 changed files with 171 additions and 51 deletions
@@ -910,6 +910,7 @@ func vpnFabricFlowPressureReport(snapshot vpnruntime.FabricFlowSchedulerSnapshot
"pressure_score": snapshot.PressureScore,
"pressure_reasons": append([]string{}, snapshot.PressureReasons...),
"recommended_action": snapshot.RecommendedAction,
"pressure_history": copyFabricFlowPressureHistory(snapshot.PressureHistory),
"backpressure_active": snapshot.BackpressureActive,
"bulk_pressure_active": snapshot.BulkPressureActive,
"bulk_pressure_channel_count": snapshot.BulkPressureChannelCount,
@@ -936,6 +937,18 @@ func vpnFabricFlowPressureReport(snapshot vpnruntime.FabricFlowSchedulerSnapshot
return report
}
func copyFabricFlowPressureHistory(in []vpnruntime.FabricFlowPressureHistorySample) []vpnruntime.FabricFlowPressureHistorySample {
if len(in) == 0 {
return []vpnruntime.FabricFlowPressureHistorySample{}
}
out := make([]vpnruntime.FabricFlowPressureHistorySample, 0, len(in))
for _, sample := range in {
sample.PressureReasons = append([]string{}, sample.PressureReasons...)
out = append(out, sample)
}
return out
}
func copyStringIntMap(in map[string]int) map[string]int {
if len(in) == 0 {
return map[string]int{}
@@ -1260,12 +1260,25 @@ func TestVPNFabricFlowPressureReportIncludesRecommendedAction(t *testing.T) {
RouteRecoveredChannelCount: 0,
RouteRecoveryMaxMillis: 0,
RouteRecoveryAvgMillis: 0,
PressureHistory: []vpnruntime.FabricFlowPressureHistorySample{
{
ObservedAt: "2026-05-16T12:00:00Z",
PressureLevel: "warning",
PressureScore: 35,
PressureReasons: []string{"bulk_pressure"},
RecommendedAction: "throttle_bulk",
},
},
})
if report["recommended_action"] != "throttle_bulk" ||
report["pressure_score"] != 35 ||
report["bulk_pressure_channel_count"] != 16 {
t.Fatalf("unexpected flow pressure report: %+v", report)
}
history, ok := report["pressure_history"].([]vpnruntime.FabricFlowPressureHistorySample)
if !ok || len(history) != 1 || history[0].RecommendedAction != "throttle_bulk" {
t.Fatalf("unexpected flow pressure history: %+v", report["pressure_history"])
}
}
func TestMergedEndpointCandidateObservationsKeepsNewest(t *testing.T) {
@@ -7,6 +7,7 @@ import (
"fmt"
"hash/fnv"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
@@ -26,6 +27,7 @@ const (
defaultFabricFlowFailureThreshold = 2
defaultFabricFlowSlowSendThreshold = 2 * time.Second
defaultFabricRouteQualitySwitchThreshold = 30
defaultFabricFlowPressureHistoryCapacity = 8
)
type FabricPacketTransport struct {
@@ -141,6 +143,8 @@ type FabricFlowScheduler struct {
highWatermark int
inFlight int
maxInFlight int
pressureHistory []FabricFlowPressureHistorySample
lastPressureFingerprint string
}
type FabricServiceChannelAdaptivePolicy struct {
@@ -256,6 +260,7 @@ type FabricFlowSchedulerSnapshot struct {
PressureScore int `json:"pressure_score,omitempty"`
PressureReasons []string `json:"pressure_reasons,omitempty"`
RecommendedAction string `json:"recommended_action,omitempty"`
PressureHistory []FabricFlowPressureHistorySample `json:"pressure_history,omitempty"`
InFlight int `json:"in_flight"`
MaxInFlight int `json:"max_in_flight"`
AdaptiveBackpressureActive bool `json:"adaptive_backpressure_active,omitempty"`
@@ -281,6 +286,14 @@ type FabricFlowSchedulerSnapshot struct {
ChannelStats map[string]FabricFlowStat `json:"channel_stats"`
}
type FabricFlowPressureHistorySample struct {
ObservedAt string `json:"observed_at"`
PressureLevel string `json:"pressure_level"`
PressureScore int `json:"pressure_score"`
PressureReasons []string `json:"pressure_reasons,omitempty"`
RecommendedAction string `json:"recommended_action"`
}
type FabricFlowStat struct {
TrafficClass string `json:"traffic_class,omitempty"`
Depth int `json:"depth"`
@@ -866,9 +879,53 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
}
snapshot.PressureLevel, snapshot.PressureScore, snapshot.PressureReasons = fabricFlowSchedulerPressure(snapshot)
snapshot.RecommendedAction = fabricFlowSchedulerRecommendedAction(snapshot)
s.recordPressureHistoryLocked(&snapshot, time.Now())
return snapshot
}
func (s *FabricFlowScheduler) recordPressureHistoryLocked(snapshot *FabricFlowSchedulerSnapshot, observedAt time.Time) {
if s == nil || snapshot == nil {
return
}
fingerprint := fabricFlowPressureFingerprint(*snapshot)
if fingerprint != s.lastPressureFingerprint {
s.pressureHistory = append(s.pressureHistory, FabricFlowPressureHistorySample{
ObservedAt: observedAt.UTC().Format(time.RFC3339Nano),
PressureLevel: snapshot.PressureLevel,
PressureScore: snapshot.PressureScore,
PressureReasons: append([]string{}, snapshot.PressureReasons...),
RecommendedAction: snapshot.RecommendedAction,
})
if len(s.pressureHistory) > defaultFabricFlowPressureHistoryCapacity {
start := len(s.pressureHistory) - defaultFabricFlowPressureHistoryCapacity
s.pressureHistory = append([]FabricFlowPressureHistorySample{}, s.pressureHistory[start:]...)
}
s.lastPressureFingerprint = fingerprint
}
snapshot.PressureHistory = copyFabricFlowPressureHistory(s.pressureHistory)
}
func fabricFlowPressureFingerprint(snapshot FabricFlowSchedulerSnapshot) string {
return strings.Join([]string{
snapshot.PressureLevel,
strconv.Itoa(snapshot.PressureScore),
snapshot.RecommendedAction,
strings.Join(snapshot.PressureReasons, ","),
}, "|")
}
func copyFabricFlowPressureHistory(in []FabricFlowPressureHistorySample) []FabricFlowPressureHistorySample {
if len(in) == 0 {
return nil
}
out := make([]FabricFlowPressureHistorySample, 0, len(in))
for _, sample := range in {
sample.PressureReasons = append([]string{}, sample.PressureReasons...)
out = append(out, sample)
}
return out
}
func fabricFlowSchedulerPressure(snapshot FabricFlowSchedulerSnapshot) (string, int, []string) {
level := "nominal"
score := 0
@@ -805,6 +805,39 @@ func TestFabricFlowSchedulerSnapshotReportsNominalAction(t *testing.T) {
snapshot.RecommendedAction != "observe" {
t.Fatalf("nominal pressure snapshot = %+v", snapshot)
}
if len(snapshot.PressureHistory) != 1 ||
snapshot.PressureHistory[0].PressureLevel != "nominal" ||
snapshot.PressureHistory[0].RecommendedAction != "observe" {
t.Fatalf("nominal pressure history = %+v", snapshot.PressureHistory)
}
}
func TestFabricFlowSchedulerRecordsPressureHistoryTransitions(t *testing.T) {
scheduler := NewFabricFlowScheduler(1, 1)
nominal := scheduler.Snapshot()
if len(nominal.PressureHistory) != 1 || nominal.PressureHistory[0].RecommendedAction != "observe" {
t.Fatalf("nominal pressure history = %+v", nominal.PressureHistory)
}
packetA := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
packetB := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
scheduler.scheduleClientPackets("", "", [][]byte{packetA, packetB})
pressure := scheduler.Snapshot()
if len(pressure.PressureHistory) != 2 {
t.Fatalf("pressure history = %+v, want nominal plus critical transition", pressure.PressureHistory)
}
last := pressure.PressureHistory[len(pressure.PressureHistory)-1]
if last.PressureLevel != "critical" ||
last.RecommendedAction != "shed_or_reroute" ||
!containsString(last.PressureReasons, "drops") {
t.Fatalf("last pressure history sample = %+v", last)
}
unchanged := scheduler.Snapshot()
if len(unchanged.PressureHistory) != 2 {
t.Fatalf("unchanged pressure history duplicated: %+v", unchanged.PressureHistory)
}
}
func TestFabricFlowSchedulerRoundsSubMillisecondSendDuration(t *testing.T) {
@@ -479,6 +479,10 @@ The `flow_pressure` summary includes a `recommended_action` such as
contract, so heartbeat reports and smoke diagnostics consume the same runtime
decision.
The scheduler's nominal snapshot explicitly reports the `observe` action.
Flow-scheduler snapshots keep a bounded pressure transition history with the
observed level, score, reasons, and recommended action. Repeated snapshots do
not duplicate unchanged pressure states, so controllers can distinguish current
state from recent worsening or recovery without unbounded heartbeat growth.
`mesh-live-smoke` reports the recommended action for its mixed bulk/interactive
load scenario.
Nodes advertise the `vpn_fabric_flow_pressure` capability when that heartbeat