Track VPN pressure history
This commit is contained in:
@@ -910,6 +910,7 @@ func vpnFabricFlowPressureReport(snapshot vpnruntime.FabricFlowSchedulerSnapshot
|
|||||||
"pressure_score": snapshot.PressureScore,
|
"pressure_score": snapshot.PressureScore,
|
||||||
"pressure_reasons": append([]string{}, snapshot.PressureReasons...),
|
"pressure_reasons": append([]string{}, snapshot.PressureReasons...),
|
||||||
"recommended_action": snapshot.RecommendedAction,
|
"recommended_action": snapshot.RecommendedAction,
|
||||||
|
"pressure_history": copyFabricFlowPressureHistory(snapshot.PressureHistory),
|
||||||
"backpressure_active": snapshot.BackpressureActive,
|
"backpressure_active": snapshot.BackpressureActive,
|
||||||
"bulk_pressure_active": snapshot.BulkPressureActive,
|
"bulk_pressure_active": snapshot.BulkPressureActive,
|
||||||
"bulk_pressure_channel_count": snapshot.BulkPressureChannelCount,
|
"bulk_pressure_channel_count": snapshot.BulkPressureChannelCount,
|
||||||
@@ -936,6 +937,18 @@ func vpnFabricFlowPressureReport(snapshot vpnruntime.FabricFlowSchedulerSnapshot
|
|||||||
return report
|
return report
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func copyFabricFlowPressureHistory(in []vpnruntime.FabricFlowPressureHistorySample) []vpnruntime.FabricFlowPressureHistorySample {
|
||||||
|
if len(in) == 0 {
|
||||||
|
return []vpnruntime.FabricFlowPressureHistorySample{}
|
||||||
|
}
|
||||||
|
out := make([]vpnruntime.FabricFlowPressureHistorySample, 0, len(in))
|
||||||
|
for _, sample := range in {
|
||||||
|
sample.PressureReasons = append([]string{}, sample.PressureReasons...)
|
||||||
|
out = append(out, sample)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func copyStringIntMap(in map[string]int) map[string]int {
|
func copyStringIntMap(in map[string]int) map[string]int {
|
||||||
if len(in) == 0 {
|
if len(in) == 0 {
|
||||||
return map[string]int{}
|
return map[string]int{}
|
||||||
|
|||||||
@@ -1260,12 +1260,25 @@ func TestVPNFabricFlowPressureReportIncludesRecommendedAction(t *testing.T) {
|
|||||||
RouteRecoveredChannelCount: 0,
|
RouteRecoveredChannelCount: 0,
|
||||||
RouteRecoveryMaxMillis: 0,
|
RouteRecoveryMaxMillis: 0,
|
||||||
RouteRecoveryAvgMillis: 0,
|
RouteRecoveryAvgMillis: 0,
|
||||||
|
PressureHistory: []vpnruntime.FabricFlowPressureHistorySample{
|
||||||
|
{
|
||||||
|
ObservedAt: "2026-05-16T12:00:00Z",
|
||||||
|
PressureLevel: "warning",
|
||||||
|
PressureScore: 35,
|
||||||
|
PressureReasons: []string{"bulk_pressure"},
|
||||||
|
RecommendedAction: "throttle_bulk",
|
||||||
|
},
|
||||||
|
},
|
||||||
})
|
})
|
||||||
if report["recommended_action"] != "throttle_bulk" ||
|
if report["recommended_action"] != "throttle_bulk" ||
|
||||||
report["pressure_score"] != 35 ||
|
report["pressure_score"] != 35 ||
|
||||||
report["bulk_pressure_channel_count"] != 16 {
|
report["bulk_pressure_channel_count"] != 16 {
|
||||||
t.Fatalf("unexpected flow pressure report: %+v", report)
|
t.Fatalf("unexpected flow pressure report: %+v", report)
|
||||||
}
|
}
|
||||||
|
history, ok := report["pressure_history"].([]vpnruntime.FabricFlowPressureHistorySample)
|
||||||
|
if !ok || len(history) != 1 || history[0].RecommendedAction != "throttle_bulk" {
|
||||||
|
t.Fatalf("unexpected flow pressure history: %+v", report["pressure_history"])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMergedEndpointCandidateObservationsKeepsNewest(t *testing.T) {
|
func TestMergedEndpointCandidateObservationsKeepsNewest(t *testing.T) {
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"hash/fnv"
|
"hash/fnv"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
@@ -26,6 +27,7 @@ const (
|
|||||||
defaultFabricFlowFailureThreshold = 2
|
defaultFabricFlowFailureThreshold = 2
|
||||||
defaultFabricFlowSlowSendThreshold = 2 * time.Second
|
defaultFabricFlowSlowSendThreshold = 2 * time.Second
|
||||||
defaultFabricRouteQualitySwitchThreshold = 30
|
defaultFabricRouteQualitySwitchThreshold = 30
|
||||||
|
defaultFabricFlowPressureHistoryCapacity = 8
|
||||||
)
|
)
|
||||||
|
|
||||||
type FabricPacketTransport struct {
|
type FabricPacketTransport struct {
|
||||||
@@ -141,6 +143,8 @@ type FabricFlowScheduler struct {
|
|||||||
highWatermark int
|
highWatermark int
|
||||||
inFlight int
|
inFlight int
|
||||||
maxInFlight int
|
maxInFlight int
|
||||||
|
pressureHistory []FabricFlowPressureHistorySample
|
||||||
|
lastPressureFingerprint string
|
||||||
}
|
}
|
||||||
|
|
||||||
type FabricServiceChannelAdaptivePolicy struct {
|
type FabricServiceChannelAdaptivePolicy struct {
|
||||||
@@ -256,6 +260,7 @@ type FabricFlowSchedulerSnapshot struct {
|
|||||||
PressureScore int `json:"pressure_score,omitempty"`
|
PressureScore int `json:"pressure_score,omitempty"`
|
||||||
PressureReasons []string `json:"pressure_reasons,omitempty"`
|
PressureReasons []string `json:"pressure_reasons,omitempty"`
|
||||||
RecommendedAction string `json:"recommended_action,omitempty"`
|
RecommendedAction string `json:"recommended_action,omitempty"`
|
||||||
|
PressureHistory []FabricFlowPressureHistorySample `json:"pressure_history,omitempty"`
|
||||||
InFlight int `json:"in_flight"`
|
InFlight int `json:"in_flight"`
|
||||||
MaxInFlight int `json:"max_in_flight"`
|
MaxInFlight int `json:"max_in_flight"`
|
||||||
AdaptiveBackpressureActive bool `json:"adaptive_backpressure_active,omitempty"`
|
AdaptiveBackpressureActive bool `json:"adaptive_backpressure_active,omitempty"`
|
||||||
@@ -281,6 +286,14 @@ type FabricFlowSchedulerSnapshot struct {
|
|||||||
ChannelStats map[string]FabricFlowStat `json:"channel_stats"`
|
ChannelStats map[string]FabricFlowStat `json:"channel_stats"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type FabricFlowPressureHistorySample struct {
|
||||||
|
ObservedAt string `json:"observed_at"`
|
||||||
|
PressureLevel string `json:"pressure_level"`
|
||||||
|
PressureScore int `json:"pressure_score"`
|
||||||
|
PressureReasons []string `json:"pressure_reasons,omitempty"`
|
||||||
|
RecommendedAction string `json:"recommended_action"`
|
||||||
|
}
|
||||||
|
|
||||||
type FabricFlowStat struct {
|
type FabricFlowStat struct {
|
||||||
TrafficClass string `json:"traffic_class,omitempty"`
|
TrafficClass string `json:"traffic_class,omitempty"`
|
||||||
Depth int `json:"depth"`
|
Depth int `json:"depth"`
|
||||||
@@ -866,9 +879,53 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
}
|
}
|
||||||
snapshot.PressureLevel, snapshot.PressureScore, snapshot.PressureReasons = fabricFlowSchedulerPressure(snapshot)
|
snapshot.PressureLevel, snapshot.PressureScore, snapshot.PressureReasons = fabricFlowSchedulerPressure(snapshot)
|
||||||
snapshot.RecommendedAction = fabricFlowSchedulerRecommendedAction(snapshot)
|
snapshot.RecommendedAction = fabricFlowSchedulerRecommendedAction(snapshot)
|
||||||
|
s.recordPressureHistoryLocked(&snapshot, time.Now())
|
||||||
return snapshot
|
return snapshot
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *FabricFlowScheduler) recordPressureHistoryLocked(snapshot *FabricFlowSchedulerSnapshot, observedAt time.Time) {
|
||||||
|
if s == nil || snapshot == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fingerprint := fabricFlowPressureFingerprint(*snapshot)
|
||||||
|
if fingerprint != s.lastPressureFingerprint {
|
||||||
|
s.pressureHistory = append(s.pressureHistory, FabricFlowPressureHistorySample{
|
||||||
|
ObservedAt: observedAt.UTC().Format(time.RFC3339Nano),
|
||||||
|
PressureLevel: snapshot.PressureLevel,
|
||||||
|
PressureScore: snapshot.PressureScore,
|
||||||
|
PressureReasons: append([]string{}, snapshot.PressureReasons...),
|
||||||
|
RecommendedAction: snapshot.RecommendedAction,
|
||||||
|
})
|
||||||
|
if len(s.pressureHistory) > defaultFabricFlowPressureHistoryCapacity {
|
||||||
|
start := len(s.pressureHistory) - defaultFabricFlowPressureHistoryCapacity
|
||||||
|
s.pressureHistory = append([]FabricFlowPressureHistorySample{}, s.pressureHistory[start:]...)
|
||||||
|
}
|
||||||
|
s.lastPressureFingerprint = fingerprint
|
||||||
|
}
|
||||||
|
snapshot.PressureHistory = copyFabricFlowPressureHistory(s.pressureHistory)
|
||||||
|
}
|
||||||
|
|
||||||
|
func fabricFlowPressureFingerprint(snapshot FabricFlowSchedulerSnapshot) string {
|
||||||
|
return strings.Join([]string{
|
||||||
|
snapshot.PressureLevel,
|
||||||
|
strconv.Itoa(snapshot.PressureScore),
|
||||||
|
snapshot.RecommendedAction,
|
||||||
|
strings.Join(snapshot.PressureReasons, ","),
|
||||||
|
}, "|")
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyFabricFlowPressureHistory(in []FabricFlowPressureHistorySample) []FabricFlowPressureHistorySample {
|
||||||
|
if len(in) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make([]FabricFlowPressureHistorySample, 0, len(in))
|
||||||
|
for _, sample := range in {
|
||||||
|
sample.PressureReasons = append([]string{}, sample.PressureReasons...)
|
||||||
|
out = append(out, sample)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func fabricFlowSchedulerPressure(snapshot FabricFlowSchedulerSnapshot) (string, int, []string) {
|
func fabricFlowSchedulerPressure(snapshot FabricFlowSchedulerSnapshot) (string, int, []string) {
|
||||||
level := "nominal"
|
level := "nominal"
|
||||||
score := 0
|
score := 0
|
||||||
|
|||||||
@@ -805,6 +805,39 @@ func TestFabricFlowSchedulerSnapshotReportsNominalAction(t *testing.T) {
|
|||||||
snapshot.RecommendedAction != "observe" {
|
snapshot.RecommendedAction != "observe" {
|
||||||
t.Fatalf("nominal pressure snapshot = %+v", snapshot)
|
t.Fatalf("nominal pressure snapshot = %+v", snapshot)
|
||||||
}
|
}
|
||||||
|
if len(snapshot.PressureHistory) != 1 ||
|
||||||
|
snapshot.PressureHistory[0].PressureLevel != "nominal" ||
|
||||||
|
snapshot.PressureHistory[0].RecommendedAction != "observe" {
|
||||||
|
t.Fatalf("nominal pressure history = %+v", snapshot.PressureHistory)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFabricFlowSchedulerRecordsPressureHistoryTransitions(t *testing.T) {
|
||||||
|
scheduler := NewFabricFlowScheduler(1, 1)
|
||||||
|
nominal := scheduler.Snapshot()
|
||||||
|
if len(nominal.PressureHistory) != 1 || nominal.PressureHistory[0].RecommendedAction != "observe" {
|
||||||
|
t.Fatalf("nominal pressure history = %+v", nominal.PressureHistory)
|
||||||
|
}
|
||||||
|
|
||||||
|
packetA := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
|
||||||
|
packetB := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
|
||||||
|
scheduler.scheduleClientPackets("", "", [][]byte{packetA, packetB})
|
||||||
|
|
||||||
|
pressure := scheduler.Snapshot()
|
||||||
|
if len(pressure.PressureHistory) != 2 {
|
||||||
|
t.Fatalf("pressure history = %+v, want nominal plus critical transition", pressure.PressureHistory)
|
||||||
|
}
|
||||||
|
last := pressure.PressureHistory[len(pressure.PressureHistory)-1]
|
||||||
|
if last.PressureLevel != "critical" ||
|
||||||
|
last.RecommendedAction != "shed_or_reroute" ||
|
||||||
|
!containsString(last.PressureReasons, "drops") {
|
||||||
|
t.Fatalf("last pressure history sample = %+v", last)
|
||||||
|
}
|
||||||
|
|
||||||
|
unchanged := scheduler.Snapshot()
|
||||||
|
if len(unchanged.PressureHistory) != 2 {
|
||||||
|
t.Fatalf("unchanged pressure history duplicated: %+v", unchanged.PressureHistory)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFabricFlowSchedulerRoundsSubMillisecondSendDuration(t *testing.T) {
|
func TestFabricFlowSchedulerRoundsSubMillisecondSendDuration(t *testing.T) {
|
||||||
|
|||||||
@@ -479,6 +479,10 @@ The `flow_pressure` summary includes a `recommended_action` such as
|
|||||||
contract, so heartbeat reports and smoke diagnostics consume the same runtime
|
contract, so heartbeat reports and smoke diagnostics consume the same runtime
|
||||||
decision.
|
decision.
|
||||||
The scheduler's nominal snapshot explicitly reports the `observe` action.
|
The scheduler's nominal snapshot explicitly reports the `observe` action.
|
||||||
|
Flow-scheduler snapshots keep a bounded pressure transition history with the
|
||||||
|
observed level, score, reasons, and recommended action. Repeated snapshots do
|
||||||
|
not duplicate unchanged pressure states, so controllers can distinguish current
|
||||||
|
state from recent worsening or recovery without unbounded heartbeat growth.
|
||||||
`mesh-live-smoke` reports the recommended action for its mixed bulk/interactive
|
`mesh-live-smoke` reports the recommended action for its mixed bulk/interactive
|
||||||
load scenario.
|
load scenario.
|
||||||
Nodes advertise the `vpn_fabric_flow_pressure` capability when that heartbeat
|
Nodes advertise the `vpn_fabric_flow_pressure` capability when that heartbeat
|
||||||
|
|||||||
Reference in New Issue
Block a user