From d5c089d120b9b23a7948e04b2c138bfc455ab3b5 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Sat, 16 May 2026 13:07:42 +0300 Subject: [PATCH] Expose VPN route recovery telemetry --- .../internal/vpnruntime/fabric_transport.go | 32 +++++++++++++++++++ .../vpnruntime/fabric_transport_test.go | 8 +++++ .../DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md | 3 ++ 3 files changed, 43 insertions(+) diff --git a/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go b/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go index e1be09b..f8b5678 100644 --- a/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go +++ b/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go @@ -183,6 +183,10 @@ type fabricFlowQueue struct { LastFailedRouteID string LastFailedRoutePolicyVersion string LastFailedRouteGeneration string + LastRecoveredFromRouteID string + LastRecoveredNextHop string + LastRouteSwitchAt time.Time + RouteSwitchCount uint64 LastError string ConsecutiveFailures uint64 StallCount uint64 @@ -254,6 +258,8 @@ type FabricFlowSchedulerSnapshot struct { BulkPressureActive bool `json:"bulk_pressure_active,omitempty"` BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"` InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"` + RouteRecoveredChannelCount int `json:"route_recovered_channel_count,omitempty"` + RouteSwitchCount uint64 `json:"route_switch_count,omitempty"` SlowChannelCount int `json:"slow_channel_count"` FailingChannelCount int `json:"failing_channel_count"` QualityWindowSampleCount int `json:"quality_window_sample_count"` @@ -287,6 +293,10 @@ type FabricFlowStat struct { LastFailedRouteID string `json:"last_failed_route_id,omitempty"` LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"` LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"` + LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"` + LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"` + LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"` + RouteSwitchCount uint64 `json:"route_switch_count,omitempty"` LastError string `json:"last_error,omitempty"` ConsecutiveFailures uint64 `json:"consecutive_failures"` StallCount uint64 `json:"stall_count"` @@ -690,6 +700,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot { LastFailedRouteID: queue.LastFailedRouteID, LastFailedRoutePolicyVersion: queue.LastFailedRoutePolicyVersion, LastFailedRouteGeneration: queue.LastFailedRouteGeneration, + LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID, + LastRecoveredNextHop: queue.LastRecoveredNextHop, + RouteSwitchCount: queue.RouteSwitchCount, LastError: queue.LastError, ConsecutiveFailures: queue.ConsecutiveFailures, StallCount: queue.StallCount, @@ -717,6 +730,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot { if !qualityStats.LastUpdatedAt.IsZero() { stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano) } + if !queue.LastRouteSwitchAt.IsZero() { + stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano) + } snapshot.ChannelStats[channelID] = FabricFlowStat{ Depth: stat.Depth, TrafficClass: stat.TrafficClass, @@ -739,6 +755,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot { LastFailedRouteID: stat.LastFailedRouteID, LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion, LastFailedRouteGeneration: stat.LastFailedRouteGeneration, + LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID, + LastRecoveredNextHop: stat.LastRecoveredNextHop, + LastRouteSwitchAt: stat.LastRouteSwitchAt, + RouteSwitchCount: stat.RouteSwitchCount, LastError: stat.LastError, ConsecutiveFailures: stat.ConsecutiveFailures, StallCount: stat.StallCount, @@ -765,6 +785,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot { snapshot.QualityWindowFailureCount += qualityStats.FailureCount snapshot.QualityWindowSlowCount += qualityStats.SlowCount snapshot.QualityWindowDropCount += qualityStats.DropCount + snapshot.RouteSwitchCount += queue.RouteSwitchCount + if queue.LastRecoveredFromRouteID != "" { + snapshot.RouteRecoveredChannelCount++ + } if queue.Depth >= s.queueCapacity || qualityStats.DropCount > 0 { snapshot.BackpressureActive = true } @@ -1065,6 +1089,14 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string, s.mu.Lock() defer s.mu.Unlock() queue := s.ensureQueueLocked(channelID) + failedRouteID := strings.TrimSpace(queue.LastFailedRouteID) + failedNextHop := strings.TrimSpace(queue.LastNextHop) + if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) { + queue.LastRecoveredFromRouteID = failedRouteID + queue.LastRecoveredNextHop = failedNextHop + queue.LastRouteSwitchAt = time.Now().UTC() + queue.RouteSwitchCount++ + } queue.LastRouteID = routeID queue.RoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion) queue.RouteGeneration = strings.TrimSpace(provenance.Generation) diff --git a/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go b/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go index f4fec25..5424cc9 100644 --- a/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go +++ b/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go @@ -1530,6 +1530,14 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test if statA.LastRouteID != "route-alternate" || statA.LastFailedRouteID != "" || statA.ConsecutiveFailures != 0 { t.Fatalf("channel A stat = %+v, want recovered on alternate route", statA) } + if statA.LastRecoveredFromRouteID != "route-primary" || + statA.LastRecoveredNextHop != "relay-primary" || + statA.RouteSwitchCount != 1 || + statA.LastRouteSwitchAt == "" || + snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 || + snapshot.FlowScheduler.RouteSwitchCount != 1 { + t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler) + } if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 { t.Fatalf("channel B stat = %+v, want primary route memory preserved", statB) } diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index 7c7ae4a..0117ceb 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -440,6 +440,9 @@ bulk and interactive/control channel counts, making mixed browser/RDP load diagnosis explicit when bulk windows are reduced to protect interactive traffic. `mesh-live-smoke` now exercises that mixed-load scheduler path and reports bulk pressure activation plus bulk/interactive window recommendations. +Flow-scheduler route recovery telemetry now records per-channel route switches, +the failed route a channel recovered from, and aggregate recovered-channel / +switch counts, making alternate-route recovery measurable during load tests. Endpoint ranking treats `capacity_limited` observations as a soft pressure penalty instead of a hard recent failure, enabling load spreading without marking the carrier unhealthy.