diff --git a/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go b/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go index f8b5678..fe4bc6e 100644 --- a/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go +++ b/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go @@ -183,9 +183,11 @@ type fabricFlowQueue struct { LastFailedRouteID string LastFailedRoutePolicyVersion string LastFailedRouteGeneration string + LastRouteFailureAt time.Time LastRecoveredFromRouteID string LastRecoveredNextHop string LastRouteSwitchAt time.Time + LastRouteRecoveryMillis int64 RouteSwitchCount uint64 LastError string ConsecutiveFailures uint64 @@ -293,9 +295,11 @@ type FabricFlowStat struct { LastFailedRouteID string `json:"last_failed_route_id,omitempty"` LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"` LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"` + LastRouteFailureAt string `json:"last_route_failure_at,omitempty"` LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"` LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"` LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"` + LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"` RouteSwitchCount uint64 `json:"route_switch_count,omitempty"` LastError string `json:"last_error,omitempty"` ConsecutiveFailures uint64 `json:"consecutive_failures"` @@ -702,6 +706,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot { LastFailedRouteGeneration: queue.LastFailedRouteGeneration, LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID, LastRecoveredNextHop: queue.LastRecoveredNextHop, + LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis, RouteSwitchCount: queue.RouteSwitchCount, LastError: queue.LastError, ConsecutiveFailures: queue.ConsecutiveFailures, @@ -730,6 +735,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot { if !qualityStats.LastUpdatedAt.IsZero() { stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano) } + if !queue.LastRouteFailureAt.IsZero() { + stat.LastRouteFailureAt = queue.LastRouteFailureAt.UTC().Format(time.RFC3339Nano) + } if !queue.LastRouteSwitchAt.IsZero() { stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano) } @@ -755,9 +763,11 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot { LastFailedRouteID: stat.LastFailedRouteID, LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion, LastFailedRouteGeneration: stat.LastFailedRouteGeneration, + LastRouteFailureAt: stat.LastRouteFailureAt, LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID, LastRecoveredNextHop: stat.LastRecoveredNextHop, LastRouteSwitchAt: stat.LastRouteSwitchAt, + LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis, RouteSwitchCount: stat.RouteSwitchCount, LastError: stat.LastError, ConsecutiveFailures: stat.ConsecutiveFailures, @@ -1092,9 +1102,17 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string, failedRouteID := strings.TrimSpace(queue.LastFailedRouteID) failedNextHop := strings.TrimSpace(queue.LastNextHop) if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) { + switchedAt := time.Now().UTC() queue.LastRecoveredFromRouteID = failedRouteID queue.LastRecoveredNextHop = failedNextHop - queue.LastRouteSwitchAt = time.Now().UTC() + queue.LastRouteSwitchAt = switchedAt + queue.LastRouteRecoveryMillis = 0 + if !queue.LastRouteFailureAt.IsZero() { + queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds() + if queue.LastRouteRecoveryMillis < 0 { + queue.LastRouteRecoveryMillis = 0 + } + } queue.RouteSwitchCount++ } queue.LastRouteID = routeID @@ -1140,6 +1158,7 @@ func (s *FabricFlowScheduler) RecordRouteFailureWithProvenance(channelID string, queue.LastFailedRouteID = routeID queue.LastFailedRoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion) queue.LastFailedRouteGeneration = strings.TrimSpace(provenance.Generation) + queue.LastRouteFailureAt = time.Now().UTC() if fp := strings.TrimSpace(provenance.RecoveryPolicyFingerprint); fp != "" { queue.RecoveryPolicyFingerprint = fp } diff --git a/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go b/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go index 5424cc9..8a86dcc 100644 --- a/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go +++ b/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go @@ -1533,7 +1533,9 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test if statA.LastRecoveredFromRouteID != "route-primary" || statA.LastRecoveredNextHop != "relay-primary" || statA.RouteSwitchCount != 1 || + statA.LastRouteFailureAt == "" || statA.LastRouteSwitchAt == "" || + statA.LastRouteRecoveryMillis < 0 || snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 || snapshot.FlowScheduler.RouteSwitchCount != 1 { t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler) diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index aecbe18..96274cd 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -445,6 +445,8 @@ the failed route a channel recovered from, and aggregate recovered-channel / switch counts, making alternate-route recovery measurable during load tests. `mesh-live-smoke` now also exercises a primary-route failure followed by an alternate-route success and reports the resulting route switch count. +Route recovery telemetry includes failure/switch timestamps and recovery +duration in milliseconds for each recovered flow channel. Endpoint ranking treats `capacity_limited` observations as a soft pressure penalty instead of a hard recent failure, enabling load spreading without marking the carrier unhealthy.