diff --git a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go index bd91e57..bc6171f 100644 --- a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go +++ b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go @@ -51,6 +51,8 @@ type smokeReport struct { FabricVPNRouteRecovered bool `json:"fabric_vpn_route_recovered"` FabricVPNRouteSwitches uint64 `json:"fabric_vpn_route_switch_count"` FabricVPNRecoveryMS int64 `json:"fabric_vpn_route_recovery_ms"` + FabricVPNRecoveryMaxMS int64 `json:"fabric_vpn_route_recovery_max_ms"` + FabricVPNRecoveryAvgMS int64 `json:"fabric_vpn_route_recovery_avg_ms"` FabricQUICAccepted bool `json:"fabric_quic_accepted"` FabricQUICEndpoint string `json:"fabric_quic_endpoint"` FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"` @@ -155,7 +157,7 @@ func run(ctx context.Context) (smokeReport, error) { return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err) } fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure() - fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS := smokeVPNFlowSchedulerRouteRecovery() + fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS := smokeVPNFlowSchedulerRouteRecovery() fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx) if err != nil { return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err) @@ -184,6 +186,8 @@ func run(ctx context.Context) (smokeReport, error) { FabricVPNRouteRecovered: fabricVPNRouteRecovered, FabricVPNRouteSwitches: fabricVPNRouteSwitches, FabricVPNRecoveryMS: fabricVPNRecoveryMS, + FabricVPNRecoveryMaxMS: fabricVPNRecoveryMaxMS, + FabricVPNRecoveryAvgMS: fabricVPNRecoveryAvgMS, FabricQUICAccepted: fabricQUICAccepted, FabricQUICEndpoint: fabricQUICEndpoint, FabricQUICPressure: fabricQUICPressure, @@ -221,7 +225,7 @@ func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int) { snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive] } -func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64) { +func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64) { scheduler := vpnruntime.NewFabricFlowScheduler(8, 16) channelID := "vpn-smoke-flow-0" scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond) @@ -233,7 +237,9 @@ func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64) { stat.LastRouteID == "route-alternate" && snapshot.RouteRecoveredChannelCount == 1, snapshot.RouteSwitchCount, - stat.LastRouteRecoveryMillis + stat.LastRouteRecoveryMillis, + snapshot.RouteRecoveryMaxMillis, + snapshot.RouteRecoveryAvgMillis } func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) { diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index 54abc80..0b1b1e1 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -447,6 +447,8 @@ switch counts, making alternate-route recovery measurable during load tests. alternate-route success and reports the resulting route switch count. The same smoke output reports measured route recovery milliseconds for the synthetic failover path. +Smoke now includes max/average route recovery timing from the scheduler +aggregate snapshot as well. Route recovery telemetry includes failure/switch timestamps and recovery duration in milliseconds for each recovered flow channel. Scheduler snapshots also aggregate route recovery max/average milliseconds