diff --git a/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go b/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go index 1a8b5eb..6d62f59 100644 --- a/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go +++ b/agents/rap-node-agent/internal/vpnruntime/fabric_transport.go @@ -929,6 +929,24 @@ func normalizeFabricRouteSwitchReason(reason string) string { if reason == "" { return "route_failure" } + for _, mapping := range []struct { + needle string + label string + }{ + {needle: "context deadline exceeded", label: "timeout"}, + {needle: "i/o timeout", label: "timeout"}, + {needle: "connection refused", label: "connection_refused"}, + {needle: "connection reset", label: "connection_reset"}, + {needle: "no route to host", label: "no_route_to_host"}, + {needle: "peer unavailable", label: "peer_unavailable"}, + {needle: "peer is unavailable", label: "peer_unavailable"}, + {needle: "next peer is unavailable", label: "peer_unavailable"}, + {needle: "capacity", label: "capacity_limited"}, + } { + if strings.Contains(reason, mapping.needle) { + return mapping.label + } + } replacer := strings.NewReplacer(" ", "_", "\t", "_", "\n", "_", "\r", "_", ":", "_", ";", "_", ",", "_") reason = replacer.Replace(reason) for strings.Contains(reason, "__") { diff --git a/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go b/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go index c69dd6c..a04ff5f 100644 --- a/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go +++ b/agents/rap-node-agent/internal/vpnruntime/fabric_transport_test.go @@ -1532,7 +1532,7 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test } if statA.LastRecoveredFromRouteID != "route-primary" || statA.LastRecoveredNextHop != "relay-primary" || - statA.LastRouteSwitchReason != "production_mesh_next_peer_is_unavailable" || + statA.LastRouteSwitchReason != "peer_unavailable" || statA.RouteSwitchCount != 1 || statA.LastRouteFailureAt == "" || statA.LastRouteSwitchAt == "" || @@ -1541,7 +1541,7 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test snapshot.FlowScheduler.RouteSwitchCount != 1 || snapshot.FlowScheduler.RouteRecoveryMaxMillis != statA.LastRouteRecoveryMillis || snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis || - snapshot.FlowScheduler.RouteSwitchReasonCounts["production_mesh_next_peer_is_unavailable"] != 1 { + snapshot.FlowScheduler.RouteSwitchReasonCounts["peer_unavailable"] != 1 { t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler) } if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 { @@ -1552,6 +1552,21 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test } } +func TestNormalizeFabricRouteSwitchReasonBucketsCommonFailures(t *testing.T) { + cases := map[string]string{ + "context deadline exceeded while dialing 10.0.0.1:19124": "timeout", + "dial tcp 10.0.0.1:19124: connection refused": "connection_refused", + "production mesh next peer is unavailable": "peer_unavailable", + "quic fabric stream capacity limited": "capacity_limited", + "": "route_failure", + } + for input, want := range cases { + if got := normalizeFabricRouteSwitchReason(input); got != want { + t.Fatalf("normalizeFabricRouteSwitchReason(%q) = %q, want %q", input, got, want) + } + } +} + func TestFabricClientPacketIngressIsolatesRouteMemoryPerVPNConnection(t *testing.T) { transport := &captureManyProductionTransport{} scheduler := NewFabricFlowScheduler(8, 16) diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index db7c926..8d18a0e 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -458,6 +458,9 @@ reason counts, so load tests can distinguish peer failures, timeouts, and other route-break causes. `mesh-live-smoke` reports the synthetic route-recovery reason beside recovery timing and switch count. +Common route switch reasons are bucketed into stable labels such as timeout, +peer_unavailable, connection_refused, connection_reset, no_route_to_host, and +capacity_limited to keep heartbeat cardinality bounded. Endpoint ranking treats `capacity_limited` observations as a soft pressure penalty instead of a hard recent failure, enabling load spreading without marking the carrier unhealthy.