Bucket VPN route switch reasons

This commit is contained in:
2026-05-16 13:19:33 +03:00
parent c97044cd34
commit 5c3b19cff7
3 changed files with 38 additions and 2 deletions
@@ -929,6 +929,24 @@ func normalizeFabricRouteSwitchReason(reason string) string {
if reason == "" {
return "route_failure"
}
for _, mapping := range []struct {
needle string
label string
}{
{needle: "context deadline exceeded", label: "timeout"},
{needle: "i/o timeout", label: "timeout"},
{needle: "connection refused", label: "connection_refused"},
{needle: "connection reset", label: "connection_reset"},
{needle: "no route to host", label: "no_route_to_host"},
{needle: "peer unavailable", label: "peer_unavailable"},
{needle: "peer is unavailable", label: "peer_unavailable"},
{needle: "next peer is unavailable", label: "peer_unavailable"},
{needle: "capacity", label: "capacity_limited"},
} {
if strings.Contains(reason, mapping.needle) {
return mapping.label
}
}
replacer := strings.NewReplacer(" ", "_", "\t", "_", "\n", "_", "\r", "_", ":", "_", ";", "_", ",", "_")
reason = replacer.Replace(reason)
for strings.Contains(reason, "__") {
@@ -1532,7 +1532,7 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
}
if statA.LastRecoveredFromRouteID != "route-primary" ||
statA.LastRecoveredNextHop != "relay-primary" ||
statA.LastRouteSwitchReason != "production_mesh_next_peer_is_unavailable" ||
statA.LastRouteSwitchReason != "peer_unavailable" ||
statA.RouteSwitchCount != 1 ||
statA.LastRouteFailureAt == "" ||
statA.LastRouteSwitchAt == "" ||
@@ -1541,7 +1541,7 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
snapshot.FlowScheduler.RouteSwitchCount != 1 ||
snapshot.FlowScheduler.RouteRecoveryMaxMillis != statA.LastRouteRecoveryMillis ||
snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis ||
snapshot.FlowScheduler.RouteSwitchReasonCounts["production_mesh_next_peer_is_unavailable"] != 1 {
snapshot.FlowScheduler.RouteSwitchReasonCounts["peer_unavailable"] != 1 {
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
}
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
@@ -1552,6 +1552,21 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
}
}
func TestNormalizeFabricRouteSwitchReasonBucketsCommonFailures(t *testing.T) {
cases := map[string]string{
"context deadline exceeded while dialing 10.0.0.1:19124": "timeout",
"dial tcp 10.0.0.1:19124: connection refused": "connection_refused",
"production mesh next peer is unavailable": "peer_unavailable",
"quic fabric stream capacity limited": "capacity_limited",
"": "route_failure",
}
for input, want := range cases {
if got := normalizeFabricRouteSwitchReason(input); got != want {
t.Fatalf("normalizeFabricRouteSwitchReason(%q) = %q, want %q", input, got, want)
}
}
}
func TestFabricClientPacketIngressIsolatesRouteMemoryPerVPNConnection(t *testing.T) {
transport := &captureManyProductionTransport{}
scheduler := NewFabricFlowScheduler(8, 16)
@@ -458,6 +458,9 @@ reason counts, so load tests can distinguish peer failures, timeouts, and other
route-break causes.
`mesh-live-smoke` reports the synthetic route-recovery reason beside recovery
timing and switch count.
Common route switch reasons are bucketed into stable labels such as timeout,
peer_unavailable, connection_refused, connection_reset, no_route_to_host, and
capacity_limited to keep heartbeat cardinality bounded.
Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy.