Bucket VPN route switch reasons
This commit is contained in:
@@ -929,6 +929,24 @@ func normalizeFabricRouteSwitchReason(reason string) string {
|
|||||||
if reason == "" {
|
if reason == "" {
|
||||||
return "route_failure"
|
return "route_failure"
|
||||||
}
|
}
|
||||||
|
for _, mapping := range []struct {
|
||||||
|
needle string
|
||||||
|
label string
|
||||||
|
}{
|
||||||
|
{needle: "context deadline exceeded", label: "timeout"},
|
||||||
|
{needle: "i/o timeout", label: "timeout"},
|
||||||
|
{needle: "connection refused", label: "connection_refused"},
|
||||||
|
{needle: "connection reset", label: "connection_reset"},
|
||||||
|
{needle: "no route to host", label: "no_route_to_host"},
|
||||||
|
{needle: "peer unavailable", label: "peer_unavailable"},
|
||||||
|
{needle: "peer is unavailable", label: "peer_unavailable"},
|
||||||
|
{needle: "next peer is unavailable", label: "peer_unavailable"},
|
||||||
|
{needle: "capacity", label: "capacity_limited"},
|
||||||
|
} {
|
||||||
|
if strings.Contains(reason, mapping.needle) {
|
||||||
|
return mapping.label
|
||||||
|
}
|
||||||
|
}
|
||||||
replacer := strings.NewReplacer(" ", "_", "\t", "_", "\n", "_", "\r", "_", ":", "_", ";", "_", ",", "_")
|
replacer := strings.NewReplacer(" ", "_", "\t", "_", "\n", "_", "\r", "_", ":", "_", ";", "_", ",", "_")
|
||||||
reason = replacer.Replace(reason)
|
reason = replacer.Replace(reason)
|
||||||
for strings.Contains(reason, "__") {
|
for strings.Contains(reason, "__") {
|
||||||
|
|||||||
@@ -1532,7 +1532,7 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
|||||||
}
|
}
|
||||||
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
||||||
statA.LastRecoveredNextHop != "relay-primary" ||
|
statA.LastRecoveredNextHop != "relay-primary" ||
|
||||||
statA.LastRouteSwitchReason != "production_mesh_next_peer_is_unavailable" ||
|
statA.LastRouteSwitchReason != "peer_unavailable" ||
|
||||||
statA.RouteSwitchCount != 1 ||
|
statA.RouteSwitchCount != 1 ||
|
||||||
statA.LastRouteFailureAt == "" ||
|
statA.LastRouteFailureAt == "" ||
|
||||||
statA.LastRouteSwitchAt == "" ||
|
statA.LastRouteSwitchAt == "" ||
|
||||||
@@ -1541,7 +1541,7 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
|||||||
snapshot.FlowScheduler.RouteSwitchCount != 1 ||
|
snapshot.FlowScheduler.RouteSwitchCount != 1 ||
|
||||||
snapshot.FlowScheduler.RouteRecoveryMaxMillis != statA.LastRouteRecoveryMillis ||
|
snapshot.FlowScheduler.RouteRecoveryMaxMillis != statA.LastRouteRecoveryMillis ||
|
||||||
snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis ||
|
snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis ||
|
||||||
snapshot.FlowScheduler.RouteSwitchReasonCounts["production_mesh_next_peer_is_unavailable"] != 1 {
|
snapshot.FlowScheduler.RouteSwitchReasonCounts["peer_unavailable"] != 1 {
|
||||||
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
||||||
}
|
}
|
||||||
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
|
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
|
||||||
@@ -1552,6 +1552,21 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNormalizeFabricRouteSwitchReasonBucketsCommonFailures(t *testing.T) {
|
||||||
|
cases := map[string]string{
|
||||||
|
"context deadline exceeded while dialing 10.0.0.1:19124": "timeout",
|
||||||
|
"dial tcp 10.0.0.1:19124: connection refused": "connection_refused",
|
||||||
|
"production mesh next peer is unavailable": "peer_unavailable",
|
||||||
|
"quic fabric stream capacity limited": "capacity_limited",
|
||||||
|
"": "route_failure",
|
||||||
|
}
|
||||||
|
for input, want := range cases {
|
||||||
|
if got := normalizeFabricRouteSwitchReason(input); got != want {
|
||||||
|
t.Fatalf("normalizeFabricRouteSwitchReason(%q) = %q, want %q", input, got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestFabricClientPacketIngressIsolatesRouteMemoryPerVPNConnection(t *testing.T) {
|
func TestFabricClientPacketIngressIsolatesRouteMemoryPerVPNConnection(t *testing.T) {
|
||||||
transport := &captureManyProductionTransport{}
|
transport := &captureManyProductionTransport{}
|
||||||
scheduler := NewFabricFlowScheduler(8, 16)
|
scheduler := NewFabricFlowScheduler(8, 16)
|
||||||
|
|||||||
@@ -458,6 +458,9 @@ reason counts, so load tests can distinguish peer failures, timeouts, and other
|
|||||||
route-break causes.
|
route-break causes.
|
||||||
`mesh-live-smoke` reports the synthetic route-recovery reason beside recovery
|
`mesh-live-smoke` reports the synthetic route-recovery reason beside recovery
|
||||||
timing and switch count.
|
timing and switch count.
|
||||||
|
Common route switch reasons are bucketed into stable labels such as timeout,
|
||||||
|
peer_unavailable, connection_refused, connection_reset, no_route_to_host, and
|
||||||
|
capacity_limited to keep heartbeat cardinality bounded.
|
||||||
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
||||||
penalty instead of a hard recent failure, enabling load spreading without
|
penalty instead of a hard recent failure, enabling load spreading without
|
||||||
marking the carrier unhealthy.
|
marking the carrier unhealthy.
|
||||||
|
|||||||
Reference in New Issue
Block a user