Classify QUIC stream saturation
This commit is contained in:
@@ -10,6 +10,7 @@ import (
|
|||||||
"crypto/x509/pkix"
|
"crypto/x509/pkix"
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"math/big"
|
"math/big"
|
||||||
@@ -414,6 +415,7 @@ type vpnFabricSessionDialStats struct {
|
|||||||
TransportFailures atomic.Int64
|
TransportFailures atomic.Int64
|
||||||
SessionOpenFailures atomic.Int64
|
SessionOpenFailures atomic.Int64
|
||||||
StreamOpenFailures atomic.Int64
|
StreamOpenFailures atomic.Int64
|
||||||
|
CapacityLimited atomic.Int64
|
||||||
AllCandidatesFailed atomic.Int64
|
AllCandidatesFailed atomic.Int64
|
||||||
QUICSelected atomic.Int64
|
QUICSelected atomic.Int64
|
||||||
WebSocketSelected atomic.Int64
|
WebSocketSelected atomic.Int64
|
||||||
@@ -601,6 +603,8 @@ func (s *vpnFabricSessionDialStats) ObserveCandidateFailure(reason string) {
|
|||||||
s.SessionOpenFailures.Add(1)
|
s.SessionOpenFailures.Add(1)
|
||||||
case "stream_open_failed":
|
case "stream_open_failed":
|
||||||
s.StreamOpenFailures.Add(1)
|
s.StreamOpenFailures.Add(1)
|
||||||
|
case "capacity_limited":
|
||||||
|
s.CapacityLimited.Add(1)
|
||||||
}
|
}
|
||||||
s.LastFailureReason.Store(strings.TrimSpace(reason))
|
s.LastFailureReason.Store(strings.TrimSpace(reason))
|
||||||
s.LastFailureUnixSec.Store(time.Now().UTC().Unix())
|
s.LastFailureUnixSec.Store(time.Now().UTC().Unix())
|
||||||
@@ -653,6 +657,7 @@ func (s *vpnFabricSessionDialStats) Report(observedAt time.Time) map[string]any
|
|||||||
"transport_failures": s.TransportFailures.Load(),
|
"transport_failures": s.TransportFailures.Load(),
|
||||||
"session_open_failures": s.SessionOpenFailures.Load(),
|
"session_open_failures": s.SessionOpenFailures.Load(),
|
||||||
"stream_open_failures": s.StreamOpenFailures.Load(),
|
"stream_open_failures": s.StreamOpenFailures.Load(),
|
||||||
|
"capacity_limited": s.CapacityLimited.Load(),
|
||||||
"all_candidates_failed": s.AllCandidatesFailed.Load(),
|
"all_candidates_failed": s.AllCandidatesFailed.Load(),
|
||||||
"quic_selected": s.QUICSelected.Load(),
|
"quic_selected": s.QUICSelected.Load(),
|
||||||
"websocket_selected": s.WebSocketSelected.Load(),
|
"websocket_selected": s.WebSocketSelected.Load(),
|
||||||
@@ -5024,9 +5029,12 @@ func fabricSessionGatewayTransportForAssignment(ctx context.Context, identity st
|
|||||||
session, err := carrier.Connect(dialCtx, selectedTarget)
|
session, err := carrier.Connect(dialCtx, selectedTarget)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cancel()
|
cancel()
|
||||||
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure("session_open_failed")
|
reason := fabricSessionOpenFailureReason(err)
|
||||||
meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, "session_open_failed")
|
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure(reason)
|
||||||
log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=session_open_failed error=%v", assignment.VPNConnectionID, nextHop, index, selectedTarget.Endpoint, selectedTarget.Transport, err)
|
if reason != "capacity_limited" {
|
||||||
|
meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, reason)
|
||||||
|
}
|
||||||
|
log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=%s error=%v", assignment.VPNConnectionID, nextHop, index, selectedTarget.Endpoint, selectedTarget.Transport, reason, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
streamID := uint64(time.Now().UnixNano())
|
streamID := uint64(time.Now().UnixNano())
|
||||||
@@ -5065,6 +5073,16 @@ func fabricSessionGatewayTransportForAssignment(ctx context.Context, identity st
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fabricSessionOpenFailureReason(err error) string {
|
||||||
|
if err == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if errors.Is(err, mesh.ErrQUICFabricStreamLimitReached) {
|
||||||
|
return "capacity_limited"
|
||||||
|
}
|
||||||
|
return "session_open_failed"
|
||||||
|
}
|
||||||
|
|
||||||
func vpnFabricSessionTarget(meshState *syntheticMeshState, nextHop string) (mesh.FabricTransportTarget, bool) {
|
func vpnFabricSessionTarget(meshState *syntheticMeshState, nextHop string) (mesh.FabricTransportTarget, bool) {
|
||||||
targets := vpnFabricSessionTargets(meshState, nextHop)
|
targets := vpnFabricSessionTargets(meshState, nextHop)
|
||||||
if len(targets) == 0 {
|
if len(targets) == 0 {
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"crypto/ed25519"
|
"crypto/ed25519"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log"
|
"log"
|
||||||
@@ -775,6 +776,7 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
|
|||||||
func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
||||||
stats := newVPNFabricSessionDialStats()
|
stats := newVPNFabricSessionDialStats()
|
||||||
stats.Attempts.Add(1)
|
stats.Attempts.Add(1)
|
||||||
|
stats.ObserveCandidateFailure("capacity_limited")
|
||||||
stats.ObserveCandidateFailure("session_open_failed")
|
stats.ObserveCandidateFailure("session_open_failed")
|
||||||
stats.ObserveSelected(mesh.FabricTransportTarget{
|
stats.ObserveSelected(mesh.FabricTransportTarget{
|
||||||
Endpoint: "quic://node-b.example.test:19443",
|
Endpoint: "quic://node-b.example.test:19443",
|
||||||
@@ -785,7 +787,8 @@ func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
|||||||
report := stats.Report(time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC))
|
report := stats.Report(time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC))
|
||||||
if report["attempts"] != int64(1) ||
|
if report["attempts"] != int64(1) ||
|
||||||
report["selected"] != int64(1) ||
|
report["selected"] != int64(1) ||
|
||||||
report["candidate_failures"] != int64(1) ||
|
report["candidate_failures"] != int64(2) ||
|
||||||
|
report["capacity_limited"] != int64(1) ||
|
||||||
report["session_open_failures"] != int64(1) ||
|
report["session_open_failures"] != int64(1) ||
|
||||||
report["quic_selected"] != int64(1) ||
|
report["quic_selected"] != int64(1) ||
|
||||||
report["pinned_cert_selected"] != int64(1) ||
|
report["pinned_cert_selected"] != int64(1) ||
|
||||||
@@ -796,6 +799,15 @@ func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestFabricSessionOpenFailureReasonClassifiesCapacity(t *testing.T) {
|
||||||
|
if got := fabricSessionOpenFailureReason(mesh.ErrQUICFabricStreamLimitReached); got != "capacity_limited" {
|
||||||
|
t.Fatalf("failure reason = %q, want capacity_limited", got)
|
||||||
|
}
|
||||||
|
if got := fabricSessionOpenFailureReason(errors.New("dial failed")); got != "session_open_failed" {
|
||||||
|
t.Fatalf("failure reason = %q, want session_open_failed", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestVPNFabricEndpointObservationReportIsBoundedAndNewestFirst(t *testing.T) {
|
func TestVPNFabricEndpointObservationReportIsBoundedAndNewestFirst(t *testing.T) {
|
||||||
store := newVPNFabricEndpointObservationStore("node-a")
|
store := newVPNFabricEndpointObservationStore("node-a")
|
||||||
base := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
|
base := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
|
||||||
|
|||||||
@@ -18,6 +18,13 @@ import (
|
|||||||
const fabricQUICNextProto = "rap-fabric-data-session-v1"
|
const fabricQUICNextProto = "rap-fabric-data-session-v1"
|
||||||
const defaultQUICFabricConnIdleTTL = 5 * time.Minute
|
const defaultQUICFabricConnIdleTTL = 5 * time.Minute
|
||||||
const defaultQUICFabricMaxStreamsPerConn = 64
|
const defaultQUICFabricMaxStreamsPerConn = 64
|
||||||
|
const ErrQUICFabricStreamLimitReached = quicFabricError("quic fabric stream limit reached")
|
||||||
|
|
||||||
|
type quicFabricError string
|
||||||
|
|
||||||
|
func (e quicFabricError) Error() string {
|
||||||
|
return string(e)
|
||||||
|
}
|
||||||
|
|
||||||
type QUICFabricTransport struct {
|
type QUICFabricTransport struct {
|
||||||
Config *quic.Config
|
Config *quic.Config
|
||||||
@@ -231,7 +238,7 @@ func (t *QUICFabricTransport) reserveStream(key string, conn *quic.Conn) error {
|
|||||||
}
|
}
|
||||||
if entry.activeStreams >= limit {
|
if entry.activeStreams >= limit {
|
||||||
t.stats.StreamLimitRejects++
|
t.stats.StreamLimitRejects++
|
||||||
return fmt.Errorf("quic fabric stream limit reached")
|
return ErrQUICFabricStreamLimitReached
|
||||||
}
|
}
|
||||||
entry.activeStreams++
|
entry.activeStreams++
|
||||||
entry.lastUsed = time.Now()
|
entry.lastUsed = time.Now()
|
||||||
|
|||||||
@@ -368,6 +368,9 @@ The per-connection QUIC stream limit is configurable through
|
|||||||
`RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN` /
|
`RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN` /
|
||||||
`-vpn-fabric-quic-max-streams-per-conn` and propagated by host-agent install
|
`-vpn-fabric-quic-max-streams-per-conn` and propagated by host-agent install
|
||||||
profiles.
|
profiles.
|
||||||
|
QUIC stream-limit rejects are classified as capacity pressure instead of peer
|
||||||
|
endpoint failure, so local health feedback does not incorrectly demote a healthy
|
||||||
|
but saturated carrier.
|
||||||
|
|
||||||
Deliverables:
|
Deliverables:
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user