diff --git a/agents/rap-node-agent/internal/mesh/fabric_quic_transport.go b/agents/rap-node-agent/internal/mesh/fabric_quic_transport.go index 963c040..e8c9619 100644 --- a/agents/rap-node-agent/internal/mesh/fabric_quic_transport.go +++ b/agents/rap-node-agent/internal/mesh/fabric_quic_transport.go @@ -354,6 +354,7 @@ func (t *QUICFabricTransport) Snapshot() QUICFabricTransportSnapshot { select { case <-entry.conn.Context().Done(): delete(t.conns, key) + t.stats.ClosedEvicted++ snapshot.Stats.ClosedEvicted++ default: snapshot.ActiveCount++ diff --git a/agents/rap-node-agent/internal/mesh/fabric_quic_transport_test.go b/agents/rap-node-agent/internal/mesh/fabric_quic_transport_test.go index 7237a69..080540b 100644 --- a/agents/rap-node-agent/internal/mesh/fabric_quic_transport_test.go +++ b/agents/rap-node-agent/internal/mesh/fabric_quic_transport_test.go @@ -235,6 +235,51 @@ func TestQUICFabricTransportPrunesIdleConnections(t *testing.T) { } } +func TestQUICFabricTransportSnapshotPersistsClosedEvictions(t *testing.T) { + server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{ + ListenAddr: "127.0.0.1:0", + TLSConfig: testQUICTLSConfig(t), + }) + if err != nil { + t.Fatalf("start quic fabric server: %v", err) + } + defer server.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + transport := NewQUICFabricTransport(nil) + defer transport.Close() + target := FabricTransportTarget{ + PeerID: "node-b", + Endpoint: server.Addr().String(), + TLSConfig: &tls.Config{ + InsecureSkipVerify: true, + NextProtos: []string{fabricQUICNextProto}, + }, + Timeout: time.Second, + } + session, err := transport.Connect(ctx, target) + if err != nil { + t.Fatalf("connect: %v", err) + } + defer session.Close() + key := quicFabricConnKey(target) + transport.mu.Lock() + entry := transport.conns[key] + transport.mu.Unlock() + if entry == nil || entry.conn == nil { + t.Fatalf("cached connection missing") + } + _ = entry.conn.CloseWithError(0, "test closed") + <-entry.conn.Context().Done() + + first := transport.Snapshot() + second := transport.Snapshot() + if first.Stats.ClosedEvicted != 1 || second.Stats.ClosedEvicted != 1 { + t.Fatalf("closed eviction stats were not persisted: first=%+v second=%+v", first, second) + } +} + func TestQUICFabricTransportLimitsStreamsPerConnection(t *testing.T) { server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{ ListenAddr: "127.0.0.1:0", diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index 31ad11c..defa487 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -418,6 +418,9 @@ stream id as well, so gateway status can compare TX and RX distribution under browser/RDP load. QUIC fabric transport snapshots expose the configured stream limit, saturated connection count, and capacity pressure percentage next to stream limit rejects. +Closed cached QUIC connections discovered during snapshot generation now update +the transport's cumulative eviction counters, keeping successive heartbeats +consistent. Endpoint ranking treats `capacity_limited` observations as a soft pressure penalty instead of a hard recent failure, enabling load spreading without marking the carrier unhealthy.