Refactor RDP proxy handling and update related tests

This commit is contained in:
2026-05-17 20:38:35 +03:00
parent 8e9402580f
commit d551e57fd5
172 changed files with 22117 additions and 2509 deletions
@@ -11,8 +11,9 @@ import (
func TestClientFabricSessionFrameRoundTrip(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -37,8 +38,9 @@ func TestClientFabricSessionFrameRoundTrip(t *testing.T) {
func TestClientFabricSessionPersistentRoundTrips(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -80,8 +82,9 @@ func TestClientFabricSessionPersistentRoundTrips(t *testing.T) {
func TestClientFabricSessionPersistentDataAcks(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -135,8 +138,9 @@ func TestClientFabricSessionPersistentDataAcks(t *testing.T) {
func TestClientFabricSessionPumpMovesIndependentFrames(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -202,8 +206,9 @@ func TestClientFabricSessionPumpMovesIndependentFrames(t *testing.T) {
func TestClientFabricSessionReportsRejectedStatus(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -72,6 +72,10 @@ const (
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionVPNPacketPayloadBytes = 256 * 1024
MaxProductionEnvelopeFutureSkew = time.Minute
ProductionForwardQUICStreamID = 1
WebIngressForwardQUICStreamID = 2
FabricControlForwardQUICStreamID = 3
SyntheticForwardQUICStreamID = 1001
)
type PeerIdentity struct {
@@ -47,6 +47,9 @@ func RankPeerEndpointCandidates(candidates []PeerEndpointCandidate, opts Endpoin
}
out := make([]ScoredPeerEndpointCandidate, 0, len(candidates))
for _, candidate := range candidates {
if endpointHasUnspecifiedHost(candidate.Address) {
continue
}
out = append(out, scorePeerEndpointCandidate(candidate, opts))
}
sort.SliceStable(out, func(i, j int) bool {
@@ -68,25 +71,25 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
score := 100
reasons := []string{"base"}
switch candidate.Transport {
switch strings.ToLower(strings.TrimSpace(candidate.Transport)) {
case "quic", "direct_quic", "udp_quic", "quic_udp":
score += 45
reasons = append(reasons, "transport:quic")
case "direct_tcp_tls", "direct_http", "direct_https":
score += 35
reasons = append(reasons, "transport:direct")
case "wss":
score += 25
reasons = append(reasons, "transport:wss")
case "outbound_reverse":
score += 10
reasons = append(reasons, "transport:outbound_reverse")
case "relay":
case "lan_quic":
score += 42
reasons = append(reasons, "transport:lan_quic")
case "ice_quic":
score += 38
reasons = append(reasons, "transport:ice_quic")
case "reverse_quic":
score += 15
reasons = append(reasons, "transport:reverse_quic")
case "relay_quic":
score += 5
reasons = append(reasons, "transport:relay")
reasons = append(reasons, "transport:relay_quic")
default:
score -= 100
reasons = append(reasons, "transport:unknown")
reasons = append(reasons, "transport:non_quic_rejected")
}
switch candidate.Reachability {
@@ -173,7 +176,8 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
score += 8
reasons = append(reasons, "channel:control-direct")
}
if candidate.Transport == "relay" {
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
if transport == "relay" || transport == "relay_quic" {
score -= 8
reasons = append(reasons, "channel:control-relay-penalty")
}
@@ -234,14 +238,20 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv
}
switch {
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
score += 18
score += 24
reasons = append(reasons, "latency:low")
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 150:
score += 8
reasons = append(reasons, "latency:moderate")
case observation.LastLatencyMs > 0:
score -= 10
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 300:
score -= 12
reasons = append(reasons, "latency:high")
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 750:
score -= 32
reasons = append(reasons, "latency:very_high")
case observation.LastLatencyMs > 0:
score -= 60
reasons = append(reasons, "latency:extreme")
}
if observation.ReliabilityScore > 0 {
switch {
@@ -13,7 +13,7 @@ func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Transport: "relay_quic",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "symmetric",
@@ -25,8 +25,8 @@ func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -38,8 +38,8 @@ func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
{
EndpointID: "node-b-private-stale",
NodeID: "node-b",
Transport: "wss",
Address: "10.0.0.5:443",
Transport: "lan_quic",
Address: "quic://10.0.0.5:19443",
Reachability: "private",
NATType: "restricted",
ConnectivityMode: "direct",
@@ -74,8 +74,8 @@ func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
{
EndpointID: "endpoint-b",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.21:443",
Transport: "direct_quic",
Address: "quic://203.0.113.21:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -84,8 +84,8 @@ func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
{
EndpointID: "endpoint-a",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -103,10 +103,10 @@ func TestRankPeerEndpointCandidatesPrefersQUICFastPath(t *testing.T) {
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-wss",
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "wss",
Address: "wss://node-b.example.test",
Transport: "relay_quic",
Address: "quic://relay.example.test:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -138,14 +138,44 @@ func TestRankPeerEndpointCandidatesPrefersQUICFastPath(t *testing.T) {
}
}
func TestRankPeerEndpointCandidatesDropsUnspecifiedQUICEndpoint(t *testing.T) {
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-unspecified",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://[::]:19131",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 1,
},
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19131",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{})
if len(ranked) != 1 || ranked[0].Candidate.EndpointID != "node-b-public" {
t.Fatalf("unspecified endpoint was not dropped: %+v", ranked)
}
}
func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -155,8 +185,8 @@ func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T)
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "10.24.10.20:19001",
Transport: "lan_quic",
Address: "quic://10.24.10.20:19443",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
@@ -184,7 +214,7 @@ func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T
{
EndpointID: "node-b-outbound",
NodeID: "node-b",
Transport: "outbound_reverse",
Transport: "reverse_quic",
Address: "node-b.reverse.local",
Reachability: "outbound_only",
NATType: "symmetric",
@@ -194,7 +224,7 @@ func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Transport: "relay_quic",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "blocked",
@@ -222,18 +252,18 @@ func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "node-b-wss",
EndpointID: "node-b-ice",
NodeID: "node-b",
Transport: "wss",
Address: "node-b.example.test",
Transport: "ice_quic",
Address: "quic://node-b.example.test:19443",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
@@ -253,8 +283,8 @@ func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
ReliabilityScore: 50,
ObservedAt: now.Add(-time.Minute),
},
"node-b-wss": {
EndpointID: "node-b-wss",
"node-b-ice": {
EndpointID: "node-b-ice",
LastLatencyMs: 35,
SuccessCount: 8,
ReliabilityScore: 95,
@@ -262,8 +292,8 @@ func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
},
},
})
if ranked[0].Candidate.EndpointID != "node-b-wss" {
t.Fatalf("top endpoint = %q, want node-b-wss: %+v", ranked[0].Candidate.EndpointID, ranked)
if ranked[0].Candidate.EndpointID != "node-b-ice" {
t.Fatalf("top endpoint = %q, want node-b-ice: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "latency:low") || !containsReason(ranked[0].Reasons, "reliability:high") {
t.Fatalf("top reasons missing health hints: %+v", ranked[0].Reasons)
@@ -279,8 +309,8 @@ func TestRankPeerEndpointCandidatesTreatsStaleObservationAsPenalty(t *testing.T)
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -321,10 +351,10 @@ func TestRankPeerEndpointCandidatesDoesNotRewardZeroLatencyFailure(t *testing.T)
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-wss",
EndpointID: "node-b-ice",
NodeID: "node-b",
Transport: "wss",
Address: "https://node-b.example.test:443",
Transport: "ice_quic",
Address: "quic://node-b.example.test:19444",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 10,
@@ -345,14 +375,81 @@ func TestRankPeerEndpointCandidatesDoesNotRewardZeroLatencyFailure(t *testing.T)
},
MaxObservationAge: time.Minute,
})
if ranked[0].Candidate.EndpointID != "node-b-wss" {
t.Fatalf("top endpoint = %q, want wss after repeated quic failures: %+v", ranked[0].Candidate.EndpointID, ranked)
if ranked[0].Candidate.EndpointID != "node-b-ice" {
t.Fatalf("top endpoint = %q, want ice_quic after repeated direct QUIC failures: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if containsReason(ranked[1].Reasons, "latency:moderate") {
t.Fatalf("zero latency failure was rewarded as moderate latency: %+v", ranked[1].Reasons)
}
}
func TestRankPeerEndpointCandidatesPenalizesSevereLatencyGradient(t *testing.T) {
now := time.Date(2026, 5, 17, 6, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-lan",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://10.0.0.2:19443",
Reachability: "private",
ConnectivityMode: "direct",
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-wan",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
ConnectivityMode: "direct",
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-bad-relay",
NodeID: "node-b",
Transport: "relay_quic",
Address: "quic://relay.example.test:19443",
Reachability: "relay",
ConnectivityMode: "relay_required",
LastVerifiedAt: &now,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
MaxVerificationAge: time.Minute,
MaxObservationAge: time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"node-b-lan": {
EndpointID: "node-b-lan",
LastLatencyMs: 4,
ReliabilityScore: 95,
ObservedAt: now,
},
"node-b-wan": {
EndpointID: "node-b-wan",
LastLatencyMs: 420,
ReliabilityScore: 95,
ObservedAt: now,
},
"node-b-bad-relay": {
EndpointID: "node-b-bad-relay",
LastLatencyMs: 900,
ReliabilityScore: 95,
ObservedAt: now,
},
},
})
if ranked[0].Candidate.EndpointID != "node-b-lan" || ranked[1].Candidate.EndpointID != "node-b-wan" || ranked[2].Candidate.EndpointID != "node-b-bad-relay" {
t.Fatalf("ranked endpoints = %+v, want lan, wan, bad relay", ranked)
}
if !containsReason(ranked[1].Reasons, "latency:very_high") {
t.Fatalf("wan reasons = %+v, want latency:very_high", ranked[1].Reasons)
}
if !containsReason(ranked[2].Reasons, "latency:extreme") {
t.Fatalf("relay reasons = %+v, want latency:extreme", ranked[2].Reasons)
}
}
func TestRankPeerEndpointCandidatesTreatsCapacityAsSoftPressure(t *testing.T) {
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{
@@ -0,0 +1,217 @@
package mesh
import (
"errors"
"strings"
"time"
)
type FabricChannelRouteEventType string
const (
FabricChannelRouteEventNone FabricChannelRouteEventType = ""
FabricChannelRouteEventOpened FabricChannelRouteEventType = "opened"
FabricChannelRouteEventReroute FabricChannelRouteEventType = "reroute"
)
var ErrFabricRouteRerouteSuppressed = errors.New("fabric route reroute suppressed")
type FabricChannelRouterConfig struct {
SchedulerConfig FabricRouteSchedulerConfig
MaxAckLatencyMs int64
MaxRoutePressure int
MinRerouteInterval time.Duration
ProjectedChannelCost int
}
type FabricChannelRouter struct {
Config FabricChannelRouterConfig
Scheduler FabricRouteScheduler
}
type FabricChannelObservation struct {
ChannelID string
RouteID string
AckLatencyMs int64
Failed bool
BytesSent uint64
BytesRecv uint64
FramesSent uint64
FramesRecv uint64
Reason string
ObservedAt time.Time
}
type FabricChannelRouteEvent struct {
Type FabricChannelRouteEventType
Reason string
PreviousRoute FabricRoute
NextRoute FabricRoute
Choice FabricRouteChoice
Observation FabricChannelObservation
Channel FabricChannel
OccurredAt time.Time
}
func NewFabricChannelRouter(cfg FabricChannelRouterConfig) FabricChannelRouter {
cfg = normalizeFabricChannelRouterConfig(cfg)
return FabricChannelRouter{
Config: cfg,
Scheduler: NewFabricRouteScheduler(cfg.SchedulerConfig),
}
}
func (r FabricChannelRouter) OpenChannel(spec FabricChannelSpec, routeSet FabricRouteSet, now time.Time) (FabricChannel, FabricChannelRouteEvent, error) {
if now.IsZero() {
now = time.Now().UTC()
}
choice, err := r.Scheduler.ChooseRoute(spec, routeSet, now)
if err != nil {
return FabricChannel{}, FabricChannelRouteEvent{}, err
}
channel := FabricChannel{
Spec: spec,
State: FabricChannelOpen,
RouteID: choice.Route.RouteID,
TargetNode: choice.Route.DestinationNodeID,
OpenedAt: now,
}
event := FabricChannelRouteEvent{
Type: FabricChannelRouteEventOpened,
Reason: choice.Reason,
NextRoute: choice.Route,
Choice: choice,
Channel: channel,
OccurredAt: now,
}
return channel, event, nil
}
func (r FabricChannelRouter) ObserveChannel(channel FabricChannel, routeSet FabricRouteSet, observation FabricChannelObservation, now time.Time) (FabricChannel, FabricChannelRouteEvent, error) {
if now.IsZero() {
now = time.Now().UTC()
}
if observation.ObservedAt.IsZero() {
observation.ObservedAt = now
}
channel.BytesSent += observation.BytesSent
channel.BytesRecv += observation.BytesRecv
channel.FramesSent += observation.FramesSent
channel.FramesRecv += observation.FramesRecv
if channel.State == "" {
channel.State = FabricChannelOpen
}
if !r.shouldReroute(channel, observation, routeSet, now) {
return channel, FabricChannelRouteEvent{Type: FabricChannelRouteEventNone, Observation: observation, Channel: channel, OccurredAt: now}, nil
}
previous, _ := findFabricRoute(routeSet, channel.RouteID)
choice, err := r.chooseAlternativeRoute(channel.Spec, routeSet, channel.RouteID, now)
if err != nil {
return channel, FabricChannelRouteEvent{}, err
}
channel.RouteID = choice.Route.RouteID
channel.TargetNode = choice.Route.DestinationNodeID
channel.LastReroute = now
channel.RerouteCount++
reason := observation.Reason
if strings.TrimSpace(reason) == "" {
reason = rerouteReason(r.Config, observation, previous)
}
event := FabricChannelRouteEvent{
Type: FabricChannelRouteEventReroute,
Reason: reason,
PreviousRoute: previous,
NextRoute: choice.Route,
Choice: choice,
Observation: observation,
Channel: channel,
OccurredAt: now,
}
return channel, event, nil
}
func (r FabricChannelRouter) shouldReroute(channel FabricChannel, observation FabricChannelObservation, routeSet FabricRouteSet, now time.Time) bool {
cfg := normalizeFabricChannelRouterConfig(r.Config)
if cfg.MinRerouteInterval > 0 && !channel.LastReroute.IsZero() && now.Sub(channel.LastReroute) < cfg.MinRerouteInterval {
return false
}
if observation.Failed {
return true
}
if cfg.MaxAckLatencyMs > 0 && observation.AckLatencyMs > cfg.MaxAckLatencyMs {
return true
}
if cfg.MaxRoutePressure > 0 {
if route, ok := findFabricRoute(routeSet, channel.RouteID); ok && fabricRoutePressurePercent(route, cfg.ProjectedChannelCost) > cfg.MaxRoutePressure {
return true
}
}
return false
}
func (r FabricChannelRouter) chooseAlternativeRoute(spec FabricChannelSpec, routeSet FabricRouteSet, currentRouteID string, now time.Time) (FabricRouteChoice, error) {
routes := flattenFabricRouteSet(routeSet)
alternatives := make([]FabricRoute, 0, len(routes))
for _, route := range routes {
if route.RouteID == currentRouteID {
continue
}
alternatives = append(alternatives, route)
}
if len(alternatives) == 0 {
return FabricRouteChoice{}, ErrFabricRouteNotFound
}
return r.Scheduler.ChooseRoute(spec, routeSetFromRoutes(routeSet, alternatives), now)
}
func normalizeFabricChannelRouterConfig(cfg FabricChannelRouterConfig) FabricChannelRouterConfig {
if cfg.ProjectedChannelCost <= 0 {
cfg.ProjectedChannelCost = 1
}
if cfg.SchedulerConfig.ProjectedChannelCost <= 0 {
cfg.SchedulerConfig.ProjectedChannelCost = cfg.ProjectedChannelCost
}
if cfg.MaxRoutePressure <= 0 {
cfg.MaxRoutePressure = 90
}
return cfg
}
func rerouteReason(cfg FabricChannelRouterConfig, observation FabricChannelObservation, route FabricRoute) string {
cfg = normalizeFabricChannelRouterConfig(cfg)
switch {
case observation.Failed:
return "route_failure"
case cfg.MaxAckLatencyMs > 0 && observation.AckLatencyMs > cfg.MaxAckLatencyMs:
return "ack_latency_threshold"
case cfg.MaxRoutePressure > 0 && fabricRoutePressurePercent(route, cfg.ProjectedChannelCost) > cfg.MaxRoutePressure:
return "route_capacity_pressure"
default:
return "route_degraded"
}
}
func findFabricRoute(routeSet FabricRouteSet, routeID string) (FabricRoute, bool) {
routeID = strings.TrimSpace(routeID)
if routeID == "" {
return FabricRoute{}, false
}
for _, route := range flattenFabricRouteSet(routeSet) {
if route.RouteID == routeID {
return route, true
}
}
return FabricRoute{}, false
}
func routeSetFromRoutes(template FabricRouteSet, routes []FabricRoute) FabricRouteSet {
out := FabricRouteSet{TargetKind: template.TargetKind, TargetID: template.TargetID}
if len(routes) == 0 {
return out
}
out.Primary = routes[0]
if len(routes) > 1 {
out.WarmStandby = append(out.WarmStandby, routes[1:]...)
}
return out
}
@@ -0,0 +1,151 @@
package mesh
import (
"testing"
"time"
)
func TestFabricChannelRouterOpensOnBestRoute(t *testing.T) {
router := NewFabricChannelRouter(FabricChannelRouterConfig{})
now := time.Now()
channel, event, err := router.OpenChannel(testFabricChannelSpec(FabricChannelTargetNode, "node-b"), FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testFabricRoute("route-slow", "node-b", 80, 100, 0, true),
WarmStandby: []FabricRoute{
testFabricRoute("route-fast", "node-b", 15, 100, 0, true),
},
}, now)
if err != nil {
t.Fatalf("open channel: %v", err)
}
if channel.RouteID != "route-fast" || channel.State != FabricChannelOpen {
t.Fatalf("channel = %+v, want route-fast open", channel)
}
if event.Type != FabricChannelRouteEventOpened || event.NextRoute.RouteID != "route-fast" {
t.Fatalf("event = %+v", event)
}
}
func TestFabricChannelRouterReroutesOnSlowAck(t *testing.T) {
router := NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 30})
now := time.Now()
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testFabricRoute("route-primary", "node-b", 10, 100, 0, true),
WarmStandby: []FabricRoute{
testFabricRoute("route-standby", "node-b", 20, 100, 0, true),
},
}
channel := FabricChannel{
Spec: testFabricChannelSpec(FabricChannelTargetNode, "node-b"),
State: FabricChannelOpen,
RouteID: "route-primary",
OpenedAt: now.Add(-time.Minute),
}
updated, event, err := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: channel.Spec.ChannelID,
RouteID: channel.RouteID,
AckLatencyMs: 120,
BytesSent: 4096,
FramesSent: 4,
}, now)
if err != nil {
t.Fatalf("observe channel: %v", err)
}
if event.Type != FabricChannelRouteEventReroute || event.Reason != "ack_latency_threshold" {
t.Fatalf("event = %+v", event)
}
if updated.RouteID != "route-standby" || updated.RerouteCount != 1 || updated.BytesSent != 4096 || updated.FramesSent != 4 {
t.Fatalf("updated = %+v", updated)
}
}
func TestFabricChannelRouterReroutesPoolTargetOnFailure(t *testing.T) {
router := NewFabricChannelRouter(FabricChannelRouterConfig{})
now := time.Now()
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
Primary: testFabricPoolRoute("route-node-b", "node-b", 10, true),
WarmStandby: []FabricRoute{
testFabricPoolRoute("route-node-c", "node-c", 20, true),
},
}
channel := FabricChannel{
Spec: testFabricChannelSpec(FabricChannelTargetPool, "pool-egress"),
State: FabricChannelOpen,
RouteID: "route-node-b",
TargetNode: "node-b",
OpenedAt: now.Add(-time.Minute),
}
updated, event, err := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: channel.Spec.ChannelID,
RouteID: channel.RouteID,
Failed: true,
Reason: "target_failed",
}, now)
if err != nil {
t.Fatalf("observe channel: %v", err)
}
if event.Type != FabricChannelRouteEventReroute || event.PreviousRoute.RouteID != "route-node-b" || event.NextRoute.RouteID != "route-node-c" {
t.Fatalf("event = %+v", event)
}
if updated.TargetNode != "node-c" || updated.RouteID != "route-node-c" {
t.Fatalf("updated = %+v", updated)
}
}
func TestFabricChannelRouterSuppressesRerouteInsideHysteresis(t *testing.T) {
router := NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 30, MinRerouteInterval: time.Minute})
now := time.Now()
channel := FabricChannel{
Spec: testFabricChannelSpec(FabricChannelTargetNode, "node-b"),
State: FabricChannelOpen,
RouteID: "route-primary",
LastReroute: now.Add(-10 * time.Second),
}
updated, event, err := router.ObserveChannel(channel, FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testFabricRoute("route-primary", "node-b", 10, 100, 0, true),
WarmStandby: []FabricRoute{testFabricRoute("route-standby", "node-b", 20, 100, 0, true)},
}, FabricChannelObservation{AckLatencyMs: 120}, now)
if err != nil {
t.Fatalf("observe channel: %v", err)
}
if event.Type != FabricChannelRouteEventNone || updated.RouteID != "route-primary" {
t.Fatalf("event=%+v updated=%+v", event, updated)
}
}
func testFabricChannelSpec(kind FabricChannelTargetKind, targetID string) FabricChannelSpec {
return FabricChannelSpec{
ChannelID: "channel-1",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: kind,
TargetID: targetID,
}
}
func testFabricRoute(routeID string, destination string, latency int, capacity int, active int, healthy bool) FabricRoute {
return FabricRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: destination,
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: destination}},
BaseLatencyMs: latency,
Capacity: capacity,
ActiveChannels: active,
Healthy: healthy,
}
}
func testFabricPoolRoute(routeID string, destination string, latency int, healthy bool) FabricRoute {
route := testFabricRoute(routeID, destination, latency, 100, 0, healthy)
route.PoolID = "pool-egress"
return route
}
@@ -0,0 +1,487 @@
package mesh
import (
"context"
"fmt"
"strings"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
type FabricChannelRuntimeConfig struct {
RouterConfig FabricChannelRouterConfig
StreamID uint64
TrafficClass fabricproto.TrafficClass
Timeout time.Duration
MaxPayload int
RouteHealthTTL time.Duration
}
type FabricChannelRuntime struct {
Transport FabricTransport
Router FabricChannelRouter
Pressure *FabricRoutePressureTracker
Health *FabricRouteHealthTracker
Config FabricChannelRuntimeConfig
}
type FabricChannelRuntimeResult struct {
Channel FabricChannel
BytesSent uint64
BytesRecv uint64
FramesSent uint64
FramesRecv uint64
AcksReceived uint64
RouteEvents []FabricChannelRouteEvent
RouteAttempts []string
MigrationEvents int
RoutePressure FabricRoutePressureSnapshot
RouteHealth FabricRouteHealthSnapshot
}
type FabricChannelRequestResponseResult struct {
FabricChannelRuntimeResult
ResponsePayload []byte
}
func NewFabricChannelRuntime(transport FabricTransport, cfg FabricChannelRuntimeConfig) *FabricChannelRuntime {
if cfg.StreamID == 0 {
cfg.StreamID = 2
}
if cfg.TrafficClass == 0 {
cfg.TrafficClass = fabricproto.TrafficClassBulk
}
if cfg.Timeout <= 0 {
cfg.Timeout = 30 * time.Second
}
if cfg.MaxPayload <= 0 {
cfg.MaxPayload = fabricproto.DefaultMaxPayload
}
return &FabricChannelRuntime{
Transport: transport,
Router: NewFabricChannelRouter(cfg.RouterConfig),
Pressure: NewFabricRoutePressureTracker(),
Health: NewFabricRouteHealthTracker(cfg.RouteHealthTTL),
Config: cfg,
}
}
func (r *FabricChannelRuntime) SendReliable(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payloads [][]byte) (FabricChannelRuntimeResult, error) {
if r == nil || r.Transport == nil {
return FabricChannelRuntimeResult{}, ErrForwardRuntimeUnavailable
}
now := time.Now().UTC()
routeSet = r.routeSetForScheduling(routeSet)
channel, event, err := r.Router.OpenChannel(spec, routeSet, now)
if err != nil {
return FabricChannelRuntimeResult{}, err
}
result := FabricChannelRuntimeResult{Channel: channel, RouteEvents: []FabricChannelRouteEvent{event}}
sequence := uint64(0)
index := 0
for index < len(payloads) {
routeSet = r.routeSetForScheduling(routeSet)
route, ok := findFabricRoute(routeSet, channel.RouteID)
if !ok {
return result, ErrFabricRouteNotFound
}
result.RouteAttempts = append(result.RouteAttempts, route.RouteID)
target, err := FabricTransportTargetForRoute(route)
if err != nil {
return result, err
}
releaseRoute := r.acquireRoute(route.RouteID)
session, err := r.Transport.Connect(ctx, target)
if err != nil {
releaseRoute()
r.markRouteFailure(route.RouteID, err)
updated, event, rerouteErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "connect_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
result.Channel = channel
if event.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, event)
result.MigrationEvents++
continue
}
if rerouteErr != nil {
return result, rerouteErr
}
return result, err
}
migrated, sendErr := r.sendOnSession(ctx, session, &channel, routeSet, route, payloads, &index, &sequence, &result)
_ = session.Close()
releaseRoute()
result.Channel = channel
if sendErr != nil {
return result, sendErr
}
if !migrated {
break
}
}
result.Channel = channel
result.RoutePressure = r.snapshotRoutePressure()
result.RouteHealth = r.snapshotRouteHealth()
return result, nil
}
func (r *FabricChannelRuntime) SendRequestResponse(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payload []byte) (FabricChannelRequestResponseResult, error) {
if r == nil || r.Transport == nil {
return FabricChannelRequestResponseResult{}, ErrForwardRuntimeUnavailable
}
if len(payload) > r.Config.MaxPayload {
return FabricChannelRequestResponseResult{}, fmt.Errorf("%w: %d > %d", fabricproto.ErrInvalidPayloadLen, len(payload), r.Config.MaxPayload)
}
now := time.Now().UTC()
routeSet = r.routeSetForScheduling(routeSet)
channel, event, err := r.Router.OpenChannel(spec, routeSet, now)
if err != nil {
return FabricChannelRequestResponseResult{}, err
}
result := FabricChannelRequestResponseResult{
FabricChannelRuntimeResult: FabricChannelRuntimeResult{Channel: channel, RouteEvents: []FabricChannelRouteEvent{event}},
}
sequence := uint64(1)
for {
routeSet = r.routeSetForScheduling(routeSet)
route, ok := findFabricRoute(routeSet, channel.RouteID)
if !ok {
return result, ErrFabricRouteNotFound
}
result.RouteAttempts = append(result.RouteAttempts, route.RouteID)
target, err := FabricTransportTargetForRoute(route)
if err != nil {
return result, err
}
releaseRoute := r.acquireRoute(route.RouteID)
session, err := r.Transport.Connect(ctx, target)
if err != nil {
releaseRoute()
r.markRouteFailure(route.RouteID, err)
updated, routeEvent, rerouteErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "connect_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
result.Channel = channel
if routeEvent.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, routeEvent)
result.MigrationEvents++
continue
}
if rerouteErr != nil {
return result, rerouteErr
}
return result, err
}
response, ackMs, sendErr := r.sendRequestResponseOnSession(ctx, session, route.RouteID, spec.ChannelID, payload, sequence)
_ = session.Close()
releaseRoute()
result.Channel = channel
if sendErr == nil {
r.markRouteSuccess(route.RouteID)
result.BytesSent += uint64(len(payload))
result.FramesSent++
result.BytesRecv += uint64(len(response))
result.FramesRecv++
result.AcksReceived++
updated, routeEvent, observeErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
AckLatencyMs: ackMs,
BytesSent: uint64(len(payload)),
FramesSent: 1,
BytesRecv: uint64(len(response)),
FramesRecv: 1,
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
result.Channel = channel
if observeErr != nil {
return result, observeErr
}
if routeEvent.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, routeEvent)
result.MigrationEvents++
}
result.ResponsePayload = response
result.RoutePressure = r.snapshotRoutePressure()
result.RouteHealth = r.snapshotRouteHealth()
return result, nil
}
r.markRouteFailure(route.RouteID, sendErr)
updated, routeEvent, rerouteErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "response_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
result.Channel = channel
if routeEvent.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, routeEvent)
result.MigrationEvents++
continue
}
if rerouteErr != nil {
return result, rerouteErr
}
return result, sendErr
}
}
func (r *FabricChannelRuntime) routeSetForScheduling(routeSet FabricRouteSet) FabricRouteSet {
if r != nil && r.Health != nil {
routeSet = r.Health.Apply(routeSet, time.Now().UTC())
}
return r.routeSetWithActiveChannels(routeSet)
}
func (r *FabricChannelRuntime) routeSetWithActiveChannels(routeSet FabricRouteSet) FabricRouteSet {
if r == nil || r.Pressure == nil {
return routeSet
}
return r.Pressure.Apply(routeSet)
}
func (r *FabricChannelRuntime) acquireRoute(routeID string) func() {
if r == nil || r.Pressure == nil {
return func() {}
}
return r.Pressure.Acquire(routeID)
}
func (r *FabricChannelRuntime) snapshotRoutePressure() FabricRoutePressureSnapshot {
if r == nil || r.Pressure == nil {
return FabricRoutePressureSnapshot{}
}
return r.Pressure.SnapshotPressure()
}
func (r *FabricChannelRuntime) snapshotRouteHealth() FabricRouteHealthSnapshot {
if r == nil || r.Health == nil {
return FabricRouteHealthSnapshot{}
}
return r.Health.Snapshot(time.Now().UTC())
}
func (r *FabricChannelRuntime) markRouteFailure(routeID string, err error) {
if r == nil || r.Health == nil || err == nil {
return
}
r.Health.MarkFailure(routeID, err.Error(), time.Now().UTC())
}
func (r *FabricChannelRuntime) markRouteSuccess(routeID string) {
if r == nil || r.Health == nil {
return
}
r.Health.MarkSuccess(routeID)
}
func (r *FabricChannelRuntime) sendOnSession(ctx context.Context, session FabricTransportSession, channel *FabricChannel, routeSet FabricRouteSet, route FabricRoute, payloads [][]byte, index *int, sequence *uint64, result *FabricChannelRuntimeResult) (bool, error) {
cfg := r.Config
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameOpenStream,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
}); err != nil {
r.markRouteFailure(route.RouteID, err)
return false, err
}
for *index < len(payloads) {
payload := payloads[*index]
if len(payload) > cfg.MaxPayload {
return false, fmt.Errorf("%w: %d > %d", fabricproto.ErrInvalidPayloadLen, len(payload), cfg.MaxPayload)
}
(*sequence)++
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
Sequence: *sequence,
Payload: payload,
}); err != nil {
r.markRouteFailure(route.RouteID, err)
return false, err
}
ackOK, ackMs := waitForFabricRuntimeAck(ctx, session, cfg.StreamID, *sequence, cfg.Timeout)
if !ackOK {
r.markRouteFailure(route.RouteID, fmt.Errorf("ack_failed"))
updated, event, err := r.Router.ObserveChannel(*channel, routeSet, FabricChannelObservation{
ChannelID: channel.Spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "ack_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
*channel = updated
if event.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, event)
result.MigrationEvents++
return true, nil
}
return false, err
}
r.markRouteSuccess(route.RouteID)
*index++
result.BytesSent += uint64(len(payload))
result.FramesSent++
result.AcksReceived++
updated, event, err := r.Router.ObserveChannel(*channel, routeSet, FabricChannelObservation{
ChannelID: channel.Spec.ChannelID,
RouteID: route.RouteID,
AckLatencyMs: ackMs,
BytesSent: uint64(len(payload)),
FramesSent: 1,
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
*channel = updated
if err != nil {
return false, err
}
if event.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, event)
result.MigrationEvents++
return true, nil
}
}
_ = session.Send(context.Background(), fabricproto.Frame{
Type: fabricproto.FrameCloseStream,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
})
return false, nil
}
func (r *FabricChannelRuntime) sendRequestResponseOnSession(ctx context.Context, session FabricTransportSession, routeID string, channelID string, payload []byte, sequence uint64) ([]byte, int64, error) {
cfg := r.Config
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameOpenStream,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
}); err != nil {
r.markRouteFailure(routeID, err)
return nil, 0, err
}
started := time.Now()
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
Sequence: sequence,
Payload: payload,
}); err != nil {
r.markRouteFailure(routeID, err)
return nil, 0, err
}
waitCtx := ctx
if cfg.Timeout > 0 {
var cancel context.CancelFunc
waitCtx, cancel = context.WithTimeout(ctx, cfg.Timeout)
defer cancel()
}
for {
select {
case <-waitCtx.Done():
return nil, 0, waitCtx.Err()
case err, ok := <-session.Errors():
if !ok {
return nil, 0, ErrForwardPeerUnavailable
}
if err != nil {
return nil, 0, err
}
case frame, ok := <-session.Frames():
if !ok {
return nil, 0, ErrForwardPeerUnavailable
}
if frame.Type != fabricproto.FrameData || frame.StreamID != cfg.StreamID || frame.Sequence != sequence {
continue
}
_ = session.Send(context.Background(), fabricproto.Frame{
Type: fabricproto.FrameCloseStream,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
})
return append([]byte(nil), frame.Payload...), time.Since(started).Milliseconds(), nil
}
}
}
func FabricTransportTargetForRoute(route FabricRoute) (FabricTransportTarget, error) {
if strings.TrimSpace(route.RouteID) == "" {
return FabricTransportTarget{}, ErrFabricRouteNotFound
}
if route.RelayCount > 0 {
for _, hop := range route.Hops {
if hop.Mode != FabricRouteRelay {
continue
}
if target, ok := fabricTransportTargetForHop(hop); ok {
return target, nil
}
}
}
for i := len(route.Hops) - 1; i >= 0; i-- {
if target, ok := fabricTransportTargetForHop(route.Hops[i]); ok {
return target, nil
}
}
return FabricTransportTarget{}, fmt.Errorf("%w: route %s has no transport endpoint", ErrFabricRouteNotFound, route.RouteID)
}
func fabricTransportTargetForHop(hop FabricRouteHop) (FabricTransportTarget, bool) {
endpoint := strings.TrimSpace(hop.Address)
if endpoint == "" {
return FabricTransportTarget{}, false
}
transport := string(hop.Mode)
if transport == "" {
transport = "quic"
}
return FabricTransportTarget{
EndpointID: hop.EndpointID,
PeerID: strings.TrimSpace(hop.NodeID),
Endpoint: endpoint,
Transport: transport,
PeerCertSHA256: strings.TrimSpace(hop.PeerCertSHA256),
}, true
}
func waitForFabricRuntimeAck(ctx context.Context, session FabricTransportSession, streamID uint64, sequence uint64, timeout time.Duration) (bool, int64) {
started := time.Now()
if timeout > 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
for {
select {
case <-ctx.Done():
return false, 0
case err, ok := <-session.Errors():
if !ok || err != nil {
return false, 0
}
case frame, ok := <-session.Frames():
if !ok {
return false, 0
}
if frame.Type == fabricproto.FrameAck && frame.StreamID == streamID && frame.Sequence == sequence {
return true, time.Since(started).Milliseconds()
}
}
}
}
@@ -0,0 +1,495 @@
package mesh
import (
"context"
"strings"
"sync"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestFabricChannelRuntimeMigratesSlowAckToStandbyRoute(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://slow.example.test:19443": 60 * time.Millisecond,
"quic://fast.example.test:19443": 0,
})
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-slow", "node-b", "quic://slow.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
},
}
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{
[]byte("one"),
[]byte("two"),
[]byte("three"),
})
if err != nil {
t.Fatalf("send reliable: %v", err)
}
if result.MigrationEvents != 1 {
t.Fatalf("migration events = %d, want 1: %+v", result.MigrationEvents, result.RouteEvents)
}
if result.Channel.RouteID != "route-fast" || result.Channel.RerouteCount != 1 {
t.Fatalf("channel = %+v", result.Channel)
}
if result.BytesSent != uint64(len("one")+len("two")+len("three")) || result.AcksReceived != 3 {
t.Fatalf("result = %+v", result)
}
if got := transport.connectCount("quic://slow.example.test:19443"); got != 1 {
t.Fatalf("slow connect count = %d, want 1", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
t.Fatalf("fast connect count = %d, want 1", got)
}
if result.RoutePressure.AcquiredTotal != 2 || result.RoutePressure.ReleasedTotal != 2 || result.RoutePressure.MaxActiveTotal == 0 {
t.Fatalf("route pressure = %+v", result.RoutePressure)
}
}
func TestFabricChannelRuntimeReroutesOnConnectFailure(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://fast.example.test:19443": 0,
})
transport.failConnect["quic://dead.example.test:19443"] = true
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-dead", "node-b", "quic://dead.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
},
}
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("payload")})
if err != nil {
t.Fatalf("send reliable: %v", err)
}
if result.MigrationEvents != 1 || result.Channel.RouteID != "route-fast" || result.BytesSent != uint64(len("payload")) {
t.Fatalf("result = %+v", result)
}
}
func TestFabricChannelRuntimeQuarantinesFailedRouteAcrossChannels(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://fast.example.test:19443": 0,
})
transport.failConnect["quic://dead.example.test:19443"] = true
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
RouteHealthTTL: time.Minute,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-dead", "node-b", "quic://dead.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
},
}
first, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("first")})
if err != nil {
t.Fatalf("first send reliable: %v", err)
}
if first.Channel.RouteID != "route-fast" || first.RouteHealth.Quarantined["route-dead"].Failures != 1 {
t.Fatalf("first result = %+v", first)
}
second, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("second")})
if err != nil {
t.Fatalf("second send reliable: %v", err)
}
if second.Channel.RouteID != "route-fast" {
t.Fatalf("second route = %s, want route-fast", second.Channel.RouteID)
}
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
t.Fatalf("dead connect count = %d, want one attempt before quarantine", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 2 {
t.Fatalf("fast connect count = %d, want both channels on healthy route", got)
}
}
func TestFabricChannelRuntimeReroutesOnAckTimeout(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://slow.example.test:19443": 100 * time.Millisecond,
"quic://fast.example.test:19443": 0,
})
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
Timeout: 10 * time.Millisecond,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-slow", "node-b", "quic://slow.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
},
}
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("payload")})
if err != nil {
t.Fatalf("send reliable: %v", err)
}
if result.MigrationEvents != 1 || result.Channel.RouteID != "route-fast" || result.BytesSent != uint64(len("payload")) {
t.Fatalf("result = %+v", result)
}
}
func TestFabricChannelRuntimeSpreadsConcurrentChannelsBySharedPressure(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://route-a.example.test:19443": 80 * time.Millisecond,
"quic://route-b.example.test:19443": 0,
})
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{StreamID: 9})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-a", "node-b", "quic://route-a.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-b", "node-b", "quic://route-b.example.test:19443", 11),
},
}
firstDone := make(chan error, 1)
go func() {
_, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("one")})
firstDone <- err
}()
transport.waitForConnect(t, "quic://route-a.example.test:19443", 1)
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("two")})
if err != nil {
t.Fatalf("second send reliable: %v", err)
}
if result.Channel.RouteID != "route-b" {
t.Fatalf("second route = %s, want route-b", result.Channel.RouteID)
}
if got := transport.connectCount("quic://route-b.example.test:19443"); got != 1 {
t.Fatalf("route-b connect count = %d, want 1", got)
}
if err := <-firstDone; err != nil {
t.Fatalf("first send reliable: %v", err)
}
}
func TestFabricChannelRuntimeRequestResponseReturnsPayload(t *testing.T) {
transport := newFakeFabricRequestResponseTransport(map[string][]byte{
"quic://runtime.example.test:19443": []byte(`{"status":"ok"}`),
})
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetPool,
TargetID: "pool-admin-runtime",
Primary: testRuntimePoolRoute("route-runtime", "pool-admin-runtime", "node-runtime", "quic://runtime.example.test:19443", 10),
}
result, err := runtime.SendRequestResponse(context.Background(), FabricChannelSpec{
ChannelID: "channel-web-1",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetPool,
TargetID: "pool-admin-runtime",
TrafficClass: "control",
CreatedAt: time.Now().UTC(),
}, routeSet, []byte(`{"request":true}`))
if err != nil {
t.Fatalf("request response: %v", err)
}
if string(result.ResponsePayload) != `{"status":"ok"}` {
t.Fatalf("response payload = %s", string(result.ResponsePayload))
}
if result.Channel.RouteID != "route-runtime" ||
result.BytesSent != uint64(len(`{"request":true}`)) ||
result.BytesRecv != uint64(len(`{"status":"ok"}`)) ||
result.FramesSent != 1 ||
result.FramesRecv != 1 ||
result.AcksReceived != 1 {
t.Fatalf("result = %+v", result)
}
}
func TestFabricChannelRuntimeRequestResponseReroutesOnResponseFailure(t *testing.T) {
transport := newFakeFabricRequestResponseTransport(map[string][]byte{
"quic://fast.example.test:19443": []byte(`{"status":"ok"}`),
})
transport.failResponse["quic://slow.example.test:19443"] = true
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
Timeout: 10 * time.Millisecond,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-runtime",
Primary: testRuntimeRoute("route-slow", "node-runtime", "quic://slow.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-runtime", "quic://fast.example.test:19443", 20),
},
}
result, err := runtime.SendRequestResponse(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-runtime"), routeSet, []byte(`{"request":true}`))
if err != nil {
t.Fatalf("request response: %v", err)
}
if result.MigrationEvents != 1 || result.Channel.RouteID != "route-fast" || string(result.ResponsePayload) != `{"status":"ok"}` {
t.Fatalf("result = %+v", result)
}
}
func TestFabricTransportTargetForRouteUsesLastAddressedHop(t *testing.T) {
target, err := FabricTransportTargetForRoute(FabricRoute{
RouteID: "route-1",
Hops: []FabricRouteHop{
{NodeID: "node-a"},
{NodeID: "node-r", Mode: FabricRouteRelay, EndpointID: "relay-1", Address: "quic://relay.example.test:19443"},
{NodeID: "node-b", Mode: FabricRouteDirect, EndpointID: "node-b-quic", Address: "quic://node-b.example.test:19443"},
},
})
if err != nil {
t.Fatalf("target for route: %v", err)
}
if target.PeerID != "node-b" || target.EndpointID != "node-b-quic" || target.Endpoint != "quic://node-b.example.test:19443" || target.Transport != string(FabricRouteDirect) {
t.Fatalf("target = %+v", target)
}
}
type fakeFabricRequestResponseTransport struct {
mu sync.Mutex
responses map[string][]byte
failResponse map[string]bool
connects map[string]int
}
func newFakeFabricRequestResponseTransport(responses map[string][]byte) *fakeFabricRequestResponseTransport {
return &fakeFabricRequestResponseTransport{
responses: responses,
failResponse: map[string]bool{},
connects: map[string]int{},
}
}
func (t *fakeFabricRequestResponseTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
endpoint := target.Endpoint
t.mu.Lock()
t.connects[endpoint]++
response := append([]byte(nil), t.responses[endpoint]...)
failResponse := t.failResponse[endpoint]
t.mu.Unlock()
return &fakeFabricRequestResponseSession{
response: response,
failResponse: failResponse,
frames: make(chan fabricproto.Frame, 16),
errors: make(chan error, 1),
done: make(chan struct{}),
}, nil
}
func (t *fakeFabricRequestResponseTransport) Close() error {
return nil
}
type fakeFabricRequestResponseSession struct {
response []byte
failResponse bool
frames chan fabricproto.Frame
errors chan error
done chan struct{}
once sync.Once
}
func (s *fakeFabricRequestResponseSession) Send(_ context.Context, frame fabricproto.Frame) error {
if frame.Type != fabricproto.FrameData || s.failResponse {
return nil
}
response := append([]byte(nil), s.response...)
go func() {
select {
case <-s.done:
case s.frames <- fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: frame.TrafficClass, StreamID: frame.StreamID, Sequence: frame.Sequence, Payload: response}:
}
}()
return nil
}
func (s *fakeFabricRequestResponseSession) Frames() <-chan fabricproto.Frame {
return s.frames
}
func (s *fakeFabricRequestResponseSession) Errors() <-chan error {
return s.errors
}
func (s *fakeFabricRequestResponseSession) Close() error {
s.once.Do(func() {
close(s.done)
})
return nil
}
func (s *fakeFabricRequestResponseSession) Closed() bool {
select {
case <-s.done:
return true
default:
return false
}
}
func TestFabricTransportTargetForRouteUsesRelayHopForRelayRoute(t *testing.T) {
target, err := FabricTransportTargetForRoute(FabricRoute{
RouteID: "route-relay",
RelayCount: 1,
Hops: []FabricRouteHop{
{NodeID: "node-a"},
{NodeID: "node-r", Mode: FabricRouteRelay, EndpointID: "relay-1", Address: "quic://relay.example.test:19443", PeerCertSHA256: "relay-cert"},
{NodeID: "node-b", Mode: FabricRouteRelay, EndpointID: "node-b-private", Address: "quic://10.0.0.2:19443", PeerCertSHA256: "node-b-cert"},
},
})
if err != nil {
t.Fatalf("target for relay route: %v", err)
}
if target.PeerID != "node-r" || target.EndpointID != "relay-1" || target.Endpoint != "quic://relay.example.test:19443" || target.PeerCertSHA256 != "relay-cert" {
t.Fatalf("target = %+v", target)
}
}
type fakeFabricRuntimeTransport struct {
mu sync.Mutex
delays map[string]time.Duration
failConnect map[string]bool
connects map[string]int
}
func newFakeFabricRuntimeTransport(delays map[string]time.Duration) *fakeFabricRuntimeTransport {
return &fakeFabricRuntimeTransport{
delays: delays,
failConnect: map[string]bool{},
connects: map[string]int{},
}
}
func (t *fakeFabricRuntimeTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
endpoint := target.Endpoint
t.mu.Lock()
t.connects[endpoint]++
fail := t.failConnect[endpoint]
delay := t.delays[endpoint]
t.mu.Unlock()
if fail {
return nil, ErrForwardPeerUnavailable
}
return &fakeFabricRuntimeSession{
endpoint: endpoint,
delay: delay,
frames: make(chan fabricproto.Frame, 64),
errors: make(chan error, 1),
done: make(chan struct{}),
}, nil
}
func (t *fakeFabricRuntimeTransport) Close() error {
return nil
}
func (t *fakeFabricRuntimeTransport) connectCount(endpoint string) int {
t.mu.Lock()
defer t.mu.Unlock()
return t.connects[endpoint]
}
func (t *fakeFabricRuntimeTransport) waitForConnect(tb testing.TB, endpoint string, count int) {
tb.Helper()
deadline := time.Now().Add(time.Second)
for {
t.mu.Lock()
got := t.connects[endpoint]
t.mu.Unlock()
if got >= count {
return
}
if time.Now().After(deadline) {
tb.Fatalf("timed out waiting for %s connect count %d, got %d", endpoint, count, got)
}
time.Sleep(time.Millisecond)
}
}
type fakeFabricRuntimeSession struct {
endpoint string
delay time.Duration
frames chan fabricproto.Frame
errors chan error
done chan struct{}
once sync.Once
}
func (s *fakeFabricRuntimeSession) Send(_ context.Context, frame fabricproto.Frame) error {
if frame.Type != fabricproto.FrameData {
return nil
}
delay := s.delay
go func() {
if delay > 0 {
time.Sleep(delay)
}
select {
case <-s.done:
case s.frames <- fabricproto.Frame{Type: fabricproto.FrameAck, TrafficClass: frame.TrafficClass, StreamID: frame.StreamID, Sequence: frame.Sequence}:
}
}()
return nil
}
func (s *fakeFabricRuntimeSession) Frames() <-chan fabricproto.Frame {
return s.frames
}
func (s *fakeFabricRuntimeSession) Errors() <-chan error {
return s.errors
}
func (s *fakeFabricRuntimeSession) Close() error {
s.once.Do(func() {
close(s.done)
})
return nil
}
func (s *fakeFabricRuntimeSession) Closed() bool {
select {
case <-s.done:
return true
default:
return false
}
}
func testRuntimeRoute(routeID string, destination string, endpoint string, latency int) FabricRoute {
route := testFabricRoute(routeID, destination, latency, 100, 0, true)
route.Hops[len(route.Hops)-1].Address = endpoint
route.Hops[len(route.Hops)-1].EndpointID = strings.TrimPrefix(routeID, "route-")
route.Hops[len(route.Hops)-1].Mode = FabricRouteDirect
return route
}
func testRuntimePoolRoute(routeID string, poolID string, destination string, endpoint string, latency int) FabricRoute {
route := testRuntimeRoute(routeID, destination, endpoint, latency)
route.PoolID = poolID
return route
}
@@ -0,0 +1,390 @@
package mesh
import (
"errors"
"sort"
"strings"
"time"
)
type FabricChannelTargetKind string
const (
FabricChannelTargetNode FabricChannelTargetKind = "node"
FabricChannelTargetPool FabricChannelTargetKind = "pool"
)
type FabricChannelLifecycleState string
const (
FabricChannelOpening FabricChannelLifecycleState = "opening"
FabricChannelOpen FabricChannelLifecycleState = "open"
FabricChannelDraining FabricChannelLifecycleState = "draining"
FabricChannelClosed FabricChannelLifecycleState = "closed"
)
type FabricRouteMode string
const (
FabricRouteDirect FabricRouteMode = "direct_quic"
FabricRouteLAN FabricRouteMode = "lan_quic"
FabricRouteReverse FabricRouteMode = "reverse_quic"
FabricRouteRelay FabricRouteMode = "relay_quic"
FabricRouteICE FabricRouteMode = "ice_quic"
)
var (
ErrFabricChannelInvalid = errors.New("fabric channel request is invalid")
ErrFabricRouteNotFound = errors.New("fabric route not found")
)
type FabricChannelSpec struct {
ChannelID string
ClusterID string
SourceNodeID string
TargetKind FabricChannelTargetKind
TargetID string
TrafficClass string
MinBandwidth int64
StickyKey string
CreatedAt time.Time
ForbiddenHops []string
}
type FabricServiceChannelTarget struct {
Kind FabricChannelTargetKind
PoolIDs []string
NodeIDs []string
SelectedNodeID string
ServiceRole string
SelectionPolicy string
SingleMemberPool bool
}
type FabricServiceChannelRequest struct {
SchemaVersion string
ChannelID string
ClusterID string
OrganizationID string
UserID string
ResourceID string
SourceNodeID string
SourceRole string
ServiceClass string
Target FabricServiceChannelTarget
TrafficClass string
CreatedAt time.Time
}
type FabricChannel struct {
Spec FabricChannelSpec
State FabricChannelLifecycleState
RouteID string
TargetNode string
OpenedAt time.Time
LastReroute time.Time
BytesSent uint64
BytesRecv uint64
FramesSent uint64
FramesRecv uint64
RerouteCount uint64
}
type FabricRouteHop struct {
NodeID string
Mode FabricRouteMode
EndpointID string
Address string
PeerCertSHA256 string
}
type FabricRoute struct {
RouteID string
ClusterID string
SourceNodeID string
DestinationNodeID string
PoolID string
Hops []FabricRouteHop
BaseLatencyMs int
JitterMs int
LossPermille int
Capacity int
ActiveChannels int
RelayCount int
LastUpdatedAt time.Time
Healthy bool
Degraded bool
}
type FabricRouteSet struct {
TargetKind FabricChannelTargetKind
TargetID string
Primary FabricRoute
WarmStandby []FabricRoute
ColdFallbacks []FabricRoute
}
type FabricAdjacency struct {
FromNodeID string
ToNodeID string
Mode FabricRouteMode
RTTMs int
JitterMs int
LossPermille int
Capacity int
ActiveChannels int
ThroughputBps int64
PressurePercent int
Healthy bool
PassiveOutbound bool
LocalSegmentID string
NATGroupID string
LastObservedAt time.Time
LastFailureReason string
}
type FabricRouteChoice struct {
Route FabricRoute
Score int
Reason string
PressureBefore int
PressureAfter int
}
type FabricRouteSchedulerConfig struct {
LatencyWeight int
JitterWeight int
LossWeight int
PressureWeight int
HopPenalty int
RelayPenalty int
DegradedPenalty int
ProjectedChannelCost int
HardMaxRoutePressure int
}
type FabricRouteScheduler struct {
Config FabricRouteSchedulerConfig
}
func NewFabricRouteScheduler(cfg FabricRouteSchedulerConfig) FabricRouteScheduler {
return FabricRouteScheduler{Config: normalizeFabricRouteSchedulerConfig(cfg)}
}
func (s FabricRouteScheduler) ChooseRoute(spec FabricChannelSpec, routeSet FabricRouteSet, now time.Time) (FabricRouteChoice, error) {
if err := ValidateFabricChannelSpec(spec); err != nil {
return FabricRouteChoice{}, err
}
routes := flattenFabricRouteSet(routeSet)
if len(routes) == 0 {
return FabricRouteChoice{}, ErrFabricRouteNotFound
}
forbidden := stringSet(spec.ForbiddenHops)
choices := make([]FabricRouteChoice, 0, len(routes))
for _, route := range routes {
if !fabricRouteUsable(spec, route, forbidden, now) {
continue
}
choice := s.scoreRoute(route)
if s.Config.HardMaxRoutePressure > 0 && choice.PressureAfter > s.Config.HardMaxRoutePressure {
continue
}
choice.Route = route
choices = append(choices, choice)
}
if len(choices) == 0 {
return FabricRouteChoice{}, ErrFabricRouteNotFound
}
sort.SliceStable(choices, func(i, j int) bool {
if choices[i].Score != choices[j].Score {
return choices[i].Score < choices[j].Score
}
if choices[i].PressureAfter != choices[j].PressureAfter {
return choices[i].PressureAfter < choices[j].PressureAfter
}
if choices[i].Route.BaseLatencyMs != choices[j].Route.BaseLatencyMs {
return choices[i].Route.BaseLatencyMs < choices[j].Route.BaseLatencyMs
}
return choices[i].Route.RouteID < choices[j].Route.RouteID
})
return choices[0], nil
}
func ValidateFabricChannelSpec(spec FabricChannelSpec) error {
if strings.TrimSpace(spec.ChannelID) == "" || strings.TrimSpace(spec.ClusterID) == "" || strings.TrimSpace(spec.SourceNodeID) == "" || strings.TrimSpace(spec.TargetID) == "" {
return ErrFabricChannelInvalid
}
switch spec.TargetKind {
case FabricChannelTargetNode, FabricChannelTargetPool:
return nil
default:
return ErrFabricChannelInvalid
}
}
func FabricChannelSpecFromServiceRequest(req FabricServiceChannelRequest, localNodeID string, now time.Time) (FabricChannelSpec, error) {
if now.IsZero() {
now = time.Now().UTC()
}
sourceNodeID := firstNonEmpty(strings.TrimSpace(req.SourceNodeID), strings.TrimSpace(localNodeID))
targetKind := req.Target.Kind
if targetKind == "" {
targetKind = FabricChannelTargetPool
}
targetID := firstNonEmpty(firstString(req.Target.PoolIDs), strings.TrimSpace(req.Target.SelectedNodeID), firstString(req.Target.NodeIDs))
if targetKind == FabricChannelTargetNode {
targetID = firstNonEmpty(strings.TrimSpace(req.Target.SelectedNodeID), firstString(req.Target.NodeIDs), targetID)
}
spec := FabricChannelSpec{
ChannelID: firstNonEmpty(strings.TrimSpace(req.ChannelID), strings.TrimSpace(req.ResourceID)),
ClusterID: strings.TrimSpace(req.ClusterID),
SourceNodeID: sourceNodeID,
TargetKind: targetKind,
TargetID: targetID,
TrafficClass: firstNonEmpty(strings.TrimSpace(req.TrafficClass), serviceClassDefaultTrafficClass(req.ServiceClass)),
StickyKey: strings.TrimSpace(req.ResourceID),
CreatedAt: now,
}
if err := ValidateFabricChannelSpec(spec); err != nil {
return FabricChannelSpec{}, err
}
return spec, nil
}
func serviceClassDefaultTrafficClass(serviceClass string) string {
switch strings.TrimSpace(strings.ToLower(serviceClass)) {
case FabricServiceClassVPNPackets:
return FabricServiceChannelBulk
case FabricServiceClassRemoteWorkspace:
return FabricServiceChannelInteractive
default:
return FabricServiceChannelReliable
}
}
func firstString(values []string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
}
return ""
}
func (s FabricRouteScheduler) scoreRoute(route FabricRoute) FabricRouteChoice {
cfg := normalizeFabricRouteSchedulerConfig(s.Config)
pressureBefore := fabricRoutePressurePercent(route, 0)
pressureAfter := fabricRoutePressurePercent(route, cfg.ProjectedChannelCost)
score := route.BaseLatencyMs*cfg.LatencyWeight +
route.JitterMs*cfg.JitterWeight +
route.LossPermille*cfg.LossWeight +
pressureAfter*cfg.PressureWeight +
len(route.Hops)*cfg.HopPenalty +
route.RelayCount*cfg.RelayPenalty
if route.Degraded {
score += cfg.DegradedPenalty
}
reason := "latency_load_score"
if pressureAfter >= 90 {
reason = "capacity_pressure_avoidance"
}
if route.RelayCount > 0 {
reason = "relay_fallback_available"
}
return FabricRouteChoice{Score: score, Reason: reason, PressureBefore: pressureBefore, PressureAfter: pressureAfter}
}
func normalizeFabricRouteSchedulerConfig(cfg FabricRouteSchedulerConfig) FabricRouteSchedulerConfig {
if cfg.LatencyWeight <= 0 {
cfg.LatencyWeight = 10
}
if cfg.JitterWeight <= 0 {
cfg.JitterWeight = 4
}
if cfg.LossWeight <= 0 {
cfg.LossWeight = 8
}
if cfg.PressureWeight <= 0 {
cfg.PressureWeight = 12
}
if cfg.HopPenalty <= 0 {
cfg.HopPenalty = 5
}
if cfg.RelayPenalty <= 0 {
cfg.RelayPenalty = 25
}
if cfg.DegradedPenalty <= 0 {
cfg.DegradedPenalty = 500
}
if cfg.ProjectedChannelCost <= 0 {
cfg.ProjectedChannelCost = 1
}
if cfg.HardMaxRoutePressure < 0 {
cfg.HardMaxRoutePressure = 0
}
return cfg
}
func flattenFabricRouteSet(routeSet FabricRouteSet) []FabricRoute {
routes := make([]FabricRoute, 0, 1+len(routeSet.WarmStandby)+len(routeSet.ColdFallbacks))
if strings.TrimSpace(routeSet.Primary.RouteID) != "" {
routes = append(routes, routeSet.Primary)
}
routes = append(routes, routeSet.WarmStandby...)
routes = append(routes, routeSet.ColdFallbacks...)
return routes
}
func fabricRouteUsable(spec FabricChannelSpec, route FabricRoute, forbidden map[string]struct{}, now time.Time) bool {
if strings.TrimSpace(route.RouteID) == "" || !route.Healthy {
return false
}
if route.ClusterID != "" && spec.ClusterID != "" && route.ClusterID != spec.ClusterID {
return false
}
if route.SourceNodeID != "" && route.SourceNodeID != spec.SourceNodeID {
return false
}
switch spec.TargetKind {
case FabricChannelTargetNode:
if route.DestinationNodeID != "" && route.DestinationNodeID != spec.TargetID {
return false
}
case FabricChannelTargetPool:
if route.PoolID != "" && route.PoolID != spec.TargetID {
return false
}
}
for _, hop := range route.Hops {
if _, blocked := forbidden[hop.NodeID]; blocked {
return false
}
}
return true
}
func fabricRoutePressurePercent(route FabricRoute, projected int) int {
if route.Capacity <= 0 {
return 100
}
active := route.ActiveChannels + projected
if active <= 0 {
return 0
}
pressure := (active * 100) / route.Capacity
if pressure > 100 {
return 100
}
return pressure
}
func stringSet(values []string) map[string]struct{} {
out := make(map[string]struct{}, len(values))
for _, value := range values {
value = strings.TrimSpace(value)
if value != "" {
out[value] = struct{}{}
}
}
return out
}
@@ -0,0 +1,244 @@
package mesh
import (
"errors"
"testing"
"time"
)
func TestFabricRouteSchedulerAvoidsSaturatedShortestRoute(t *testing.T) {
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{})
spec := FabricChannelSpec{
ChannelID: "channel-1",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
}
choice, err := scheduler.ChooseRoute(spec, FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: FabricRoute{
RouteID: "short-saturated",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-b"}},
BaseLatencyMs: 10,
Capacity: 10,
ActiveChannels: 10,
Healthy: true,
},
WarmStandby: []FabricRoute{{
RouteID: "slightly-longer-free",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-r"}, {NodeID: "node-b"}},
BaseLatencyMs: 18,
Capacity: 100,
ActiveChannels: 5,
RelayCount: 1,
Healthy: true,
}},
}, time.Now())
if err != nil {
t.Fatalf("choose route: %v", err)
}
if choice.Route.RouteID != "slightly-longer-free" {
t.Fatalf("route = %q, want slightly-longer-free score=%d pressure=%d", choice.Route.RouteID, choice.Score, choice.PressureAfter)
}
}
func TestFabricChannelSpecFromServiceRequestTargetsPool(t *testing.T) {
spec, err := FabricChannelSpecFromServiceRequest(FabricServiceChannelRequest{
ChannelID: "vpn-1",
ClusterID: "cluster-1",
ResourceID: "vpn-1",
ServiceClass: FabricServiceClassVPNPackets,
Target: FabricServiceChannelTarget{
Kind: FabricChannelTargetPool,
PoolIDs: []string{"home-ipv4"},
ServiceRole: "ipv4-egress",
},
}, "android-node", time.Now())
if err != nil {
t.Fatalf("service request spec: %v", err)
}
if spec.SourceNodeID != "android-node" || spec.TargetKind != FabricChannelTargetPool || spec.TargetID != "home-ipv4" || spec.TrafficClass != FabricServiceChannelBulk {
t.Fatalf("unexpected spec: %+v", spec)
}
}
func TestFabricChannelSpecFromServiceRequestKeepsServiceOutOfEndpointSelection(t *testing.T) {
_, err := FabricChannelSpecFromServiceRequest(FabricServiceChannelRequest{
ChannelID: "rdp-1",
ClusterID: "cluster-1",
ServiceClass: FabricServiceClassRemoteWorkspace,
Target: FabricServiceChannelTarget{
Kind: FabricChannelTargetPool,
ServiceRole: "rdp-gateway",
},
}, "client-node", time.Now())
if !errors.Is(err, ErrFabricChannelInvalid) {
t.Fatalf("err = %v, want invalid without pool/node target id", err)
}
}
func TestFabricRouteSchedulerPoolSkipsFailedEndpoint(t *testing.T) {
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{})
spec := FabricChannelSpec{
ChannelID: "channel-pool",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
}
choice, err := scheduler.ChooseRoute(spec, FabricRouteSet{
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
Primary: FabricRoute{
RouteID: "pool-node-dead",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
PoolID: "pool-egress",
Capacity: 100,
Healthy: false,
},
WarmStandby: []FabricRoute{{
RouteID: "pool-node-live",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-c",
PoolID: "pool-egress",
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-c"}},
BaseLatencyMs: 25,
Capacity: 100,
Healthy: true,
}},
}, time.Now())
if err != nil {
t.Fatalf("choose route: %v", err)
}
if choice.Route.DestinationNodeID != "node-c" {
t.Fatalf("destination = %q, want node-c", choice.Route.DestinationNodeID)
}
}
func TestFabricRouteSchedulerHonorsForbiddenHops(t *testing.T) {
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{})
spec := FabricChannelSpec{
ChannelID: "channel-1",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
ForbiddenHops: []string{"node-r"},
}
_, err := scheduler.ChooseRoute(spec, FabricRouteSet{
Primary: FabricRoute{
RouteID: "blocked",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-r"}, {NodeID: "node-b"}},
Capacity: 100,
Healthy: true,
},
}, time.Now())
if !errors.Is(err, ErrFabricRouteNotFound) {
t.Fatalf("err = %v, want ErrFabricRouteNotFound", err)
}
}
func TestFabricRouteSchedulerRejectsRoutesAboveHardPressureLimit(t *testing.T) {
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{HardMaxRoutePressure: 80})
spec := FabricChannelSpec{
ChannelID: "channel-pressure",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
}
choice, err := scheduler.ChooseRoute(spec, FabricRouteSet{
Primary: FabricRoute{
RouteID: "too-busy",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Capacity: 10,
ActiveChannels: 9,
Healthy: true,
},
WarmStandby: []FabricRoute{{
RouteID: "admissible",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Capacity: 10,
ActiveChannels: 5,
Healthy: true,
}},
}, time.Now())
if err != nil {
t.Fatalf("choose route: %v", err)
}
if choice.Route.RouteID != "admissible" {
t.Fatalf("route = %q, want admissible", choice.Route.RouteID)
}
}
func TestFabricRouteSchedulerKeepsHighLatencyRouteAsFallbackUntilFastRouteSaturates(t *testing.T) {
spec := FabricChannelSpec{
ChannelID: "channel-latency-aware",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
}
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
Primary: FabricRoute{
RouteID: "lan-fast",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-lan",
PoolID: "pool-egress",
BaseLatencyMs: 4,
Capacity: 100,
ActiveChannels: 85,
Healthy: true,
},
WarmStandby: []FabricRoute{{
RouteID: "wan-slow",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-wan",
PoolID: "pool-egress",
BaseLatencyMs: 420,
Capacity: 100,
ActiveChannels: 0,
Healthy: true,
}},
}
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{HardMaxRoutePressure: 90})
choice, err := scheduler.ChooseRoute(spec, routeSet, time.Now())
if err != nil {
t.Fatalf("choose route: %v", err)
}
if choice.Route.RouteID != "lan-fast" {
t.Fatalf("route = %q, want fast LAN before hard pressure limit", choice.Route.RouteID)
}
routeSet.Primary.ActiveChannels = 90
choice, err = scheduler.ChooseRoute(spec, routeSet, time.Now())
if err != nil {
t.Fatalf("choose fallback route: %v", err)
}
if choice.Route.RouteID != "wan-slow" {
t.Fatalf("route = %q, want WAN only after LAN reaches hard pressure limit", choice.Route.RouteID)
}
}
@@ -0,0 +1,130 @@
package mesh
import (
"context"
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
type FabricOverlayTransportConfig struct {
ClusterID string
LocalNodeID string
RouterConfig FabricChannelRouterConfig
Timeout time.Duration
}
type FabricOverlayTransport struct {
Runtime *FabricChannelRuntime
RouteSets map[string]FabricRouteSet
Config FabricOverlayTransportConfig
sequence atomic.Uint64
}
type FabricOverlayTransportSnapshot struct {
RoutePressure FabricRoutePressureSnapshot `json:"route_pressure"`
RouteHealth FabricRouteHealthSnapshot `json:"route_health,omitempty"`
}
type FabricOverlaySendRequest struct {
ChannelID string
TargetKind FabricChannelTargetKind
TargetID string
TrafficClass fabricproto.TrafficClass
Payloads [][]byte
StickyKey string
}
func NewFabricOverlayTransport(transport FabricTransport, routeSets map[string]FabricRouteSet, cfg FabricOverlayTransportConfig) *FabricOverlayTransport {
if cfg.Timeout <= 0 {
cfg.Timeout = 30 * time.Second
}
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: cfg.RouterConfig,
Timeout: cfg.Timeout,
})
normalized := make(map[string]FabricRouteSet, len(routeSets))
for targetID, routeSet := range routeSets {
targetID = strings.TrimSpace(targetID)
if targetID != "" {
normalized[targetID] = routeSet
}
}
return &FabricOverlayTransport{
Runtime: runtime,
RouteSets: normalized,
Config: cfg,
}
}
func (t *FabricOverlayTransport) Send(ctx context.Context, req FabricOverlaySendRequest) (FabricChannelRuntimeResult, error) {
if t == nil || t.Runtime == nil {
return FabricChannelRuntimeResult{}, ErrForwardRuntimeUnavailable
}
targetID := strings.TrimSpace(req.TargetID)
if targetID == "" {
return FabricChannelRuntimeResult{}, ErrFabricChannelInvalid
}
routeSet, ok := t.RouteSets[targetID]
if !ok {
return FabricChannelRuntimeResult{}, ErrFabricRouteNotFound
}
targetKind := req.TargetKind
if targetKind == "" {
targetKind = routeSet.TargetKind
}
if targetKind == "" {
targetKind = FabricChannelTargetNode
}
trafficClass := req.TrafficClass
if trafficClass == 0 {
trafficClass = fabricproto.TrafficClassReliable
}
t.Runtime.Config.TrafficClass = trafficClass
spec := FabricChannelSpec{
ChannelID: firstNonEmpty(strings.TrimSpace(req.ChannelID), fmt.Sprintf("fabric-overlay-%d", t.sequence.Add(1))),
ClusterID: strings.TrimSpace(t.Config.ClusterID),
SourceNodeID: strings.TrimSpace(t.Config.LocalNodeID),
TargetKind: targetKind,
TargetID: targetID,
TrafficClass: loadFabricTrafficClassName(trafficClass),
StickyKey: strings.TrimSpace(req.StickyKey),
CreatedAt: time.Now().UTC(),
}
return t.Runtime.SendReliable(ctx, spec, routeSet, req.Payloads)
}
func (t *FabricOverlayTransport) SnapshotPressure() FabricRoutePressureSnapshot {
if t == nil || t.Runtime == nil || t.Runtime.Pressure == nil {
return FabricRoutePressureSnapshot{}
}
return t.Runtime.Pressure.SnapshotPressure()
}
func (t *FabricOverlayTransport) Snapshot() FabricOverlayTransportSnapshot {
if t == nil || t.Runtime == nil {
return FabricOverlayTransportSnapshot{}
}
return FabricOverlayTransportSnapshot{
RoutePressure: t.Runtime.snapshotRoutePressure(),
RouteHealth: t.Runtime.snapshotRouteHealth(),
}
}
func loadFabricTrafficClassName(trafficClass fabricproto.TrafficClass) string {
switch trafficClass {
case fabricproto.TrafficClassControl:
return "control"
case fabricproto.TrafficClassInteractive:
return "interactive"
case fabricproto.TrafficClassBulk:
return "bulk"
case fabricproto.TrafficClassReliable:
return "reliable"
default:
return fmt.Sprintf("traffic_class_%d", trafficClass)
}
}
@@ -0,0 +1,49 @@
package mesh
import (
"context"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestFabricOverlayTransportSendsThroughRouteSet(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://node-b:19443": 0,
})
overlay := NewFabricOverlayTransport(transport, map[string]FabricRouteSet{
"node-b": {
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: FabricRoute{
RouteID: "node-b-direct",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Hops: []FabricRouteHop{{NodeID: "node-b", Mode: FabricRouteDirect, EndpointID: "node-b-direct", Address: "quic://node-b:19443"}},
Capacity: 100,
Healthy: true,
},
},
}, FabricOverlayTransportConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
result, err := overlay.Send(ctx, FabricOverlaySendRequest{
TargetID: "node-b",
TrafficClass: fabricproto.TrafficClassReliable,
Payloads: [][]byte{[]byte("payload")},
})
if err != nil {
t.Fatalf("send: %v", err)
}
if result.BytesSent != uint64(len("payload")) || result.AcksReceived != 1 {
t.Fatalf("result = %+v", result)
}
if pressure := overlay.SnapshotPressure(); pressure.ActiveTotal != 0 || pressure.AcquiredTotal != pressure.ReleasedTotal {
t.Fatalf("pressure leak: %+v", pressure)
}
if snapshot := overlay.Snapshot(); snapshot.RoutePressure.AcquiredTotal != 1 || len(snapshot.RouteHealth.Quarantined) != 0 {
t.Fatalf("snapshot = %+v", snapshot)
}
}
@@ -3,28 +3,50 @@ package mesh
import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"net"
"strings"
"sync"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
"github.com/quic-go/quic-go"
)
type QUICFabricServer struct {
listener *quic.Listener
logger FabricSessionEventLogger
done chan struct{}
closeOnce sync.Once
listener *quic.Listener
logger FabricSessionEventLogger
reverseMu sync.RWMutex
reverseTransport *QUICFabricTransport
fabricFrameHandler FabricFrameHandler
productionForwardHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
webIngressForwardHandler func(context.Context, []byte) ([]byte, error)
fabricControlHandler func(context.Context, []byte) ([]byte, error)
syntheticForwardHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
done chan struct{}
closeOnce sync.Once
}
type QUICFabricServerConfig struct {
ListenAddr string
TLSConfig *tls.Config
QUICConfig *quic.Config
Logger FabricSessionEventLogger
ListenAddr string
TLSConfig *tls.Config
QUICConfig *quic.Config
Logger FabricSessionEventLogger
ReverseTransport *QUICFabricTransport
FabricFrameHandler FabricFrameHandler
ProductionForwardHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
WebIngressForwardHandler func(context.Context, []byte) ([]byte, error)
FabricControlHandler func(context.Context, []byte) ([]byte, error)
SyntheticForwardHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
}
type FabricFrameSender interface {
SendFrame(context.Context, fabricproto.Frame) error
}
type FabricFrameHandler func(context.Context, FabricFrameSender, fabricproto.Frame) (bool, error)
func StartQUICFabricServer(ctx context.Context, cfg QUICFabricServerConfig) (*QUICFabricServer, error) {
if cfg.ListenAddr == "" {
return nil, fmt.Errorf("quic fabric listen addr is required")
@@ -42,9 +64,15 @@ func StartQUICFabricServer(ctx context.Context, cfg QUICFabricServerConfig) (*QU
return nil, err
}
server := &QUICFabricServer{
listener: listener,
logger: cfg.Logger,
done: make(chan struct{}),
listener: listener,
logger: cfg.Logger,
reverseTransport: cfg.ReverseTransport,
fabricFrameHandler: cfg.FabricFrameHandler,
productionForwardHandler: cfg.ProductionForwardHandler,
webIngressForwardHandler: cfg.WebIngressForwardHandler,
fabricControlHandler: cfg.FabricControlHandler,
syntheticForwardHandler: cfg.SyntheticForwardHandler,
done: make(chan struct{}),
}
go server.acceptLoop(ctx)
return server, nil
@@ -57,6 +85,15 @@ func (s *QUICFabricServer) Addr() net.Addr {
return s.listener.Addr()
}
func (s *QUICFabricServer) SetReverseTransport(transport *QUICFabricTransport) {
if s == nil {
return
}
s.reverseMu.Lock()
s.reverseTransport = transport
s.reverseMu.Unlock()
}
func (s *QUICFabricServer) Close() error {
if s == nil {
return nil
@@ -95,6 +132,8 @@ func (s *QUICFabricServer) handleConn(ctx context.Context, conn *quic.Conn) {
func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, stream *quic.Stream) {
session := fabricproto.NewSession(fabricproto.SessionConfig{})
sender := quicStreamFrameSender{stream: stream}
defer func() { _ = stream.Close() }()
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_stream_opened",
AcceptedBy: "quic",
@@ -116,6 +155,29 @@ func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, st
if err != nil {
return
}
s.registerReverseHelloFrame(conn, frame)
if s.handleProductionForwardFrame(ctx, stream, frame) {
continue
}
if s.handleWebIngressForwardFrame(ctx, stream, frame) {
continue
}
if s.handleFabricControlForwardFrame(ctx, stream, frame) {
continue
}
if s.handleSyntheticForwardFrame(ctx, conn, stream, frame) {
continue
}
if s.fabricFrameHandler != nil {
handled, err := s.fabricFrameHandler(ctx, sender, frame)
if err != nil {
_ = conn.CloseWithError(2, err.Error())
return
}
if handled {
continue
}
}
event, responses, err := session.HandleFrame(frame)
if err != nil {
_ = conn.CloseWithError(2, err.Error())
@@ -140,6 +202,196 @@ func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, st
}
}
type quicStreamFrameSender struct {
stream *quic.Stream
mu sync.Mutex
}
func (s quicStreamFrameSender) SendFrame(ctx context.Context, frame fabricproto.Frame) error {
if s.stream == nil {
return fmt.Errorf("quic fabric stream is closed")
}
s.mu.Lock()
defer s.mu.Unlock()
if deadline, ok := ctx.Deadline(); ok {
_ = s.stream.SetWriteDeadline(deadline)
} else {
_ = s.stream.SetWriteDeadline(time.Now().Add(30 * time.Second))
}
return fabricproto.WriteFrame(s.stream, frame)
}
func (s *QUICFabricServer) registerReverseHelloFrame(conn *quic.Conn, frame fabricproto.Frame) {
reverseTransport := s.getReverseTransport()
if s == nil || reverseTransport == nil || conn == nil || frame.Type != fabricproto.FramePing {
return
}
payload := string(frame.Payload)
if !strings.HasPrefix(payload, fabricQUICReverseHelloPrefix) {
return
}
peerID := strings.TrimPrefix(payload, fabricQUICReverseHelloPrefix)
reverseTransport.RegisterReverseConn(peerID, conn)
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_reverse_registered",
AcceptedBy: "quic_reverse_hello",
RemoteAddr: conn.RemoteAddr().String(),
PeerID: peerID,
})
}
type quicProductionForwardResponse struct {
Result ProductionForwardResult `json:"result,omitempty"`
Error string `json:"error,omitempty"`
}
type quicSyntheticForwardResponse struct {
Envelope SyntheticEnvelope `json:"envelope,omitempty"`
Error string `json:"error,omitempty"`
}
type quicWebIngressForwardResponse struct {
Payload json.RawMessage `json:"payload,omitempty"`
Error string `json:"error,omitempty"`
}
type quicFabricControlForwardResponse struct {
Payload json.RawMessage `json:"payload,omitempty"`
Error string `json:"error,omitempty"`
}
func (s *QUICFabricServer) handleProductionForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != ProductionForwardQUICStreamID {
return false
}
response := quicProductionForwardResponse{}
if s == nil || s.productionForwardHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else {
var envelope ProductionEnvelope
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
response.Error = "invalid production mesh envelope"
} else if result, err := s.productionForwardHandler(ctx, envelope); err != nil {
response.Error = err.Error()
} else {
response.Result = result
}
}
payload, err := json.Marshal(response)
if err != nil {
return true
}
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: ProductionForwardQUICStreamID,
Sequence: frame.Sequence,
Payload: payload,
})
return true
}
func (s *QUICFabricServer) handleWebIngressForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != WebIngressForwardQUICStreamID {
return false
}
response := quicWebIngressForwardResponse{}
if s == nil || s.webIngressForwardHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else if payload, err := s.webIngressForwardHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
response.Error = err.Error()
} else {
response.Payload = append(json.RawMessage(nil), payload...)
}
payload, err := json.Marshal(response)
if err != nil {
return true
}
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: WebIngressForwardQUICStreamID,
Sequence: frame.Sequence,
Payload: payload,
})
return true
}
func (s *QUICFabricServer) handleFabricControlForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != FabricControlForwardQUICStreamID {
return false
}
response := quicFabricControlForwardResponse{}
if s == nil || s.fabricControlHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else if payload, err := s.fabricControlHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
response.Error = err.Error()
} else {
response.Payload = append(json.RawMessage(nil), payload...)
}
payload, err := json.Marshal(response)
if err != nil {
return true
}
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: FabricControlForwardQUICStreamID,
Sequence: frame.Sequence,
Payload: payload,
})
return true
}
func (s *QUICFabricServer) handleSyntheticForwardFrame(ctx context.Context, conn *quic.Conn, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != SyntheticForwardQUICStreamID {
return false
}
response := quicSyntheticForwardResponse{}
if s == nil || s.syntheticForwardHandler == nil {
response.Error = ErrMeshRuntimeDisabled.Error()
} else {
var envelope SyntheticEnvelope
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
response.Error = "invalid synthetic mesh envelope"
} else if ack, err := s.syntheticForwardHandler(ctx, envelope); err != nil {
response.Error = err.Error()
} else {
s.registerReversePeerConn(envelope.From.NodeID, conn)
response.Envelope = ack
}
}
payload, err := json.Marshal(response)
if err != nil {
return true
}
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: SyntheticForwardQUICStreamID,
Sequence: frame.Sequence,
Payload: payload,
})
return true
}
func (s *QUICFabricServer) registerReversePeerConn(peerID string, conn *quic.Conn) {
reverseTransport := s.getReverseTransport()
if s == nil || reverseTransport == nil || conn == nil {
return
}
reverseTransport.RegisterReverseConn(peerID, conn)
}
func (s *QUICFabricServer) getReverseTransport() *QUICFabricTransport {
if s == nil {
return nil
}
s.reverseMu.RLock()
defer s.reverseMu.RUnlock()
return s.reverseTransport
}
func (s *QUICFabricServer) logFabricSession(entry FabricSessionEventLogEntry) {
if s != nil && s.logger != nil {
s.logger(entry)
@@ -6,7 +6,9 @@ import (
"crypto/tls"
"crypto/x509"
"encoding/hex"
"encoding/json"
"fmt"
"net"
"sort"
"strings"
"sync"
@@ -17,6 +19,7 @@ import (
)
const fabricQUICNextProto = "rap-fabric-data-session-v1"
const fabricQUICReverseHelloPrefix = "rap-fabric-reverse-hello-v1:"
const defaultQUICFabricConnIdleTTL = 5 * time.Minute
const defaultQUICFabricMaxStreamsPerConn = 64
const ErrQUICFabricStreamLimitReached = quicFabricError("quic fabric stream limit reached")
@@ -28,17 +31,29 @@ func (e quicFabricError) Error() string {
}
type QUICFabricTransport struct {
Config *quic.Config
IdleTTL time.Duration
MaxStreamsPerConn int
mu sync.Mutex
conns map[string]*quicFabricConnEntry
stats QUICFabricTransportStats
Config *quic.Config
LocalPeerID string
IdleTTL time.Duration
MaxStreamsPerConn int
DialAddr func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error)
mu sync.Mutex
conns map[string]*quicFabricConnEntry
reverseConns map[string]*quicFabricConnEntry
inboundProductionHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
inboundWebIngressHandler func(context.Context, []byte) ([]byte, error)
inboundFabricControlHandler func(context.Context, []byte) ([]byte, error)
inboundSyntheticHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
logger FabricSessionEventLogger
stats QUICFabricTransportStats
}
type QUICFabricTransportStats struct {
Opens uint64 `json:"opens"`
Reuses uint64 `json:"reuses"`
ReverseHelloSent uint64 `json:"reverse_hello_sent"`
ReverseHelloFailed uint64 `json:"reverse_hello_failed"`
ReverseRegisters uint64 `json:"reverse_registers"`
ReverseReuses uint64 `json:"reverse_reuses"`
OpenFailures uint64 `json:"open_failures"`
ClosedEvicted uint64 `json:"closed_evicted"`
CloseAllCalls uint64 `json:"close_all_calls"`
@@ -50,6 +65,7 @@ type QUICFabricTransportStats struct {
type QUICFabricTransportSnapshot struct {
SchemaVersion string `json:"schema_version"`
LocalPeerID string `json:"local_peer_id,omitempty"`
ActiveCount int `json:"active_count"`
ActiveStreams int `json:"active_streams"`
MaxStreamsPerConn int `json:"max_streams_per_conn"`
@@ -63,6 +79,7 @@ type QUICFabricConnSnapshot struct {
PeerID string `json:"peer_id,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
CertSHA256 string `json:"cert_sha256,omitempty"`
Direction string `json:"direction,omitempty"`
ActiveStreams int `json:"active_streams"`
MaxStreams int `json:"max_streams"`
CapacityPressurePercent int `json:"capacity_pressure_percent"`
@@ -92,7 +109,41 @@ type quicFabricConnEntry struct {
}
func NewQUICFabricTransport(config *quic.Config) *QUICFabricTransport {
return &QUICFabricTransport{Config: config, IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}}
return &QUICFabricTransport{Config: config, IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}, reverseConns: map[string]*quicFabricConnEntry{}}
}
func (t *QUICFabricTransport) SetInboundHandlers(production func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), synthetic func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), logger FabricSessionEventLogger) {
t.SetInboundHandlersWithWebIngress(production, nil, synthetic, logger)
}
func (t *QUICFabricTransport) SetInboundHandlersWithWebIngress(production func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), webIngress func(context.Context, []byte) ([]byte, error), synthetic func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), logger FabricSessionEventLogger) {
if t == nil {
return
}
t.mu.Lock()
t.inboundProductionHandler = production
t.inboundWebIngressHandler = webIngress
t.inboundSyntheticHandler = synthetic
t.logger = logger
t.mu.Unlock()
}
func (t *QUICFabricTransport) SetInboundFabricControlHandler(handler func(context.Context, []byte) ([]byte, error)) {
if t == nil {
return
}
t.mu.Lock()
t.inboundFabricControlHandler = handler
t.mu.Unlock()
}
func (t *QUICFabricTransport) SetLocalPeerID(peerID string) {
if t == nil {
return
}
t.mu.Lock()
t.LocalPeerID = strings.TrimSpace(peerID)
t.mu.Unlock()
}
func quicTLSConfigForTarget(target FabricTransportTarget) *tls.Config {
@@ -186,9 +237,12 @@ func (t *QUICFabricTransport) connectConn(ctx context.Context, target FabricTran
conn, err := quic.DialAddr(ctx, target.Endpoint, tlsConfig, nil)
return conn, "", true, err
}
if conn, key, ok := t.reverseConnForTarget(target); ok {
return conn, key, false, nil
}
key := quicFabricConnKey(target)
if key == "" {
conn, err := quic.DialAddr(ctx, target.Endpoint, tlsConfig, t.Config)
conn, err := t.dialAddr(ctx, target.Endpoint, tlsConfig)
return conn, "", true, err
}
t.mu.Lock()
@@ -207,7 +261,7 @@ func (t *QUICFabricTransport) connectConn(ctx context.Context, target FabricTran
}
t.mu.Unlock()
conn, err := quic.DialAddr(ctx, target.Endpoint, tlsConfig, t.Config)
conn, err := t.dialAddr(ctx, target.Endpoint, tlsConfig)
if err != nil {
t.mu.Lock()
t.stats.OpenFailures++
@@ -235,16 +289,339 @@ func (t *QUICFabricTransport) connectConn(ctx context.Context, target FabricTran
t.conns[key] = &quicFabricConnEntry{conn: conn, lastUsed: time.Now()}
t.stats.Opens++
t.mu.Unlock()
go t.acceptInboundStreams(context.Background(), conn)
go t.sendReverseHello(context.Background(), conn)
return conn, key, false, nil
}
func (t *QUICFabricTransport) dialAddr(ctx context.Context, endpoint string, tlsConfig *tls.Config) (*quic.Conn, error) {
if t != nil && t.DialAddr != nil {
return t.DialAddr(ctx, endpoint, tlsConfig, t.Config)
}
return quic.DialAddr(ctx, endpoint, tlsConfig, t.Config)
}
func DialQUICAddrWithPacketConn(ctx context.Context, endpoint string, packetConn net.PacketConn, tlsConfig *tls.Config, config *quic.Config) (*quic.Conn, error) {
if packetConn == nil {
return nil, fmt.Errorf("quic packet connection is required")
}
addr, err := net.ResolveUDPAddr("udp", strings.TrimPrefix(strings.TrimSpace(endpoint), "quic://"))
if err != nil {
_ = packetConn.Close()
return nil, err
}
transport := &quic.Transport{Conn: packetConn}
conn, err := transport.Dial(ctx, addr, tlsConfig, config)
if err != nil {
_ = transport.Close()
return nil, err
}
go func() {
<-conn.Context().Done()
_ = transport.Close()
}()
return conn, nil
}
func (t *QUICFabricTransport) sendReverseHello(ctx context.Context, conn *quic.Conn) {
if t == nil || conn == nil {
return
}
localPeerID := t.localPeerID()
if localPeerID == "" {
t.mu.Lock()
t.stats.ReverseHelloFailed++
t.mu.Unlock()
return
}
helloCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
stream, err := conn.OpenStreamSync(helloCtx)
if err != nil {
t.mu.Lock()
t.stats.ReverseHelloFailed++
t.mu.Unlock()
return
}
defer func() { _ = stream.Close() }()
if err := fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 1,
Payload: []byte(fabricQUICReverseHelloPrefix + localPeerID),
}); err != nil {
t.mu.Lock()
t.stats.ReverseHelloFailed++
t.mu.Unlock()
return
}
t.mu.Lock()
t.stats.ReverseHelloSent++
t.mu.Unlock()
_, _ = fabricproto.ReadFrame(stream, fabricproto.DefaultMaxPayload)
}
func (t *QUICFabricTransport) acceptInboundStreams(ctx context.Context, conn *quic.Conn) {
if t == nil || conn == nil {
return
}
for {
stream, err := conn.AcceptStream(ctx)
if err != nil {
return
}
go t.handleInboundStream(ctx, conn, stream)
}
}
func (t *QUICFabricTransport) handleInboundStream(ctx context.Context, conn *quic.Conn, stream *quic.Stream) {
session := fabricproto.NewSession(fabricproto.SessionConfig{})
defer func() { _ = stream.Close() }()
t.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_reverse_stream_opened",
AcceptedBy: "quic_reverse",
RemoteAddr: conn.RemoteAddr().String(),
})
defer t.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_reverse_stream_closed",
AcceptedBy: "quic_reverse",
RemoteAddr: conn.RemoteAddr().String(),
})
for {
select {
case <-ctx.Done():
_ = stream.Close()
return
default:
}
frame, err := fabricproto.ReadFrame(stream, fabricproto.DefaultMaxPayload)
if err != nil {
return
}
t.registerReverseHelloFrame(conn, frame)
if t.handleInboundProductionForwardFrame(ctx, stream, frame) {
continue
}
if t.handleInboundWebIngressForwardFrame(ctx, stream, frame) {
continue
}
if t.handleInboundFabricControlForwardFrame(ctx, stream, frame) {
continue
}
if t.handleInboundSyntheticForwardFrame(ctx, stream, frame) {
continue
}
event, responses, err := session.HandleFrame(frame)
if err != nil {
_ = stream.Close()
return
}
if event.Type != fabricproto.SessionEventNone {
t.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_reverse_event",
SessionEvent: event.Type,
StreamID: event.StreamID,
Sequence: event.Sequence,
TrafficClass: event.TrafficClass,
AcceptedBy: "quic_reverse",
RemoteAddr: conn.RemoteAddr().String(),
})
}
for _, response := range responses {
if err := fabricproto.WriteFrame(stream, response); err != nil {
return
}
}
}
}
func (t *QUICFabricTransport) registerReverseHelloFrame(conn *quic.Conn, frame fabricproto.Frame) {
if t == nil || conn == nil || frame.Type != fabricproto.FramePing {
return
}
payload := string(frame.Payload)
if !strings.HasPrefix(payload, fabricQUICReverseHelloPrefix) {
return
}
peerID := strings.TrimPrefix(payload, fabricQUICReverseHelloPrefix)
t.RegisterReverseConn(peerID, conn)
t.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_reverse_registered",
AcceptedBy: "quic_reverse_hello",
RemoteAddr: conn.RemoteAddr().String(),
PeerID: peerID,
})
}
func (t *QUICFabricTransport) handleInboundProductionForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != ProductionForwardQUICStreamID {
return false
}
response := quicProductionForwardResponse{}
productionHandler, _, _, _, _ := t.inboundHandlers()
if productionHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else {
var envelope ProductionEnvelope
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
response.Error = "invalid production mesh envelope"
} else if result, err := productionHandler(ctx, envelope); err != nil {
response.Error = err.Error()
} else {
response.Result = result
}
}
payload, err := json.Marshal(response)
if err == nil {
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: ProductionForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
}
return true
}
func (t *QUICFabricTransport) handleInboundWebIngressForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != WebIngressForwardQUICStreamID {
return false
}
response := quicWebIngressForwardResponse{}
_, webIngressHandler, _, _, _ := t.inboundHandlers()
if webIngressHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else if payload, err := webIngressHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
response.Error = err.Error()
} else {
response.Payload = append(json.RawMessage(nil), payload...)
}
payload, err := json.Marshal(response)
if err == nil {
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: WebIngressForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
}
return true
}
func (t *QUICFabricTransport) handleInboundFabricControlForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != FabricControlForwardQUICStreamID {
return false
}
response := quicFabricControlForwardResponse{}
_, _, fabricControlHandler, _, _ := t.inboundHandlers()
if fabricControlHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else if payload, err := fabricControlHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
response.Error = err.Error()
} else {
response.Payload = append(json.RawMessage(nil), payload...)
}
payload, err := json.Marshal(response)
if err == nil {
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: FabricControlForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
}
return true
}
func (t *QUICFabricTransport) handleInboundSyntheticForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != SyntheticForwardQUICStreamID {
return false
}
response := quicSyntheticForwardResponse{}
_, _, _, syntheticHandler, _ := t.inboundHandlers()
if syntheticHandler == nil {
response.Error = ErrMeshRuntimeDisabled.Error()
} else {
var envelope SyntheticEnvelope
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
response.Error = "invalid synthetic mesh envelope"
} else if ack, err := syntheticHandler(ctx, envelope); err != nil {
response.Error = err.Error()
} else {
response.Envelope = ack
}
}
payload, err := json.Marshal(response)
if err == nil {
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: SyntheticForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
}
return true
}
func (t *QUICFabricTransport) inboundHandlers() (func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), func(context.Context, []byte) ([]byte, error), func(context.Context, []byte) ([]byte, error), func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), FabricSessionEventLogger) {
if t == nil {
return nil, nil, nil, nil, nil
}
t.mu.Lock()
defer t.mu.Unlock()
return t.inboundProductionHandler, t.inboundWebIngressHandler, t.inboundFabricControlHandler, t.inboundSyntheticHandler, t.logger
}
func (t *QUICFabricTransport) localPeerID() string {
if t == nil {
return ""
}
t.mu.Lock()
defer t.mu.Unlock()
return strings.TrimSpace(t.LocalPeerID)
}
func (t *QUICFabricTransport) logFabricSession(entry FabricSessionEventLogEntry) {
_, _, _, _, logger := t.inboundHandlers()
if logger != nil {
logger(entry)
}
}
func (t *QUICFabricTransport) RegisterReverseConn(peerID string, conn *quic.Conn) {
if t == nil || conn == nil {
return
}
peerID = strings.TrimSpace(peerID)
if peerID == "" {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if t.reverseConns == nil {
t.reverseConns = map[string]*quicFabricConnEntry{}
}
if existing := t.reverseConns[peerID]; existing != nil && existing.conn != nil && existing.conn != conn {
select {
case <-existing.conn.Context().Done():
default:
_ = existing.conn.CloseWithError(0, "reverse connection replaced")
}
}
t.reverseConns[peerID] = &quicFabricConnEntry{conn: conn, lastUsed: time.Now()}
t.stats.ReverseRegisters++
}
func (t *QUICFabricTransport) reverseConnForTarget(target FabricTransportTarget) (*quic.Conn, string, bool) {
peerID := strings.TrimSpace(target.PeerID)
if t == nil || peerID == "" || !fabricTransportPrefersReverseConn(target.Transport) {
return nil, "", false
}
t.mu.Lock()
defer t.mu.Unlock()
t.pruneIdleLocked(time.Now())
entry := t.reverseConns[peerID]
if entry == nil || entry.conn == nil {
return nil, "", false
}
select {
case <-entry.conn.Context().Done():
delete(t.reverseConns, peerID)
t.stats.ClosedEvicted++
return nil, "", false
default:
entry.lastUsed = time.Now()
t.stats.ReverseReuses++
return entry.conn, quicFabricReverseConnKey(peerID), true
}
}
func (t *QUICFabricTransport) reserveStream(key string, conn *quic.Conn) error {
if t == nil || key == "" {
return nil
}
t.mu.Lock()
defer t.mu.Unlock()
entry := t.conns[key]
entry := t.connEntryLocked(key)
if entry == nil || entry.conn != conn {
return fmt.Errorf("quic fabric connection is not cached")
}
@@ -267,16 +644,26 @@ func (t *QUICFabricTransport) releaseStream(key string) {
return
}
t.mu.Lock()
if entry := t.conns[key]; entry != nil {
if entry := t.connEntryLocked(key); entry != nil {
if entry.activeStreams > 0 {
entry.activeStreams--
}
entry.lastUsed = time.Now()
t.stats.StreamCloses++
}
t.stats.StreamCloses++
t.mu.Unlock()
}
func (t *QUICFabricTransport) connEntryLocked(key string) *quicFabricConnEntry {
if t == nil || key == "" {
return nil
}
if strings.HasPrefix(key, "reverse\x00") {
return t.reverseConns[strings.TrimPrefix(key, "reverse\x00")]
}
return t.conns[key]
}
func (t *QUICFabricTransport) evictConn(target FabricTransportTarget, conn *quic.Conn) {
if t == nil || conn == nil {
return
@@ -315,6 +702,20 @@ func (t *QUICFabricTransport) pruneIdleLocked(now time.Time) {
t.stats.IdleEvicted++
}
}
for peerID, entry := range t.reverseConns {
if entry == nil || entry.conn == nil {
delete(t.reverseConns, peerID)
continue
}
if !entry.lastUsed.IsZero() && now.Sub(entry.lastUsed) > ttl {
if entry.activeStreams > 0 {
continue
}
_ = entry.conn.CloseWithError(0, "idle reverse")
delete(t.reverseConns, peerID)
t.stats.IdleEvicted++
}
}
}
func quicFabricConnKey(target FabricTransportTarget) string {
@@ -340,6 +741,23 @@ func parseQUICFabricConnKey(key string) (peerID string, endpoint string, certSHA
return peerID, endpoint, certSHA256
}
func quicFabricReverseConnKey(peerID string) string {
peerID = strings.TrimSpace(peerID)
if peerID == "" {
return ""
}
return "reverse\x00" + peerID
}
func fabricTransportPrefersReverseConn(transport string) bool {
switch strings.ToLower(strings.TrimSpace(transport)) {
case "reverse_quic", "relay_quic":
return true
default:
return false
}
}
func (t *QUICFabricTransport) Close() error {
if t == nil {
return nil
@@ -348,12 +766,19 @@ func (t *QUICFabricTransport) Close() error {
t.stats.CloseAllCalls++
conns := t.conns
t.conns = map[string]*quicFabricConnEntry{}
reverseConns := t.reverseConns
t.reverseConns = map[string]*quicFabricConnEntry{}
t.mu.Unlock()
for _, entry := range conns {
if entry != nil && entry.conn != nil {
_ = entry.conn.CloseWithError(0, "closed")
}
}
for _, entry := range reverseConns {
if entry != nil && entry.conn != nil {
_ = entry.conn.CloseWithError(0, "closed")
}
}
return nil
}
@@ -370,6 +795,7 @@ func (t *QUICFabricTransport) Snapshot() QUICFabricTransportSnapshot {
}
snapshot := QUICFabricTransportSnapshot{
SchemaVersion: "rap.quic_fabric_transport.v1",
LocalPeerID: strings.TrimSpace(t.LocalPeerID),
MaxStreamsPerConn: limit,
Stats: t.stats,
}
@@ -391,6 +817,40 @@ func (t *QUICFabricTransport) Snapshot() QUICFabricTransportSnapshot {
PeerID: peerID,
Endpoint: endpoint,
CertSHA256: certSHA256,
Direction: "outbound",
ActiveStreams: entry.activeStreams,
MaxStreams: limit,
Saturated: entry.activeStreams >= limit,
}
if !entry.lastUsed.IsZero() {
connSnapshot.LastUsedUnixSec = entry.lastUsed.UTC().Unix()
}
if limit > 0 {
connSnapshot.CapacityPressurePercent = (entry.activeStreams * 100) / limit
}
snapshot.Connections = append(snapshot.Connections, connSnapshot)
if entry.activeStreams >= limit {
snapshot.SaturatedConnections++
}
}
}
for peerID, entry := range t.reverseConns {
if entry == nil || entry.conn == nil {
delete(t.reverseConns, peerID)
continue
}
select {
case <-entry.conn.Context().Done():
delete(t.reverseConns, peerID)
t.stats.ClosedEvicted++
snapshot.Stats.ClosedEvicted++
default:
snapshot.ActiveCount++
snapshot.ActiveStreams += entry.activeStreams
connSnapshot := QUICFabricConnSnapshot{
PeerID: peerID,
Endpoint: entry.conn.RemoteAddr().String(),
Direction: "reverse",
ActiveStreams: entry.activeStreams,
MaxStreams: limit,
Saturated: entry.activeStreams >= limit,
@@ -462,6 +922,7 @@ func (s *quicFabricSession) Close() error {
s.closeOnce.Do(func() {
close(s.done)
if s.stream != nil {
s.stream.CancelRead(0)
err = s.stream.Close()
}
if s.transport != nil {
@@ -9,6 +9,7 @@ import (
"crypto/x509"
"crypto/x509/pkix"
"encoding/hex"
"encoding/json"
"encoding/pem"
"math/big"
"strings"
@@ -341,6 +342,119 @@ func TestQUICFabricTransportLimitsStreamsPerConnection(t *testing.T) {
defer second.Close()
}
func TestQUICFabricTransportReusesInboundConnectionForReverseStream(t *testing.T) {
reverseTransport := NewQUICFabricTransport(nil)
defer reverseTransport.Close()
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: testQUICTLSConfig(t),
ReverseTransport: reverseTransport,
SyntheticForwardHandler: func(_ context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
envelope.To, envelope.From = envelope.From, PeerIdentity{ClusterID: envelope.ClusterID, NodeID: "node-r"}
return envelope, nil
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
clientTransport := NewQUICFabricTransport(nil)
defer clientTransport.Close()
clientTransport.SetLocalPeerID("node-a")
clientTransport.SetInboundHandlers(func(_ context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
return ProductionForwardResult{
Accepted: true,
Delivered: true,
Forwarded: true,
By: PeerIdentity{ClusterID: envelope.ClusterID, NodeID: "node-a"},
MessageID: envelope.MessageID,
RouteID: envelope.RouteID,
}, nil
}, nil, nil)
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
session, err := clientTransport.Connect(ctx, FabricTransportTarget{
PeerID: "node-r",
Endpoint: server.Addr().String(),
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
NextProtos: []string{fabricQUICNextProto},
},
Timeout: time.Second,
InboundBuffer: 4,
ErrorBuffer: 4,
})
if err != nil {
t.Fatalf("client connect: %v", err)
}
defer session.Close()
deadline := time.Now().Add(time.Second)
for {
if reverseTransport.Snapshot().Stats.ReverseRegisters > 0 {
break
}
if time.Now().After(deadline) {
t.Fatalf("reverse hello did not register connection: %+v", reverseTransport.Snapshot())
}
time.Sleep(10 * time.Millisecond)
}
reverseSession, err := reverseTransport.Connect(ctx, FabricTransportTarget{
PeerID: "node-a",
Endpoint: "10.0.0.2:19443",
Transport: "relay_quic",
Timeout: time.Second,
InboundBuffer: 4,
ErrorBuffer: 4,
})
if err != nil {
t.Fatalf("reverse connect: %v", err)
}
defer reverseSession.Close()
productionPayload, err := json.Marshal(ProductionEnvelope{
FabricProtocolVersion: ProtocolVersion,
MessageID: "msg-1",
RouteID: "route-r-a",
ClusterID: "cluster-1",
SourceNodeID: "node-r",
DestinationNodeID: "node-a",
CurrentHopNodeID: "node-a",
NextHopNodeID: "node-a",
ChannelClass: ProductionChannelFabricControl,
MessageType: ProductionMessageFabricControl,
TTL: 4,
CreatedAt: time.Now().UTC(),
ExpiresAt: time.Now().UTC().Add(time.Minute),
PayloadHash: "unused-by-test-handler",
})
if err != nil {
t.Fatalf("marshal production: %v", err)
}
if err := reverseSession.Send(ctx, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: ProductionForwardQUICStreamID, Sequence: 2, Payload: productionPayload}); err != nil {
t.Fatalf("send reverse production: %v", err)
}
select {
case frame := <-reverseSession.Frames():
var response quicProductionForwardResponse
if err := json.Unmarshal(frame.Payload, &response); err != nil {
t.Fatalf("decode response: %v", err)
}
if !response.Result.Accepted || !response.Result.Delivered || response.Result.By.NodeID != "node-a" {
t.Fatalf("response = %+v", response)
}
case err := <-reverseSession.Errors():
t.Fatalf("reverse session error: %v", err)
case <-ctx.Done():
t.Fatal(ctx.Err())
}
snapshot := reverseTransport.Snapshot()
if snapshot.Stats.ReverseRegisters == 0 || snapshot.Stats.ReverseReuses == 0 {
t.Fatalf("reverse connection was not registered/reused: %+v", snapshot)
}
}
func TestQUICFabricServerHandlesFabricFrames(t *testing.T) {
var events []FabricSessionEventLogEntry
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
@@ -389,6 +503,68 @@ func TestQUICFabricServerHandlesFabricFrames(t *testing.T) {
}
}
func TestQUICFabricServerHandlesWebIngressForwardFrames(t *testing.T) {
var received []byte
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: testQUICTLSConfig(t),
WebIngressForwardHandler: func(_ context.Context, payload []byte) ([]byte, error) {
received = append([]byte(nil), payload...)
return []byte(`{"schema_version":"rap.web_ingress.fabric_runtime_response.v1","status_code":200,"body_b64":"b2s="}`), nil
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
session, err := NewQUICFabricTransport(nil).Connect(ctx, FabricTransportTarget{
Endpoint: server.Addr().String(),
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
NextProtos: []string{fabricQUICNextProto},
},
Timeout: time.Second,
InboundBuffer: 4,
ErrorBuffer: 4,
})
if err != nil {
t.Fatalf("connect quic fabric: %v", err)
}
defer session.Close()
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: WebIngressForwardQUICStreamID,
Sequence: 44,
Payload: []byte(`{"envelope":true}`),
}); err != nil {
t.Fatalf("send web ingress frame: %v", err)
}
select {
case frame := <-session.Frames():
if frame.Type != fabricproto.FrameData || frame.StreamID != WebIngressForwardQUICStreamID || frame.Sequence != 44 {
t.Fatalf("frame = %+v", frame)
}
var response quicWebIngressForwardResponse
if err := json.Unmarshal(frame.Payload, &response); err != nil {
t.Fatalf("decode response: %v", err)
}
if string(response.Payload) != `{"schema_version":"rap.web_ingress.fabric_runtime_response.v1","status_code":200,"body_b64":"b2s="}` || response.Error != "" {
t.Fatalf("response = %+v", response)
}
case err := <-session.Errors():
t.Fatalf("session error: %v", err)
case <-ctx.Done():
t.Fatal(ctx.Err())
}
if string(received) != `{"envelope":true}` {
t.Fatalf("received = %s", string(received))
}
}
func startQUICFabricEchoServer(t *testing.T) *quic.Listener {
t.Helper()
return startQUICFabricEchoServerWithTLS(t, testQUICTLSConfig(t))
@@ -0,0 +1,128 @@
package mesh
import (
"strings"
"sync"
"time"
)
type FabricRouteHealthTracker struct {
mu sync.Mutex
QuarantineTTL time.Duration
routes map[string]FabricRouteHealthEntry
}
type FabricRouteHealthEntry struct {
Reason string `json:"reason,omitempty"`
Failures uint64 `json:"failures"`
LastFailure time.Time `json:"last_failure,omitempty"`
RetryAfter time.Time `json:"retry_after,omitempty"`
}
type FabricRouteHealthSnapshot struct {
Quarantined map[string]FabricRouteHealthEntry `json:"quarantined,omitempty"`
}
func NewFabricRouteHealthTracker(ttl time.Duration) *FabricRouteHealthTracker {
if ttl <= 0 {
ttl = 30 * time.Second
}
return &FabricRouteHealthTracker{QuarantineTTL: ttl, routes: map[string]FabricRouteHealthEntry{}}
}
func (t *FabricRouteHealthTracker) MarkFailure(routeID string, reason string, now time.Time) {
routeID = strings.TrimSpace(routeID)
if t == nil || routeID == "" {
return
}
if now.IsZero() {
now = time.Now().UTC()
}
ttl := t.QuarantineTTL
if ttl <= 0 {
ttl = 30 * time.Second
}
t.mu.Lock()
entry := t.routes[routeID]
entry.Failures++
entry.Reason = strings.TrimSpace(reason)
entry.LastFailure = now
entry.RetryAfter = now.Add(ttl)
if t.routes == nil {
t.routes = map[string]FabricRouteHealthEntry{}
}
t.routes[routeID] = entry
t.mu.Unlock()
}
func (t *FabricRouteHealthTracker) MarkSuccess(routeID string) {
routeID = strings.TrimSpace(routeID)
if t == nil || routeID == "" {
return
}
t.mu.Lock()
delete(t.routes, routeID)
t.mu.Unlock()
}
func (t *FabricRouteHealthTracker) Apply(routeSet FabricRouteSet, now time.Time) FabricRouteSet {
if t == nil {
return routeSet
}
if now.IsZero() {
now = time.Now().UTC()
}
t.mu.Lock()
defer t.mu.Unlock()
if len(t.routes) == 0 {
return routeSet
}
return mapFabricRouteSet(routeSet, func(route FabricRoute) FabricRoute {
entry, ok := t.routes[route.RouteID]
if !ok {
return route
}
if !entry.RetryAfter.IsZero() && !now.Before(entry.RetryAfter) {
delete(t.routes, route.RouteID)
return route
}
route.Healthy = false
route.Degraded = true
return route
})
}
func (t *FabricRouteHealthTracker) Snapshot(now time.Time) FabricRouteHealthSnapshot {
if t == nil {
return FabricRouteHealthSnapshot{}
}
if now.IsZero() {
now = time.Now().UTC()
}
t.mu.Lock()
defer t.mu.Unlock()
out := map[string]FabricRouteHealthEntry{}
for routeID, entry := range t.routes {
if !entry.RetryAfter.IsZero() && !now.Before(entry.RetryAfter) {
continue
}
out[routeID] = entry
}
if len(out) == 0 {
return FabricRouteHealthSnapshot{}
}
return FabricRouteHealthSnapshot{Quarantined: out}
}
func mapFabricRouteSet(routeSet FabricRouteSet, fn func(FabricRoute) FabricRoute) FabricRouteSet {
if strings.TrimSpace(routeSet.Primary.RouteID) != "" {
routeSet.Primary = fn(routeSet.Primary)
}
for i := range routeSet.WarmStandby {
routeSet.WarmStandby[i] = fn(routeSet.WarmStandby[i])
}
for i := range routeSet.ColdFallbacks {
routeSet.ColdFallbacks[i] = fn(routeSet.ColdFallbacks[i])
}
return routeSet
}
@@ -0,0 +1,322 @@
package mesh
import (
"encoding/json"
"fmt"
"strings"
"time"
)
const (
FabricCandidateReachabilityPublic = "public"
FabricCandidateReachabilityPrivate = "private"
FabricCandidateReachabilityRelay = "relay"
FabricCandidateReachabilityOutboundOnly = "outbound_only"
FabricConnectivityDirect = "direct"
FabricConnectivityOutboundOnly = "outbound_only"
FabricConnectivityRelayRequired = "relay_required"
)
type FabricRoutePlannerConfig struct {
ClusterID string
LocalNodeID string
LocalSegmentID string
LocalNATGroupID string
DefaultCapacity int
RelayCapacity int
ReverseCapacity int
Observations map[string]EndpointCandidateHealthObservation
CapacityPressure map[string]EndpointCandidateCapacityPressure
Now time.Time
MaxObservationAge time.Duration
MaxCapacityPressureAge time.Duration
}
type FabricCandidateMetadata struct {
LocalSegmentID string `json:"local_segment_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ViaNodeID string `json:"via_node_id,omitempty"`
STUNServer string `json:"stun_server,omitempty"`
ICEFoundation string `json:"ice_foundation,omitempty"`
}
func FabricRouteSetForPeerEndpointCandidates(targetNodeID string, candidates []PeerEndpointCandidate, cfg FabricRoutePlannerConfig) FabricRouteSet {
targetNodeID = strings.TrimSpace(targetNodeID)
if targetNodeID == "" && len(candidates) > 0 {
targetNodeID = strings.TrimSpace(candidates[0].NodeID)
}
routeSet := FabricRouteSet{TargetKind: FabricChannelTargetNode, TargetID: targetNodeID}
if len(candidates) == 0 {
return routeSet
}
now := cfg.Now
if now.IsZero() {
now = time.Now().UTC()
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
Observations: cfg.Observations,
MaxObservationAge: firstNonZeroDuration(cfg.MaxObservationAge, 30*time.Second),
CapacityPressure: cfg.CapacityPressure,
MaxCapacityPressureAge: firstNonZeroDuration(cfg.MaxCapacityPressureAge, 10*time.Second),
})
routes := make([]FabricRoute, 0, len(ranked))
for index, scored := range ranked {
route, ok := fabricRouteForPeerEndpointCandidate(scored.Candidate, cfg, scored.Score, index, now)
if ok {
routes = append(routes, route)
}
}
return routeSetFromRoutes(routeSet, routes)
}
func FabricRouteSetsForPeerEndpointCandidates(candidatesByNode map[string][]PeerEndpointCandidate, cfg FabricRoutePlannerConfig) map[string]FabricRouteSet {
out := make(map[string]FabricRouteSet, len(candidatesByNode))
for nodeID, candidates := range candidatesByNode {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
continue
}
routeSet := FabricRouteSetForPeerEndpointCandidates(nodeID, candidates, cfg)
if strings.TrimSpace(routeSet.Primary.RouteID) != "" || len(routeSet.WarmStandby) > 0 || len(routeSet.ColdFallbacks) > 0 {
out[nodeID] = routeSet
}
}
return out
}
func fabricRouteForPeerEndpointCandidate(candidate PeerEndpointCandidate, cfg FabricRoutePlannerConfig, score int, index int, now time.Time) (FabricRoute, bool) {
candidate.EndpointID = strings.TrimSpace(candidate.EndpointID)
candidate.NodeID = strings.TrimSpace(candidate.NodeID)
candidate.Address = strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
if candidate.EndpointID == "" || candidate.NodeID == "" || candidate.Address == "" || !isQUICOnlyCandidateTransport(candidate.Transport) {
return FabricRoute{}, false
}
metadata := decodeFabricCandidateMetadata(candidate.Metadata)
mode := fabricRouteModeForPeerEndpointCandidate(candidate, metadata, cfg)
hops := fabricRouteHopsForCandidate(candidate, metadata, mode, cfg)
if len(hops) == 0 {
return FabricRoute{}, false
}
relayCount := 0
for _, hop := range hops {
if hop.Mode == FabricRouteRelay {
relayCount++
}
}
latency := fabricRouteLatencyFromCandidate(candidate, cfg, score, index)
capacity := fabricRouteCapacityForMode(mode, cfg)
if capacity <= 0 {
capacity = 100
}
healthy := true
degraded := false
if observation, ok := cfg.Observations[candidate.EndpointID]; ok {
healthy = observation.ReliabilityScore == 0 || observation.ReliabilityScore >= 50
degraded = observation.LastLatencyMs > 0 && observation.LastLatencyMs >= 250
}
return FabricRoute{
RouteID: candidate.EndpointID,
ClusterID: strings.TrimSpace(cfg.ClusterID),
SourceNodeID: strings.TrimSpace(cfg.LocalNodeID),
DestinationNodeID: candidate.NodeID,
Hops: hops,
BaseLatencyMs: latency,
Capacity: capacity,
ActiveChannels: int(candidatePressureCount(candidate.EndpointID, cfg)),
RelayCount: relayCount,
Healthy: healthy,
Degraded: degraded,
LastUpdatedAt: now,
}, true
}
func fabricRouteModeForPeerEndpointCandidate(candidate PeerEndpointCandidate, metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) FabricRouteMode {
transportMode := fabricRouteModeForTransportTarget(FabricTransportTarget{Transport: candidate.Transport})
if transportMode == FabricRouteRelay || transportMode == FabricRouteReverse || transportMode == FabricRouteICE || transportMode == FabricRouteLAN {
return transportMode
}
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
if sameLocalSegment(metadata, cfg) || sameNATGroup(metadata, cfg) {
return FabricRouteLAN
}
if reachability == FabricCandidateReachabilityRelay || connectivity == FabricConnectivityRelayRequired || strings.TrimSpace(metadata.RelayEndpoint) != "" {
return FabricRouteRelay
}
if connectivity == FabricConnectivityOutboundOnly || reachability == FabricCandidateReachabilityOutboundOnly {
return FabricRouteReverse
}
if strings.TrimSpace(metadata.STUNServer) != "" || strings.TrimSpace(metadata.ICEFoundation) != "" || candidate.NATType != "" {
return FabricRouteICE
}
return FabricRouteDirect
}
func fabricRouteHopsForCandidate(candidate PeerEndpointCandidate, metadata FabricCandidateMetadata, mode FabricRouteMode, cfg FabricRoutePlannerConfig) []FabricRouteHop {
localNodeID := strings.TrimSpace(cfg.LocalNodeID)
targetNodeID := strings.TrimSpace(candidate.NodeID)
endpoint := strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
switch mode {
case FabricRouteRelay:
relayNodeID := firstNonEmpty(strings.TrimSpace(metadata.RelayNodeID), strings.TrimSpace(metadata.ViaNodeID))
relayEndpoint := firstNonEmpty(strings.TrimRight(strings.TrimSpace(metadata.RelayEndpoint), "/"), endpoint)
hops := []FabricRouteHop{}
if localNodeID != "" {
hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: FabricRouteDirect})
}
if relayNodeID == "" {
hops = append(hops, FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)})
return hops
}
hops = append(hops,
FabricRouteHop{NodeID: relayNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID + ":relay", Address: relayEndpoint},
FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)},
)
return hops
case FabricRouteLAN, FabricRouteICE, FabricRouteReverse, FabricRouteDirect:
hops := []FabricRouteHop{}
if localNodeID != "" {
hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: mode})
}
hops = append(hops, FabricRouteHop{NodeID: targetNodeID, Mode: mode, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)})
return hops
default:
return nil
}
}
func isQUICOnlyCandidateTransport(transport string) bool {
switch strings.ToLower(strings.TrimSpace(transport)) {
case "quic", "direct_quic", "udp_quic", "quic_udp",
string(FabricRouteLAN), string(FabricRouteReverse), string(FabricRouteRelay), string(FabricRouteICE):
return true
default:
return false
}
}
func fabricRouteLatencyFromCandidate(candidate PeerEndpointCandidate, cfg FabricRoutePlannerConfig, score int, index int) int {
if observation, ok := cfg.Observations[candidate.EndpointID]; ok && observation.LastLatencyMs > 0 {
if observation.LastLatencyMs > int64(^uint(0)>>1) {
return int(^uint(0) >> 1)
}
return int(observation.LastLatencyMs)
}
base := 10 + index
switch strings.ToLower(strings.TrimSpace(candidate.Reachability)) {
case FabricCandidateReachabilityPrivate:
base = 3 + index
case FabricCandidateReachabilityOutboundOnly:
base = 25 + index
case FabricCandidateReachabilityRelay:
base = 40 + index
}
if score < 100 {
base += (100 - score) / 10
}
return base
}
func fabricRouteCapacityForMode(mode FabricRouteMode, cfg FabricRoutePlannerConfig) int {
switch mode {
case FabricRouteRelay:
return firstPositiveInt(cfg.RelayCapacity, cfg.DefaultCapacity, 100)
case FabricRouteReverse:
return firstPositiveInt(cfg.ReverseCapacity, cfg.DefaultCapacity, 100)
default:
return firstPositiveInt(cfg.DefaultCapacity, 100)
}
}
func candidatePressureCount(endpointID string, cfg FabricRoutePlannerConfig) int64 {
if pressure, ok := cfg.CapacityPressure[endpointID]; ok {
return pressure.Count
}
return 0
}
func sameLocalSegment(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localSegment := strings.TrimSpace(cfg.LocalSegmentID)
if localSegment == "" {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.LocalSegmentID), localSegment)
}
func sameNATGroup(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localNATGroup := strings.TrimSpace(cfg.LocalNATGroupID)
if localNATGroup == "" {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.NATGroupID), localNATGroup)
}
func decodeFabricCandidateMetadata(raw json.RawMessage) FabricCandidateMetadata {
if len(raw) == 0 {
return FabricCandidateMetadata{}
}
var metadata FabricCandidateMetadata
if err := json.Unmarshal(raw, &metadata); err != nil {
return FabricCandidateMetadata{}
}
return metadata
}
func candidatePeerCertSHA256(candidate PeerEndpointCandidate) string {
var metadata struct {
PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"`
TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"`
}
if len(candidate.Metadata) == 0 {
return ""
}
if err := json.Unmarshal(candidate.Metadata, &metadata); err != nil {
return ""
}
return firstNonEmpty(strings.TrimSpace(metadata.PeerCertSHA256), strings.TrimSpace(metadata.TLSCertSHA256))
}
func firstPositiveInt(values ...int) int {
for _, value := range values {
if value > 0 {
return value
}
}
return 0
}
func firstNonZeroDuration(values ...time.Duration) time.Duration {
for _, value := range values {
if value > 0 {
return value
}
}
return 0
}
func FabricRouteSetForRelayFallback(clusterID string, sourceNodeID string, targetNodeID string, relayNodeID string, relayEndpoint string, targetEndpoint string) FabricRouteSet {
relayEndpoint = strings.TrimRight(strings.TrimSpace(relayEndpoint), "/")
targetEndpoint = strings.TrimRight(strings.TrimSpace(targetEndpoint), "/")
candidate := PeerEndpointCandidate{
EndpointID: fmt.Sprintf("%s-via-%s-relay", strings.TrimSpace(targetNodeID), strings.TrimSpace(relayNodeID)),
NodeID: strings.TrimSpace(targetNodeID),
Transport: string(FabricRouteRelay),
Address: targetEndpoint,
Reachability: FabricCandidateReachabilityRelay,
ConnectivityMode: FabricConnectivityRelayRequired,
Metadata: mustMarshalFabricCandidateMetadata(FabricCandidateMetadata{RelayNodeID: relayNodeID, RelayEndpoint: relayEndpoint}),
}
return FabricRouteSetForPeerEndpointCandidates(targetNodeID, []PeerEndpointCandidate{candidate}, FabricRoutePlannerConfig{
ClusterID: clusterID,
LocalNodeID: sourceNodeID,
})
}
func mustMarshalFabricCandidateMetadata(metadata FabricCandidateMetadata) json.RawMessage {
raw, _ := json.Marshal(metadata)
return raw
}
@@ -0,0 +1,187 @@
package mesh
import (
"encoding/json"
"testing"
"time"
)
func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{LocalSegmentID: "site-a", NATGroupID: "nat-a"})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "quic",
Address: "quic://203.0.113.10:19443",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "node-b-lan",
NodeID: "node-b",
Transport: "quic",
Address: "quic://10.10.0.12:19443",
Reachability: "private",
ConnectivityMode: "direct",
PolicyTags: []string{"private-lan"},
Metadata: metadata,
},
}, FabricRoutePlannerConfig{
ClusterID: "cluster-1",
LocalNodeID: "node-a",
LocalSegmentID: "site-a",
DefaultCapacity: 200,
Now: time.Unix(100, 0).UTC(),
})
if routeSet.Primary.RouteID != "node-b-lan" {
t.Fatalf("primary route = %q, want node-b-lan", routeSet.Primary.RouteID)
}
if routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1].Mode != FabricRouteLAN {
t.Fatalf("primary mode = %q, want lan", routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1].Mode)
}
}
func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{RelayNodeID: "node-r", RelayEndpoint: "quic://node-r:19443"})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "quic",
Address: "quic://node-b-passive:19443",
Reachability: "outbound_only",
ConnectivityMode: "relay_required",
NATType: "symmetric",
Metadata: metadata,
}}, FabricRoutePlannerConfig{
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RelayCapacity: 50,
Now: time.Unix(100, 0).UTC(),
})
if routeSet.Primary.RouteID != "node-b-relay" {
t.Fatalf("primary route = %q", routeSet.Primary.RouteID)
}
if routeSet.Primary.RelayCount != 2 {
t.Fatalf("relay count = %d, want 2", routeSet.Primary.RelayCount)
}
if got := routeSet.Primary.Hops[1].NodeID; got != "node-r" {
t.Fatalf("relay hop = %q, want node-r", got)
}
if routeSet.Primary.Capacity != 50 {
t.Fatalf("capacity = %d, want 50", routeSet.Primary.Capacity)
}
}
func TestFabricRouteSetForPeerEndpointCandidatesUsesTargetWhenRelayMetadataIsAbsent(t *testing.T) {
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay_quic",
Address: "quic://node-b:19443",
Reachability: "relay",
ConnectivityMode: "relay_required",
Metadata: json.RawMessage(`{"tls_cert_sha256":"abc123"}`),
}}, FabricRoutePlannerConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
if routeSet.Primary.RouteID != "node-b-relay" {
t.Fatalf("primary route = %q", routeSet.Primary.RouteID)
}
if len(routeSet.Primary.Hops) != 2 {
t.Fatalf("hops = %+v, want local + target only", routeSet.Primary.Hops)
}
targetHop := routeSet.Primary.Hops[1]
if targetHop.NodeID != "node-b" || targetHop.Mode != FabricRouteRelay || targetHop.PeerCertSHA256 != "abc123" {
t.Fatalf("target hop = %+v, want relay-mode target with cert", targetHop)
}
}
func TestFabricRouteSetForPeerEndpointCandidatesAcceptsExplicitQUICModes(t *testing.T) {
for _, tc := range []struct {
name string
transport string
wantMode FabricRouteMode
}{
{name: "lan", transport: "lan_quic", wantMode: FabricRouteLAN},
{name: "reverse", transport: "reverse_quic", wantMode: FabricRouteReverse},
{name: "relay", transport: "relay_quic", wantMode: FabricRouteRelay},
{name: "ice", transport: "ice_quic", wantMode: FabricRouteICE},
} {
t.Run(tc.name, func(t *testing.T) {
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-" + tc.name,
NodeID: "node-b",
Transport: tc.transport,
Address: "quic://node-b:19443",
Reachability: "private",
ConnectivityMode: "direct",
Metadata: json.RawMessage(`{"tls_cert_sha256":"abc123"}`),
}}, FabricRoutePlannerConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
if routeSet.Primary.RouteID == "" {
t.Fatalf("%s candidate produced empty route set", tc.transport)
}
hop := routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1]
if hop.Mode != tc.wantMode {
t.Fatalf("mode = %q, want %q", hop.Mode, tc.wantMode)
}
if hop.PeerCertSHA256 != "abc123" {
t.Fatalf("peer cert = %q, want abc123", hop.PeerCertSHA256)
}
})
}
}
func TestFabricRouteSetForPeerEndpointCandidatesTreatsSameNATGroupAsLAN(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{NATGroupID: "nat-a"})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-nat-lan",
NodeID: "node-b",
Transport: "quic",
Address: "quic://10.44.0.12:19443",
Reachability: "private",
ConnectivityMode: "direct",
NATType: "symmetric",
Metadata: metadata,
}}, FabricRoutePlannerConfig{
ClusterID: "cluster-1",
LocalNodeID: "node-a",
LocalNATGroupID: "nat-a",
})
if routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1].Mode != FabricRouteLAN {
t.Fatalf("route = %+v, want LAN mode for same NAT group", routeSet.Primary)
}
}
func TestFabricRouteSetForPeerEndpointCandidatesRejectsNonQUIC(t *testing.T) {
for _, candidate := range []PeerEndpointCandidate{
{
EndpointID: "node-b-http",
NodeID: "node-b",
Transport: "direct_http",
Address: "http://node-b:8080",
Reachability: "public",
ConnectivityMode: "direct",
},
{
EndpointID: "node-b-legacy-relay",
NodeID: "node-b",
Transport: "relay",
Address: "quic://node-r:19443",
Reachability: "relay",
ConnectivityMode: "relay_required",
},
{
EndpointID: "node-b-legacy-reverse",
NodeID: "node-b",
Transport: "outbound_reverse",
Address: "quic://node-b:19443",
Reachability: "outbound_only",
ConnectivityMode: "outbound_only",
},
} {
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{candidate}, FabricRoutePlannerConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
if routeSet.Primary.RouteID != "" || len(routeSet.WarmStandby) != 0 {
t.Fatalf("non-quic candidate produced route set: %+v", routeSet)
}
}
}
@@ -0,0 +1,137 @@
package mesh
import (
"strings"
"sync"
"sync/atomic"
)
type FabricRoutePressureTracker struct {
mu sync.Mutex
active map[string]int
maxActive map[string]int
acquiredTotal uint64
releasedTotal uint64
maxActiveTotal int
lastAcquiredRoute string
lastReleasedRoute string
}
type FabricRoutePressureSnapshot struct {
Active map[string]int `json:"active"`
MaxActive map[string]int `json:"max_active"`
ActiveTotal int `json:"active_total"`
MaxActiveTotal int `json:"max_active_total"`
AcquiredTotal uint64 `json:"acquired_total"`
ReleasedTotal uint64 `json:"released_total"`
LastAcquiredRoute string `json:"last_acquired_route,omitempty"`
LastReleasedRoute string `json:"last_released_route,omitempty"`
}
func NewFabricRoutePressureTracker() *FabricRoutePressureTracker {
return &FabricRoutePressureTracker{
active: map[string]int{},
maxActive: map[string]int{},
}
}
func (t *FabricRoutePressureTracker) Apply(routeSet FabricRouteSet) FabricRouteSet {
if t == nil {
return routeSet
}
active := t.Snapshot()
if len(active) == 0 {
return routeSet
}
apply := func(route FabricRoute) FabricRoute {
if count := active[route.RouteID]; count > 0 {
route.ActiveChannels += count
}
return route
}
routeSet.Primary = apply(routeSet.Primary)
for i := range routeSet.WarmStandby {
routeSet.WarmStandby[i] = apply(routeSet.WarmStandby[i])
}
for i := range routeSet.ColdFallbacks {
routeSet.ColdFallbacks[i] = apply(routeSet.ColdFallbacks[i])
}
return routeSet
}
func (t *FabricRoutePressureTracker) Acquire(routeID string) func() {
routeID = strings.TrimSpace(routeID)
if t == nil || routeID == "" {
return func() {}
}
t.mu.Lock()
if t.active == nil {
t.active = map[string]int{}
}
if t.maxActive == nil {
t.maxActive = map[string]int{}
}
t.active[routeID]++
if t.active[routeID] > t.maxActive[routeID] {
t.maxActive[routeID] = t.active[routeID]
}
t.acquiredTotal++
t.lastAcquiredRoute = routeID
if activeTotal := activeTotalLocked(t.active); activeTotal > t.maxActiveTotal {
t.maxActiveTotal = activeTotal
}
t.mu.Unlock()
var released atomic.Bool
return func() {
if released.Swap(true) {
return
}
t.mu.Lock()
if t.active[routeID] <= 1 {
delete(t.active, routeID)
} else {
t.active[routeID]--
}
t.releasedTotal++
t.lastReleasedRoute = routeID
t.mu.Unlock()
}
}
func (t *FabricRoutePressureTracker) Snapshot() map[string]int {
return t.SnapshotPressure().Active
}
func (t *FabricRoutePressureTracker) SnapshotPressure() FabricRoutePressureSnapshot {
if t == nil {
return FabricRoutePressureSnapshot{}
}
t.mu.Lock()
defer t.mu.Unlock()
active := make(map[string]int, len(t.active))
for routeID, count := range t.active {
active[routeID] = count
}
maxActive := make(map[string]int, len(t.maxActive))
for routeID, count := range t.maxActive {
maxActive[routeID] = count
}
return FabricRoutePressureSnapshot{
Active: active,
MaxActive: maxActive,
ActiveTotal: activeTotalLocked(active),
MaxActiveTotal: t.maxActiveTotal,
AcquiredTotal: t.acquiredTotal,
ReleasedTotal: t.releasedTotal,
LastAcquiredRoute: t.lastAcquiredRoute,
LastReleasedRoute: t.lastReleasedRoute,
}
}
func activeTotalLocked(active map[string]int) int {
total := 0
for _, count := range active {
total += count
}
return total
}
@@ -0,0 +1,44 @@
package mesh
import "testing"
func TestFabricRoutePressureTrackerAppliesAndReleasesActiveChannels(t *testing.T) {
tracker := NewFabricRoutePressureTracker()
releaseA := tracker.Acquire("route-a")
releaseAAgain := tracker.Acquire("route-a")
releaseB := tracker.Acquire("route-b")
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testFabricRoute("route-a", "node-b", 10, 100, 3, true),
WarmStandby: []FabricRoute{
testFabricRoute("route-b", "node-b", 10, 100, 0, true),
},
}
withPressure := tracker.Apply(routeSet)
if withPressure.Primary.ActiveChannels != 5 {
t.Fatalf("primary active = %d, want 5", withPressure.Primary.ActiveChannels)
}
if withPressure.WarmStandby[0].ActiveChannels != 1 {
t.Fatalf("standby active = %d, want 1", withPressure.WarmStandby[0].ActiveChannels)
}
releaseA()
releaseA()
releaseAAgain()
releaseB()
snapshot := tracker.SnapshotPressure()
if len(snapshot.Active) != 0 || snapshot.ActiveTotal != 0 {
t.Fatalf("snapshot after release = %+v, want inactive", snapshot)
}
if snapshot.AcquiredTotal != 3 || snapshot.ReleasedTotal != 3 {
t.Fatalf("snapshot totals = %+v, want acquired/released 3", snapshot)
}
if snapshot.MaxActive["route-a"] != 2 || snapshot.MaxActive["route-b"] != 1 || snapshot.MaxActiveTotal != 3 {
t.Fatalf("snapshot max = %+v", snapshot)
}
if snapshot.LastAcquiredRoute != "route-b" || snapshot.LastReleasedRoute != "route-b" {
t.Fatalf("snapshot last routes = %+v", snapshot)
}
}
@@ -12,8 +12,9 @@ import (
func TestFabricSessionPeerManagerReusesPeerPump(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
@@ -83,8 +84,9 @@ func TestFabricSessionPeerManagerReusesPeerPump(t *testing.T) {
func TestFabricSessionPeerManagerClosePeerReopens(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
@@ -131,8 +133,9 @@ func TestFabricSessionPeerManagerClosePeerReopens(t *testing.T) {
func TestFabricSessionPeerManagerReopensClosedPump(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
@@ -40,73 +40,22 @@ type FabricTransportTarget struct {
ErrorBuffer int
}
func FabricTransportForTarget(target FabricTransportTarget, websocket *WebSocketFabricTransport, quicTransport *QUICFabricTransport) (FabricTransport, FabricTransportTarget, error) {
func FabricTransportForTarget(target FabricTransportTarget, quicTransport *QUICFabricTransport) (FabricTransport, FabricTransportTarget, error) {
transportLabel := strings.ToLower(strings.TrimSpace(target.Transport))
endpoint := strings.TrimSpace(target.Endpoint)
if strings.HasPrefix(strings.ToLower(endpoint), "quic://") {
transportLabel = "quic"
if transportLabel == "" {
transportLabel = "quic"
}
target.Endpoint = strings.TrimPrefix(endpoint, "quic://")
}
switch transportLabel {
case "quic", "direct_quic", "udp_quic", "quic_udp":
case "quic", "direct_quic", "udp_quic", "quic_udp", "lan_quic", "reverse_quic", "relay_quic", "ice_quic":
if quicTransport == nil {
quicTransport = NewQUICFabricTransport(nil)
}
return quicTransport, target, nil
case "", "websocket", "ws", "wss", "direct_http", "direct_https", "direct_tcp_tls":
if websocket == nil {
websocket = NewWebSocketFabricTransport(nil)
}
return websocket, target, nil
default:
return nil, target, fmt.Errorf("unsupported fabric transport %q", target.Transport)
return nil, target, fmt.Errorf("unsupported fabric transport %q: quic is required", target.Transport)
}
}
type WebSocketFabricTransport struct {
Manager *FabricSessionPeerManager
}
func NewWebSocketFabricTransport(manager *FabricSessionPeerManager) *WebSocketFabricTransport {
if manager == nil {
manager = NewFabricSessionPeerManager()
}
return &WebSocketFabricTransport{Manager: manager}
}
func (t *WebSocketFabricTransport) Connect(ctx context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
manager := t.Manager
if manager == nil {
manager = NewFabricSessionPeerManager()
t.Manager = manager
}
return manager.Get(ctx, FabricSessionPeerTarget{
PeerID: target.PeerID,
BaseURL: target.Endpoint,
Options: FabricSessionDialOptions{
Token: target.Token,
Header: target.Header,
Timeout: target.Timeout,
MaxPayload: target.MaxPayload,
},
Pump: FabricSessionPumpOptions{
OutboundBuffer: target.OutboundBuffer,
InboundBuffer: target.InboundBuffer,
ErrorBuffer: target.ErrorBuffer,
},
})
}
func (t *WebSocketFabricTransport) Close() error {
if t == nil || t.Manager == nil {
return nil
}
return t.Manager.Close()
}
func (t *WebSocketFabricTransport) Snapshot() FabricSessionPeerManagerSnapshot {
if t == nil || t.Manager == nil {
return FabricSessionPeerManagerSnapshot{SchemaVersion: "rap.fabric_session_peer_manager.v1"}
}
return t.Manager.Snapshot()
}
@@ -1,117 +1,27 @@
package mesh
import (
"context"
"net/http/httptest"
"strings"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestWebSocketFabricTransportConnectsAndReusesSession(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
}
},
}.Handler())
defer server.Close()
transport := NewWebSocketFabricTransport(nil)
defer transport.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricTransportTarget{
PeerID: "node-a",
Endpoint: server.URL,
Token: "rap_fsn_transport",
Timeout: time.Second,
OutboundBuffer: 4,
InboundBuffer: 4,
ErrorBuffer: 4,
}
first, err := transport.Connect(ctx, target)
if err != nil {
t.Fatalf("first connect: %v", err)
}
second, err := transport.Connect(ctx, target)
if err != nil {
t.Fatalf("second connect: %v", err)
}
if first != second {
t.Fatal("transport did not reuse session")
}
if opened != 1 {
t.Fatalf("opened = %d, want 1", opened)
}
if err := first.Send(ctx, fabricproto.Frame{Type: fabricproto.FramePing, Sequence: 1, Payload: []byte("transport")}); err != nil {
t.Fatalf("send ping: %v", err)
}
select {
case frame := <-first.Frames():
if frame.Type != fabricproto.FramePong || frame.Sequence != 1 || string(frame.Payload) != "transport" {
t.Fatalf("frame = %+v", frame)
func TestFabricTransportRejectsWebSocketTransport(t *testing.T) {
for _, target := range []FabricTransportTarget{
{Transport: "wss", Endpoint: "wss://node-a.example/fabric/session"},
{Transport: "relay", Endpoint: "quic://node-r.example:19443"},
{Transport: "outbound_reverse", Endpoint: "quic://node-b.example:19443"},
} {
_, _, err := FabricTransportForTarget(target, nil)
if err == nil || !strings.Contains(err.Error(), "quic is required") {
t.Fatalf("target = %+v err = %v, want quic-only rejection", target, err)
}
case err := <-first.Errors():
t.Fatalf("session error: %v", err)
case <-ctx.Done():
t.Fatal(ctx.Err())
}
}
func TestWebSocketFabricTransportReopensClosedSession(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
}
},
}.Handler())
defer server.Close()
transport := NewWebSocketFabricTransport(nil)
defer transport.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricTransportTarget{
PeerID: "node-a",
Endpoint: server.URL,
Token: "rap_fsn_transport_reopen",
Timeout: time.Second,
}
first, err := transport.Connect(ctx, target)
if err != nil {
t.Fatalf("first connect: %v", err)
}
if err := first.Close(); err != nil {
t.Fatalf("close first session: %v", err)
}
second, err := transport.Connect(ctx, target)
if err != nil {
t.Fatalf("second connect: %v", err)
}
if first == second {
t.Fatal("transport reused closed session")
}
if opened != 2 {
t.Fatalf("opened = %d, want 2", opened)
}
}
func TestFabricTransportForTargetSelectsQUICByScheme(t *testing.T) {
transport, target, err := FabricTransportForTarget(FabricTransportTarget{
Endpoint: "quic://127.0.0.1:4433",
}, nil, nil)
}, nil)
if err != nil {
t.Fatalf("select transport: %v", err)
}
@@ -123,15 +33,12 @@ func TestFabricTransportForTargetSelectsQUICByScheme(t *testing.T) {
}
}
func TestFabricTransportForTargetSelectsWebSocketByDefault(t *testing.T) {
transport, target, err := FabricTransportForTarget(FabricTransportTarget{
func TestFabricTransportForTargetRejectsNonQUICByDefault(t *testing.T) {
_, target, err := FabricTransportForTarget(FabricTransportTarget{
Endpoint: "https://node.example",
}, nil, nil)
if err != nil {
t.Fatalf("select transport: %v", err)
}
if _, ok := transport.(*WebSocketFabricTransport); !ok {
t.Fatalf("transport = %T, want websocket", transport)
}, nil)
if err == nil {
t.Fatal("non-QUIC target unexpectedly selected a transport")
}
if target.Endpoint != "https://node.example" {
t.Fatalf("endpoint = %q", target.Endpoint)
@@ -1,42 +0,0 @@
package mesh
import (
"context"
"net/http"
"strings"
)
// HTTPPeerTransport sends synthetic mesh envelopes to explicitly configured
// peer endpoints. It is intentionally narrow: production forwarding remains
// disabled and only SyntheticRuntime messages use this transport.
type HTTPPeerTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
}
func NewHTTPPeerTransport(peerURLs map[string]string) *HTTPPeerTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
}
}
return &HTTPPeerTransport{PeerURLs: normalized}
}
func (t *HTTPPeerTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
if t == nil {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
}
return client.SendSynthetic(ctx, envelope)
}
@@ -1,130 +0,0 @@
package mesh
import (
"context"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestHTTPPeerTransportDirectSyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-direct", []string{"node-a", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-direct")
if err != nil {
t.Fatalf("send live direct probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportSingleRelaySyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeR := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"})
defer nodeR.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-relay", []string{"node-a", "node-r", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-r": nodeR.URL})
nodeR.Runtime = newLiveRuntime(nodeR.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-relay")
if err != nil {
t.Fatalf("send live relay probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-r", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportMissingPeer(t *testing.T) {
transport := NewHTTPPeerTransport(map[string]string{})
_, err := transport.SendSynthetic(context.Background(), "node-missing", SyntheticEnvelope{})
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
}
}
type liveSyntheticNode struct {
Local PeerIdentity
Runtime *SyntheticRuntime
URL string
server *httptest.Server
}
func newLiveSyntheticNode(t *testing.T, local PeerIdentity) *liveSyntheticNode {
t.Helper()
node := &liveSyntheticNode{Local: local}
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
}))
node.URL = node.server.URL
return node
}
func (n *liveSyntheticNode) Close() {
if n.server != nil {
n.server.Close()
}
}
func newLiveRuntime(local PeerIdentity, routes []SyntheticRoute, peers map[string]string) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: local,
Routes: routes,
Transport: NewHTTPPeerTransport(peers),
})
}
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
MaxTTL: 8,
MaxHops: 8,
ExpiresAt: time.Now().UTC().Add(time.Hour),
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func sameStrings(left, right []string) bool {
if len(left) != len(right) {
return false
}
for i := range left {
if left[i] != right[i] {
return false
}
}
return true
}
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"sort"
"strings"
"time"
@@ -53,9 +54,11 @@ type PeerCacheEntry struct {
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestRegion string `json:"best_region,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
BestScoreReasons []string `json:"best_score_reasons,omitempty"`
BestPeerCertSHA256 string `json:"best_peer_cert_sha256,omitempty"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
@@ -132,9 +135,11 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.BestReachability = scored[0].Candidate.Reachability
entry.BestConnectivity = scored[0].Candidate.ConnectivityMode
entry.BestNATType = scored[0].Candidate.NATType
entry.BestRegion = scored[0].Candidate.Region
entry.BestPolicyTags = append([]string{}, scored[0].Candidate.PolicyTags...)
entry.BestCandidateScore = scored[0].Score
entry.BestScoreReasons = append([]string{}, scored[0].Reasons...)
entry.BestPeerCertSHA256 = candidatePeerCertSHA256(scored[0].Candidate)
entry.bestScore = scored[0].Score
if strings.TrimSpace(scored[0].Candidate.Address) != "" {
entry.Endpoint = strings.TrimSpace(scored[0].Candidate.Address)
@@ -188,6 +193,7 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
if lease.PeerNodeID != cfg.Local.NodeID {
entry := peerCacheEntry(entries, lease.PeerNodeID)
useLeaseEndpoint := shouldUseRendezvousEndpoint(*entry)
localRelay := lease.RelayNodeID == cfg.Local.NodeID
entry.RendezvousLeaseID = lease.LeaseID
entry.RelayNodeID = lease.RelayNodeID
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
@@ -195,12 +201,21 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
if useLeaseEndpoint {
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_control")
if localRelay {
entry.BestTransport = "reverse_quic"
} else {
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_quic")
}
entry.BestReachability = "relay"
entry.BestConnectivity = firstNonEmpty(lease.ConnectivityMode, "relay_required")
entry.Endpoint = entry.RelayEndpoint
entry.BestCandidateID = lease.LeaseID
entry.BestCandidateAddr = entry.RelayEndpoint
if !localRelay {
entry.Endpoint = entry.RelayEndpoint
entry.BestCandidateID = lease.LeaseID
entry.BestCandidateAddr = entry.RelayEndpoint
entry.BestPeerCertSHA256 = rendezvousLeasePeerCertSHA256(lease)
} else if strings.TrimSpace(entry.Endpoint) == "" {
entry.Endpoint = firstNonEmpty(entry.BestCandidateAddr, entry.RelayEndpoint)
}
entry.bestScore = maxInt(entry.bestScore, 500)
}
}
@@ -262,6 +277,20 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
}}
}
func rendezvousLeasePeerCertSHA256(lease PeerRendezvousLease) string {
var metadata struct {
PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"`
TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"`
}
if len(lease.Metadata) == 0 {
return ""
}
if err := json.Unmarshal(lease.Metadata, &metadata); err != nil {
return ""
}
return firstNonEmpty(strings.TrimSpace(metadata.PeerCertSHA256), strings.TrimSpace(metadata.TLSCertSHA256))
}
func (c *PeerCache) Snapshot() PeerCacheSnapshot {
if c == nil {
return PeerCacheSnapshot{}
@@ -10,15 +10,15 @@ func TestPeerCacheSelectsAdjacentWarmPeersWithinLimit(t *testing.T) {
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-a": "http://node-a:19000",
"node-r": "http://node-r:19000",
"node-c": "http://node-c:19000",
"node-a": "quic://node-a:19443",
"node-r": "quic://node-r:19443",
"node-c": "quic://node-c:19443",
},
Routes: []SyntheticRoute{
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "https://seed.example.test", Transport: "direct_tcp_tls", Priority: 10},
{NodeID: "node-seed", Endpoint: "quic://seed.example.test:19443", Transport: "direct_quic", Priority: 10},
},
WarmPeerLimit: 2,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
@@ -42,7 +42,7 @@ func TestPeerCachePromotesRecoverySeedAfterRoutePeers(t *testing.T) {
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "wss://seed.example.test/mesh", Transport: "wss", ConnectivityMode: "direct", Priority: 1},
{NodeID: "node-seed", Endpoint: "quic://seed.example.test:19443", Transport: "direct_quic", ConnectivityMode: "direct", Priority: 1},
},
WarmPeerLimit: 3,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
@@ -68,7 +68,7 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Transport: "relay_quic",
Address: "relay.example.test",
Reachability: "relay",
ConnectivityMode: "relay_required",
@@ -77,8 +77,8 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -119,10 +119,10 @@ func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-wss",
EndpointID: "node-b-ice",
NodeID: "node-b",
Transport: "wss",
Address: "https://node-b.example.test:443",
Transport: "ice_quic",
Address: "quic://node-b.example.test:19444",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -148,10 +148,10 @@ func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
if !ok {
t.Fatal("node-b missing from cache")
}
if entry.BestCandidateID != "node-b-wss" || entry.Endpoint != "https://node-b.example.test:443" {
if entry.BestCandidateID != "node-b-ice" || entry.Endpoint != "quic://node-b.example.test:19444" {
t.Fatalf("peer cache did not apply endpoint observations: %+v", entry)
}
if !containsString(entry.BestScoreReasons, "transport:wss") {
if !containsString(entry.BestScoreReasons, "transport:ice_quic") {
t.Fatalf("peer cache did not expose score reasons: %+v", entry.BestScoreReasons)
}
}
@@ -161,15 +161,15 @@ func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "https://node-b.public.example.test:443",
"node-b": "quic://node-b.public.example.test:19443",
},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "https://node-b.public.example.test:443",
Transport: "direct_quic",
Address: "quic://node-b.public.example.test:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -179,8 +179,8 @@ func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "http://10.24.10.20:19001",
Transport: "lan_quic",
Address: "quic://10.24.10.20:19443",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
@@ -199,7 +199,7 @@ func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
if !ok {
t.Fatal("node-b missing from peer cache")
}
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "http://10.24.10.20:19001" {
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "quic://10.24.10.20:19443" {
t.Fatalf("peer cache did not choose corp LAN endpoint: %+v", entry)
}
}
@@ -29,6 +29,7 @@ type PeerConnectionIntentPlanConfig struct {
PeerCache PeerCacheSnapshot
RecoveryPlan PeerRecoveryPlan
RendezvousLeases []PeerRendezvousLease
PreferredRegion string
Now time.Time
}
@@ -62,12 +63,14 @@ type PeerConnectionIntent struct {
Reachability string `json:"reachability,omitempty"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
NATType string `json:"nat_type,omitempty"`
Region string `json:"region,omitempty"`
PolicyTags []string `json:"policy_tags,omitempty"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestPeerCertSHA256 string `json:"best_peer_cert_sha256,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
@@ -94,33 +97,35 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
}
entry := entryByNode[candidate.NodeID]
intent := PeerConnectionIntent{
NodeID: candidate.NodeID,
Action: connectionIntentAction(candidate),
Reason: candidate.Reason,
Endpoint: candidate.Endpoint,
ConnectionState: candidate.ConnectionState,
Transport: firstNonEmpty(candidate.BestTransport, entry.BestTransport),
Reachability: entry.BestReachability,
ConnectivityMode: entry.BestConnectivity,
NATType: entry.BestNATType,
PolicyTags: append([]string{}, entry.BestPolicyTags...),
BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID),
RendezvousLeaseID: entry.RendezvousLeaseID,
RelayNodeID: entry.RelayNodeID,
RelayEndpoint: entry.RelayEndpoint,
RelayCandidate: entry.RelayControl,
ControlPlaneOnly: entry.RelayControl,
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
Priority: candidate.Priority,
GeneratedAt: now,
NodeID: candidate.NodeID,
Action: connectionIntentAction(candidate),
Reason: candidate.Reason,
Endpoint: candidate.Endpoint,
ConnectionState: candidate.ConnectionState,
Transport: firstNonEmpty(candidate.BestTransport, entry.BestTransport),
Reachability: entry.BestReachability,
ConnectivityMode: entry.BestConnectivity,
NATType: entry.BestNATType,
Region: entry.BestRegion,
PolicyTags: append([]string{}, entry.BestPolicyTags...),
BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID),
BestPeerCertSHA256: entry.BestPeerCertSHA256,
RendezvousLeaseID: entry.RendezvousLeaseID,
RelayNodeID: entry.RelayNodeID,
RelayEndpoint: entry.RelayEndpoint,
RelayCandidate: entry.RelayControl,
ControlPlaneOnly: entry.RelayControl,
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
Priority: candidate.Priority,
GeneratedAt: now,
}
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent)
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent, cfg.PreferredRegion)
intent.TransportMode = mode
intent.RequiresRendezvous = requiresRendezvous
intent.DirectCandidate = directCandidate
if intent.RequiresRendezvous {
if lease, ok := rendezvousLeaseForPeer(cfg.RendezvousLeases, intent.NodeID, now); ok {
applyRendezvousLease(&intent, lease)
applyRendezvousLease(&intent, lease, cfg.PeerCache.LocalNodeID)
}
}
intents = append(intents, intent)
@@ -185,10 +190,12 @@ func connectionIntentAction(candidate PeerRecoveryCandidate) string {
}
}
func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
func classifyPeerTransport(intent PeerConnectionIntent, preferredRegion string) (string, bool, bool) {
transport := strings.ToLower(strings.TrimSpace(intent.Transport))
connectivity := strings.ToLower(strings.TrimSpace(intent.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(intent.Reachability))
region := strings.TrimSpace(intent.Region)
preferredRegion = strings.TrimSpace(preferredRegion)
tags := lowerStringSet(intent.PolicyTags)
if strings.Contains(transport, "relay") || connectivity == "relay_required" || reachability == "relay" {
@@ -201,6 +208,9 @@ func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
return PeerTransportModeCorporateLAN, false, true
}
if tags["private-lan"] || reachability == "private" || endpointHasPrivateHost(intent.Endpoint) {
if preferredRegion != "" && region != "" && !strings.EqualFold(region, preferredRegion) {
return PeerTransportModeRelayRequired, true, false
}
return PeerTransportModePrivateLAN, false, true
}
if strings.Contains(transport, "direct") || reachability == "public" || connectivity == "direct" {
@@ -246,9 +256,16 @@ func rendezvousLeaseForPeer(leases []PeerRendezvousLease, peerNodeID string, now
return candidates[0], true
}
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease) {
intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
intent.Transport = firstNonEmpty(lease.Transport, "relay_control")
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease, localNodeID string) {
localRelay := strings.TrimSpace(lease.RelayNodeID) == strings.TrimSpace(localNodeID)
if !localRelay {
intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
}
if localRelay {
intent.Transport = "reverse_quic"
} else {
intent.Transport = firstNonEmpty(lease.Transport, "relay_quic")
}
intent.TransportMode = PeerTransportModeRelayControl
intent.RequiresRendezvous = false
intent.RendezvousResolved = true
@@ -256,17 +273,33 @@ func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLeas
intent.RelayCandidate = true
intent.RendezvousLeaseID = lease.LeaseID
intent.RelayNodeID = lease.RelayNodeID
intent.RelayEndpoint = intent.Endpoint
intent.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
intent.ControlPlaneOnly = true
if certSHA256 := rendezvousLeasePeerCertSHA256(lease); certSHA256 != "" && !localRelay {
intent.BestPeerCertSHA256 = certSHA256
}
if lease.ConnectivityMode != "" {
intent.ConnectivityMode = lease.ConnectivityMode
}
}
func endpointHasPrivateHost(rawEndpoint string) bool {
addr, ok := endpointHostAddr(rawEndpoint)
if !ok {
return false
}
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
}
func endpointHasUnspecifiedHost(rawEndpoint string) bool {
addr, ok := endpointHostAddr(rawEndpoint)
return ok && addr.IsUnspecified()
}
func endpointHostAddr(rawEndpoint string) (netip.Addr, bool) {
rawEndpoint = strings.TrimSpace(rawEndpoint)
if rawEndpoint == "" {
return false
return netip.Addr{}, false
}
host := rawEndpoint
if parsed, err := url.Parse(rawEndpoint); err == nil && parsed.Host != "" {
@@ -277,9 +310,9 @@ func endpointHasPrivateHost(rawEndpoint string) bool {
}
addr, err := netip.ParseAddr(strings.Trim(host, "[]"))
if err != nil {
return false
return netip.Addr{}, false
}
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
return addr, true
}
func lowerStringSet(values []string) map[string]bool {
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"testing"
"time"
)
@@ -11,8 +12,8 @@ func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
BestTransport: "direct_tcp_tls",
Endpoint: "quic://10.24.10.20:19443",
BestTransport: "lan_quic",
BestReachability: "private",
BestConnectivity: "direct",
BestPolicyTags: []string{"corp-lan", "same-site"},
@@ -23,7 +24,7 @@ func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
Endpoint: "quic://10.24.10.20:19443",
ConnectionState: PeerConnectionReady,
Reason: "maintain_ready",
Priority: 100,
@@ -48,15 +49,15 @@ func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *te
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
BestTransport: "direct_tcp_tls",
Endpoint: "quic://node-b.example.test:19443",
BestTransport: "reverse_quic",
BestReachability: "outbound_only",
BestConnectivity: "outbound_only",
},
{
NodeID: "node-c",
Endpoint: "relay://fabric-relay/node-c",
BestTransport: "relay",
BestTransport: "relay_quic",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
@@ -66,7 +67,7 @@ func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *te
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
Endpoint: "quic://node-b.example.test:19443",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 90,
@@ -91,6 +92,42 @@ func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *te
}
}
func TestPeerConnectionIntentsRequireRendezvousForRemotePrivateRegion(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PreferredRegion: "ifcm",
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "quic://192.168.200.61:19132",
BestTransport: "direct_quic",
BestReachability: "private",
BestConnectivity: "private_lan",
BestRegion: "docker-test",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{{
NodeID: "node-b",
Endpoint: "quic://192.168.200.61:19132",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 100,
}},
},
Now: now,
})
if plan.IntentCount != 1 || plan.RelayRequiredCount != 1 || plan.RendezvousRequiredCount != 1 {
t.Fatalf("unexpected remote private plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.DirectCandidate || !intent.RequiresRendezvous || intent.TransportMode != PeerTransportModeRelayRequired {
t.Fatalf("unexpected remote private intent: %+v", intent)
}
}
func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
@@ -120,13 +157,14 @@ func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r:19443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
Metadata: peerConnectionIntentLeaseMetadata(t, "abc123"),
},
},
Now: now,
@@ -137,9 +175,10 @@ func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
}
intent := plan.Intents[0]
if intent.TransportMode != PeerTransportModeRelayControl ||
intent.Endpoint != "http://node-r:19000" ||
intent.Endpoint != "quic://node-r:19443" ||
intent.RelayNodeID != "node-r" ||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
intent.BestPeerCertSHA256 != "abc123" ||
!intent.RelayCandidate ||
!intent.RendezvousResolved ||
intent.RequiresRendezvous {
@@ -176,8 +215,8 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
LeaseID: "lease-expired-preferred",
PeerNodeID: "node-b",
RelayNodeID: "node-r-old",
RelayEndpoint: "http://node-r-old:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r-old:19443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 1,
ControlPlaneOnly: true,
@@ -188,8 +227,8 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
LeaseID: "lease-active-reselected",
PeerNodeID: "node-b",
RelayNodeID: "node-r-new",
RelayEndpoint: "http://node-r-new:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r-new:19443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 20,
ControlPlaneOnly: true,
@@ -206,20 +245,29 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
intent := plan.Intents[0]
if intent.RendezvousLeaseID != "lease-active-reselected" ||
intent.RelayNodeID != "node-r-new" ||
intent.Endpoint != "http://node-r-new:19000" {
intent.Endpoint != "quic://node-r-new:19443" {
t.Fatalf("expired lease was not skipped: %+v", intent)
}
}
func peerConnectionIntentLeaseMetadata(t *testing.T, certSHA256 string) json.RawMessage {
t.Helper()
payload, err := json.Marshal(map[string]string{"peer_cert_sha256": certSHA256})
if err != nil {
t.Fatalf("marshal metadata: %v", err)
}
return payload
}
func TestPeerConnectionIntentsClassifyPrivateEndpointWithoutCandidateHints(t *testing.T) {
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{NodeID: "node-b", Endpoint: "http://192.168.10.20:19001"},
{NodeID: "node-b", Endpoint: "quic://192.168.10.20:19443"},
}},
RecoveryPlan: PeerRecoveryPlan{Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://192.168.10.20:19001",
Endpoint: "quic://192.168.10.20:19443",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_peer",
Priority: 10,
@@ -2,6 +2,7 @@ package mesh
import (
"context"
"fmt"
"net/http"
"strings"
"sync"
@@ -25,6 +26,8 @@ type PeerConnectionManagerConfig struct {
Tracker *PeerConnectionTracker
RendezvousLeases []PeerRendezvousLease
HTTPClient *http.Client
QUICTransport *QUICFabricTransport
PreferredRegion string
ProbeTimeout time.Duration
Now func() time.Time
}
@@ -35,6 +38,8 @@ type PeerConnectionManager struct {
tracker *PeerConnectionTracker
rendezvousLeases []PeerRendezvousLease
httpClient *http.Client
quicTransport *QUICFabricTransport
preferredRegion string
probeTimeout time.Duration
now func() time.Time
@@ -101,9 +106,10 @@ type PeerConnectionCandidateProbeResult struct {
}
type peerConnectionProbeTarget struct {
CandidateID string
Endpoint string
Transport string
CandidateID string
Endpoint string
Transport string
PeerCertSHA256 string
}
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
@@ -132,6 +138,8 @@ func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionMa
tracker: cfg.Tracker,
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
httpClient: httpClient,
quicTransport: cfg.QUICTransport,
preferredRegion: strings.TrimSpace(cfg.PreferredRegion),
probeTimeout: probeTimeout,
now: now,
}
@@ -155,6 +163,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
PeerCache: peerSnapshot,
RecoveryPlan: recoveryPlan,
RendezvousLeases: rendezvousLeases,
PreferredRegion: m.preferredRegion,
Now: startedAt,
})
entriesByNode := map[string]PeerCacheEntry{}
@@ -215,6 +224,15 @@ func (m *PeerConnectionManager) UpdatePeerConfig(peerCache *PeerCache, rendezvou
m.rendezvousLeases = append([]PeerRendezvousLease{}, rendezvousLeases...)
}
func (m *PeerConnectionManager) UpdateQUICTransport(transport *QUICFabricTransport) {
if m == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
m.quicTransport = transport
}
func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvousLease) {
if m == nil {
return nil, nil
@@ -242,17 +260,18 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
StartedAt: startedAt,
}
peer := PeerCacheEntry{
NodeID: intent.NodeID,
Endpoint: intent.Endpoint,
Warm: true,
WarmReason: intent.Reason,
RecoverySeed: intent.RecoverySeed,
BestCandidateID: intent.BestCandidateID,
BestTransport: intent.Transport,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
NodeID: intent.NodeID,
Endpoint: intent.Endpoint,
Warm: true,
WarmReason: intent.Reason,
RecoverySeed: intent.RecoverySeed,
BestCandidateID: intent.BestCandidateID,
BestTransport: intent.Transport,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
BestPeerCertSHA256: firstNonEmpty(intent.BestPeerCertSHA256, cacheEntry.BestPeerCertSHA256),
}
if intent.RequiresRendezvous {
result.LinkStatus = PeerConnectionProbeDeferred
@@ -282,13 +301,12 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
ClusterID: m.local.ClusterID,
NodeID: intent.NodeID,
}
if intent.RelayCandidate && intent.RelayNodeID != "" {
target.NodeID = intent.RelayNodeID
}
target.NodeID = peerConnectionProbeTargetNodeID(intent, m.local.NodeID)
targets := []peerConnectionProbeTarget{{
CandidateID: intent.BestCandidateID,
Endpoint: intent.Endpoint,
Transport: intent.Transport,
CandidateID: intent.BestCandidateID,
Endpoint: intent.Endpoint,
Transport: intent.Transport,
PeerCertSHA256: intent.BestPeerCertSHA256,
}}
if intent.DirectCandidate {
targets = peerConnectionProbeTargets(intent, cacheEntry)
@@ -300,13 +318,14 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
probePeer.BestCandidateID = strings.TrimSpace(probeTarget.CandidateID)
probePeer.BestCandidateAddr = probePeer.Endpoint
probePeer.BestTransport = strings.TrimSpace(probeTarget.Transport)
probePeer.BestPeerCertSHA256 = firstNonEmpty(probeTarget.PeerCertSHA256, probePeer.BestPeerCertSHA256)
if probePeer.Endpoint == "" {
continue
}
candidateStartedAt := normalizedNow(m.now())
m.tracker.BeginProbe(probePeer, candidateStartedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
_, err := NewClient(probePeer.Endpoint).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
err := m.probePeerTarget(probeCtx, probePeer, target)
cancel()
completedAt := normalizedNow(m.now())
candidateResult := PeerConnectionCandidateProbeResult{
@@ -354,47 +373,97 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
return result
}
func peerConnectionProbeTargetNodeID(intent PeerConnectionIntent, localNodeID string) string {
if intent.RelayCandidate && strings.TrimSpace(intent.RelayNodeID) != "" && strings.TrimSpace(intent.RelayNodeID) != strings.TrimSpace(localNodeID) {
return intent.RelayNodeID
}
return intent.NodeID
}
func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer PeerCacheEntry, target PeerIdentity) error {
endpoint := strings.TrimRight(strings.TrimSpace(probePeer.Endpoint), "/")
transport := strings.TrimSpace(probePeer.BestTransport)
if hasLegacyEndpointScheme(endpoint) {
return fmt.Errorf("non_quic_probe_rejected")
}
if peerConnectionTargetIsQUIC(transport, endpoint) {
carrier, selectedTarget, err := FabricTransportForTarget(FabricTransportTarget{
EndpointID: probePeer.BestCandidateID,
PeerID: target.NodeID,
Endpoint: endpoint,
Transport: transport,
Timeout: m.probeTimeout,
PeerCertSHA256: strings.TrimSpace(probePeer.BestPeerCertSHA256),
}, m.quicTransport)
if err != nil {
return err
}
session, err := carrier.Connect(ctx, selectedTarget)
if err != nil {
return err
}
return session.Close()
}
return fmt.Errorf("non_quic_probe_rejected")
}
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
seen := map[string]struct{}{}
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
add := func(candidateID, endpoint, transport string) {
add := func(candidateID, endpoint, transport, peerCertSHA256 string) {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
if endpoint == "" {
return
}
if endpointHasUnspecifiedHost(endpoint) {
return
}
key := candidateID + "|" + endpoint
if _, ok := seen[key]; ok {
return
}
seen[key] = struct{}{}
out = append(out, peerConnectionProbeTarget{
CandidateID: strings.TrimSpace(candidateID),
Endpoint: endpoint,
Transport: strings.TrimSpace(transport),
CandidateID: strings.TrimSpace(candidateID),
Endpoint: endpoint,
Transport: strings.TrimSpace(transport),
PeerCertSHA256: strings.TrimSpace(peerCertSHA256),
})
}
for _, candidate := range cacheEntry.EndpointCandidates {
if !candidateUsableForDirectProbe(candidate) {
continue
}
add(candidate.EndpointID, candidate.Address, candidate.Transport)
add(candidate.EndpointID, candidate.Address, candidate.Transport, candidatePeerCertSHA256(candidate))
}
add(intent.BestCandidateID, intent.Endpoint, intent.Transport)
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, cacheEntry.BestPeerCertSHA256)
return out
}
func peerConnectionTargetIsQUIC(transport string, endpoint string) bool {
return isQUICOnlyCandidateTransport(transport) || strings.HasPrefix(strings.ToLower(strings.TrimSpace(endpoint)), "quic://")
}
func candidateUsableForDirectProbe(candidate PeerEndpointCandidate) bool {
endpoint := strings.TrimSpace(candidate.Address)
if endpoint == "" || strings.HasPrefix(endpoint, "relay://") || strings.HasPrefix(endpoint, "outbound://") {
return false
}
if endpointHasUnspecifiedHost(endpoint) {
return false
}
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
if connectivity == "outbound_only" || connectivity == "relay_required" || reachability == "outbound_only" || reachability == "relay" {
return false
}
return transport == "" || strings.Contains(transport, "direct") || transport == "wss" || strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
return transport == "" ||
strings.Contains(transport, "direct_quic") ||
transport == "quic" ||
transport == "lan_quic" ||
transport == "ice_quic" ||
strings.HasPrefix(endpoint, "quic://")
}
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
@@ -2,8 +2,8 @@ package mesh
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"time"
)
@@ -11,12 +11,18 @@ import (
func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
@@ -24,19 +30,20 @@ func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: server.URL,
Transport: "direct_quic",
Address: "quic://" + server.Addr().String(),
Reachability: "private",
ConnectivityMode: "direct",
PolicyTags: []string{"corp-lan", "same-site"},
Priority: 1,
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
},
"node-c": {
{
EndpointID: "node-c-relay",
NodeID: "node-c",
Transport: "relay",
Transport: "relay_quic",
Address: "relay://fabric/node-c",
Reachability: "relay",
ConnectivityMode: "relay_required",
@@ -49,10 +56,11 @@ func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
ProbeTimeout: time.Second,
Local: local,
PeerCache: cache,
Tracker: tracker,
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
@@ -116,24 +124,31 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
}.Handler())
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
leases := []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: server.URL,
Transport: "relay_control",
RelayEndpoint: "quic://" + server.Addr().String(),
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
}
cache := NewPeerCache(PeerCacheConfig{
@@ -143,7 +158,7 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Transport: "relay_quic",
Address: "relay://fabric/node-b",
Reachability: "relay",
ConnectivityMode: "relay_required",
@@ -161,6 +176,7 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
PeerCache: cache,
Tracker: tracker,
RendezvousLeases: leases,
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
@@ -189,15 +205,37 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
}
}
func TestPeerConnectionProbeTargetKeepsPeerForLocalRelayReverseQUIC(t *testing.T) {
intent := PeerConnectionIntent{
NodeID: "node-b",
RelayCandidate: true,
RelayNodeID: "node-a",
Transport: "reverse_quic",
}
if got := peerConnectionProbeTargetNodeID(intent, "node-a"); got != "node-b" {
t.Fatalf("local relay reverse probe target = %q, want peer node-b", got)
}
intent.RelayNodeID = "node-r"
if got := peerConnectionProbeTargetNodeID(intent, "node-a"); got != "node-r" {
t.Fatalf("remote relay probe target = %q, want relay node-r", got)
}
}
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
@@ -205,8 +243,8 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
{
EndpointID: "node-b-dead",
NodeID: "node-b",
Transport: "direct_http",
Address: "http://127.0.0.1:1",
Transport: "lan_quic",
Address: "quic://127.0.0.1:1",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
@@ -214,11 +252,12 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
{
EndpointID: "node-b-live",
NodeID: "node-b",
Transport: "direct_http",
Address: server.URL,
Transport: "lan_quic",
Address: "quic://" + server.Addr().String(),
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 2,
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
},
},
@@ -227,11 +266,11 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 100 * time.Millisecond},
ProbeTimeout: 100 * time.Millisecond,
Local: local,
PeerCache: cache,
Tracker: tracker,
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: 100 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
@@ -243,7 +282,7 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
t.Fatalf("unexpected cycle: %+v", cycle)
}
result := cycle.Results[0]
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != server.URL {
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != "quic://"+server.Addr().String() {
t.Fatalf("fallback did not select live candidate: %+v", result)
}
if len(result.CandidateResults) != 2 ||
@@ -252,7 +291,85 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
t.Fatalf("candidate probe trail mismatch: %+v", result.CandidateResults)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != server.URL {
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != "quic://"+server.Addr().String() {
t.Fatalf("tracker did not retain selected candidate: %+v", snapshot)
}
}
func TestPeerConnectionManagerSkipsUnspecifiedQUICCandidates(t *testing.T) {
now := time.Date(2026, 5, 17, 6, 0, 0, 0, time.UTC)
current := now
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-unspecified-v6",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://[::]:19131",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 1,
},
{
EndpointID: "node-b-live",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://" + server.Addr().String(),
Reachability: "public",
ConnectivityMode: "direct",
Priority: 2,
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
},
},
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || len(cycle.Results) != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
result := cycle.Results[0]
if result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != "quic://"+server.Addr().String() {
t.Fatalf("manager did not skip unspecified endpoint: %+v", result)
}
if len(result.CandidateResults) != 1 || result.CandidateResults[0].CandidateID != "node-b-live" {
t.Fatalf("unspecified endpoint should not be probed: %+v", result.CandidateResults)
}
}
func peerConnectionProbeMetadata(t *testing.T, certSHA256 string) json.RawMessage {
t.Helper()
payload, err := json.Marshal(map[string]string{"peer_cert_sha256": certSHA256})
if err != nil {
t.Fatalf("marshal probe metadata: %v", err)
}
return payload
}
@@ -9,7 +9,7 @@ func TestPeerConnectionTrackerTransitionsReadyAndDegraded(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "http://node-b:19000"},
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "quic://node-b:19443"},
},
}, now)
@@ -76,12 +76,12 @@ func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
Entries: []PeerCacheEntry{
{
NodeID: "node-c",
Endpoint: "http://relay:19001",
Endpoint: "quic://relay:19443",
Warm: true,
WarmReason: "rendezvous_lease",
RendezvousLeaseID: "lease-1",
RelayNodeID: "node-r",
RelayEndpoint: "http://relay:19001",
RelayEndpoint: "quic://relay:19443",
RelayControl: true,
},
},
@@ -121,7 +121,7 @@ func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
return PeerCacheEntry{
NodeID: nodeID,
Endpoint: "http://" + nodeID + ":19001",
Endpoint: "quic://" + nodeID + ":19443",
Warm: warm,
WarmReason: warmReason,
RecoverySeed: recoverySeed,
@@ -2,42 +2,369 @@ package mesh
import (
"context"
"net/http"
"encoding/json"
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
type ProductionForwardTransport interface {
SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error)
}
type HTTPProductionForwardTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
type QUICProductionForwardTransport struct {
Targets map[string]FabricTransportTarget
RouteSets map[string]FabricRouteSet
Transport FabricTransport
Router FabricChannelRouter
Timeout time.Duration
Pressure *FabricRoutePressureTracker
Health *FabricRouteHealthTracker
sequence atomic.Uint64
}
func NewHTTPProductionForwardTransport(peerURLs map[string]string) *HTTPProductionForwardTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
type QUICProductionForwardTransportSnapshot struct {
RoutePressure FabricRoutePressureSnapshot `json:"route_pressure"`
RouteHealth FabricRouteHealthSnapshot `json:"route_health,omitempty"`
}
func NewQUICProductionForwardTransport(targets map[string]FabricTransportTarget, transport *QUICFabricTransport) *QUICProductionForwardTransport {
routeSets := make(map[string]FabricRouteSet, len(targets))
for nodeID, target := range targets {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
target.Endpoint = strings.TrimRight(strings.TrimSpace(target.Endpoint), "/")
target.Transport = strings.TrimSpace(target.Transport)
if nodeID != "" && target.Endpoint != "" {
target.PeerID = firstNonEmpty(strings.TrimSpace(target.PeerID), nodeID)
routeSets[nodeID] = FabricRouteSetForTransportTargets("", "", nodeID, []FabricTransportTarget{target})
}
}
return &HTTPProductionForwardTransport{PeerURLs: normalized}
if transport == nil {
transport = NewQUICFabricTransport(nil)
}
return NewQUICProductionForwardTransportFromRouteSets(routeSets, transport)
}
func (t *HTTPProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
if t == nil {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
func NewQUICProductionForwardTransportFromRouteSets(routeSets map[string]FabricRouteSet, transport FabricTransport) *QUICProductionForwardTransport {
normalizedRouteSets := make(map[string]FabricRouteSet, len(routeSets))
targets := make(map[string]FabricTransportTarget, len(routeSets))
for nodeID, routeSet := range routeSets {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
continue
}
normalizedRouteSets[nodeID] = routeSet
if target, err := FabricTransportTargetForRoute(routeSet.Primary); err == nil {
targets[nodeID] = target
}
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
if transport == nil {
transport = NewQUICFabricTransport(nil)
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
return &QUICProductionForwardTransport{
Targets: targets,
RouteSets: normalizedRouteSets,
Transport: transport,
Router: NewFabricChannelRouter(FabricChannelRouterConfig{
MaxAckLatencyMs: 2000,
MinRerouteInterval: 50 * time.Millisecond,
}),
Timeout: 30 * time.Second,
Pressure: NewFabricRoutePressureTracker(),
Health: NewFabricRouteHealthTracker(30 * time.Second),
}
return client.SendProduction(ctx, envelope)
}
func (t *QUICProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
if t == nil || t.Transport == nil {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
nextNodeID = strings.TrimSpace(nextNodeID)
routeSet, ok := t.RouteSets[nextNodeID]
if !ok {
target, targetOK := t.Targets[nextNodeID]
if !targetOK || strings.TrimSpace(target.Endpoint) == "" {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
routeSet = FabricRouteSetForTransportTargets(envelope.ClusterID, envelope.CurrentHopNodeID, nextNodeID, []FabricTransportTarget{target})
}
spec := FabricChannelSpec{
ChannelID: firstNonEmpty(strings.TrimSpace(envelope.MessageID), fmt.Sprintf("production-%d", t.sequence.Add(1))),
ClusterID: envelope.ClusterID,
SourceNodeID: firstNonEmpty(productionRouteSetSourceNodeID(routeSet), envelope.CurrentHopNodeID),
TargetKind: FabricChannelTargetNode,
TargetID: nextNodeID,
TrafficClass: FabricServiceChannelReliable,
CreatedAt: time.Now().UTC(),
}
payload, err := json.Marshal(envelope)
if err != nil {
return ProductionForwardResult{}, err
}
result, err := t.sendProductionWithRouteSet(ctx, spec, routeSet, payload)
if err != nil {
return ProductionForwardResult{}, err
}
return result, nil
}
func productionRouteSetSourceNodeID(routeSet FabricRouteSet) string {
for _, route := range flattenFabricRouteSet(routeSet) {
if sourceNodeID := strings.TrimSpace(route.SourceNodeID); sourceNodeID != "" {
return sourceNodeID
}
}
return ""
}
func (t *QUICProductionForwardTransport) sendProductionWithRouteSet(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payload []byte) (ProductionForwardResult, error) {
router := t.Router
if router.Config.MaxRoutePressure == 0 {
router = NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 2000, MinRerouteInterval: 50 * time.Millisecond})
}
routeSet = t.routeSetForScheduling(routeSet)
channel, _, err := router.OpenChannel(spec, routeSet, time.Now().UTC())
if err != nil {
return ProductionForwardResult{}, err
}
timeout := t.Timeout
if timeout <= 0 {
timeout = 30 * time.Second
}
for {
routeSet = t.routeSetForScheduling(routeSet)
route, ok := findFabricRoute(routeSet, channel.RouteID)
if !ok {
return ProductionForwardResult{}, ErrFabricRouteNotFound
}
target, err := FabricTransportTargetForRoute(route)
if err != nil {
return ProductionForwardResult{}, err
}
target.PeerID = firstNonEmpty(strings.TrimSpace(target.PeerID), spec.TargetID)
target.MaxPayload = fabricproto.DefaultMaxPayload
releaseRoute := t.acquireProductionRoute(route.RouteID)
session, err := t.Transport.Connect(ctx, target)
if err != nil {
releaseRoute()
t.markProductionRouteFailure(route.RouteID, err)
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "connect_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
if event.Type == FabricChannelRouteEventReroute {
continue
}
if rerouteErr != nil {
return ProductionForwardResult{}, rerouteErr
}
return ProductionForwardResult{}, err
}
response, ackMs, err := t.sendProductionOnSession(ctx, session, payload, timeout)
_ = session.Close()
releaseRoute()
if err == nil {
t.markProductionRouteSuccess(route.RouteID)
_, _, _ = router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
AckLatencyMs: ackMs,
BytesSent: uint64(len(payload)),
FramesSent: 1,
BytesRecv: uint64(len(response.Payload)),
FramesRecv: 1,
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
return decodeQUICProductionForwardResponse(response.Payload)
}
t.markProductionRouteFailure(route.RouteID, err)
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "response_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
if event.Type == FabricChannelRouteEventReroute {
continue
}
if rerouteErr != nil {
return ProductionForwardResult{}, rerouteErr
}
return ProductionForwardResult{}, err
}
}
func (t *QUICProductionForwardTransport) routeSetWithActiveChannels(routeSet FabricRouteSet) FabricRouteSet {
if t == nil || t.Pressure == nil {
return routeSet
}
return t.Pressure.Apply(routeSet)
}
func (t *QUICProductionForwardTransport) routeSetForScheduling(routeSet FabricRouteSet) FabricRouteSet {
if t != nil && t.Health != nil {
routeSet = t.Health.Apply(routeSet, time.Now().UTC())
}
return t.routeSetWithActiveChannels(routeSet)
}
func (t *QUICProductionForwardTransport) acquireProductionRoute(routeID string) func() {
if t == nil || t.Pressure == nil {
return func() {}
}
return t.Pressure.Acquire(routeID)
}
func (t *QUICProductionForwardTransport) markProductionRouteFailure(routeID string, err error) {
if t == nil || t.Health == nil || err == nil {
return
}
t.Health.MarkFailure(routeID, err.Error(), time.Now().UTC())
}
func (t *QUICProductionForwardTransport) markProductionRouteSuccess(routeID string) {
if t == nil || t.Health == nil {
return
}
t.Health.MarkSuccess(routeID)
}
func (t *QUICProductionForwardTransport) Snapshot() QUICProductionForwardTransportSnapshot {
if t == nil {
return QUICProductionForwardTransportSnapshot{}
}
var pressure FabricRoutePressureSnapshot
if t.Pressure != nil {
pressure = t.Pressure.SnapshotPressure()
}
var health FabricRouteHealthSnapshot
if t.Health != nil {
health = t.Health.Snapshot(time.Now().UTC())
}
return QUICProductionForwardTransportSnapshot{RoutePressure: pressure, RouteHealth: health}
}
func (t *QUICProductionForwardTransport) sendProductionOnSession(ctx context.Context, session FabricTransportSession, payload []byte, timeout time.Duration) (fabricproto.Frame, int64, error) {
sequence := t.sequence.Add(1)
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: ProductionForwardQUICStreamID,
Sequence: sequence,
Payload: payload,
}); err != nil {
return fabricproto.Frame{}, 0, err
}
waitCtx := ctx
if timeout > 0 {
var cancel context.CancelFunc
waitCtx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
started := time.Now()
for {
select {
case <-waitCtx.Done():
return fabricproto.Frame{}, 0, waitCtx.Err()
case err, ok := <-session.Errors():
if !ok {
return fabricproto.Frame{}, 0, ErrForwardPeerUnavailable
}
if err != nil {
return fabricproto.Frame{}, 0, err
}
case frame, ok := <-session.Frames():
if !ok {
return fabricproto.Frame{}, 0, ErrForwardPeerUnavailable
}
if frame.Type != fabricproto.FrameData || frame.StreamID != ProductionForwardQUICStreamID || frame.Sequence != sequence {
continue
}
return frame, time.Since(started).Milliseconds(), nil
}
}
}
func decodeQUICProductionForwardResponse(payload []byte) (ProductionForwardResult, error) {
var response quicProductionForwardResponse
if err := json.Unmarshal(payload, &response); err != nil {
return ProductionForwardResult{}, err
}
if strings.TrimSpace(response.Error) != "" {
return ProductionForwardResult{}, fmt.Errorf("%w: %s", ErrForwardPeerUnavailable, response.Error)
}
return response.Result, nil
}
func FabricRouteSetForTransportTargets(clusterID string, sourceNodeID string, targetNodeID string, targets []FabricTransportTarget) FabricRouteSet {
routeSet := FabricRouteSet{TargetKind: FabricChannelTargetNode, TargetID: strings.TrimSpace(targetNodeID)}
routes := make([]FabricRoute, 0, len(targets))
for index, target := range targets {
target.Endpoint = strings.TrimRight(strings.TrimSpace(target.Endpoint), "/")
if strings.TrimSpace(target.Endpoint) == "" {
continue
}
peerID := firstNonEmpty(strings.TrimSpace(target.PeerID), strings.TrimSpace(targetNodeID))
routeID := strings.TrimSpace(target.EndpointID)
if routeID == "" {
routeID = fmt.Sprintf("%s-quic-%d", peerID, index)
}
routes = append(routes, FabricRoute{
RouteID: routeID,
ClusterID: strings.TrimSpace(clusterID),
SourceNodeID: strings.TrimSpace(sourceNodeID),
DestinationNodeID: peerID,
Hops: []FabricRouteHop{{
NodeID: peerID,
Mode: fabricRouteModeForTransportTarget(target),
EndpointID: strings.TrimSpace(target.EndpointID),
Address: target.Endpoint,
PeerCertSHA256: strings.TrimSpace(target.PeerCertSHA256),
}},
BaseLatencyMs: routeLatencyForIndex(index),
Capacity: 100,
ActiveChannels: 0,
Healthy: true,
LastUpdatedAt: time.Now().UTC(),
})
}
if len(routes) == 0 {
return routeSet
}
routeSet.Primary = routes[0]
if len(routes) > 1 {
routeSet.WarmStandby = append(routeSet.WarmStandby, routes[1:]...)
}
return routeSet
}
func fabricRouteModeForTransportTarget(target FabricTransportTarget) FabricRouteMode {
switch strings.ToLower(strings.TrimSpace(target.Transport)) {
case string(FabricRouteLAN):
return FabricRouteLAN
case string(FabricRouteReverse):
return FabricRouteReverse
case string(FabricRouteRelay):
return FabricRouteRelay
case string(FabricRouteICE):
return FabricRouteICE
default:
return FabricRouteDirect
}
}
func routeLatencyForIndex(index int) int {
if index <= 0 {
return 10
}
return 10 + index
}
@@ -0,0 +1,339 @@
package mesh
import (
"context"
"encoding/json"
"sync"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestQUICProductionForwardTransportReroutesOnConnectFailure(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.failConnect["quic://dead.example.test:19443"] = true
transport.results["quic://fast.example.test:19443"] = ProductionForwardResult{
Delivered: true,
MessageID: "message-1",
RouteID: "route-1",
}
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
}),
}, transport)
forward.Timeout = time.Second
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
if err != nil {
t.Fatalf("send production: %v", err)
}
if !result.Delivered || result.MessageID != "message-1" {
t.Fatalf("result = %+v", result)
}
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
t.Fatalf("dead connect count = %d, want 1", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
t.Fatalf("fast connect count = %d, want 1", got)
}
snapshot := forward.Snapshot()
if snapshot.RoutePressure.AcquiredTotal != 2 || snapshot.RoutePressure.ReleasedTotal != 2 || snapshot.RoutePressure.MaxActiveTotal == 0 {
t.Fatalf("route pressure snapshot = %+v", snapshot)
}
}
func TestQUICProductionForwardTransportQuarantinesFailedRoute(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.failConnect["quic://dead.example.test:19443"] = true
transport.results["quic://fast.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
}),
}, transport)
forward.Timeout = time.Second
for i := 0; i < 2; i++ {
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
if err != nil {
t.Fatalf("send production #%d: %v", i+1, err)
}
if !result.Delivered {
t.Fatalf("result #%d = %+v", i+1, result)
}
}
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
t.Fatalf("dead connect count = %d, want quarantine after first failure", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 2 {
t.Fatalf("fast connect count = %d, want both sends on healthy route", got)
}
snapshot := forward.Snapshot()
if snapshot.RouteHealth.Quarantined["dead"].Failures != 1 {
t.Fatalf("route health snapshot = %+v, want dead route quarantined", snapshot.RouteHealth)
}
}
func TestFabricRouteHealthTrackerExpiresQuarantine(t *testing.T) {
routeSet := FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
})
tracker := NewFabricRouteHealthTracker(time.Second)
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
tracker.MarkFailure("dead", "connect failed", now)
applied := tracker.Apply(routeSet, now.Add(500*time.Millisecond))
if applied.Primary.Healthy || !applied.Primary.Degraded {
t.Fatalf("primary after quarantine = %+v, want unhealthy degraded route", applied.Primary)
}
if len(tracker.Snapshot(now.Add(500*time.Millisecond)).Quarantined) != 1 {
t.Fatalf("route health snapshot = %+v, want one quarantined route", tracker.Snapshot(now.Add(500*time.Millisecond)))
}
applied = tracker.Apply(routeSet, now.Add(2*time.Second))
if !applied.Primary.Healthy || applied.Primary.Degraded {
t.Fatalf("primary after ttl = %+v, want route restored", applied.Primary)
}
if snapshot := tracker.Snapshot(now.Add(2 * time.Second)); len(snapshot.Quarantined) != 0 {
t.Fatalf("route health snapshot after ttl = %+v, want empty quarantine", snapshot)
}
}
func TestQUICProductionForwardTransportReroutesOnResponseTimeout(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.delays["quic://slow.example.test:19443"] = 100 * time.Millisecond
transport.results["quic://slow.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
transport.results["quic://fast.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "slow", PeerID: "node-b", Endpoint: "quic://slow.example.test:19443", Transport: "quic"},
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
}),
}, transport)
forward.Timeout = 10 * time.Millisecond
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
if err != nil {
t.Fatalf("send production: %v", err)
}
if !result.Delivered || result.MessageID != "message-1" {
t.Fatalf("result = %+v", result)
}
if got := transport.connectCount("quic://slow.example.test:19443"); got != 1 {
t.Fatalf("slow connect count = %d, want 1", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
t.Fatalf("fast connect count = %d, want 1", got)
}
}
func TestQUICProductionForwardTransportSchedulesWithRouteSetSourceForForwardedEnvelope(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.results["quic://node-c.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-forwarded"}
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
"node-c": FabricRouteSetForTransportTargets("cluster-a", "node-b", "node-c", []FabricTransportTarget{
{EndpointID: "node-c-direct", PeerID: "node-c", Endpoint: "quic://node-c.example.test:19443", Transport: "quic"},
}),
}, transport)
forward.Timeout = time.Second
envelope := testProductionForwardEnvelope("message-forwarded")
envelope.ClusterID = "cluster-a"
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = "node-c"
envelope.NextHopNodeID = "node-c"
result, err := forward.SendProduction(context.Background(), "node-c", envelope)
if err != nil {
t.Fatalf("send production: %v", err)
}
if !result.Delivered || result.MessageID != "message-forwarded" {
t.Fatalf("result = %+v", result)
}
if got := transport.connectCount("quic://node-c.example.test:19443"); got != 1 {
t.Fatalf("connect count = %d, want 1", got)
}
}
func TestQUICProductionForwardTransportSpreadsConcurrentChannelsByActivePressure(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.delays["quic://route-a.example.test:19443"] = 80 * time.Millisecond
transport.results["quic://route-a.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
transport.results["quic://route-b.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-2"}
routeSet := FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "route-a", PeerID: "node-b", Endpoint: "quic://route-a.example.test:19443", Transport: "quic"},
{EndpointID: "route-b", PeerID: "node-b", Endpoint: "quic://route-b.example.test:19443", Transport: "quic"},
})
routeSet.Primary.Capacity = 100
routeSet.WarmStandby[0].Capacity = 100
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{"node-b": routeSet}, transport)
forward.Timeout = time.Second
firstDone := make(chan error, 1)
go func() {
_, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
firstDone <- err
}()
transport.waitForConnect(t, "quic://route-a.example.test:19443", 1)
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-2"))
if err != nil {
t.Fatalf("second send production: %v", err)
}
if !result.Delivered || result.MessageID != "message-2" {
t.Fatalf("second result = %+v", result)
}
if got := transport.connectCount("quic://route-b.example.test:19443"); got != 1 {
t.Fatalf("route-b connect count = %d, want 1", got)
}
if err := <-firstDone; err != nil {
t.Fatalf("first send production: %v", err)
}
snapshot := forward.Snapshot()
if snapshot.RoutePressure.MaxActive["route-a"] != 1 || snapshot.RoutePressure.MaxActive["route-b"] != 1 || snapshot.RoutePressure.AcquiredTotal != 2 {
t.Fatalf("route pressure snapshot = %+v", snapshot)
}
}
type fakeProductionForwardFabricTransport struct {
mu sync.Mutex
failConnect map[string]bool
delays map[string]time.Duration
results map[string]ProductionForwardResult
connects map[string]int
}
func newFakeProductionForwardFabricTransport() *fakeProductionForwardFabricTransport {
return &fakeProductionForwardFabricTransport{
failConnect: map[string]bool{},
delays: map[string]time.Duration{},
results: map[string]ProductionForwardResult{},
connects: map[string]int{},
}
}
func (t *fakeProductionForwardFabricTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
endpoint := target.Endpoint
t.mu.Lock()
t.connects[endpoint]++
fail := t.failConnect[endpoint]
delay := t.delays[endpoint]
result := t.results[endpoint]
t.mu.Unlock()
if fail {
return nil, ErrForwardPeerUnavailable
}
return &fakeProductionForwardFabricSession{
delay: delay,
result: result,
frames: make(chan fabricproto.Frame, 16),
errors: make(chan error, 1),
done: make(chan struct{}),
}, nil
}
func (t *fakeProductionForwardFabricTransport) Close() error {
return nil
}
func (t *fakeProductionForwardFabricTransport) connectCount(endpoint string) int {
t.mu.Lock()
defer t.mu.Unlock()
return t.connects[endpoint]
}
func (t *fakeProductionForwardFabricTransport) waitForConnect(tb testing.TB, endpoint string, count int) {
tb.Helper()
deadline := time.Now().Add(time.Second)
for {
t.mu.Lock()
got := t.connects[endpoint]
t.mu.Unlock()
if got >= count {
return
}
if time.Now().After(deadline) {
tb.Fatalf("timed out waiting for %s connect count %d, got %d", endpoint, count, got)
}
time.Sleep(time.Millisecond)
}
}
type fakeProductionForwardFabricSession struct {
delay time.Duration
result ProductionForwardResult
frames chan fabricproto.Frame
errors chan error
done chan struct{}
once sync.Once
}
func (s *fakeProductionForwardFabricSession) Send(_ context.Context, frame fabricproto.Frame) error {
if frame.Type != fabricproto.FrameData {
return nil
}
responsePayload, _ := json.Marshal(quicProductionForwardResponse{Result: s.result})
go func() {
if s.delay > 0 {
time.Sleep(s.delay)
}
select {
case <-s.done:
case s.frames <- fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: frame.TrafficClass,
StreamID: frame.StreamID,
Sequence: frame.Sequence,
Payload: responsePayload,
}:
}
}()
return nil
}
func (s *fakeProductionForwardFabricSession) Frames() <-chan fabricproto.Frame {
return s.frames
}
func (s *fakeProductionForwardFabricSession) Errors() <-chan error {
return s.errors
}
func (s *fakeProductionForwardFabricSession) Close() error {
s.once.Do(func() {
close(s.done)
})
return nil
}
func (s *fakeProductionForwardFabricSession) Closed() bool {
select {
case <-s.done:
return true
default:
return false
}
}
func testProductionForwardEnvelope(messageID string) ProductionEnvelope {
now := time.Now().UTC()
return ProductionEnvelope{
FabricProtocolVersion: ProtocolVersion,
MessageID: messageID,
RouteID: "route-1",
ClusterID: "cluster-a",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
CurrentHopNodeID: "node-a",
NextHopNodeID: "node-b",
ChannelClass: ProductionChannelFabricControl,
MessageType: ProductionMessageFabricControl,
TTL: 8,
CreatedAt: now,
ExpiresAt: now.Add(time.Minute),
}
}
@@ -106,6 +106,9 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
}
if hasLegacyEndpointScheme(endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint")
}
}
for nodeID, candidates := range cfg.PeerEndpointCandidates {
if strings.TrimSpace(nodeID) == "" {
@@ -121,6 +124,9 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
strings.TrimSpace(candidate.ConnectivityMode) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
}
if !isQUICOnlyCandidateTransport(candidate.Transport) || hasLegacyEndpointScheme(candidate.Address) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint candidate")
}
}
}
for endpointID, observation := range cfg.PeerEndpointObservations {
@@ -179,6 +185,14 @@ func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) err
return nil
}
func hasLegacyEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
}
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
if len(seeds) > 20 {
return fmt.Errorf("scoped synthetic mesh config contains too many recovery seeds")
@@ -191,6 +205,9 @@ func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
strings.TrimSpace(seed.Transport) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
}
if !isQUICOnlyCandidateTransport(seed.Transport) || hasLegacyEndpointScheme(seed.Endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC recovery seed")
}
if _, duplicate := seen[key]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate recovery seed")
}
@@ -224,6 +241,9 @@ func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRo
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
}
if !isQUICOnlyCandidateTransport(lease.Transport) || hasLegacyEndpointScheme(lease.RelayEndpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC rendezvous lease")
}
if _, duplicate := seen[lease.LeaseID]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate rendezvous lease")
}
@@ -18,14 +18,14 @@ func TestLoadScopedSyntheticConfig(t *testing.T) {
ConfigVersion: "config-v1",
PeerDirectoryVersion: "peers-v1",
PolicyVersion: "policy-v1",
PeerEndpoints: map[string]string{"node-b": "http://127.0.0.1:19002"},
PeerEndpoints: map[string]string{"node-b": "quic://127.0.0.1:19443"},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
@@ -55,8 +55,8 @@ func TestLoadScopedSyntheticConfig(t *testing.T) {
RecoverySeeds: []PeerRecoverySeed{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
Transport: "direct_tcp_tls",
Endpoint: "quic://node-b.example.test:19443",
Transport: "direct_quic",
ConnectivityMode: "direct",
Priority: 10,
},
@@ -66,8 +66,8 @@ func TestLoadScopedSyntheticConfig(t *testing.T) {
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r:19443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
RouteIDs: []string{"route-a-b"},
AllowedChannels: []string{"fabric_control", "route_control"},
@@ -158,8 +158,8 @@ func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing
{
EndpointID: "node-b-public",
NodeID: "node-c",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
ConnectivityMode: "direct",
},
@@ -174,6 +174,73 @@ func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpoint(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerEndpoints: map[string]string{"node-b": "https://node-b.example.test:443"},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC peer endpoint error")
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateTransport(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-websocket",
NodeID: "node-b",
Transport: "websocket",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
ConnectivityMode: "direct",
},
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC peer endpoint candidate error")
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateScheme(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-https",
NodeID: "node-b",
Transport: "direct_quic",
Address: "https://node-b.example.test:443",
Reachability: "public",
ConnectivityMode: "direct",
},
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC peer endpoint candidate error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointObservation(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
@@ -217,7 +284,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-b", Endpoint: "", Transport: "direct_tcp_tls"},
{NodeID: "node-b", Endpoint: "", Transport: "direct_quic"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
@@ -228,6 +295,23 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRecoverySeed(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-b", Endpoint: "https://node-b.example.test:443", Transport: "direct_quic"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC recovery seed error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
@@ -238,8 +322,8 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r:19443",
Transport: "relay_quic",
RouteIDs: []string{"route-a-b"},
ExpiresAt: time.Now().UTC().Add(time.Hour),
},
@@ -253,6 +337,36 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "https://node-r.example.test:443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
RouteIDs: []string{"route-a-b"},
AllowedChannels: []string{"fabric_control", "route_control"},
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: time.Now().UTC().Add(-time.Minute),
ExpiresAt: time.Now().UTC().Add(time.Hour),
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC rendezvous lease error")
}
}
func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
t.Helper()
payload, err := json.Marshal(cfg)
@@ -265,3 +379,32 @@ func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
}
return path
}
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
MaxTTL: 8,
MaxHops: 8,
ExpiresAt: time.Now().UTC().Add(time.Hour),
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func sameStrings(left, right []string) bool {
if len(left) != len(right) {
return false
}
for i := range left {
if left[i] != right[i] {
return false
}
}
return true
}
+63 -56
View File
@@ -69,22 +69,24 @@ type VPNPacketIngressRoutePreference interface {
}
type Server struct {
Local PeerIdentity
SyntheticRuntime *SyntheticRuntime
ProductionForwardingEnabled bool
ProductionEnvelopeObserver ProductionEnvelopeObserver
ProductionEnvelopeDelivery ProductionEnvelopeDelivery
ProductionForwardTransport ProductionForwardTransport
ProductionForwardLogger ProductionForwardLogger
FabricServiceChannelLogger FabricServiceChannelAccessLogger
RemoteWorkspaceFrameSink RemoteWorkspaceFrameSink
ProductionRoutes []SyntheticRoute
VPNPacketIngress VPNPacketIngress
BackendProxyBaseURL string
ClusterAuthorityPublicKey string
ServiceChannelIntrospection bool
FabricSessionEnabled bool
FabricSessionLogger FabricSessionEventLogger
Local PeerIdentity
SyntheticRuntime *SyntheticRuntime
ProductionForwardingEnabled bool
ProductionEnvelopeObserver ProductionEnvelopeObserver
ProductionEnvelopeDelivery ProductionEnvelopeDelivery
ProductionForwardTransport ProductionForwardTransport
ProductionForwardLogger ProductionForwardLogger
DisableHTTPDataPlane bool
FabricServiceChannelLogger FabricServiceChannelAccessLogger
RemoteWorkspaceFrameSink RemoteWorkspaceFrameSink
ProductionRoutes []SyntheticRoute
VPNPacketIngress VPNPacketIngress
BackendProxyBaseURL string
ClusterAuthorityPublicKey string
ServiceChannelIntrospection bool
FabricSessionEnabled bool
FabricSessionWebSocketEnabled bool
FabricSessionLogger FabricSessionEventLogger
}
func (s Server) Handler() http.Handler {
@@ -92,7 +94,7 @@ func (s Server) Handler() http.Handler {
mux.HandleFunc("/mesh/v1/health", s.handleHealth)
mux.HandleFunc("/mesh/v1/forward", s.handleForward)
mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe)
if s.FabricSessionEnabled {
if s.FabricSessionEnabled && s.FabricSessionWebSocketEnabled {
mux.HandleFunc("/mesh/v1/fabric/session/ws", s.handleFabricSessionWebSocket)
}
if s.RemoteWorkspaceFrameSink != nil {
@@ -198,6 +200,7 @@ type FabricSessionEventLogEntry struct {
Event string `json:"event"`
ClusterID string `json:"cluster_id,omitempty"`
NodeID string `json:"node_id,omitempty"`
PeerID string `json:"peer_id,omitempty"`
AcceptedBy string `json:"accepted_by,omitempty"`
SessionID string `json:"session_id,omitempty"`
SessionEvent fabricproto.SessionEventType `json:"session_event,omitempty"`
@@ -2079,16 +2082,12 @@ func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if s.DisableHTTPDataPlane {
http.Error(w, "mesh data-plane forwarding requires QUIC fabric transport", http.StatusGone)
return
}
if !s.ProductionForwardingEnabled {
s.logProductionForward(ProductionForwardLogEntry{
Event: "production_forward_rejected",
ClusterID: s.Local.ClusterID,
LocalNodeID: s.Local.NodeID,
Reason: ErrForwardDisabled.Error(),
StatusCode: http.StatusNotImplemented,
OccurredAt: time.Now().UTC(),
})
http.Error(w, ErrForwardDisabled.Error(), http.StatusNotImplemented)
s.rejectProductionForward(w, ProductionEnvelope{}, ErrForwardDisabled, forwardStatusCode(ErrForwardDisabled))
return
}
var envelope ProductionEnvelope
@@ -2104,54 +2103,57 @@ func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
http.Error(w, "invalid production mesh envelope", http.StatusBadRequest)
return
}
if err := ValidateProductionEnvelope(s.Local, envelope, time.Now().UTC()); err != nil {
result, err := s.ForwardProduction(r.Context(), envelope)
if err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
writeProductionForwardResult(w, result)
}
func (s Server) ForwardProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
if !s.ProductionForwardingEnabled {
return ProductionForwardResult{}, ErrForwardDisabled
}
if err := ValidateProductionEnvelope(s.Local, envelope, time.Now().UTC()); err != nil {
return ProductionForwardResult{}, err
}
if err := ValidateProductionEnvelopeRouteConfig(s.Local, envelope, s.ProductionRoutes, time.Now().UTC()); err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
return ProductionForwardResult{}, err
}
s.logProductionForward(productionForwardLogEntry("production_forward_accepted", s.Local, envelope, "", 0))
if s.ProductionEnvelopeObserver != nil {
observation := NewProductionEnvelopeObservation(envelope, time.Now().UTC())
if err := observeProductionEnvelope(r.Context(), s.ProductionEnvelopeObserver, observation); err != nil {
if err := observeProductionEnvelope(ctx, s.ProductionEnvelopeObserver, observation); err != nil {
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, ErrForwardObservationFailed.Error(), http.StatusInternalServerError))
http.Error(w, ErrForwardObservationFailed.Error(), http.StatusInternalServerError)
return
return ProductionForwardResult{}, ErrForwardObservationFailed
}
}
if envelope.DestinationNodeID == s.Local.NodeID {
if err := deliverProductionEnvelope(r.Context(), s.ProductionEnvelopeDelivery, envelope); err != nil {
if err := deliverProductionEnvelope(ctx, s.ProductionEnvelopeDelivery, envelope); err != nil {
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, ErrForwardDeliveryFailed.Error(), http.StatusInternalServerError))
http.Error(w, ErrForwardDeliveryFailed.Error(), http.StatusInternalServerError)
return
return ProductionForwardResult{}, ErrForwardDeliveryFailed
}
s.logProductionForward(productionForwardLogEntry("production_forward_delivered", s.Local, envelope, "", http.StatusOK))
writeProductionForwardResult(w, ProductionForwardResult{
return ProductionForwardResult{
Accepted: true,
Delivered: true,
By: s.Local,
MessageID: envelope.MessageID,
RouteID: envelope.RouteID,
})
return
}, nil
}
if envelope.NextHopNodeID == s.Local.NodeID {
s.rejectProductionForward(w, envelope, ErrLoopDetected, forwardStatusCode(ErrLoopDetected))
return
return ProductionForwardResult{}, ErrLoopDetected
}
if len(envelope.RoutePath) == 0 && envelope.NextHopNodeID != envelope.DestinationNodeID {
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
return
return ProductionForwardResult{}, ErrForwardRuntimeUnavailable
}
if s.ProductionForwardTransport == nil {
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
return
return ProductionForwardResult{}, ErrForwardRuntimeUnavailable
}
if envelope.TTL <= 1 {
s.rejectProductionForward(w, envelope, ErrTTLExhausted, forwardStatusCode(ErrTTLExhausted))
return
return ProductionForwardResult{}, ErrTTLExhausted
}
forwarded := envelope
forwarded.CurrentHopNodeID = envelope.NextHopNodeID
@@ -2159,10 +2161,9 @@ func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
forwarded.TTL = envelope.TTL - 1
forwarded.HopCount = envelope.HopCount + 1
forwarded.VisitedNodeIDs = append(append([]string{}, envelope.VisitedNodeIDs...), s.Local.NodeID)
result, err := s.ProductionForwardTransport.SendProduction(r.Context(), envelope.NextHopNodeID, forwarded)
result, err := s.ProductionForwardTransport.SendProduction(ctx, envelope.NextHopNodeID, forwarded)
if err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
return ProductionForwardResult{}, err
}
s.logProductionForward(productionForwardLogEntry("production_forward_forwarded", s.Local, envelope, "", http.StatusOK))
result.Accepted = true
@@ -2171,7 +2172,7 @@ func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
result.MessageID = envelope.MessageID
result.RouteID = envelope.RouteID
result.NextNodeID = envelope.NextHopNodeID
writeProductionForwardResult(w, result)
return result, nil
}
func (s Server) rejectProductionForward(w http.ResponseWriter, envelope ProductionEnvelope, err error, statusCode int) {
@@ -2262,6 +2263,10 @@ func (s Server) handleSyntheticProbe(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if s.DisableHTTPDataPlane {
http.Error(w, "mesh synthetic probes require QUIC fabric transport", http.StatusGone)
return
}
if s.SyntheticRuntime == nil {
http.Error(w, ErrMeshRuntimeDisabled.Error(), http.StatusServiceUnavailable)
return
@@ -2307,17 +2312,19 @@ func syntheticStatusCode(err error) int {
}
func forwardStatusCode(err error) int {
switch err {
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
switch {
case errors.Is(err, ErrClusterMismatch), errors.Is(err, ErrNodeMismatch), errors.Is(err, ErrUnauthorizedChannel), errors.Is(err, ErrLoopDetected):
return http.StatusForbidden
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrRouteIDRequired:
case errors.Is(err, ErrRouteExpired), errors.Is(err, ErrTTLExhausted), errors.Is(err, ErrInvalidRoutePath), errors.Is(err, ErrRouteIDRequired), errors.Is(err, ErrForwardEnvelopeInvalid):
return http.StatusBadRequest
case ErrForwardRuntimeUnavailable:
case errors.Is(err, ErrForwardRuntimeUnavailable), errors.Is(err, ErrForwardDisabled):
return http.StatusNotImplemented
case ErrRouteNotFound:
case errors.Is(err, ErrRouteNotFound):
return http.StatusNotFound
case ErrForwardPeerUnavailable:
case errors.Is(err, ErrForwardPeerUnavailable):
return http.StatusBadGateway
case errors.Is(err, ErrForwardObservationFailed), errors.Is(err, ErrForwardDeliveryFailed):
return http.StatusInternalServerError
default:
return http.StatusBadRequest
}
@@ -23,6 +23,18 @@ import (
"github.com/gorilla/websocket"
)
type testProductionForwardTransport struct {
targets map[string]Server
}
func (t testProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
target, ok := t.targets[strings.TrimSpace(nextNodeID)]
if !ok {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
return target.ForwardProduction(ctx, envelope)
}
func TestMeshHealthAcceptsSameCluster(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{Local: local}.Handler())
@@ -92,8 +104,9 @@ func TestFabricSessionWebSocketDisabledByDefault(t *testing.T) {
func TestFabricSessionWebSocketPingPongAndEvents(t *testing.T) {
var events []FabricSessionEventLogEntry
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
events = append(events, entry)
},
@@ -119,8 +132,9 @@ func TestFabricSessionWebSocketPingPongAndEvents(t *testing.T) {
func TestFabricSessionWebSocketOpenStreamDataAck(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -151,8 +165,9 @@ func TestFabricSessionWebSocketOpenStreamDataAck(t *testing.T) {
func TestFabricSessionWebSocketRequiresToken(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -172,9 +187,10 @@ func TestFabricSessionWebSocketRequiresSignedAuthorityWhenConfigured(t *testing.
t.Fatalf("generate key: %v", err)
}
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
}.Handler())
defer server.Close()
@@ -196,9 +212,10 @@ func TestFabricSessionWebSocketAcceptsSignedAuthority(t *testing.T) {
token := "rap_fsn_signedtest"
var events []FabricSessionEventLogEntry
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
events = append(events, entry)
},
@@ -360,23 +377,20 @@ func TestMeshForwardingGateDeliversFabricControlAtDestination(t *testing.T) {
func TestMeshForwardingGateForwardsDirectFabricControlToNextHop(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var deliveredObservation ProductionEnvelopeObservation
serverC := httptest.NewServer(Server{
serverC := Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
}
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeC.NodeID: serverC}},
}.Handler())
defer serverB.Close()
@@ -414,36 +428,30 @@ func TestMeshForwardingGateForwardsMultiHopFabricControlByRoutePath(t *testing.T
var deliveredObservation ProductionEnvelopeObservation
var nodeREvents []ProductionForwardLogEntry
var nodeBEvents []ProductionForwardLogEntry
serverC := httptest.NewServer(Server{
serverC := Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
}
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
serverR := httptest.NewServer(Server{
serverR := Server{
Local: nodeR,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeC.NodeID: serverC}},
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
nodeREvents = append(nodeREvents, entry)
},
}.Handler())
defer serverR.Close()
}
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeR.NodeID: serverR.URL,
}),
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeR.NodeID: serverR}},
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
nodeBEvents = append(nodeBEvents, entry)
},
@@ -490,7 +498,7 @@ func TestMeshForwardingGateForwardsConfiguredProductionRoute(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
route := configuredProductionRoute("route-1", []string{"node-a", "node-b", "node-r", nodeC.NodeID})
var deliveredObservation ProductionEnvelopeObservation
serverC := httptest.NewServer(Server{
serverC := Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
@@ -498,28 +506,22 @@ func TestMeshForwardingGateForwardsConfiguredProductionRoute(t *testing.T) {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
}
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
serverR := httptest.NewServer(Server{
serverR := Server{
Local: nodeR,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
}.Handler())
defer serverR.Close()
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeC.NodeID: serverC}},
}
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeR.NodeID: serverR.URL,
}),
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeR.NodeID: serverR}},
}.Handler())
defer serverB.Close()
@@ -5016,3 +5018,30 @@ func TestSyntheticEndpointDisabledByDefault(t *testing.T) {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusServiceUnavailable)
}
}
func TestHTTPDataPlaneDisabledRequiresQUIC(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
SyntheticRuntime: NewSyntheticRuntime(SyntheticRuntimeConfig{Enabled: true, Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}),
DisableHTTPDataPlane: true,
}.Handler())
defer server.Close()
resp, err := http.Post(server.URL+"/mesh/v1/synthetic/probe", "application/json", bytes.NewReader([]byte(`{}`)))
if err != nil {
t.Fatalf("post synthetic probe: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusGone {
t.Fatalf("synthetic status = %d, want %d", resp.StatusCode, http.StatusGone)
}
resp, err = http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader([]byte(`{}`)))
if err != nil {
t.Fatalf("post production forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusGone {
t.Fatalf("forward status = %d, want %d", resp.StatusCode, http.StatusGone)
}
}
@@ -0,0 +1,268 @@
package mesh
import (
"context"
"encoding/json"
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
type QUICSyntheticTransport struct {
Targets map[string]FabricTransportTarget
RouteSets map[string]FabricRouteSet
Transport FabricTransport
Router FabricChannelRouter
Timeout time.Duration
Pressure *FabricRoutePressureTracker
Health *FabricRouteHealthTracker
sequence atomic.Uint64
}
type QUICSyntheticTransportSnapshot struct {
RoutePressure FabricRoutePressureSnapshot `json:"route_pressure"`
RouteHealth FabricRouteHealthSnapshot `json:"route_health,omitempty"`
}
func NewQUICSyntheticTransportFromRouteSets(routeSets map[string]FabricRouteSet, transport FabricTransport) *QUICSyntheticTransport {
normalizedRouteSets := make(map[string]FabricRouteSet, len(routeSets))
targets := make(map[string]FabricTransportTarget, len(routeSets))
for nodeID, routeSet := range routeSets {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
continue
}
normalizedRouteSets[nodeID] = routeSet
if target, err := FabricTransportTargetForRoute(routeSet.Primary); err == nil {
targets[nodeID] = target
}
}
if transport == nil {
transport = NewQUICFabricTransport(nil)
}
return &QUICSyntheticTransport{
Targets: targets,
RouteSets: normalizedRouteSets,
Transport: transport,
Router: NewFabricChannelRouter(FabricChannelRouterConfig{
MaxAckLatencyMs: 2000,
MinRerouteInterval: 50 * time.Millisecond,
}),
Timeout: 10 * time.Second,
Pressure: NewFabricRoutePressureTracker(),
Health: NewFabricRouteHealthTracker(30 * time.Second),
}
}
func (t *QUICSyntheticTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
if t == nil || t.Transport == nil {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
nextNodeID = strings.TrimSpace(nextNodeID)
routeSet, ok := t.RouteSets[nextNodeID]
if !ok {
target, targetOK := t.Targets[nextNodeID]
if !targetOK || strings.TrimSpace(target.Endpoint) == "" {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
routeSet = FabricRouteSetForTransportTargets(envelope.ClusterID, envelope.From.NodeID, nextNodeID, []FabricTransportTarget{target})
}
spec := FabricChannelSpec{
ChannelID: fmt.Sprintf("synthetic-%d", t.sequence.Add(1)),
ClusterID: envelope.ClusterID,
SourceNodeID: envelope.From.NodeID,
TargetKind: FabricChannelTargetNode,
TargetID: nextNodeID,
TrafficClass: FabricServiceChannelReliable,
CreatedAt: time.Now().UTC(),
}
payload, err := json.Marshal(envelope)
if err != nil {
return SyntheticEnvelope{}, err
}
return t.sendSyntheticWithRouteSet(ctx, spec, routeSet, payload)
}
func (t *QUICSyntheticTransport) sendSyntheticWithRouteSet(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payload []byte) (SyntheticEnvelope, error) {
router := t.Router
if router.Config.MaxRoutePressure == 0 {
router = NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 2000, MinRerouteInterval: 50 * time.Millisecond})
}
routeSet = t.routeSetForScheduling(routeSet)
channel, _, err := router.OpenChannel(spec, routeSet, time.Now().UTC())
if err != nil {
return SyntheticEnvelope{}, err
}
timeout := t.Timeout
if timeout <= 0 {
timeout = 10 * time.Second
}
for {
routeSet = t.routeSetForScheduling(routeSet)
route, ok := findFabricRoute(routeSet, channel.RouteID)
if !ok {
return SyntheticEnvelope{}, ErrFabricRouteNotFound
}
target, err := FabricTransportTargetForRoute(route)
if err != nil {
return SyntheticEnvelope{}, err
}
target.PeerID = firstNonEmpty(strings.TrimSpace(target.PeerID), spec.TargetID)
target.MaxPayload = fabricproto.DefaultMaxPayload
releaseRoute := t.acquireSyntheticRoute(route.RouteID)
session, err := t.Transport.Connect(ctx, target)
if err != nil {
releaseRoute()
t.markSyntheticRouteFailure(route.RouteID, err)
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "connect_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
if event.Type == FabricChannelRouteEventReroute {
continue
}
if rerouteErr != nil {
return SyntheticEnvelope{}, rerouteErr
}
return SyntheticEnvelope{}, fmt.Errorf("%w: %v", ErrSyntheticPeerUnavailable, err)
}
response, ackMs, err := t.sendSyntheticOnSession(ctx, session, payload, timeout)
_ = session.Close()
releaseRoute()
if err == nil {
t.markSyntheticRouteSuccess(route.RouteID)
_, _, _ = router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
AckLatencyMs: ackMs,
BytesSent: uint64(len(payload)),
FramesSent: 1,
BytesRecv: uint64(len(response.Payload)),
FramesRecv: 1,
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
return decodeQUICSyntheticForwardResponse(response.Payload)
}
t.markSyntheticRouteFailure(route.RouteID, err)
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "response_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
if event.Type == FabricChannelRouteEventReroute {
continue
}
if rerouteErr != nil {
return SyntheticEnvelope{}, rerouteErr
}
return SyntheticEnvelope{}, fmt.Errorf("%w: %v", ErrSyntheticPeerUnavailable, err)
}
}
func (t *QUICSyntheticTransport) routeSetForScheduling(routeSet FabricRouteSet) FabricRouteSet {
if t != nil && t.Health != nil {
routeSet = t.Health.Apply(routeSet, time.Now().UTC())
}
if t != nil && t.Pressure != nil {
routeSet = t.Pressure.Apply(routeSet)
}
return routeSet
}
func (t *QUICSyntheticTransport) acquireSyntheticRoute(routeID string) func() {
if t == nil || t.Pressure == nil {
return func() {}
}
return t.Pressure.Acquire(routeID)
}
func (t *QUICSyntheticTransport) markSyntheticRouteFailure(routeID string, err error) {
if t == nil || t.Health == nil || err == nil {
return
}
t.Health.MarkFailure(routeID, err.Error(), time.Now().UTC())
}
func (t *QUICSyntheticTransport) markSyntheticRouteSuccess(routeID string) {
if t == nil || t.Health == nil {
return
}
t.Health.MarkSuccess(routeID)
}
func (t *QUICSyntheticTransport) Snapshot() QUICSyntheticTransportSnapshot {
if t == nil {
return QUICSyntheticTransportSnapshot{}
}
var pressure FabricRoutePressureSnapshot
if t.Pressure != nil {
pressure = t.Pressure.SnapshotPressure()
}
var health FabricRouteHealthSnapshot
if t.Health != nil {
health = t.Health.Snapshot(time.Now().UTC())
}
return QUICSyntheticTransportSnapshot{RoutePressure: pressure, RouteHealth: health}
}
func (t *QUICSyntheticTransport) sendSyntheticOnSession(ctx context.Context, session FabricTransportSession, payload []byte, timeout time.Duration) (fabricproto.Frame, int64, error) {
sequence := t.sequence.Add(1)
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: SyntheticForwardQUICStreamID,
Sequence: sequence,
Payload: payload,
}); err != nil {
return fabricproto.Frame{}, 0, err
}
waitCtx := ctx
if timeout > 0 {
var cancel context.CancelFunc
waitCtx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
started := time.Now()
for {
select {
case <-waitCtx.Done():
return fabricproto.Frame{}, 0, waitCtx.Err()
case err, ok := <-session.Errors():
if !ok {
return fabricproto.Frame{}, 0, ErrSyntheticPeerUnavailable
}
if err != nil {
return fabricproto.Frame{}, 0, err
}
case frame, ok := <-session.Frames():
if !ok {
return fabricproto.Frame{}, 0, ErrSyntheticPeerUnavailable
}
if frame.Type != fabricproto.FrameData || frame.StreamID != SyntheticForwardQUICStreamID || frame.Sequence != sequence {
continue
}
return frame, time.Since(started).Milliseconds(), nil
}
}
}
func decodeQUICSyntheticForwardResponse(payload []byte) (SyntheticEnvelope, error) {
var response quicSyntheticForwardResponse
if err := json.Unmarshal(payload, &response); err != nil {
return SyntheticEnvelope{}, err
}
if strings.TrimSpace(response.Error) != "" {
return SyntheticEnvelope{}, fmt.Errorf("%w: %s", ErrSyntheticPeerUnavailable, response.Error)
}
return response.Envelope, nil
}
@@ -0,0 +1,223 @@
package mesh
import (
"context"
"crypto/tls"
"encoding/json"
"sync"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestQUICSyntheticTransportReroutesOnConnectFailure(t *testing.T) {
transport := newFakeSyntheticFabricTransport()
transport.failConnect["quic://dead.example.test:19443"] = true
transport.responses["quic://fast.example.test:19443"] = testSyntheticAckEnvelope("route-1", 1)
forward := NewQUICSyntheticTransportFromRouteSets(map[string]FabricRouteSet{
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
}),
}, transport)
forward.Timeout = time.Second
ack, err := forward.SendSynthetic(context.Background(), "node-b", testSyntheticEnvelope("route-1", 1))
if err != nil {
t.Fatalf("send synthetic: %v", err)
}
if ack.RouteID != "route-1" || ack.MessageType != SyntheticMessageRouteHealthAck {
t.Fatalf("ack = %+v", ack)
}
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
t.Fatalf("dead connect count = %d, want 1", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
t.Fatalf("fast connect count = %d, want 1", got)
}
}
func TestQUICFabricServerHandlesSyntheticFrames(t *testing.T) {
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: testQUICTLSConfig(t),
SyntheticForwardHandler: func(_ context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
return testSyntheticAckEnvelope(envelope.RouteID, envelope.Sequence), nil
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
session, err := NewQUICFabricTransport(nil).Connect(ctx, FabricTransportTarget{
Endpoint: server.Addr().String(),
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
NextProtos: []string{fabricQUICNextProto},
},
Timeout: time.Second,
InboundBuffer: 4,
ErrorBuffer: 4,
})
if err != nil {
t.Fatalf("connect: %v", err)
}
defer session.Close()
payload, err := json.Marshal(testSyntheticEnvelope("route-1", 7))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: SyntheticForwardQUICStreamID,
Sequence: 42,
Payload: payload,
}); err != nil {
t.Fatalf("send synthetic frame: %v", err)
}
select {
case frame := <-session.Frames():
if frame.StreamID != SyntheticForwardQUICStreamID || frame.Sequence != 42 {
t.Fatalf("frame = %+v", frame)
}
ack, err := decodeQUICSyntheticForwardResponse(frame.Payload)
if err != nil {
t.Fatalf("decode response: %v", err)
}
if ack.RouteID != "route-1" || ack.MessageType != SyntheticMessageRouteHealthAck || ack.Sequence != 7 {
t.Fatalf("ack = %+v", ack)
}
case err := <-session.Errors():
t.Fatalf("session error: %v", err)
case <-ctx.Done():
t.Fatal(ctx.Err())
}
}
type fakeSyntheticFabricTransport struct {
mu sync.Mutex
failConnect map[string]bool
responses map[string]SyntheticEnvelope
connects map[string]int
}
func newFakeSyntheticFabricTransport() *fakeSyntheticFabricTransport {
return &fakeSyntheticFabricTransport{
failConnect: map[string]bool{},
responses: map[string]SyntheticEnvelope{},
connects: map[string]int{},
}
}
func (t *fakeSyntheticFabricTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
endpoint := target.Endpoint
t.mu.Lock()
t.connects[endpoint]++
fail := t.failConnect[endpoint]
response := t.responses[endpoint]
t.mu.Unlock()
if fail {
return nil, ErrSyntheticPeerUnavailable
}
return &fakeSyntheticFabricSession{
response: response,
frames: make(chan fabricproto.Frame, 16),
errors: make(chan error, 1),
done: make(chan struct{}),
}, nil
}
func (t *fakeSyntheticFabricTransport) Close() error {
return nil
}
func (t *fakeSyntheticFabricTransport) connectCount(endpoint string) int {
t.mu.Lock()
defer t.mu.Unlock()
return t.connects[endpoint]
}
type fakeSyntheticFabricSession struct {
response SyntheticEnvelope
frames chan fabricproto.Frame
errors chan error
done chan struct{}
once sync.Once
}
func (s *fakeSyntheticFabricSession) Send(_ context.Context, frame fabricproto.Frame) error {
if frame.Type != fabricproto.FrameData {
return nil
}
responsePayload, _ := json.Marshal(quicSyntheticForwardResponse{Envelope: s.response})
go func() {
select {
case <-s.done:
case s.frames <- fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: frame.TrafficClass,
StreamID: frame.StreamID,
Sequence: frame.Sequence,
Payload: responsePayload,
}:
}
}()
return nil
}
func (s *fakeSyntheticFabricSession) Frames() <-chan fabricproto.Frame {
return s.frames
}
func (s *fakeSyntheticFabricSession) Errors() <-chan error {
return s.errors
}
func (s *fakeSyntheticFabricSession) Close() error {
s.once.Do(func() {
close(s.done)
})
return nil
}
func (s *fakeSyntheticFabricSession) Closed() bool {
select {
case <-s.done:
return true
default:
return false
}
}
func testSyntheticEnvelope(routeID string, sequence uint64) SyntheticEnvelope {
now := time.Now().UTC()
return SyntheticEnvelope{
ProtocolVersion: ProtocolVersion,
RouteID: routeID,
ClusterID: "cluster-a",
From: PeerIdentity{ClusterID: "cluster-a", NodeID: "node-a"},
To: PeerIdentity{ClusterID: "cluster-a", NodeID: "node-b"},
Channel: SyntheticChannelFabricControl,
MessageType: SyntheticMessageRouteHealth,
TTL: 8,
HopCount: 1,
Visited: []string{"node-a"},
Sequence: sequence,
SentAt: now,
}
}
func testSyntheticAckEnvelope(routeID string, sequence uint64) SyntheticEnvelope {
ack := testSyntheticEnvelope(routeID, sequence)
ack.From = PeerIdentity{ClusterID: "cluster-a", NodeID: "node-b"}
ack.To = PeerIdentity{ClusterID: "cluster-a", NodeID: "node-a"}
ack.MessageType = SyntheticMessageRouteHealthAck
ack.Visited = []string{"node-a", "node-b"}
return ack
}