Refactor RDP proxy handling and update related tests
This commit is contained in:
@@ -11,8 +11,9 @@ import (
|
||||
|
||||
func TestClientFabricSessionFrameRoundTrip(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
@@ -37,8 +38,9 @@ func TestClientFabricSessionFrameRoundTrip(t *testing.T) {
|
||||
|
||||
func TestClientFabricSessionPersistentRoundTrips(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
@@ -80,8 +82,9 @@ func TestClientFabricSessionPersistentRoundTrips(t *testing.T) {
|
||||
|
||||
func TestClientFabricSessionPersistentDataAcks(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
@@ -135,8 +138,9 @@ func TestClientFabricSessionPersistentDataAcks(t *testing.T) {
|
||||
|
||||
func TestClientFabricSessionPumpMovesIndependentFrames(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
@@ -202,8 +206,9 @@ func TestClientFabricSessionPumpMovesIndependentFrames(t *testing.T) {
|
||||
|
||||
func TestClientFabricSessionReportsRejectedStatus(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
|
||||
@@ -72,6 +72,10 @@ const (
|
||||
MaxProductionEnvelopePayloadBytes = 4096
|
||||
MaxProductionVPNPacketPayloadBytes = 256 * 1024
|
||||
MaxProductionEnvelopeFutureSkew = time.Minute
|
||||
ProductionForwardQUICStreamID = 1
|
||||
WebIngressForwardQUICStreamID = 2
|
||||
FabricControlForwardQUICStreamID = 3
|
||||
SyntheticForwardQUICStreamID = 1001
|
||||
)
|
||||
|
||||
type PeerIdentity struct {
|
||||
|
||||
@@ -47,6 +47,9 @@ func RankPeerEndpointCandidates(candidates []PeerEndpointCandidate, opts Endpoin
|
||||
}
|
||||
out := make([]ScoredPeerEndpointCandidate, 0, len(candidates))
|
||||
for _, candidate := range candidates {
|
||||
if endpointHasUnspecifiedHost(candidate.Address) {
|
||||
continue
|
||||
}
|
||||
out = append(out, scorePeerEndpointCandidate(candidate, opts))
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
@@ -68,25 +71,25 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
|
||||
score := 100
|
||||
reasons := []string{"base"}
|
||||
|
||||
switch candidate.Transport {
|
||||
switch strings.ToLower(strings.TrimSpace(candidate.Transport)) {
|
||||
case "quic", "direct_quic", "udp_quic", "quic_udp":
|
||||
score += 45
|
||||
reasons = append(reasons, "transport:quic")
|
||||
case "direct_tcp_tls", "direct_http", "direct_https":
|
||||
score += 35
|
||||
reasons = append(reasons, "transport:direct")
|
||||
case "wss":
|
||||
score += 25
|
||||
reasons = append(reasons, "transport:wss")
|
||||
case "outbound_reverse":
|
||||
score += 10
|
||||
reasons = append(reasons, "transport:outbound_reverse")
|
||||
case "relay":
|
||||
case "lan_quic":
|
||||
score += 42
|
||||
reasons = append(reasons, "transport:lan_quic")
|
||||
case "ice_quic":
|
||||
score += 38
|
||||
reasons = append(reasons, "transport:ice_quic")
|
||||
case "reverse_quic":
|
||||
score += 15
|
||||
reasons = append(reasons, "transport:reverse_quic")
|
||||
case "relay_quic":
|
||||
score += 5
|
||||
reasons = append(reasons, "transport:relay")
|
||||
reasons = append(reasons, "transport:relay_quic")
|
||||
default:
|
||||
score -= 100
|
||||
reasons = append(reasons, "transport:unknown")
|
||||
reasons = append(reasons, "transport:non_quic_rejected")
|
||||
}
|
||||
|
||||
switch candidate.Reachability {
|
||||
@@ -173,7 +176,8 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
|
||||
score += 8
|
||||
reasons = append(reasons, "channel:control-direct")
|
||||
}
|
||||
if candidate.Transport == "relay" {
|
||||
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
|
||||
if transport == "relay" || transport == "relay_quic" {
|
||||
score -= 8
|
||||
reasons = append(reasons, "channel:control-relay-penalty")
|
||||
}
|
||||
@@ -234,14 +238,20 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv
|
||||
}
|
||||
switch {
|
||||
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
|
||||
score += 18
|
||||
score += 24
|
||||
reasons = append(reasons, "latency:low")
|
||||
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 150:
|
||||
score += 8
|
||||
reasons = append(reasons, "latency:moderate")
|
||||
case observation.LastLatencyMs > 0:
|
||||
score -= 10
|
||||
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 300:
|
||||
score -= 12
|
||||
reasons = append(reasons, "latency:high")
|
||||
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 750:
|
||||
score -= 32
|
||||
reasons = append(reasons, "latency:very_high")
|
||||
case observation.LastLatencyMs > 0:
|
||||
score -= 60
|
||||
reasons = append(reasons, "latency:extreme")
|
||||
}
|
||||
if observation.ReliabilityScore > 0 {
|
||||
switch {
|
||||
|
||||
@@ -13,7 +13,7 @@ func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay",
|
||||
Transport: "relay_quic",
|
||||
Address: "relay.example.test/node-b",
|
||||
Reachability: "relay",
|
||||
NATType: "symmetric",
|
||||
@@ -25,8 +25,8 @@ func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -38,8 +38,8 @@ func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-private-stale",
|
||||
NodeID: "node-b",
|
||||
Transport: "wss",
|
||||
Address: "10.0.0.5:443",
|
||||
Transport: "lan_quic",
|
||||
Address: "quic://10.0.0.5:19443",
|
||||
Reachability: "private",
|
||||
NATType: "restricted",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -74,8 +74,8 @@ func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
|
||||
{
|
||||
EndpointID: "endpoint-b",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.21:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.21:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -84,8 +84,8 @@ func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
|
||||
{
|
||||
EndpointID: "endpoint-a",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -103,10 +103,10 @@ func TestRankPeerEndpointCandidatesPrefersQUICFastPath(t *testing.T) {
|
||||
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-wss",
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "wss",
|
||||
Address: "wss://node-b.example.test",
|
||||
Transport: "relay_quic",
|
||||
Address: "quic://relay.example.test:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -138,14 +138,44 @@ func TestRankPeerEndpointCandidatesPrefersQUICFastPath(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankPeerEndpointCandidatesDropsUnspecifiedQUICEndpoint(t *testing.T) {
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-unspecified",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://[::]:19131",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 1,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19131",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
}
|
||||
|
||||
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{})
|
||||
if len(ranked) != 1 || ranked[0].Candidate.EndpointID != "node-b-public" {
|
||||
t.Fatalf("unspecified endpoint was not dropped: %+v", ranked)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -155,8 +185,8 @@ func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T)
|
||||
{
|
||||
EndpointID: "node-b-corp-lan",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "10.24.10.20:19001",
|
||||
Transport: "lan_quic",
|
||||
Address: "quic://10.24.10.20:19443",
|
||||
Reachability: "private",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -184,7 +214,7 @@ func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T
|
||||
{
|
||||
EndpointID: "node-b-outbound",
|
||||
NodeID: "node-b",
|
||||
Transport: "outbound_reverse",
|
||||
Transport: "reverse_quic",
|
||||
Address: "node-b.reverse.local",
|
||||
Reachability: "outbound_only",
|
||||
NATType: "symmetric",
|
||||
@@ -194,7 +224,7 @@ func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T
|
||||
{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay",
|
||||
Transport: "relay_quic",
|
||||
Address: "relay.example.test/node-b",
|
||||
Reachability: "relay",
|
||||
NATType: "blocked",
|
||||
@@ -222,18 +252,18 @@ func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-direct",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-wss",
|
||||
EndpointID: "node-b-ice",
|
||||
NodeID: "node-b",
|
||||
Transport: "wss",
|
||||
Address: "node-b.example.test",
|
||||
Transport: "ice_quic",
|
||||
Address: "quic://node-b.example.test:19443",
|
||||
Reachability: "public",
|
||||
NATType: "restricted",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -253,8 +283,8 @@ func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
|
||||
ReliabilityScore: 50,
|
||||
ObservedAt: now.Add(-time.Minute),
|
||||
},
|
||||
"node-b-wss": {
|
||||
EndpointID: "node-b-wss",
|
||||
"node-b-ice": {
|
||||
EndpointID: "node-b-ice",
|
||||
LastLatencyMs: 35,
|
||||
SuccessCount: 8,
|
||||
ReliabilityScore: 95,
|
||||
@@ -262,8 +292,8 @@ func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
|
||||
},
|
||||
},
|
||||
})
|
||||
if ranked[0].Candidate.EndpointID != "node-b-wss" {
|
||||
t.Fatalf("top endpoint = %q, want node-b-wss: %+v", ranked[0].Candidate.EndpointID, ranked)
|
||||
if ranked[0].Candidate.EndpointID != "node-b-ice" {
|
||||
t.Fatalf("top endpoint = %q, want node-b-ice: %+v", ranked[0].Candidate.EndpointID, ranked)
|
||||
}
|
||||
if !containsReason(ranked[0].Reasons, "latency:low") || !containsReason(ranked[0].Reasons, "reliability:high") {
|
||||
t.Fatalf("top reasons missing health hints: %+v", ranked[0].Reasons)
|
||||
@@ -279,8 +309,8 @@ func TestRankPeerEndpointCandidatesTreatsStaleObservationAsPenalty(t *testing.T)
|
||||
{
|
||||
EndpointID: "node-b-direct",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -321,10 +351,10 @@ func TestRankPeerEndpointCandidatesDoesNotRewardZeroLatencyFailure(t *testing.T)
|
||||
LastVerifiedAt: &now,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-wss",
|
||||
EndpointID: "node-b-ice",
|
||||
NodeID: "node-b",
|
||||
Transport: "wss",
|
||||
Address: "https://node-b.example.test:443",
|
||||
Transport: "ice_quic",
|
||||
Address: "quic://node-b.example.test:19444",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
@@ -345,14 +375,81 @@ func TestRankPeerEndpointCandidatesDoesNotRewardZeroLatencyFailure(t *testing.T)
|
||||
},
|
||||
MaxObservationAge: time.Minute,
|
||||
})
|
||||
if ranked[0].Candidate.EndpointID != "node-b-wss" {
|
||||
t.Fatalf("top endpoint = %q, want wss after repeated quic failures: %+v", ranked[0].Candidate.EndpointID, ranked)
|
||||
if ranked[0].Candidate.EndpointID != "node-b-ice" {
|
||||
t.Fatalf("top endpoint = %q, want ice_quic after repeated direct QUIC failures: %+v", ranked[0].Candidate.EndpointID, ranked)
|
||||
}
|
||||
if containsReason(ranked[1].Reasons, "latency:moderate") {
|
||||
t.Fatalf("zero latency failure was rewarded as moderate latency: %+v", ranked[1].Reasons)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankPeerEndpointCandidatesPenalizesSevereLatencyGradient(t *testing.T) {
|
||||
now := time.Date(2026, 5, 17, 6, 0, 0, 0, time.UTC)
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-lan",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://10.0.0.2:19443",
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "direct",
|
||||
LastVerifiedAt: &now,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-wan",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
LastVerifiedAt: &now,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-bad-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay_quic",
|
||||
Address: "quic://relay.example.test:19443",
|
||||
Reachability: "relay",
|
||||
ConnectivityMode: "relay_required",
|
||||
LastVerifiedAt: &now,
|
||||
},
|
||||
}
|
||||
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
|
||||
Now: now,
|
||||
MaxVerificationAge: time.Minute,
|
||||
MaxObservationAge: time.Minute,
|
||||
Observations: map[string]EndpointCandidateHealthObservation{
|
||||
"node-b-lan": {
|
||||
EndpointID: "node-b-lan",
|
||||
LastLatencyMs: 4,
|
||||
ReliabilityScore: 95,
|
||||
ObservedAt: now,
|
||||
},
|
||||
"node-b-wan": {
|
||||
EndpointID: "node-b-wan",
|
||||
LastLatencyMs: 420,
|
||||
ReliabilityScore: 95,
|
||||
ObservedAt: now,
|
||||
},
|
||||
"node-b-bad-relay": {
|
||||
EndpointID: "node-b-bad-relay",
|
||||
LastLatencyMs: 900,
|
||||
ReliabilityScore: 95,
|
||||
ObservedAt: now,
|
||||
},
|
||||
},
|
||||
})
|
||||
if ranked[0].Candidate.EndpointID != "node-b-lan" || ranked[1].Candidate.EndpointID != "node-b-wan" || ranked[2].Candidate.EndpointID != "node-b-bad-relay" {
|
||||
t.Fatalf("ranked endpoints = %+v, want lan, wan, bad relay", ranked)
|
||||
}
|
||||
if !containsReason(ranked[1].Reasons, "latency:very_high") {
|
||||
t.Fatalf("wan reasons = %+v, want latency:very_high", ranked[1].Reasons)
|
||||
}
|
||||
if !containsReason(ranked[2].Reasons, "latency:extreme") {
|
||||
t.Fatalf("relay reasons = %+v, want latency:extreme", ranked[2].Reasons)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankPeerEndpointCandidatesTreatsCapacityAsSoftPressure(t *testing.T) {
|
||||
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
|
||||
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{
|
||||
|
||||
@@ -0,0 +1,217 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type FabricChannelRouteEventType string
|
||||
|
||||
const (
|
||||
FabricChannelRouteEventNone FabricChannelRouteEventType = ""
|
||||
FabricChannelRouteEventOpened FabricChannelRouteEventType = "opened"
|
||||
FabricChannelRouteEventReroute FabricChannelRouteEventType = "reroute"
|
||||
)
|
||||
|
||||
var ErrFabricRouteRerouteSuppressed = errors.New("fabric route reroute suppressed")
|
||||
|
||||
type FabricChannelRouterConfig struct {
|
||||
SchedulerConfig FabricRouteSchedulerConfig
|
||||
MaxAckLatencyMs int64
|
||||
MaxRoutePressure int
|
||||
MinRerouteInterval time.Duration
|
||||
ProjectedChannelCost int
|
||||
}
|
||||
|
||||
type FabricChannelRouter struct {
|
||||
Config FabricChannelRouterConfig
|
||||
Scheduler FabricRouteScheduler
|
||||
}
|
||||
|
||||
type FabricChannelObservation struct {
|
||||
ChannelID string
|
||||
RouteID string
|
||||
AckLatencyMs int64
|
||||
Failed bool
|
||||
BytesSent uint64
|
||||
BytesRecv uint64
|
||||
FramesSent uint64
|
||||
FramesRecv uint64
|
||||
Reason string
|
||||
ObservedAt time.Time
|
||||
}
|
||||
|
||||
type FabricChannelRouteEvent struct {
|
||||
Type FabricChannelRouteEventType
|
||||
Reason string
|
||||
PreviousRoute FabricRoute
|
||||
NextRoute FabricRoute
|
||||
Choice FabricRouteChoice
|
||||
Observation FabricChannelObservation
|
||||
Channel FabricChannel
|
||||
OccurredAt time.Time
|
||||
}
|
||||
|
||||
func NewFabricChannelRouter(cfg FabricChannelRouterConfig) FabricChannelRouter {
|
||||
cfg = normalizeFabricChannelRouterConfig(cfg)
|
||||
return FabricChannelRouter{
|
||||
Config: cfg,
|
||||
Scheduler: NewFabricRouteScheduler(cfg.SchedulerConfig),
|
||||
}
|
||||
}
|
||||
|
||||
func (r FabricChannelRouter) OpenChannel(spec FabricChannelSpec, routeSet FabricRouteSet, now time.Time) (FabricChannel, FabricChannelRouteEvent, error) {
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
choice, err := r.Scheduler.ChooseRoute(spec, routeSet, now)
|
||||
if err != nil {
|
||||
return FabricChannel{}, FabricChannelRouteEvent{}, err
|
||||
}
|
||||
channel := FabricChannel{
|
||||
Spec: spec,
|
||||
State: FabricChannelOpen,
|
||||
RouteID: choice.Route.RouteID,
|
||||
TargetNode: choice.Route.DestinationNodeID,
|
||||
OpenedAt: now,
|
||||
}
|
||||
event := FabricChannelRouteEvent{
|
||||
Type: FabricChannelRouteEventOpened,
|
||||
Reason: choice.Reason,
|
||||
NextRoute: choice.Route,
|
||||
Choice: choice,
|
||||
Channel: channel,
|
||||
OccurredAt: now,
|
||||
}
|
||||
return channel, event, nil
|
||||
}
|
||||
|
||||
func (r FabricChannelRouter) ObserveChannel(channel FabricChannel, routeSet FabricRouteSet, observation FabricChannelObservation, now time.Time) (FabricChannel, FabricChannelRouteEvent, error) {
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
if observation.ObservedAt.IsZero() {
|
||||
observation.ObservedAt = now
|
||||
}
|
||||
channel.BytesSent += observation.BytesSent
|
||||
channel.BytesRecv += observation.BytesRecv
|
||||
channel.FramesSent += observation.FramesSent
|
||||
channel.FramesRecv += observation.FramesRecv
|
||||
if channel.State == "" {
|
||||
channel.State = FabricChannelOpen
|
||||
}
|
||||
if !r.shouldReroute(channel, observation, routeSet, now) {
|
||||
return channel, FabricChannelRouteEvent{Type: FabricChannelRouteEventNone, Observation: observation, Channel: channel, OccurredAt: now}, nil
|
||||
}
|
||||
previous, _ := findFabricRoute(routeSet, channel.RouteID)
|
||||
choice, err := r.chooseAlternativeRoute(channel.Spec, routeSet, channel.RouteID, now)
|
||||
if err != nil {
|
||||
return channel, FabricChannelRouteEvent{}, err
|
||||
}
|
||||
channel.RouteID = choice.Route.RouteID
|
||||
channel.TargetNode = choice.Route.DestinationNodeID
|
||||
channel.LastReroute = now
|
||||
channel.RerouteCount++
|
||||
reason := observation.Reason
|
||||
if strings.TrimSpace(reason) == "" {
|
||||
reason = rerouteReason(r.Config, observation, previous)
|
||||
}
|
||||
event := FabricChannelRouteEvent{
|
||||
Type: FabricChannelRouteEventReroute,
|
||||
Reason: reason,
|
||||
PreviousRoute: previous,
|
||||
NextRoute: choice.Route,
|
||||
Choice: choice,
|
||||
Observation: observation,
|
||||
Channel: channel,
|
||||
OccurredAt: now,
|
||||
}
|
||||
return channel, event, nil
|
||||
}
|
||||
|
||||
func (r FabricChannelRouter) shouldReroute(channel FabricChannel, observation FabricChannelObservation, routeSet FabricRouteSet, now time.Time) bool {
|
||||
cfg := normalizeFabricChannelRouterConfig(r.Config)
|
||||
if cfg.MinRerouteInterval > 0 && !channel.LastReroute.IsZero() && now.Sub(channel.LastReroute) < cfg.MinRerouteInterval {
|
||||
return false
|
||||
}
|
||||
if observation.Failed {
|
||||
return true
|
||||
}
|
||||
if cfg.MaxAckLatencyMs > 0 && observation.AckLatencyMs > cfg.MaxAckLatencyMs {
|
||||
return true
|
||||
}
|
||||
if cfg.MaxRoutePressure > 0 {
|
||||
if route, ok := findFabricRoute(routeSet, channel.RouteID); ok && fabricRoutePressurePercent(route, cfg.ProjectedChannelCost) > cfg.MaxRoutePressure {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (r FabricChannelRouter) chooseAlternativeRoute(spec FabricChannelSpec, routeSet FabricRouteSet, currentRouteID string, now time.Time) (FabricRouteChoice, error) {
|
||||
routes := flattenFabricRouteSet(routeSet)
|
||||
alternatives := make([]FabricRoute, 0, len(routes))
|
||||
for _, route := range routes {
|
||||
if route.RouteID == currentRouteID {
|
||||
continue
|
||||
}
|
||||
alternatives = append(alternatives, route)
|
||||
}
|
||||
if len(alternatives) == 0 {
|
||||
return FabricRouteChoice{}, ErrFabricRouteNotFound
|
||||
}
|
||||
return r.Scheduler.ChooseRoute(spec, routeSetFromRoutes(routeSet, alternatives), now)
|
||||
}
|
||||
|
||||
func normalizeFabricChannelRouterConfig(cfg FabricChannelRouterConfig) FabricChannelRouterConfig {
|
||||
if cfg.ProjectedChannelCost <= 0 {
|
||||
cfg.ProjectedChannelCost = 1
|
||||
}
|
||||
if cfg.SchedulerConfig.ProjectedChannelCost <= 0 {
|
||||
cfg.SchedulerConfig.ProjectedChannelCost = cfg.ProjectedChannelCost
|
||||
}
|
||||
if cfg.MaxRoutePressure <= 0 {
|
||||
cfg.MaxRoutePressure = 90
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func rerouteReason(cfg FabricChannelRouterConfig, observation FabricChannelObservation, route FabricRoute) string {
|
||||
cfg = normalizeFabricChannelRouterConfig(cfg)
|
||||
switch {
|
||||
case observation.Failed:
|
||||
return "route_failure"
|
||||
case cfg.MaxAckLatencyMs > 0 && observation.AckLatencyMs > cfg.MaxAckLatencyMs:
|
||||
return "ack_latency_threshold"
|
||||
case cfg.MaxRoutePressure > 0 && fabricRoutePressurePercent(route, cfg.ProjectedChannelCost) > cfg.MaxRoutePressure:
|
||||
return "route_capacity_pressure"
|
||||
default:
|
||||
return "route_degraded"
|
||||
}
|
||||
}
|
||||
|
||||
func findFabricRoute(routeSet FabricRouteSet, routeID string) (FabricRoute, bool) {
|
||||
routeID = strings.TrimSpace(routeID)
|
||||
if routeID == "" {
|
||||
return FabricRoute{}, false
|
||||
}
|
||||
for _, route := range flattenFabricRouteSet(routeSet) {
|
||||
if route.RouteID == routeID {
|
||||
return route, true
|
||||
}
|
||||
}
|
||||
return FabricRoute{}, false
|
||||
}
|
||||
|
||||
func routeSetFromRoutes(template FabricRouteSet, routes []FabricRoute) FabricRouteSet {
|
||||
out := FabricRouteSet{TargetKind: template.TargetKind, TargetID: template.TargetID}
|
||||
if len(routes) == 0 {
|
||||
return out
|
||||
}
|
||||
out.Primary = routes[0]
|
||||
if len(routes) > 1 {
|
||||
out.WarmStandby = append(out.WarmStandby, routes[1:]...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestFabricChannelRouterOpensOnBestRoute(t *testing.T) {
|
||||
router := NewFabricChannelRouter(FabricChannelRouterConfig{})
|
||||
now := time.Now()
|
||||
channel, event, err := router.OpenChannel(testFabricChannelSpec(FabricChannelTargetNode, "node-b"), FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: testFabricRoute("route-slow", "node-b", 80, 100, 0, true),
|
||||
WarmStandby: []FabricRoute{
|
||||
testFabricRoute("route-fast", "node-b", 15, 100, 0, true),
|
||||
},
|
||||
}, now)
|
||||
if err != nil {
|
||||
t.Fatalf("open channel: %v", err)
|
||||
}
|
||||
if channel.RouteID != "route-fast" || channel.State != FabricChannelOpen {
|
||||
t.Fatalf("channel = %+v, want route-fast open", channel)
|
||||
}
|
||||
if event.Type != FabricChannelRouteEventOpened || event.NextRoute.RouteID != "route-fast" {
|
||||
t.Fatalf("event = %+v", event)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelRouterReroutesOnSlowAck(t *testing.T) {
|
||||
router := NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 30})
|
||||
now := time.Now()
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: testFabricRoute("route-primary", "node-b", 10, 100, 0, true),
|
||||
WarmStandby: []FabricRoute{
|
||||
testFabricRoute("route-standby", "node-b", 20, 100, 0, true),
|
||||
},
|
||||
}
|
||||
channel := FabricChannel{
|
||||
Spec: testFabricChannelSpec(FabricChannelTargetNode, "node-b"),
|
||||
State: FabricChannelOpen,
|
||||
RouteID: "route-primary",
|
||||
OpenedAt: now.Add(-time.Minute),
|
||||
}
|
||||
updated, event, err := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: channel.Spec.ChannelID,
|
||||
RouteID: channel.RouteID,
|
||||
AckLatencyMs: 120,
|
||||
BytesSent: 4096,
|
||||
FramesSent: 4,
|
||||
}, now)
|
||||
if err != nil {
|
||||
t.Fatalf("observe channel: %v", err)
|
||||
}
|
||||
if event.Type != FabricChannelRouteEventReroute || event.Reason != "ack_latency_threshold" {
|
||||
t.Fatalf("event = %+v", event)
|
||||
}
|
||||
if updated.RouteID != "route-standby" || updated.RerouteCount != 1 || updated.BytesSent != 4096 || updated.FramesSent != 4 {
|
||||
t.Fatalf("updated = %+v", updated)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelRouterReroutesPoolTargetOnFailure(t *testing.T) {
|
||||
router := NewFabricChannelRouter(FabricChannelRouterConfig{})
|
||||
now := time.Now()
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetPool,
|
||||
TargetID: "pool-egress",
|
||||
Primary: testFabricPoolRoute("route-node-b", "node-b", 10, true),
|
||||
WarmStandby: []FabricRoute{
|
||||
testFabricPoolRoute("route-node-c", "node-c", 20, true),
|
||||
},
|
||||
}
|
||||
channel := FabricChannel{
|
||||
Spec: testFabricChannelSpec(FabricChannelTargetPool, "pool-egress"),
|
||||
State: FabricChannelOpen,
|
||||
RouteID: "route-node-b",
|
||||
TargetNode: "node-b",
|
||||
OpenedAt: now.Add(-time.Minute),
|
||||
}
|
||||
updated, event, err := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: channel.Spec.ChannelID,
|
||||
RouteID: channel.RouteID,
|
||||
Failed: true,
|
||||
Reason: "target_failed",
|
||||
}, now)
|
||||
if err != nil {
|
||||
t.Fatalf("observe channel: %v", err)
|
||||
}
|
||||
if event.Type != FabricChannelRouteEventReroute || event.PreviousRoute.RouteID != "route-node-b" || event.NextRoute.RouteID != "route-node-c" {
|
||||
t.Fatalf("event = %+v", event)
|
||||
}
|
||||
if updated.TargetNode != "node-c" || updated.RouteID != "route-node-c" {
|
||||
t.Fatalf("updated = %+v", updated)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelRouterSuppressesRerouteInsideHysteresis(t *testing.T) {
|
||||
router := NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 30, MinRerouteInterval: time.Minute})
|
||||
now := time.Now()
|
||||
channel := FabricChannel{
|
||||
Spec: testFabricChannelSpec(FabricChannelTargetNode, "node-b"),
|
||||
State: FabricChannelOpen,
|
||||
RouteID: "route-primary",
|
||||
LastReroute: now.Add(-10 * time.Second),
|
||||
}
|
||||
updated, event, err := router.ObserveChannel(channel, FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: testFabricRoute("route-primary", "node-b", 10, 100, 0, true),
|
||||
WarmStandby: []FabricRoute{testFabricRoute("route-standby", "node-b", 20, 100, 0, true)},
|
||||
}, FabricChannelObservation{AckLatencyMs: 120}, now)
|
||||
if err != nil {
|
||||
t.Fatalf("observe channel: %v", err)
|
||||
}
|
||||
if event.Type != FabricChannelRouteEventNone || updated.RouteID != "route-primary" {
|
||||
t.Fatalf("event=%+v updated=%+v", event, updated)
|
||||
}
|
||||
}
|
||||
|
||||
func testFabricChannelSpec(kind FabricChannelTargetKind, targetID string) FabricChannelSpec {
|
||||
return FabricChannelSpec{
|
||||
ChannelID: "channel-1",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
TargetKind: kind,
|
||||
TargetID: targetID,
|
||||
}
|
||||
}
|
||||
|
||||
func testFabricRoute(routeID string, destination string, latency int, capacity int, active int, healthy bool) FabricRoute {
|
||||
return FabricRoute{
|
||||
RouteID: routeID,
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: destination,
|
||||
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: destination}},
|
||||
BaseLatencyMs: latency,
|
||||
Capacity: capacity,
|
||||
ActiveChannels: active,
|
||||
Healthy: healthy,
|
||||
}
|
||||
}
|
||||
|
||||
func testFabricPoolRoute(routeID string, destination string, latency int, healthy bool) FabricRoute {
|
||||
route := testFabricRoute(routeID, destination, latency, 100, 0, healthy)
|
||||
route.PoolID = "pool-egress"
|
||||
return route
|
||||
}
|
||||
@@ -0,0 +1,487 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
type FabricChannelRuntimeConfig struct {
|
||||
RouterConfig FabricChannelRouterConfig
|
||||
StreamID uint64
|
||||
TrafficClass fabricproto.TrafficClass
|
||||
Timeout time.Duration
|
||||
MaxPayload int
|
||||
RouteHealthTTL time.Duration
|
||||
}
|
||||
|
||||
type FabricChannelRuntime struct {
|
||||
Transport FabricTransport
|
||||
Router FabricChannelRouter
|
||||
Pressure *FabricRoutePressureTracker
|
||||
Health *FabricRouteHealthTracker
|
||||
Config FabricChannelRuntimeConfig
|
||||
}
|
||||
|
||||
type FabricChannelRuntimeResult struct {
|
||||
Channel FabricChannel
|
||||
BytesSent uint64
|
||||
BytesRecv uint64
|
||||
FramesSent uint64
|
||||
FramesRecv uint64
|
||||
AcksReceived uint64
|
||||
RouteEvents []FabricChannelRouteEvent
|
||||
RouteAttempts []string
|
||||
MigrationEvents int
|
||||
RoutePressure FabricRoutePressureSnapshot
|
||||
RouteHealth FabricRouteHealthSnapshot
|
||||
}
|
||||
|
||||
type FabricChannelRequestResponseResult struct {
|
||||
FabricChannelRuntimeResult
|
||||
ResponsePayload []byte
|
||||
}
|
||||
|
||||
func NewFabricChannelRuntime(transport FabricTransport, cfg FabricChannelRuntimeConfig) *FabricChannelRuntime {
|
||||
if cfg.StreamID == 0 {
|
||||
cfg.StreamID = 2
|
||||
}
|
||||
if cfg.TrafficClass == 0 {
|
||||
cfg.TrafficClass = fabricproto.TrafficClassBulk
|
||||
}
|
||||
if cfg.Timeout <= 0 {
|
||||
cfg.Timeout = 30 * time.Second
|
||||
}
|
||||
if cfg.MaxPayload <= 0 {
|
||||
cfg.MaxPayload = fabricproto.DefaultMaxPayload
|
||||
}
|
||||
return &FabricChannelRuntime{
|
||||
Transport: transport,
|
||||
Router: NewFabricChannelRouter(cfg.RouterConfig),
|
||||
Pressure: NewFabricRoutePressureTracker(),
|
||||
Health: NewFabricRouteHealthTracker(cfg.RouteHealthTTL),
|
||||
Config: cfg,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) SendReliable(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payloads [][]byte) (FabricChannelRuntimeResult, error) {
|
||||
if r == nil || r.Transport == nil {
|
||||
return FabricChannelRuntimeResult{}, ErrForwardRuntimeUnavailable
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
routeSet = r.routeSetForScheduling(routeSet)
|
||||
channel, event, err := r.Router.OpenChannel(spec, routeSet, now)
|
||||
if err != nil {
|
||||
return FabricChannelRuntimeResult{}, err
|
||||
}
|
||||
result := FabricChannelRuntimeResult{Channel: channel, RouteEvents: []FabricChannelRouteEvent{event}}
|
||||
sequence := uint64(0)
|
||||
index := 0
|
||||
for index < len(payloads) {
|
||||
routeSet = r.routeSetForScheduling(routeSet)
|
||||
route, ok := findFabricRoute(routeSet, channel.RouteID)
|
||||
if !ok {
|
||||
return result, ErrFabricRouteNotFound
|
||||
}
|
||||
result.RouteAttempts = append(result.RouteAttempts, route.RouteID)
|
||||
target, err := FabricTransportTargetForRoute(route)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
releaseRoute := r.acquireRoute(route.RouteID)
|
||||
session, err := r.Transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
releaseRoute()
|
||||
r.markRouteFailure(route.RouteID, err)
|
||||
updated, event, rerouteErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
Failed: true,
|
||||
Reason: "connect_failed",
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
channel = updated
|
||||
result.Channel = channel
|
||||
if event.Type == FabricChannelRouteEventReroute {
|
||||
result.RouteEvents = append(result.RouteEvents, event)
|
||||
result.MigrationEvents++
|
||||
continue
|
||||
}
|
||||
if rerouteErr != nil {
|
||||
return result, rerouteErr
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
migrated, sendErr := r.sendOnSession(ctx, session, &channel, routeSet, route, payloads, &index, &sequence, &result)
|
||||
_ = session.Close()
|
||||
releaseRoute()
|
||||
result.Channel = channel
|
||||
if sendErr != nil {
|
||||
return result, sendErr
|
||||
}
|
||||
if !migrated {
|
||||
break
|
||||
}
|
||||
}
|
||||
result.Channel = channel
|
||||
result.RoutePressure = r.snapshotRoutePressure()
|
||||
result.RouteHealth = r.snapshotRouteHealth()
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) SendRequestResponse(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payload []byte) (FabricChannelRequestResponseResult, error) {
|
||||
if r == nil || r.Transport == nil {
|
||||
return FabricChannelRequestResponseResult{}, ErrForwardRuntimeUnavailable
|
||||
}
|
||||
if len(payload) > r.Config.MaxPayload {
|
||||
return FabricChannelRequestResponseResult{}, fmt.Errorf("%w: %d > %d", fabricproto.ErrInvalidPayloadLen, len(payload), r.Config.MaxPayload)
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
routeSet = r.routeSetForScheduling(routeSet)
|
||||
channel, event, err := r.Router.OpenChannel(spec, routeSet, now)
|
||||
if err != nil {
|
||||
return FabricChannelRequestResponseResult{}, err
|
||||
}
|
||||
result := FabricChannelRequestResponseResult{
|
||||
FabricChannelRuntimeResult: FabricChannelRuntimeResult{Channel: channel, RouteEvents: []FabricChannelRouteEvent{event}},
|
||||
}
|
||||
sequence := uint64(1)
|
||||
for {
|
||||
routeSet = r.routeSetForScheduling(routeSet)
|
||||
route, ok := findFabricRoute(routeSet, channel.RouteID)
|
||||
if !ok {
|
||||
return result, ErrFabricRouteNotFound
|
||||
}
|
||||
result.RouteAttempts = append(result.RouteAttempts, route.RouteID)
|
||||
target, err := FabricTransportTargetForRoute(route)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
releaseRoute := r.acquireRoute(route.RouteID)
|
||||
session, err := r.Transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
releaseRoute()
|
||||
r.markRouteFailure(route.RouteID, err)
|
||||
updated, routeEvent, rerouteErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
Failed: true,
|
||||
Reason: "connect_failed",
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
channel = updated
|
||||
result.Channel = channel
|
||||
if routeEvent.Type == FabricChannelRouteEventReroute {
|
||||
result.RouteEvents = append(result.RouteEvents, routeEvent)
|
||||
result.MigrationEvents++
|
||||
continue
|
||||
}
|
||||
if rerouteErr != nil {
|
||||
return result, rerouteErr
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
response, ackMs, sendErr := r.sendRequestResponseOnSession(ctx, session, route.RouteID, spec.ChannelID, payload, sequence)
|
||||
_ = session.Close()
|
||||
releaseRoute()
|
||||
result.Channel = channel
|
||||
if sendErr == nil {
|
||||
r.markRouteSuccess(route.RouteID)
|
||||
result.BytesSent += uint64(len(payload))
|
||||
result.FramesSent++
|
||||
result.BytesRecv += uint64(len(response))
|
||||
result.FramesRecv++
|
||||
result.AcksReceived++
|
||||
updated, routeEvent, observeErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
AckLatencyMs: ackMs,
|
||||
BytesSent: uint64(len(payload)),
|
||||
FramesSent: 1,
|
||||
BytesRecv: uint64(len(response)),
|
||||
FramesRecv: 1,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
channel = updated
|
||||
result.Channel = channel
|
||||
if observeErr != nil {
|
||||
return result, observeErr
|
||||
}
|
||||
if routeEvent.Type == FabricChannelRouteEventReroute {
|
||||
result.RouteEvents = append(result.RouteEvents, routeEvent)
|
||||
result.MigrationEvents++
|
||||
}
|
||||
result.ResponsePayload = response
|
||||
result.RoutePressure = r.snapshotRoutePressure()
|
||||
result.RouteHealth = r.snapshotRouteHealth()
|
||||
return result, nil
|
||||
}
|
||||
r.markRouteFailure(route.RouteID, sendErr)
|
||||
updated, routeEvent, rerouteErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
Failed: true,
|
||||
Reason: "response_failed",
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
channel = updated
|
||||
result.Channel = channel
|
||||
if routeEvent.Type == FabricChannelRouteEventReroute {
|
||||
result.RouteEvents = append(result.RouteEvents, routeEvent)
|
||||
result.MigrationEvents++
|
||||
continue
|
||||
}
|
||||
if rerouteErr != nil {
|
||||
return result, rerouteErr
|
||||
}
|
||||
return result, sendErr
|
||||
}
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) routeSetForScheduling(routeSet FabricRouteSet) FabricRouteSet {
|
||||
if r != nil && r.Health != nil {
|
||||
routeSet = r.Health.Apply(routeSet, time.Now().UTC())
|
||||
}
|
||||
return r.routeSetWithActiveChannels(routeSet)
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) routeSetWithActiveChannels(routeSet FabricRouteSet) FabricRouteSet {
|
||||
if r == nil || r.Pressure == nil {
|
||||
return routeSet
|
||||
}
|
||||
return r.Pressure.Apply(routeSet)
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) acquireRoute(routeID string) func() {
|
||||
if r == nil || r.Pressure == nil {
|
||||
return func() {}
|
||||
}
|
||||
return r.Pressure.Acquire(routeID)
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) snapshotRoutePressure() FabricRoutePressureSnapshot {
|
||||
if r == nil || r.Pressure == nil {
|
||||
return FabricRoutePressureSnapshot{}
|
||||
}
|
||||
return r.Pressure.SnapshotPressure()
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) snapshotRouteHealth() FabricRouteHealthSnapshot {
|
||||
if r == nil || r.Health == nil {
|
||||
return FabricRouteHealthSnapshot{}
|
||||
}
|
||||
return r.Health.Snapshot(time.Now().UTC())
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) markRouteFailure(routeID string, err error) {
|
||||
if r == nil || r.Health == nil || err == nil {
|
||||
return
|
||||
}
|
||||
r.Health.MarkFailure(routeID, err.Error(), time.Now().UTC())
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) markRouteSuccess(routeID string) {
|
||||
if r == nil || r.Health == nil {
|
||||
return
|
||||
}
|
||||
r.Health.MarkSuccess(routeID)
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) sendOnSession(ctx context.Context, session FabricTransportSession, channel *FabricChannel, routeSet FabricRouteSet, route FabricRoute, payloads [][]byte, index *int, sequence *uint64, result *FabricChannelRuntimeResult) (bool, error) {
|
||||
cfg := r.Config
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameOpenStream,
|
||||
TrafficClass: cfg.TrafficClass,
|
||||
StreamID: cfg.StreamID,
|
||||
}); err != nil {
|
||||
r.markRouteFailure(route.RouteID, err)
|
||||
return false, err
|
||||
}
|
||||
for *index < len(payloads) {
|
||||
payload := payloads[*index]
|
||||
if len(payload) > cfg.MaxPayload {
|
||||
return false, fmt.Errorf("%w: %d > %d", fabricproto.ErrInvalidPayloadLen, len(payload), cfg.MaxPayload)
|
||||
}
|
||||
(*sequence)++
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: cfg.TrafficClass,
|
||||
StreamID: cfg.StreamID,
|
||||
Sequence: *sequence,
|
||||
Payload: payload,
|
||||
}); err != nil {
|
||||
r.markRouteFailure(route.RouteID, err)
|
||||
return false, err
|
||||
}
|
||||
ackOK, ackMs := waitForFabricRuntimeAck(ctx, session, cfg.StreamID, *sequence, cfg.Timeout)
|
||||
if !ackOK {
|
||||
r.markRouteFailure(route.RouteID, fmt.Errorf("ack_failed"))
|
||||
updated, event, err := r.Router.ObserveChannel(*channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: channel.Spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
Failed: true,
|
||||
Reason: "ack_failed",
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
*channel = updated
|
||||
if event.Type == FabricChannelRouteEventReroute {
|
||||
result.RouteEvents = append(result.RouteEvents, event)
|
||||
result.MigrationEvents++
|
||||
return true, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
r.markRouteSuccess(route.RouteID)
|
||||
*index++
|
||||
result.BytesSent += uint64(len(payload))
|
||||
result.FramesSent++
|
||||
result.AcksReceived++
|
||||
updated, event, err := r.Router.ObserveChannel(*channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: channel.Spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
AckLatencyMs: ackMs,
|
||||
BytesSent: uint64(len(payload)),
|
||||
FramesSent: 1,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
*channel = updated
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if event.Type == FabricChannelRouteEventReroute {
|
||||
result.RouteEvents = append(result.RouteEvents, event)
|
||||
result.MigrationEvents++
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
_ = session.Send(context.Background(), fabricproto.Frame{
|
||||
Type: fabricproto.FrameCloseStream,
|
||||
TrafficClass: cfg.TrafficClass,
|
||||
StreamID: cfg.StreamID,
|
||||
})
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func (r *FabricChannelRuntime) sendRequestResponseOnSession(ctx context.Context, session FabricTransportSession, routeID string, channelID string, payload []byte, sequence uint64) ([]byte, int64, error) {
|
||||
cfg := r.Config
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameOpenStream,
|
||||
TrafficClass: cfg.TrafficClass,
|
||||
StreamID: cfg.StreamID,
|
||||
}); err != nil {
|
||||
r.markRouteFailure(routeID, err)
|
||||
return nil, 0, err
|
||||
}
|
||||
started := time.Now()
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: cfg.TrafficClass,
|
||||
StreamID: cfg.StreamID,
|
||||
Sequence: sequence,
|
||||
Payload: payload,
|
||||
}); err != nil {
|
||||
r.markRouteFailure(routeID, err)
|
||||
return nil, 0, err
|
||||
}
|
||||
waitCtx := ctx
|
||||
if cfg.Timeout > 0 {
|
||||
var cancel context.CancelFunc
|
||||
waitCtx, cancel = context.WithTimeout(ctx, cfg.Timeout)
|
||||
defer cancel()
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case <-waitCtx.Done():
|
||||
return nil, 0, waitCtx.Err()
|
||||
case err, ok := <-session.Errors():
|
||||
if !ok {
|
||||
return nil, 0, ErrForwardPeerUnavailable
|
||||
}
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
case frame, ok := <-session.Frames():
|
||||
if !ok {
|
||||
return nil, 0, ErrForwardPeerUnavailable
|
||||
}
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != cfg.StreamID || frame.Sequence != sequence {
|
||||
continue
|
||||
}
|
||||
_ = session.Send(context.Background(), fabricproto.Frame{
|
||||
Type: fabricproto.FrameCloseStream,
|
||||
TrafficClass: cfg.TrafficClass,
|
||||
StreamID: cfg.StreamID,
|
||||
})
|
||||
return append([]byte(nil), frame.Payload...), time.Since(started).Milliseconds(), nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func FabricTransportTargetForRoute(route FabricRoute) (FabricTransportTarget, error) {
|
||||
if strings.TrimSpace(route.RouteID) == "" {
|
||||
return FabricTransportTarget{}, ErrFabricRouteNotFound
|
||||
}
|
||||
if route.RelayCount > 0 {
|
||||
for _, hop := range route.Hops {
|
||||
if hop.Mode != FabricRouteRelay {
|
||||
continue
|
||||
}
|
||||
if target, ok := fabricTransportTargetForHop(hop); ok {
|
||||
return target, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
for i := len(route.Hops) - 1; i >= 0; i-- {
|
||||
if target, ok := fabricTransportTargetForHop(route.Hops[i]); ok {
|
||||
return target, nil
|
||||
}
|
||||
}
|
||||
return FabricTransportTarget{}, fmt.Errorf("%w: route %s has no transport endpoint", ErrFabricRouteNotFound, route.RouteID)
|
||||
}
|
||||
|
||||
func fabricTransportTargetForHop(hop FabricRouteHop) (FabricTransportTarget, bool) {
|
||||
endpoint := strings.TrimSpace(hop.Address)
|
||||
if endpoint == "" {
|
||||
return FabricTransportTarget{}, false
|
||||
}
|
||||
transport := string(hop.Mode)
|
||||
if transport == "" {
|
||||
transport = "quic"
|
||||
}
|
||||
return FabricTransportTarget{
|
||||
EndpointID: hop.EndpointID,
|
||||
PeerID: strings.TrimSpace(hop.NodeID),
|
||||
Endpoint: endpoint,
|
||||
Transport: transport,
|
||||
PeerCertSHA256: strings.TrimSpace(hop.PeerCertSHA256),
|
||||
}, true
|
||||
}
|
||||
|
||||
func waitForFabricRuntimeAck(ctx context.Context, session FabricTransportSession, streamID uint64, sequence uint64, timeout time.Duration) (bool, int64) {
|
||||
started := time.Now()
|
||||
if timeout > 0 {
|
||||
var cancel context.CancelFunc
|
||||
ctx, cancel = context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false, 0
|
||||
case err, ok := <-session.Errors():
|
||||
if !ok || err != nil {
|
||||
return false, 0
|
||||
}
|
||||
case frame, ok := <-session.Frames():
|
||||
if !ok {
|
||||
return false, 0
|
||||
}
|
||||
if frame.Type == fabricproto.FrameAck && frame.StreamID == streamID && frame.Sequence == sequence {
|
||||
return true, time.Since(started).Milliseconds()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,495 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
func TestFabricChannelRuntimeMigratesSlowAckToStandbyRoute(t *testing.T) {
|
||||
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
|
||||
"quic://slow.example.test:19443": 60 * time.Millisecond,
|
||||
"quic://fast.example.test:19443": 0,
|
||||
})
|
||||
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
|
||||
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
|
||||
StreamID: 9,
|
||||
})
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: testRuntimeRoute("route-slow", "node-b", "quic://slow.example.test:19443", 10),
|
||||
WarmStandby: []FabricRoute{
|
||||
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
|
||||
},
|
||||
}
|
||||
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{
|
||||
[]byte("one"),
|
||||
[]byte("two"),
|
||||
[]byte("three"),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("send reliable: %v", err)
|
||||
}
|
||||
if result.MigrationEvents != 1 {
|
||||
t.Fatalf("migration events = %d, want 1: %+v", result.MigrationEvents, result.RouteEvents)
|
||||
}
|
||||
if result.Channel.RouteID != "route-fast" || result.Channel.RerouteCount != 1 {
|
||||
t.Fatalf("channel = %+v", result.Channel)
|
||||
}
|
||||
if result.BytesSent != uint64(len("one")+len("two")+len("three")) || result.AcksReceived != 3 {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if got := transport.connectCount("quic://slow.example.test:19443"); got != 1 {
|
||||
t.Fatalf("slow connect count = %d, want 1", got)
|
||||
}
|
||||
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
|
||||
t.Fatalf("fast connect count = %d, want 1", got)
|
||||
}
|
||||
if result.RoutePressure.AcquiredTotal != 2 || result.RoutePressure.ReleasedTotal != 2 || result.RoutePressure.MaxActiveTotal == 0 {
|
||||
t.Fatalf("route pressure = %+v", result.RoutePressure)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelRuntimeReroutesOnConnectFailure(t *testing.T) {
|
||||
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
|
||||
"quic://fast.example.test:19443": 0,
|
||||
})
|
||||
transport.failConnect["quic://dead.example.test:19443"] = true
|
||||
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
|
||||
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
|
||||
StreamID: 9,
|
||||
})
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: testRuntimeRoute("route-dead", "node-b", "quic://dead.example.test:19443", 10),
|
||||
WarmStandby: []FabricRoute{
|
||||
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
|
||||
},
|
||||
}
|
||||
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("payload")})
|
||||
if err != nil {
|
||||
t.Fatalf("send reliable: %v", err)
|
||||
}
|
||||
if result.MigrationEvents != 1 || result.Channel.RouteID != "route-fast" || result.BytesSent != uint64(len("payload")) {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelRuntimeQuarantinesFailedRouteAcrossChannels(t *testing.T) {
|
||||
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
|
||||
"quic://fast.example.test:19443": 0,
|
||||
})
|
||||
transport.failConnect["quic://dead.example.test:19443"] = true
|
||||
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
|
||||
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
|
||||
StreamID: 9,
|
||||
RouteHealthTTL: time.Minute,
|
||||
})
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: testRuntimeRoute("route-dead", "node-b", "quic://dead.example.test:19443", 10),
|
||||
WarmStandby: []FabricRoute{
|
||||
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
|
||||
},
|
||||
}
|
||||
|
||||
first, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("first")})
|
||||
if err != nil {
|
||||
t.Fatalf("first send reliable: %v", err)
|
||||
}
|
||||
if first.Channel.RouteID != "route-fast" || first.RouteHealth.Quarantined["route-dead"].Failures != 1 {
|
||||
t.Fatalf("first result = %+v", first)
|
||||
}
|
||||
second, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("second")})
|
||||
if err != nil {
|
||||
t.Fatalf("second send reliable: %v", err)
|
||||
}
|
||||
if second.Channel.RouteID != "route-fast" {
|
||||
t.Fatalf("second route = %s, want route-fast", second.Channel.RouteID)
|
||||
}
|
||||
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
|
||||
t.Fatalf("dead connect count = %d, want one attempt before quarantine", got)
|
||||
}
|
||||
if got := transport.connectCount("quic://fast.example.test:19443"); got != 2 {
|
||||
t.Fatalf("fast connect count = %d, want both channels on healthy route", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelRuntimeReroutesOnAckTimeout(t *testing.T) {
|
||||
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
|
||||
"quic://slow.example.test:19443": 100 * time.Millisecond,
|
||||
"quic://fast.example.test:19443": 0,
|
||||
})
|
||||
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
|
||||
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
|
||||
StreamID: 9,
|
||||
Timeout: 10 * time.Millisecond,
|
||||
})
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: testRuntimeRoute("route-slow", "node-b", "quic://slow.example.test:19443", 10),
|
||||
WarmStandby: []FabricRoute{
|
||||
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
|
||||
},
|
||||
}
|
||||
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("payload")})
|
||||
if err != nil {
|
||||
t.Fatalf("send reliable: %v", err)
|
||||
}
|
||||
if result.MigrationEvents != 1 || result.Channel.RouteID != "route-fast" || result.BytesSent != uint64(len("payload")) {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelRuntimeSpreadsConcurrentChannelsBySharedPressure(t *testing.T) {
|
||||
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
|
||||
"quic://route-a.example.test:19443": 80 * time.Millisecond,
|
||||
"quic://route-b.example.test:19443": 0,
|
||||
})
|
||||
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{StreamID: 9})
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: testRuntimeRoute("route-a", "node-b", "quic://route-a.example.test:19443", 10),
|
||||
WarmStandby: []FabricRoute{
|
||||
testRuntimeRoute("route-b", "node-b", "quic://route-b.example.test:19443", 11),
|
||||
},
|
||||
}
|
||||
|
||||
firstDone := make(chan error, 1)
|
||||
go func() {
|
||||
_, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("one")})
|
||||
firstDone <- err
|
||||
}()
|
||||
transport.waitForConnect(t, "quic://route-a.example.test:19443", 1)
|
||||
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("two")})
|
||||
if err != nil {
|
||||
t.Fatalf("second send reliable: %v", err)
|
||||
}
|
||||
if result.Channel.RouteID != "route-b" {
|
||||
t.Fatalf("second route = %s, want route-b", result.Channel.RouteID)
|
||||
}
|
||||
if got := transport.connectCount("quic://route-b.example.test:19443"); got != 1 {
|
||||
t.Fatalf("route-b connect count = %d, want 1", got)
|
||||
}
|
||||
if err := <-firstDone; err != nil {
|
||||
t.Fatalf("first send reliable: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelRuntimeRequestResponseReturnsPayload(t *testing.T) {
|
||||
transport := newFakeFabricRequestResponseTransport(map[string][]byte{
|
||||
"quic://runtime.example.test:19443": []byte(`{"status":"ok"}`),
|
||||
})
|
||||
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
|
||||
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
|
||||
StreamID: 9,
|
||||
})
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetPool,
|
||||
TargetID: "pool-admin-runtime",
|
||||
Primary: testRuntimePoolRoute("route-runtime", "pool-admin-runtime", "node-runtime", "quic://runtime.example.test:19443", 10),
|
||||
}
|
||||
|
||||
result, err := runtime.SendRequestResponse(context.Background(), FabricChannelSpec{
|
||||
ChannelID: "channel-web-1",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
TargetKind: FabricChannelTargetPool,
|
||||
TargetID: "pool-admin-runtime",
|
||||
TrafficClass: "control",
|
||||
CreatedAt: time.Now().UTC(),
|
||||
}, routeSet, []byte(`{"request":true}`))
|
||||
if err != nil {
|
||||
t.Fatalf("request response: %v", err)
|
||||
}
|
||||
if string(result.ResponsePayload) != `{"status":"ok"}` {
|
||||
t.Fatalf("response payload = %s", string(result.ResponsePayload))
|
||||
}
|
||||
if result.Channel.RouteID != "route-runtime" ||
|
||||
result.BytesSent != uint64(len(`{"request":true}`)) ||
|
||||
result.BytesRecv != uint64(len(`{"status":"ok"}`)) ||
|
||||
result.FramesSent != 1 ||
|
||||
result.FramesRecv != 1 ||
|
||||
result.AcksReceived != 1 {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelRuntimeRequestResponseReroutesOnResponseFailure(t *testing.T) {
|
||||
transport := newFakeFabricRequestResponseTransport(map[string][]byte{
|
||||
"quic://fast.example.test:19443": []byte(`{"status":"ok"}`),
|
||||
})
|
||||
transport.failResponse["quic://slow.example.test:19443"] = true
|
||||
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
|
||||
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
|
||||
StreamID: 9,
|
||||
Timeout: 10 * time.Millisecond,
|
||||
})
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-runtime",
|
||||
Primary: testRuntimeRoute("route-slow", "node-runtime", "quic://slow.example.test:19443", 10),
|
||||
WarmStandby: []FabricRoute{
|
||||
testRuntimeRoute("route-fast", "node-runtime", "quic://fast.example.test:19443", 20),
|
||||
},
|
||||
}
|
||||
|
||||
result, err := runtime.SendRequestResponse(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-runtime"), routeSet, []byte(`{"request":true}`))
|
||||
if err != nil {
|
||||
t.Fatalf("request response: %v", err)
|
||||
}
|
||||
if result.MigrationEvents != 1 || result.Channel.RouteID != "route-fast" || string(result.ResponsePayload) != `{"status":"ok"}` {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricTransportTargetForRouteUsesLastAddressedHop(t *testing.T) {
|
||||
target, err := FabricTransportTargetForRoute(FabricRoute{
|
||||
RouteID: "route-1",
|
||||
Hops: []FabricRouteHop{
|
||||
{NodeID: "node-a"},
|
||||
{NodeID: "node-r", Mode: FabricRouteRelay, EndpointID: "relay-1", Address: "quic://relay.example.test:19443"},
|
||||
{NodeID: "node-b", Mode: FabricRouteDirect, EndpointID: "node-b-quic", Address: "quic://node-b.example.test:19443"},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("target for route: %v", err)
|
||||
}
|
||||
if target.PeerID != "node-b" || target.EndpointID != "node-b-quic" || target.Endpoint != "quic://node-b.example.test:19443" || target.Transport != string(FabricRouteDirect) {
|
||||
t.Fatalf("target = %+v", target)
|
||||
}
|
||||
}
|
||||
|
||||
type fakeFabricRequestResponseTransport struct {
|
||||
mu sync.Mutex
|
||||
responses map[string][]byte
|
||||
failResponse map[string]bool
|
||||
connects map[string]int
|
||||
}
|
||||
|
||||
func newFakeFabricRequestResponseTransport(responses map[string][]byte) *fakeFabricRequestResponseTransport {
|
||||
return &fakeFabricRequestResponseTransport{
|
||||
responses: responses,
|
||||
failResponse: map[string]bool{},
|
||||
connects: map[string]int{},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *fakeFabricRequestResponseTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
|
||||
endpoint := target.Endpoint
|
||||
t.mu.Lock()
|
||||
t.connects[endpoint]++
|
||||
response := append([]byte(nil), t.responses[endpoint]...)
|
||||
failResponse := t.failResponse[endpoint]
|
||||
t.mu.Unlock()
|
||||
return &fakeFabricRequestResponseSession{
|
||||
response: response,
|
||||
failResponse: failResponse,
|
||||
frames: make(chan fabricproto.Frame, 16),
|
||||
errors: make(chan error, 1),
|
||||
done: make(chan struct{}),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (t *fakeFabricRequestResponseTransport) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeFabricRequestResponseSession struct {
|
||||
response []byte
|
||||
failResponse bool
|
||||
frames chan fabricproto.Frame
|
||||
errors chan error
|
||||
done chan struct{}
|
||||
once sync.Once
|
||||
}
|
||||
|
||||
func (s *fakeFabricRequestResponseSession) Send(_ context.Context, frame fabricproto.Frame) error {
|
||||
if frame.Type != fabricproto.FrameData || s.failResponse {
|
||||
return nil
|
||||
}
|
||||
response := append([]byte(nil), s.response...)
|
||||
go func() {
|
||||
select {
|
||||
case <-s.done:
|
||||
case s.frames <- fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: frame.TrafficClass, StreamID: frame.StreamID, Sequence: frame.Sequence, Payload: response}:
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeFabricRequestResponseSession) Frames() <-chan fabricproto.Frame {
|
||||
return s.frames
|
||||
}
|
||||
|
||||
func (s *fakeFabricRequestResponseSession) Errors() <-chan error {
|
||||
return s.errors
|
||||
}
|
||||
|
||||
func (s *fakeFabricRequestResponseSession) Close() error {
|
||||
s.once.Do(func() {
|
||||
close(s.done)
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeFabricRequestResponseSession) Closed() bool {
|
||||
select {
|
||||
case <-s.done:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricTransportTargetForRouteUsesRelayHopForRelayRoute(t *testing.T) {
|
||||
target, err := FabricTransportTargetForRoute(FabricRoute{
|
||||
RouteID: "route-relay",
|
||||
RelayCount: 1,
|
||||
Hops: []FabricRouteHop{
|
||||
{NodeID: "node-a"},
|
||||
{NodeID: "node-r", Mode: FabricRouteRelay, EndpointID: "relay-1", Address: "quic://relay.example.test:19443", PeerCertSHA256: "relay-cert"},
|
||||
{NodeID: "node-b", Mode: FabricRouteRelay, EndpointID: "node-b-private", Address: "quic://10.0.0.2:19443", PeerCertSHA256: "node-b-cert"},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("target for relay route: %v", err)
|
||||
}
|
||||
if target.PeerID != "node-r" || target.EndpointID != "relay-1" || target.Endpoint != "quic://relay.example.test:19443" || target.PeerCertSHA256 != "relay-cert" {
|
||||
t.Fatalf("target = %+v", target)
|
||||
}
|
||||
}
|
||||
|
||||
type fakeFabricRuntimeTransport struct {
|
||||
mu sync.Mutex
|
||||
delays map[string]time.Duration
|
||||
failConnect map[string]bool
|
||||
connects map[string]int
|
||||
}
|
||||
|
||||
func newFakeFabricRuntimeTransport(delays map[string]time.Duration) *fakeFabricRuntimeTransport {
|
||||
return &fakeFabricRuntimeTransport{
|
||||
delays: delays,
|
||||
failConnect: map[string]bool{},
|
||||
connects: map[string]int{},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *fakeFabricRuntimeTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
|
||||
endpoint := target.Endpoint
|
||||
t.mu.Lock()
|
||||
t.connects[endpoint]++
|
||||
fail := t.failConnect[endpoint]
|
||||
delay := t.delays[endpoint]
|
||||
t.mu.Unlock()
|
||||
if fail {
|
||||
return nil, ErrForwardPeerUnavailable
|
||||
}
|
||||
return &fakeFabricRuntimeSession{
|
||||
endpoint: endpoint,
|
||||
delay: delay,
|
||||
frames: make(chan fabricproto.Frame, 64),
|
||||
errors: make(chan error, 1),
|
||||
done: make(chan struct{}),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (t *fakeFabricRuntimeTransport) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *fakeFabricRuntimeTransport) connectCount(endpoint string) int {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
return t.connects[endpoint]
|
||||
}
|
||||
|
||||
func (t *fakeFabricRuntimeTransport) waitForConnect(tb testing.TB, endpoint string, count int) {
|
||||
tb.Helper()
|
||||
deadline := time.Now().Add(time.Second)
|
||||
for {
|
||||
t.mu.Lock()
|
||||
got := t.connects[endpoint]
|
||||
t.mu.Unlock()
|
||||
if got >= count {
|
||||
return
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
tb.Fatalf("timed out waiting for %s connect count %d, got %d", endpoint, count, got)
|
||||
}
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
type fakeFabricRuntimeSession struct {
|
||||
endpoint string
|
||||
delay time.Duration
|
||||
frames chan fabricproto.Frame
|
||||
errors chan error
|
||||
done chan struct{}
|
||||
once sync.Once
|
||||
}
|
||||
|
||||
func (s *fakeFabricRuntimeSession) Send(_ context.Context, frame fabricproto.Frame) error {
|
||||
if frame.Type != fabricproto.FrameData {
|
||||
return nil
|
||||
}
|
||||
delay := s.delay
|
||||
go func() {
|
||||
if delay > 0 {
|
||||
time.Sleep(delay)
|
||||
}
|
||||
select {
|
||||
case <-s.done:
|
||||
case s.frames <- fabricproto.Frame{Type: fabricproto.FrameAck, TrafficClass: frame.TrafficClass, StreamID: frame.StreamID, Sequence: frame.Sequence}:
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeFabricRuntimeSession) Frames() <-chan fabricproto.Frame {
|
||||
return s.frames
|
||||
}
|
||||
|
||||
func (s *fakeFabricRuntimeSession) Errors() <-chan error {
|
||||
return s.errors
|
||||
}
|
||||
|
||||
func (s *fakeFabricRuntimeSession) Close() error {
|
||||
s.once.Do(func() {
|
||||
close(s.done)
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeFabricRuntimeSession) Closed() bool {
|
||||
select {
|
||||
case <-s.done:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func testRuntimeRoute(routeID string, destination string, endpoint string, latency int) FabricRoute {
|
||||
route := testFabricRoute(routeID, destination, latency, 100, 0, true)
|
||||
route.Hops[len(route.Hops)-1].Address = endpoint
|
||||
route.Hops[len(route.Hops)-1].EndpointID = strings.TrimPrefix(routeID, "route-")
|
||||
route.Hops[len(route.Hops)-1].Mode = FabricRouteDirect
|
||||
return route
|
||||
}
|
||||
|
||||
func testRuntimePoolRoute(routeID string, poolID string, destination string, endpoint string, latency int) FabricRoute {
|
||||
route := testRuntimeRoute(routeID, destination, endpoint, latency)
|
||||
route.PoolID = poolID
|
||||
return route
|
||||
}
|
||||
@@ -0,0 +1,390 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type FabricChannelTargetKind string
|
||||
|
||||
const (
|
||||
FabricChannelTargetNode FabricChannelTargetKind = "node"
|
||||
FabricChannelTargetPool FabricChannelTargetKind = "pool"
|
||||
)
|
||||
|
||||
type FabricChannelLifecycleState string
|
||||
|
||||
const (
|
||||
FabricChannelOpening FabricChannelLifecycleState = "opening"
|
||||
FabricChannelOpen FabricChannelLifecycleState = "open"
|
||||
FabricChannelDraining FabricChannelLifecycleState = "draining"
|
||||
FabricChannelClosed FabricChannelLifecycleState = "closed"
|
||||
)
|
||||
|
||||
type FabricRouteMode string
|
||||
|
||||
const (
|
||||
FabricRouteDirect FabricRouteMode = "direct_quic"
|
||||
FabricRouteLAN FabricRouteMode = "lan_quic"
|
||||
FabricRouteReverse FabricRouteMode = "reverse_quic"
|
||||
FabricRouteRelay FabricRouteMode = "relay_quic"
|
||||
FabricRouteICE FabricRouteMode = "ice_quic"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrFabricChannelInvalid = errors.New("fabric channel request is invalid")
|
||||
ErrFabricRouteNotFound = errors.New("fabric route not found")
|
||||
)
|
||||
|
||||
type FabricChannelSpec struct {
|
||||
ChannelID string
|
||||
ClusterID string
|
||||
SourceNodeID string
|
||||
TargetKind FabricChannelTargetKind
|
||||
TargetID string
|
||||
TrafficClass string
|
||||
MinBandwidth int64
|
||||
StickyKey string
|
||||
CreatedAt time.Time
|
||||
ForbiddenHops []string
|
||||
}
|
||||
|
||||
type FabricServiceChannelTarget struct {
|
||||
Kind FabricChannelTargetKind
|
||||
PoolIDs []string
|
||||
NodeIDs []string
|
||||
SelectedNodeID string
|
||||
ServiceRole string
|
||||
SelectionPolicy string
|
||||
SingleMemberPool bool
|
||||
}
|
||||
|
||||
type FabricServiceChannelRequest struct {
|
||||
SchemaVersion string
|
||||
ChannelID string
|
||||
ClusterID string
|
||||
OrganizationID string
|
||||
UserID string
|
||||
ResourceID string
|
||||
SourceNodeID string
|
||||
SourceRole string
|
||||
ServiceClass string
|
||||
Target FabricServiceChannelTarget
|
||||
TrafficClass string
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
type FabricChannel struct {
|
||||
Spec FabricChannelSpec
|
||||
State FabricChannelLifecycleState
|
||||
RouteID string
|
||||
TargetNode string
|
||||
OpenedAt time.Time
|
||||
LastReroute time.Time
|
||||
BytesSent uint64
|
||||
BytesRecv uint64
|
||||
FramesSent uint64
|
||||
FramesRecv uint64
|
||||
RerouteCount uint64
|
||||
}
|
||||
|
||||
type FabricRouteHop struct {
|
||||
NodeID string
|
||||
Mode FabricRouteMode
|
||||
EndpointID string
|
||||
Address string
|
||||
PeerCertSHA256 string
|
||||
}
|
||||
|
||||
type FabricRoute struct {
|
||||
RouteID string
|
||||
ClusterID string
|
||||
SourceNodeID string
|
||||
DestinationNodeID string
|
||||
PoolID string
|
||||
Hops []FabricRouteHop
|
||||
BaseLatencyMs int
|
||||
JitterMs int
|
||||
LossPermille int
|
||||
Capacity int
|
||||
ActiveChannels int
|
||||
RelayCount int
|
||||
LastUpdatedAt time.Time
|
||||
Healthy bool
|
||||
Degraded bool
|
||||
}
|
||||
|
||||
type FabricRouteSet struct {
|
||||
TargetKind FabricChannelTargetKind
|
||||
TargetID string
|
||||
Primary FabricRoute
|
||||
WarmStandby []FabricRoute
|
||||
ColdFallbacks []FabricRoute
|
||||
}
|
||||
|
||||
type FabricAdjacency struct {
|
||||
FromNodeID string
|
||||
ToNodeID string
|
||||
Mode FabricRouteMode
|
||||
RTTMs int
|
||||
JitterMs int
|
||||
LossPermille int
|
||||
Capacity int
|
||||
ActiveChannels int
|
||||
ThroughputBps int64
|
||||
PressurePercent int
|
||||
Healthy bool
|
||||
PassiveOutbound bool
|
||||
LocalSegmentID string
|
||||
NATGroupID string
|
||||
LastObservedAt time.Time
|
||||
LastFailureReason string
|
||||
}
|
||||
|
||||
type FabricRouteChoice struct {
|
||||
Route FabricRoute
|
||||
Score int
|
||||
Reason string
|
||||
PressureBefore int
|
||||
PressureAfter int
|
||||
}
|
||||
|
||||
type FabricRouteSchedulerConfig struct {
|
||||
LatencyWeight int
|
||||
JitterWeight int
|
||||
LossWeight int
|
||||
PressureWeight int
|
||||
HopPenalty int
|
||||
RelayPenalty int
|
||||
DegradedPenalty int
|
||||
ProjectedChannelCost int
|
||||
HardMaxRoutePressure int
|
||||
}
|
||||
|
||||
type FabricRouteScheduler struct {
|
||||
Config FabricRouteSchedulerConfig
|
||||
}
|
||||
|
||||
func NewFabricRouteScheduler(cfg FabricRouteSchedulerConfig) FabricRouteScheduler {
|
||||
return FabricRouteScheduler{Config: normalizeFabricRouteSchedulerConfig(cfg)}
|
||||
}
|
||||
|
||||
func (s FabricRouteScheduler) ChooseRoute(spec FabricChannelSpec, routeSet FabricRouteSet, now time.Time) (FabricRouteChoice, error) {
|
||||
if err := ValidateFabricChannelSpec(spec); err != nil {
|
||||
return FabricRouteChoice{}, err
|
||||
}
|
||||
routes := flattenFabricRouteSet(routeSet)
|
||||
if len(routes) == 0 {
|
||||
return FabricRouteChoice{}, ErrFabricRouteNotFound
|
||||
}
|
||||
forbidden := stringSet(spec.ForbiddenHops)
|
||||
choices := make([]FabricRouteChoice, 0, len(routes))
|
||||
for _, route := range routes {
|
||||
if !fabricRouteUsable(spec, route, forbidden, now) {
|
||||
continue
|
||||
}
|
||||
choice := s.scoreRoute(route)
|
||||
if s.Config.HardMaxRoutePressure > 0 && choice.PressureAfter > s.Config.HardMaxRoutePressure {
|
||||
continue
|
||||
}
|
||||
choice.Route = route
|
||||
choices = append(choices, choice)
|
||||
}
|
||||
if len(choices) == 0 {
|
||||
return FabricRouteChoice{}, ErrFabricRouteNotFound
|
||||
}
|
||||
sort.SliceStable(choices, func(i, j int) bool {
|
||||
if choices[i].Score != choices[j].Score {
|
||||
return choices[i].Score < choices[j].Score
|
||||
}
|
||||
if choices[i].PressureAfter != choices[j].PressureAfter {
|
||||
return choices[i].PressureAfter < choices[j].PressureAfter
|
||||
}
|
||||
if choices[i].Route.BaseLatencyMs != choices[j].Route.BaseLatencyMs {
|
||||
return choices[i].Route.BaseLatencyMs < choices[j].Route.BaseLatencyMs
|
||||
}
|
||||
return choices[i].Route.RouteID < choices[j].Route.RouteID
|
||||
})
|
||||
return choices[0], nil
|
||||
}
|
||||
|
||||
func ValidateFabricChannelSpec(spec FabricChannelSpec) error {
|
||||
if strings.TrimSpace(spec.ChannelID) == "" || strings.TrimSpace(spec.ClusterID) == "" || strings.TrimSpace(spec.SourceNodeID) == "" || strings.TrimSpace(spec.TargetID) == "" {
|
||||
return ErrFabricChannelInvalid
|
||||
}
|
||||
switch spec.TargetKind {
|
||||
case FabricChannelTargetNode, FabricChannelTargetPool:
|
||||
return nil
|
||||
default:
|
||||
return ErrFabricChannelInvalid
|
||||
}
|
||||
}
|
||||
|
||||
func FabricChannelSpecFromServiceRequest(req FabricServiceChannelRequest, localNodeID string, now time.Time) (FabricChannelSpec, error) {
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
sourceNodeID := firstNonEmpty(strings.TrimSpace(req.SourceNodeID), strings.TrimSpace(localNodeID))
|
||||
targetKind := req.Target.Kind
|
||||
if targetKind == "" {
|
||||
targetKind = FabricChannelTargetPool
|
||||
}
|
||||
targetID := firstNonEmpty(firstString(req.Target.PoolIDs), strings.TrimSpace(req.Target.SelectedNodeID), firstString(req.Target.NodeIDs))
|
||||
if targetKind == FabricChannelTargetNode {
|
||||
targetID = firstNonEmpty(strings.TrimSpace(req.Target.SelectedNodeID), firstString(req.Target.NodeIDs), targetID)
|
||||
}
|
||||
spec := FabricChannelSpec{
|
||||
ChannelID: firstNonEmpty(strings.TrimSpace(req.ChannelID), strings.TrimSpace(req.ResourceID)),
|
||||
ClusterID: strings.TrimSpace(req.ClusterID),
|
||||
SourceNodeID: sourceNodeID,
|
||||
TargetKind: targetKind,
|
||||
TargetID: targetID,
|
||||
TrafficClass: firstNonEmpty(strings.TrimSpace(req.TrafficClass), serviceClassDefaultTrafficClass(req.ServiceClass)),
|
||||
StickyKey: strings.TrimSpace(req.ResourceID),
|
||||
CreatedAt: now,
|
||||
}
|
||||
if err := ValidateFabricChannelSpec(spec); err != nil {
|
||||
return FabricChannelSpec{}, err
|
||||
}
|
||||
return spec, nil
|
||||
}
|
||||
|
||||
func serviceClassDefaultTrafficClass(serviceClass string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(serviceClass)) {
|
||||
case FabricServiceClassVPNPackets:
|
||||
return FabricServiceChannelBulk
|
||||
case FabricServiceClassRemoteWorkspace:
|
||||
return FabricServiceChannelInteractive
|
||||
default:
|
||||
return FabricServiceChannelReliable
|
||||
}
|
||||
}
|
||||
|
||||
func firstString(values []string) string {
|
||||
for _, value := range values {
|
||||
if strings.TrimSpace(value) != "" {
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s FabricRouteScheduler) scoreRoute(route FabricRoute) FabricRouteChoice {
|
||||
cfg := normalizeFabricRouteSchedulerConfig(s.Config)
|
||||
pressureBefore := fabricRoutePressurePercent(route, 0)
|
||||
pressureAfter := fabricRoutePressurePercent(route, cfg.ProjectedChannelCost)
|
||||
score := route.BaseLatencyMs*cfg.LatencyWeight +
|
||||
route.JitterMs*cfg.JitterWeight +
|
||||
route.LossPermille*cfg.LossWeight +
|
||||
pressureAfter*cfg.PressureWeight +
|
||||
len(route.Hops)*cfg.HopPenalty +
|
||||
route.RelayCount*cfg.RelayPenalty
|
||||
if route.Degraded {
|
||||
score += cfg.DegradedPenalty
|
||||
}
|
||||
reason := "latency_load_score"
|
||||
if pressureAfter >= 90 {
|
||||
reason = "capacity_pressure_avoidance"
|
||||
}
|
||||
if route.RelayCount > 0 {
|
||||
reason = "relay_fallback_available"
|
||||
}
|
||||
return FabricRouteChoice{Score: score, Reason: reason, PressureBefore: pressureBefore, PressureAfter: pressureAfter}
|
||||
}
|
||||
|
||||
func normalizeFabricRouteSchedulerConfig(cfg FabricRouteSchedulerConfig) FabricRouteSchedulerConfig {
|
||||
if cfg.LatencyWeight <= 0 {
|
||||
cfg.LatencyWeight = 10
|
||||
}
|
||||
if cfg.JitterWeight <= 0 {
|
||||
cfg.JitterWeight = 4
|
||||
}
|
||||
if cfg.LossWeight <= 0 {
|
||||
cfg.LossWeight = 8
|
||||
}
|
||||
if cfg.PressureWeight <= 0 {
|
||||
cfg.PressureWeight = 12
|
||||
}
|
||||
if cfg.HopPenalty <= 0 {
|
||||
cfg.HopPenalty = 5
|
||||
}
|
||||
if cfg.RelayPenalty <= 0 {
|
||||
cfg.RelayPenalty = 25
|
||||
}
|
||||
if cfg.DegradedPenalty <= 0 {
|
||||
cfg.DegradedPenalty = 500
|
||||
}
|
||||
if cfg.ProjectedChannelCost <= 0 {
|
||||
cfg.ProjectedChannelCost = 1
|
||||
}
|
||||
if cfg.HardMaxRoutePressure < 0 {
|
||||
cfg.HardMaxRoutePressure = 0
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func flattenFabricRouteSet(routeSet FabricRouteSet) []FabricRoute {
|
||||
routes := make([]FabricRoute, 0, 1+len(routeSet.WarmStandby)+len(routeSet.ColdFallbacks))
|
||||
if strings.TrimSpace(routeSet.Primary.RouteID) != "" {
|
||||
routes = append(routes, routeSet.Primary)
|
||||
}
|
||||
routes = append(routes, routeSet.WarmStandby...)
|
||||
routes = append(routes, routeSet.ColdFallbacks...)
|
||||
return routes
|
||||
}
|
||||
|
||||
func fabricRouteUsable(spec FabricChannelSpec, route FabricRoute, forbidden map[string]struct{}, now time.Time) bool {
|
||||
if strings.TrimSpace(route.RouteID) == "" || !route.Healthy {
|
||||
return false
|
||||
}
|
||||
if route.ClusterID != "" && spec.ClusterID != "" && route.ClusterID != spec.ClusterID {
|
||||
return false
|
||||
}
|
||||
if route.SourceNodeID != "" && route.SourceNodeID != spec.SourceNodeID {
|
||||
return false
|
||||
}
|
||||
switch spec.TargetKind {
|
||||
case FabricChannelTargetNode:
|
||||
if route.DestinationNodeID != "" && route.DestinationNodeID != spec.TargetID {
|
||||
return false
|
||||
}
|
||||
case FabricChannelTargetPool:
|
||||
if route.PoolID != "" && route.PoolID != spec.TargetID {
|
||||
return false
|
||||
}
|
||||
}
|
||||
for _, hop := range route.Hops {
|
||||
if _, blocked := forbidden[hop.NodeID]; blocked {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func fabricRoutePressurePercent(route FabricRoute, projected int) int {
|
||||
if route.Capacity <= 0 {
|
||||
return 100
|
||||
}
|
||||
active := route.ActiveChannels + projected
|
||||
if active <= 0 {
|
||||
return 0
|
||||
}
|
||||
pressure := (active * 100) / route.Capacity
|
||||
if pressure > 100 {
|
||||
return 100
|
||||
}
|
||||
return pressure
|
||||
}
|
||||
|
||||
func stringSet(values []string) map[string]struct{} {
|
||||
out := make(map[string]struct{}, len(values))
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
out[value] = struct{}{}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -0,0 +1,244 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestFabricRouteSchedulerAvoidsSaturatedShortestRoute(t *testing.T) {
|
||||
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{})
|
||||
spec := FabricChannelSpec{
|
||||
ChannelID: "channel-1",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
}
|
||||
choice, err := scheduler.ChooseRoute(spec, FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: FabricRoute{
|
||||
RouteID: "short-saturated",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-b"}},
|
||||
BaseLatencyMs: 10,
|
||||
Capacity: 10,
|
||||
ActiveChannels: 10,
|
||||
Healthy: true,
|
||||
},
|
||||
WarmStandby: []FabricRoute{{
|
||||
RouteID: "slightly-longer-free",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-r"}, {NodeID: "node-b"}},
|
||||
BaseLatencyMs: 18,
|
||||
Capacity: 100,
|
||||
ActiveChannels: 5,
|
||||
RelayCount: 1,
|
||||
Healthy: true,
|
||||
}},
|
||||
}, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("choose route: %v", err)
|
||||
}
|
||||
if choice.Route.RouteID != "slightly-longer-free" {
|
||||
t.Fatalf("route = %q, want slightly-longer-free score=%d pressure=%d", choice.Route.RouteID, choice.Score, choice.PressureAfter)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelSpecFromServiceRequestTargetsPool(t *testing.T) {
|
||||
spec, err := FabricChannelSpecFromServiceRequest(FabricServiceChannelRequest{
|
||||
ChannelID: "vpn-1",
|
||||
ClusterID: "cluster-1",
|
||||
ResourceID: "vpn-1",
|
||||
ServiceClass: FabricServiceClassVPNPackets,
|
||||
Target: FabricServiceChannelTarget{
|
||||
Kind: FabricChannelTargetPool,
|
||||
PoolIDs: []string{"home-ipv4"},
|
||||
ServiceRole: "ipv4-egress",
|
||||
},
|
||||
}, "android-node", time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("service request spec: %v", err)
|
||||
}
|
||||
if spec.SourceNodeID != "android-node" || spec.TargetKind != FabricChannelTargetPool || spec.TargetID != "home-ipv4" || spec.TrafficClass != FabricServiceChannelBulk {
|
||||
t.Fatalf("unexpected spec: %+v", spec)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricChannelSpecFromServiceRequestKeepsServiceOutOfEndpointSelection(t *testing.T) {
|
||||
_, err := FabricChannelSpecFromServiceRequest(FabricServiceChannelRequest{
|
||||
ChannelID: "rdp-1",
|
||||
ClusterID: "cluster-1",
|
||||
ServiceClass: FabricServiceClassRemoteWorkspace,
|
||||
Target: FabricServiceChannelTarget{
|
||||
Kind: FabricChannelTargetPool,
|
||||
ServiceRole: "rdp-gateway",
|
||||
},
|
||||
}, "client-node", time.Now())
|
||||
if !errors.Is(err, ErrFabricChannelInvalid) {
|
||||
t.Fatalf("err = %v, want invalid without pool/node target id", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteSchedulerPoolSkipsFailedEndpoint(t *testing.T) {
|
||||
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{})
|
||||
spec := FabricChannelSpec{
|
||||
ChannelID: "channel-pool",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
TargetKind: FabricChannelTargetPool,
|
||||
TargetID: "pool-egress",
|
||||
}
|
||||
choice, err := scheduler.ChooseRoute(spec, FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetPool,
|
||||
TargetID: "pool-egress",
|
||||
Primary: FabricRoute{
|
||||
RouteID: "pool-node-dead",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
PoolID: "pool-egress",
|
||||
Capacity: 100,
|
||||
Healthy: false,
|
||||
},
|
||||
WarmStandby: []FabricRoute{{
|
||||
RouteID: "pool-node-live",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-c",
|
||||
PoolID: "pool-egress",
|
||||
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-c"}},
|
||||
BaseLatencyMs: 25,
|
||||
Capacity: 100,
|
||||
Healthy: true,
|
||||
}},
|
||||
}, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("choose route: %v", err)
|
||||
}
|
||||
if choice.Route.DestinationNodeID != "node-c" {
|
||||
t.Fatalf("destination = %q, want node-c", choice.Route.DestinationNodeID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteSchedulerHonorsForbiddenHops(t *testing.T) {
|
||||
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{})
|
||||
spec := FabricChannelSpec{
|
||||
ChannelID: "channel-1",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
ForbiddenHops: []string{"node-r"},
|
||||
}
|
||||
_, err := scheduler.ChooseRoute(spec, FabricRouteSet{
|
||||
Primary: FabricRoute{
|
||||
RouteID: "blocked",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-r"}, {NodeID: "node-b"}},
|
||||
Capacity: 100,
|
||||
Healthy: true,
|
||||
},
|
||||
}, time.Now())
|
||||
if !errors.Is(err, ErrFabricRouteNotFound) {
|
||||
t.Fatalf("err = %v, want ErrFabricRouteNotFound", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteSchedulerRejectsRoutesAboveHardPressureLimit(t *testing.T) {
|
||||
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{HardMaxRoutePressure: 80})
|
||||
spec := FabricChannelSpec{
|
||||
ChannelID: "channel-pressure",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
}
|
||||
choice, err := scheduler.ChooseRoute(spec, FabricRouteSet{
|
||||
Primary: FabricRoute{
|
||||
RouteID: "too-busy",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
Capacity: 10,
|
||||
ActiveChannels: 9,
|
||||
Healthy: true,
|
||||
},
|
||||
WarmStandby: []FabricRoute{{
|
||||
RouteID: "admissible",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
Capacity: 10,
|
||||
ActiveChannels: 5,
|
||||
Healthy: true,
|
||||
}},
|
||||
}, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("choose route: %v", err)
|
||||
}
|
||||
if choice.Route.RouteID != "admissible" {
|
||||
t.Fatalf("route = %q, want admissible", choice.Route.RouteID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteSchedulerKeepsHighLatencyRouteAsFallbackUntilFastRouteSaturates(t *testing.T) {
|
||||
spec := FabricChannelSpec{
|
||||
ChannelID: "channel-latency-aware",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
TargetKind: FabricChannelTargetPool,
|
||||
TargetID: "pool-egress",
|
||||
}
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetPool,
|
||||
TargetID: "pool-egress",
|
||||
Primary: FabricRoute{
|
||||
RouteID: "lan-fast",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-lan",
|
||||
PoolID: "pool-egress",
|
||||
BaseLatencyMs: 4,
|
||||
Capacity: 100,
|
||||
ActiveChannels: 85,
|
||||
Healthy: true,
|
||||
},
|
||||
WarmStandby: []FabricRoute{{
|
||||
RouteID: "wan-slow",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-wan",
|
||||
PoolID: "pool-egress",
|
||||
BaseLatencyMs: 420,
|
||||
Capacity: 100,
|
||||
ActiveChannels: 0,
|
||||
Healthy: true,
|
||||
}},
|
||||
}
|
||||
|
||||
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{HardMaxRoutePressure: 90})
|
||||
choice, err := scheduler.ChooseRoute(spec, routeSet, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("choose route: %v", err)
|
||||
}
|
||||
if choice.Route.RouteID != "lan-fast" {
|
||||
t.Fatalf("route = %q, want fast LAN before hard pressure limit", choice.Route.RouteID)
|
||||
}
|
||||
|
||||
routeSet.Primary.ActiveChannels = 90
|
||||
choice, err = scheduler.ChooseRoute(spec, routeSet, time.Now())
|
||||
if err != nil {
|
||||
t.Fatalf("choose fallback route: %v", err)
|
||||
}
|
||||
if choice.Route.RouteID != "wan-slow" {
|
||||
t.Fatalf("route = %q, want WAN only after LAN reaches hard pressure limit", choice.Route.RouteID)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
type FabricOverlayTransportConfig struct {
|
||||
ClusterID string
|
||||
LocalNodeID string
|
||||
RouterConfig FabricChannelRouterConfig
|
||||
Timeout time.Duration
|
||||
}
|
||||
|
||||
type FabricOverlayTransport struct {
|
||||
Runtime *FabricChannelRuntime
|
||||
RouteSets map[string]FabricRouteSet
|
||||
Config FabricOverlayTransportConfig
|
||||
sequence atomic.Uint64
|
||||
}
|
||||
|
||||
type FabricOverlayTransportSnapshot struct {
|
||||
RoutePressure FabricRoutePressureSnapshot `json:"route_pressure"`
|
||||
RouteHealth FabricRouteHealthSnapshot `json:"route_health,omitempty"`
|
||||
}
|
||||
|
||||
type FabricOverlaySendRequest struct {
|
||||
ChannelID string
|
||||
TargetKind FabricChannelTargetKind
|
||||
TargetID string
|
||||
TrafficClass fabricproto.TrafficClass
|
||||
Payloads [][]byte
|
||||
StickyKey string
|
||||
}
|
||||
|
||||
func NewFabricOverlayTransport(transport FabricTransport, routeSets map[string]FabricRouteSet, cfg FabricOverlayTransportConfig) *FabricOverlayTransport {
|
||||
if cfg.Timeout <= 0 {
|
||||
cfg.Timeout = 30 * time.Second
|
||||
}
|
||||
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
|
||||
RouterConfig: cfg.RouterConfig,
|
||||
Timeout: cfg.Timeout,
|
||||
})
|
||||
normalized := make(map[string]FabricRouteSet, len(routeSets))
|
||||
for targetID, routeSet := range routeSets {
|
||||
targetID = strings.TrimSpace(targetID)
|
||||
if targetID != "" {
|
||||
normalized[targetID] = routeSet
|
||||
}
|
||||
}
|
||||
return &FabricOverlayTransport{
|
||||
Runtime: runtime,
|
||||
RouteSets: normalized,
|
||||
Config: cfg,
|
||||
}
|
||||
}
|
||||
|
||||
func (t *FabricOverlayTransport) Send(ctx context.Context, req FabricOverlaySendRequest) (FabricChannelRuntimeResult, error) {
|
||||
if t == nil || t.Runtime == nil {
|
||||
return FabricChannelRuntimeResult{}, ErrForwardRuntimeUnavailable
|
||||
}
|
||||
targetID := strings.TrimSpace(req.TargetID)
|
||||
if targetID == "" {
|
||||
return FabricChannelRuntimeResult{}, ErrFabricChannelInvalid
|
||||
}
|
||||
routeSet, ok := t.RouteSets[targetID]
|
||||
if !ok {
|
||||
return FabricChannelRuntimeResult{}, ErrFabricRouteNotFound
|
||||
}
|
||||
targetKind := req.TargetKind
|
||||
if targetKind == "" {
|
||||
targetKind = routeSet.TargetKind
|
||||
}
|
||||
if targetKind == "" {
|
||||
targetKind = FabricChannelTargetNode
|
||||
}
|
||||
trafficClass := req.TrafficClass
|
||||
if trafficClass == 0 {
|
||||
trafficClass = fabricproto.TrafficClassReliable
|
||||
}
|
||||
t.Runtime.Config.TrafficClass = trafficClass
|
||||
spec := FabricChannelSpec{
|
||||
ChannelID: firstNonEmpty(strings.TrimSpace(req.ChannelID), fmt.Sprintf("fabric-overlay-%d", t.sequence.Add(1))),
|
||||
ClusterID: strings.TrimSpace(t.Config.ClusterID),
|
||||
SourceNodeID: strings.TrimSpace(t.Config.LocalNodeID),
|
||||
TargetKind: targetKind,
|
||||
TargetID: targetID,
|
||||
TrafficClass: loadFabricTrafficClassName(trafficClass),
|
||||
StickyKey: strings.TrimSpace(req.StickyKey),
|
||||
CreatedAt: time.Now().UTC(),
|
||||
}
|
||||
return t.Runtime.SendReliable(ctx, spec, routeSet, req.Payloads)
|
||||
}
|
||||
|
||||
func (t *FabricOverlayTransport) SnapshotPressure() FabricRoutePressureSnapshot {
|
||||
if t == nil || t.Runtime == nil || t.Runtime.Pressure == nil {
|
||||
return FabricRoutePressureSnapshot{}
|
||||
}
|
||||
return t.Runtime.Pressure.SnapshotPressure()
|
||||
}
|
||||
|
||||
func (t *FabricOverlayTransport) Snapshot() FabricOverlayTransportSnapshot {
|
||||
if t == nil || t.Runtime == nil {
|
||||
return FabricOverlayTransportSnapshot{}
|
||||
}
|
||||
return FabricOverlayTransportSnapshot{
|
||||
RoutePressure: t.Runtime.snapshotRoutePressure(),
|
||||
RouteHealth: t.Runtime.snapshotRouteHealth(),
|
||||
}
|
||||
}
|
||||
|
||||
func loadFabricTrafficClassName(trafficClass fabricproto.TrafficClass) string {
|
||||
switch trafficClass {
|
||||
case fabricproto.TrafficClassControl:
|
||||
return "control"
|
||||
case fabricproto.TrafficClassInteractive:
|
||||
return "interactive"
|
||||
case fabricproto.TrafficClassBulk:
|
||||
return "bulk"
|
||||
case fabricproto.TrafficClassReliable:
|
||||
return "reliable"
|
||||
default:
|
||||
return fmt.Sprintf("traffic_class_%d", trafficClass)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
func TestFabricOverlayTransportSendsThroughRouteSet(t *testing.T) {
|
||||
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
|
||||
"quic://node-b:19443": 0,
|
||||
})
|
||||
overlay := NewFabricOverlayTransport(transport, map[string]FabricRouteSet{
|
||||
"node-b": {
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: FabricRoute{
|
||||
RouteID: "node-b-direct",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
Hops: []FabricRouteHop{{NodeID: "node-b", Mode: FabricRouteDirect, EndpointID: "node-b-direct", Address: "quic://node-b:19443"}},
|
||||
Capacity: 100,
|
||||
Healthy: true,
|
||||
},
|
||||
},
|
||||
}, FabricOverlayTransportConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
defer cancel()
|
||||
result, err := overlay.Send(ctx, FabricOverlaySendRequest{
|
||||
TargetID: "node-b",
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
Payloads: [][]byte{[]byte("payload")},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("send: %v", err)
|
||||
}
|
||||
if result.BytesSent != uint64(len("payload")) || result.AcksReceived != 1 {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if pressure := overlay.SnapshotPressure(); pressure.ActiveTotal != 0 || pressure.AcquiredTotal != pressure.ReleasedTotal {
|
||||
t.Fatalf("pressure leak: %+v", pressure)
|
||||
}
|
||||
if snapshot := overlay.Snapshot(); snapshot.RoutePressure.AcquiredTotal != 1 || len(snapshot.RouteHealth.Quarantined) != 0 {
|
||||
t.Fatalf("snapshot = %+v", snapshot)
|
||||
}
|
||||
}
|
||||
@@ -3,28 +3,50 @@ package mesh
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
"github.com/quic-go/quic-go"
|
||||
)
|
||||
|
||||
type QUICFabricServer struct {
|
||||
listener *quic.Listener
|
||||
logger FabricSessionEventLogger
|
||||
done chan struct{}
|
||||
closeOnce sync.Once
|
||||
listener *quic.Listener
|
||||
logger FabricSessionEventLogger
|
||||
reverseMu sync.RWMutex
|
||||
reverseTransport *QUICFabricTransport
|
||||
fabricFrameHandler FabricFrameHandler
|
||||
productionForwardHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
|
||||
webIngressForwardHandler func(context.Context, []byte) ([]byte, error)
|
||||
fabricControlHandler func(context.Context, []byte) ([]byte, error)
|
||||
syntheticForwardHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
|
||||
done chan struct{}
|
||||
closeOnce sync.Once
|
||||
}
|
||||
|
||||
type QUICFabricServerConfig struct {
|
||||
ListenAddr string
|
||||
TLSConfig *tls.Config
|
||||
QUICConfig *quic.Config
|
||||
Logger FabricSessionEventLogger
|
||||
ListenAddr string
|
||||
TLSConfig *tls.Config
|
||||
QUICConfig *quic.Config
|
||||
Logger FabricSessionEventLogger
|
||||
ReverseTransport *QUICFabricTransport
|
||||
FabricFrameHandler FabricFrameHandler
|
||||
ProductionForwardHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
|
||||
WebIngressForwardHandler func(context.Context, []byte) ([]byte, error)
|
||||
FabricControlHandler func(context.Context, []byte) ([]byte, error)
|
||||
SyntheticForwardHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
|
||||
}
|
||||
|
||||
type FabricFrameSender interface {
|
||||
SendFrame(context.Context, fabricproto.Frame) error
|
||||
}
|
||||
|
||||
type FabricFrameHandler func(context.Context, FabricFrameSender, fabricproto.Frame) (bool, error)
|
||||
|
||||
func StartQUICFabricServer(ctx context.Context, cfg QUICFabricServerConfig) (*QUICFabricServer, error) {
|
||||
if cfg.ListenAddr == "" {
|
||||
return nil, fmt.Errorf("quic fabric listen addr is required")
|
||||
@@ -42,9 +64,15 @@ func StartQUICFabricServer(ctx context.Context, cfg QUICFabricServerConfig) (*QU
|
||||
return nil, err
|
||||
}
|
||||
server := &QUICFabricServer{
|
||||
listener: listener,
|
||||
logger: cfg.Logger,
|
||||
done: make(chan struct{}),
|
||||
listener: listener,
|
||||
logger: cfg.Logger,
|
||||
reverseTransport: cfg.ReverseTransport,
|
||||
fabricFrameHandler: cfg.FabricFrameHandler,
|
||||
productionForwardHandler: cfg.ProductionForwardHandler,
|
||||
webIngressForwardHandler: cfg.WebIngressForwardHandler,
|
||||
fabricControlHandler: cfg.FabricControlHandler,
|
||||
syntheticForwardHandler: cfg.SyntheticForwardHandler,
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
go server.acceptLoop(ctx)
|
||||
return server, nil
|
||||
@@ -57,6 +85,15 @@ func (s *QUICFabricServer) Addr() net.Addr {
|
||||
return s.listener.Addr()
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) SetReverseTransport(transport *QUICFabricTransport) {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
s.reverseMu.Lock()
|
||||
s.reverseTransport = transport
|
||||
s.reverseMu.Unlock()
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) Close() error {
|
||||
if s == nil {
|
||||
return nil
|
||||
@@ -95,6 +132,8 @@ func (s *QUICFabricServer) handleConn(ctx context.Context, conn *quic.Conn) {
|
||||
|
||||
func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, stream *quic.Stream) {
|
||||
session := fabricproto.NewSession(fabricproto.SessionConfig{})
|
||||
sender := quicStreamFrameSender{stream: stream}
|
||||
defer func() { _ = stream.Close() }()
|
||||
s.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_quic_stream_opened",
|
||||
AcceptedBy: "quic",
|
||||
@@ -116,6 +155,29 @@ func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, st
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
s.registerReverseHelloFrame(conn, frame)
|
||||
if s.handleProductionForwardFrame(ctx, stream, frame) {
|
||||
continue
|
||||
}
|
||||
if s.handleWebIngressForwardFrame(ctx, stream, frame) {
|
||||
continue
|
||||
}
|
||||
if s.handleFabricControlForwardFrame(ctx, stream, frame) {
|
||||
continue
|
||||
}
|
||||
if s.handleSyntheticForwardFrame(ctx, conn, stream, frame) {
|
||||
continue
|
||||
}
|
||||
if s.fabricFrameHandler != nil {
|
||||
handled, err := s.fabricFrameHandler(ctx, sender, frame)
|
||||
if err != nil {
|
||||
_ = conn.CloseWithError(2, err.Error())
|
||||
return
|
||||
}
|
||||
if handled {
|
||||
continue
|
||||
}
|
||||
}
|
||||
event, responses, err := session.HandleFrame(frame)
|
||||
if err != nil {
|
||||
_ = conn.CloseWithError(2, err.Error())
|
||||
@@ -140,6 +202,196 @@ func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, st
|
||||
}
|
||||
}
|
||||
|
||||
type quicStreamFrameSender struct {
|
||||
stream *quic.Stream
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
func (s quicStreamFrameSender) SendFrame(ctx context.Context, frame fabricproto.Frame) error {
|
||||
if s.stream == nil {
|
||||
return fmt.Errorf("quic fabric stream is closed")
|
||||
}
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
if deadline, ok := ctx.Deadline(); ok {
|
||||
_ = s.stream.SetWriteDeadline(deadline)
|
||||
} else {
|
||||
_ = s.stream.SetWriteDeadline(time.Now().Add(30 * time.Second))
|
||||
}
|
||||
return fabricproto.WriteFrame(s.stream, frame)
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) registerReverseHelloFrame(conn *quic.Conn, frame fabricproto.Frame) {
|
||||
reverseTransport := s.getReverseTransport()
|
||||
if s == nil || reverseTransport == nil || conn == nil || frame.Type != fabricproto.FramePing {
|
||||
return
|
||||
}
|
||||
payload := string(frame.Payload)
|
||||
if !strings.HasPrefix(payload, fabricQUICReverseHelloPrefix) {
|
||||
return
|
||||
}
|
||||
peerID := strings.TrimPrefix(payload, fabricQUICReverseHelloPrefix)
|
||||
reverseTransport.RegisterReverseConn(peerID, conn)
|
||||
s.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_quic_reverse_registered",
|
||||
AcceptedBy: "quic_reverse_hello",
|
||||
RemoteAddr: conn.RemoteAddr().String(),
|
||||
PeerID: peerID,
|
||||
})
|
||||
}
|
||||
|
||||
type quicProductionForwardResponse struct {
|
||||
Result ProductionForwardResult `json:"result,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type quicSyntheticForwardResponse struct {
|
||||
Envelope SyntheticEnvelope `json:"envelope,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type quicWebIngressForwardResponse struct {
|
||||
Payload json.RawMessage `json:"payload,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type quicFabricControlForwardResponse struct {
|
||||
Payload json.RawMessage `json:"payload,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) handleProductionForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != ProductionForwardQUICStreamID {
|
||||
return false
|
||||
}
|
||||
response := quicProductionForwardResponse{}
|
||||
if s == nil || s.productionForwardHandler == nil {
|
||||
response.Error = ErrForwardRuntimeUnavailable.Error()
|
||||
} else {
|
||||
var envelope ProductionEnvelope
|
||||
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
|
||||
response.Error = "invalid production mesh envelope"
|
||||
} else if result, err := s.productionForwardHandler(ctx, envelope); err != nil {
|
||||
response.Error = err.Error()
|
||||
} else {
|
||||
response.Result = result
|
||||
}
|
||||
}
|
||||
payload, err := json.Marshal(response)
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
StreamID: ProductionForwardQUICStreamID,
|
||||
Sequence: frame.Sequence,
|
||||
Payload: payload,
|
||||
})
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) handleWebIngressForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != WebIngressForwardQUICStreamID {
|
||||
return false
|
||||
}
|
||||
response := quicWebIngressForwardResponse{}
|
||||
if s == nil || s.webIngressForwardHandler == nil {
|
||||
response.Error = ErrForwardRuntimeUnavailable.Error()
|
||||
} else if payload, err := s.webIngressForwardHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
|
||||
response.Error = err.Error()
|
||||
} else {
|
||||
response.Payload = append(json.RawMessage(nil), payload...)
|
||||
}
|
||||
payload, err := json.Marshal(response)
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
StreamID: WebIngressForwardQUICStreamID,
|
||||
Sequence: frame.Sequence,
|
||||
Payload: payload,
|
||||
})
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) handleFabricControlForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != FabricControlForwardQUICStreamID {
|
||||
return false
|
||||
}
|
||||
response := quicFabricControlForwardResponse{}
|
||||
if s == nil || s.fabricControlHandler == nil {
|
||||
response.Error = ErrForwardRuntimeUnavailable.Error()
|
||||
} else if payload, err := s.fabricControlHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
|
||||
response.Error = err.Error()
|
||||
} else {
|
||||
response.Payload = append(json.RawMessage(nil), payload...)
|
||||
}
|
||||
payload, err := json.Marshal(response)
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
StreamID: FabricControlForwardQUICStreamID,
|
||||
Sequence: frame.Sequence,
|
||||
Payload: payload,
|
||||
})
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) handleSyntheticForwardFrame(ctx context.Context, conn *quic.Conn, stream *quic.Stream, frame fabricproto.Frame) bool {
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != SyntheticForwardQUICStreamID {
|
||||
return false
|
||||
}
|
||||
response := quicSyntheticForwardResponse{}
|
||||
if s == nil || s.syntheticForwardHandler == nil {
|
||||
response.Error = ErrMeshRuntimeDisabled.Error()
|
||||
} else {
|
||||
var envelope SyntheticEnvelope
|
||||
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
|
||||
response.Error = "invalid synthetic mesh envelope"
|
||||
} else if ack, err := s.syntheticForwardHandler(ctx, envelope); err != nil {
|
||||
response.Error = err.Error()
|
||||
} else {
|
||||
s.registerReversePeerConn(envelope.From.NodeID, conn)
|
||||
response.Envelope = ack
|
||||
}
|
||||
}
|
||||
payload, err := json.Marshal(response)
|
||||
if err != nil {
|
||||
return true
|
||||
}
|
||||
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
StreamID: SyntheticForwardQUICStreamID,
|
||||
Sequence: frame.Sequence,
|
||||
Payload: payload,
|
||||
})
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) registerReversePeerConn(peerID string, conn *quic.Conn) {
|
||||
reverseTransport := s.getReverseTransport()
|
||||
if s == nil || reverseTransport == nil || conn == nil {
|
||||
return
|
||||
}
|
||||
reverseTransport.RegisterReverseConn(peerID, conn)
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) getReverseTransport() *QUICFabricTransport {
|
||||
if s == nil {
|
||||
return nil
|
||||
}
|
||||
s.reverseMu.RLock()
|
||||
defer s.reverseMu.RUnlock()
|
||||
return s.reverseTransport
|
||||
}
|
||||
|
||||
func (s *QUICFabricServer) logFabricSession(entry FabricSessionEventLogEntry) {
|
||||
if s != nil && s.logger != nil {
|
||||
s.logger(entry)
|
||||
|
||||
@@ -6,7 +6,9 @@ import (
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -17,6 +19,7 @@ import (
|
||||
)
|
||||
|
||||
const fabricQUICNextProto = "rap-fabric-data-session-v1"
|
||||
const fabricQUICReverseHelloPrefix = "rap-fabric-reverse-hello-v1:"
|
||||
const defaultQUICFabricConnIdleTTL = 5 * time.Minute
|
||||
const defaultQUICFabricMaxStreamsPerConn = 64
|
||||
const ErrQUICFabricStreamLimitReached = quicFabricError("quic fabric stream limit reached")
|
||||
@@ -28,17 +31,29 @@ func (e quicFabricError) Error() string {
|
||||
}
|
||||
|
||||
type QUICFabricTransport struct {
|
||||
Config *quic.Config
|
||||
IdleTTL time.Duration
|
||||
MaxStreamsPerConn int
|
||||
mu sync.Mutex
|
||||
conns map[string]*quicFabricConnEntry
|
||||
stats QUICFabricTransportStats
|
||||
Config *quic.Config
|
||||
LocalPeerID string
|
||||
IdleTTL time.Duration
|
||||
MaxStreamsPerConn int
|
||||
DialAddr func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error)
|
||||
mu sync.Mutex
|
||||
conns map[string]*quicFabricConnEntry
|
||||
reverseConns map[string]*quicFabricConnEntry
|
||||
inboundProductionHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
|
||||
inboundWebIngressHandler func(context.Context, []byte) ([]byte, error)
|
||||
inboundFabricControlHandler func(context.Context, []byte) ([]byte, error)
|
||||
inboundSyntheticHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
|
||||
logger FabricSessionEventLogger
|
||||
stats QUICFabricTransportStats
|
||||
}
|
||||
|
||||
type QUICFabricTransportStats struct {
|
||||
Opens uint64 `json:"opens"`
|
||||
Reuses uint64 `json:"reuses"`
|
||||
ReverseHelloSent uint64 `json:"reverse_hello_sent"`
|
||||
ReverseHelloFailed uint64 `json:"reverse_hello_failed"`
|
||||
ReverseRegisters uint64 `json:"reverse_registers"`
|
||||
ReverseReuses uint64 `json:"reverse_reuses"`
|
||||
OpenFailures uint64 `json:"open_failures"`
|
||||
ClosedEvicted uint64 `json:"closed_evicted"`
|
||||
CloseAllCalls uint64 `json:"close_all_calls"`
|
||||
@@ -50,6 +65,7 @@ type QUICFabricTransportStats struct {
|
||||
|
||||
type QUICFabricTransportSnapshot struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
LocalPeerID string `json:"local_peer_id,omitempty"`
|
||||
ActiveCount int `json:"active_count"`
|
||||
ActiveStreams int `json:"active_streams"`
|
||||
MaxStreamsPerConn int `json:"max_streams_per_conn"`
|
||||
@@ -63,6 +79,7 @@ type QUICFabricConnSnapshot struct {
|
||||
PeerID string `json:"peer_id,omitempty"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
CertSHA256 string `json:"cert_sha256,omitempty"`
|
||||
Direction string `json:"direction,omitempty"`
|
||||
ActiveStreams int `json:"active_streams"`
|
||||
MaxStreams int `json:"max_streams"`
|
||||
CapacityPressurePercent int `json:"capacity_pressure_percent"`
|
||||
@@ -92,7 +109,41 @@ type quicFabricConnEntry struct {
|
||||
}
|
||||
|
||||
func NewQUICFabricTransport(config *quic.Config) *QUICFabricTransport {
|
||||
return &QUICFabricTransport{Config: config, IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}}
|
||||
return &QUICFabricTransport{Config: config, IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}, reverseConns: map[string]*quicFabricConnEntry{}}
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) SetInboundHandlers(production func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), synthetic func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), logger FabricSessionEventLogger) {
|
||||
t.SetInboundHandlersWithWebIngress(production, nil, synthetic, logger)
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) SetInboundHandlersWithWebIngress(production func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), webIngress func(context.Context, []byte) ([]byte, error), synthetic func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), logger FabricSessionEventLogger) {
|
||||
if t == nil {
|
||||
return
|
||||
}
|
||||
t.mu.Lock()
|
||||
t.inboundProductionHandler = production
|
||||
t.inboundWebIngressHandler = webIngress
|
||||
t.inboundSyntheticHandler = synthetic
|
||||
t.logger = logger
|
||||
t.mu.Unlock()
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) SetInboundFabricControlHandler(handler func(context.Context, []byte) ([]byte, error)) {
|
||||
if t == nil {
|
||||
return
|
||||
}
|
||||
t.mu.Lock()
|
||||
t.inboundFabricControlHandler = handler
|
||||
t.mu.Unlock()
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) SetLocalPeerID(peerID string) {
|
||||
if t == nil {
|
||||
return
|
||||
}
|
||||
t.mu.Lock()
|
||||
t.LocalPeerID = strings.TrimSpace(peerID)
|
||||
t.mu.Unlock()
|
||||
}
|
||||
|
||||
func quicTLSConfigForTarget(target FabricTransportTarget) *tls.Config {
|
||||
@@ -186,9 +237,12 @@ func (t *QUICFabricTransport) connectConn(ctx context.Context, target FabricTran
|
||||
conn, err := quic.DialAddr(ctx, target.Endpoint, tlsConfig, nil)
|
||||
return conn, "", true, err
|
||||
}
|
||||
if conn, key, ok := t.reverseConnForTarget(target); ok {
|
||||
return conn, key, false, nil
|
||||
}
|
||||
key := quicFabricConnKey(target)
|
||||
if key == "" {
|
||||
conn, err := quic.DialAddr(ctx, target.Endpoint, tlsConfig, t.Config)
|
||||
conn, err := t.dialAddr(ctx, target.Endpoint, tlsConfig)
|
||||
return conn, "", true, err
|
||||
}
|
||||
t.mu.Lock()
|
||||
@@ -207,7 +261,7 @@ func (t *QUICFabricTransport) connectConn(ctx context.Context, target FabricTran
|
||||
}
|
||||
t.mu.Unlock()
|
||||
|
||||
conn, err := quic.DialAddr(ctx, target.Endpoint, tlsConfig, t.Config)
|
||||
conn, err := t.dialAddr(ctx, target.Endpoint, tlsConfig)
|
||||
if err != nil {
|
||||
t.mu.Lock()
|
||||
t.stats.OpenFailures++
|
||||
@@ -235,16 +289,339 @@ func (t *QUICFabricTransport) connectConn(ctx context.Context, target FabricTran
|
||||
t.conns[key] = &quicFabricConnEntry{conn: conn, lastUsed: time.Now()}
|
||||
t.stats.Opens++
|
||||
t.mu.Unlock()
|
||||
go t.acceptInboundStreams(context.Background(), conn)
|
||||
go t.sendReverseHello(context.Background(), conn)
|
||||
return conn, key, false, nil
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) dialAddr(ctx context.Context, endpoint string, tlsConfig *tls.Config) (*quic.Conn, error) {
|
||||
if t != nil && t.DialAddr != nil {
|
||||
return t.DialAddr(ctx, endpoint, tlsConfig, t.Config)
|
||||
}
|
||||
return quic.DialAddr(ctx, endpoint, tlsConfig, t.Config)
|
||||
}
|
||||
|
||||
func DialQUICAddrWithPacketConn(ctx context.Context, endpoint string, packetConn net.PacketConn, tlsConfig *tls.Config, config *quic.Config) (*quic.Conn, error) {
|
||||
if packetConn == nil {
|
||||
return nil, fmt.Errorf("quic packet connection is required")
|
||||
}
|
||||
addr, err := net.ResolveUDPAddr("udp", strings.TrimPrefix(strings.TrimSpace(endpoint), "quic://"))
|
||||
if err != nil {
|
||||
_ = packetConn.Close()
|
||||
return nil, err
|
||||
}
|
||||
transport := &quic.Transport{Conn: packetConn}
|
||||
conn, err := transport.Dial(ctx, addr, tlsConfig, config)
|
||||
if err != nil {
|
||||
_ = transport.Close()
|
||||
return nil, err
|
||||
}
|
||||
go func() {
|
||||
<-conn.Context().Done()
|
||||
_ = transport.Close()
|
||||
}()
|
||||
return conn, nil
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) sendReverseHello(ctx context.Context, conn *quic.Conn) {
|
||||
if t == nil || conn == nil {
|
||||
return
|
||||
}
|
||||
localPeerID := t.localPeerID()
|
||||
if localPeerID == "" {
|
||||
t.mu.Lock()
|
||||
t.stats.ReverseHelloFailed++
|
||||
t.mu.Unlock()
|
||||
return
|
||||
}
|
||||
helloCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
|
||||
defer cancel()
|
||||
stream, err := conn.OpenStreamSync(helloCtx)
|
||||
if err != nil {
|
||||
t.mu.Lock()
|
||||
t.stats.ReverseHelloFailed++
|
||||
t.mu.Unlock()
|
||||
return
|
||||
}
|
||||
defer func() { _ = stream.Close() }()
|
||||
if err := fabricproto.WriteFrame(stream, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: 1,
|
||||
Payload: []byte(fabricQUICReverseHelloPrefix + localPeerID),
|
||||
}); err != nil {
|
||||
t.mu.Lock()
|
||||
t.stats.ReverseHelloFailed++
|
||||
t.mu.Unlock()
|
||||
return
|
||||
}
|
||||
t.mu.Lock()
|
||||
t.stats.ReverseHelloSent++
|
||||
t.mu.Unlock()
|
||||
_, _ = fabricproto.ReadFrame(stream, fabricproto.DefaultMaxPayload)
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) acceptInboundStreams(ctx context.Context, conn *quic.Conn) {
|
||||
if t == nil || conn == nil {
|
||||
return
|
||||
}
|
||||
for {
|
||||
stream, err := conn.AcceptStream(ctx)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
go t.handleInboundStream(ctx, conn, stream)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) handleInboundStream(ctx context.Context, conn *quic.Conn, stream *quic.Stream) {
|
||||
session := fabricproto.NewSession(fabricproto.SessionConfig{})
|
||||
defer func() { _ = stream.Close() }()
|
||||
t.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_quic_reverse_stream_opened",
|
||||
AcceptedBy: "quic_reverse",
|
||||
RemoteAddr: conn.RemoteAddr().String(),
|
||||
})
|
||||
defer t.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_quic_reverse_stream_closed",
|
||||
AcceptedBy: "quic_reverse",
|
||||
RemoteAddr: conn.RemoteAddr().String(),
|
||||
})
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
_ = stream.Close()
|
||||
return
|
||||
default:
|
||||
}
|
||||
frame, err := fabricproto.ReadFrame(stream, fabricproto.DefaultMaxPayload)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
t.registerReverseHelloFrame(conn, frame)
|
||||
if t.handleInboundProductionForwardFrame(ctx, stream, frame) {
|
||||
continue
|
||||
}
|
||||
if t.handleInboundWebIngressForwardFrame(ctx, stream, frame) {
|
||||
continue
|
||||
}
|
||||
if t.handleInboundFabricControlForwardFrame(ctx, stream, frame) {
|
||||
continue
|
||||
}
|
||||
if t.handleInboundSyntheticForwardFrame(ctx, stream, frame) {
|
||||
continue
|
||||
}
|
||||
event, responses, err := session.HandleFrame(frame)
|
||||
if err != nil {
|
||||
_ = stream.Close()
|
||||
return
|
||||
}
|
||||
if event.Type != fabricproto.SessionEventNone {
|
||||
t.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_reverse_event",
|
||||
SessionEvent: event.Type,
|
||||
StreamID: event.StreamID,
|
||||
Sequence: event.Sequence,
|
||||
TrafficClass: event.TrafficClass,
|
||||
AcceptedBy: "quic_reverse",
|
||||
RemoteAddr: conn.RemoteAddr().String(),
|
||||
})
|
||||
}
|
||||
for _, response := range responses {
|
||||
if err := fabricproto.WriteFrame(stream, response); err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) registerReverseHelloFrame(conn *quic.Conn, frame fabricproto.Frame) {
|
||||
if t == nil || conn == nil || frame.Type != fabricproto.FramePing {
|
||||
return
|
||||
}
|
||||
payload := string(frame.Payload)
|
||||
if !strings.HasPrefix(payload, fabricQUICReverseHelloPrefix) {
|
||||
return
|
||||
}
|
||||
peerID := strings.TrimPrefix(payload, fabricQUICReverseHelloPrefix)
|
||||
t.RegisterReverseConn(peerID, conn)
|
||||
t.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_quic_reverse_registered",
|
||||
AcceptedBy: "quic_reverse_hello",
|
||||
RemoteAddr: conn.RemoteAddr().String(),
|
||||
PeerID: peerID,
|
||||
})
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) handleInboundProductionForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != ProductionForwardQUICStreamID {
|
||||
return false
|
||||
}
|
||||
response := quicProductionForwardResponse{}
|
||||
productionHandler, _, _, _, _ := t.inboundHandlers()
|
||||
if productionHandler == nil {
|
||||
response.Error = ErrForwardRuntimeUnavailable.Error()
|
||||
} else {
|
||||
var envelope ProductionEnvelope
|
||||
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
|
||||
response.Error = "invalid production mesh envelope"
|
||||
} else if result, err := productionHandler(ctx, envelope); err != nil {
|
||||
response.Error = err.Error()
|
||||
} else {
|
||||
response.Result = result
|
||||
}
|
||||
}
|
||||
payload, err := json.Marshal(response)
|
||||
if err == nil {
|
||||
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: ProductionForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) handleInboundWebIngressForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != WebIngressForwardQUICStreamID {
|
||||
return false
|
||||
}
|
||||
response := quicWebIngressForwardResponse{}
|
||||
_, webIngressHandler, _, _, _ := t.inboundHandlers()
|
||||
if webIngressHandler == nil {
|
||||
response.Error = ErrForwardRuntimeUnavailable.Error()
|
||||
} else if payload, err := webIngressHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
|
||||
response.Error = err.Error()
|
||||
} else {
|
||||
response.Payload = append(json.RawMessage(nil), payload...)
|
||||
}
|
||||
payload, err := json.Marshal(response)
|
||||
if err == nil {
|
||||
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: WebIngressForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) handleInboundFabricControlForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != FabricControlForwardQUICStreamID {
|
||||
return false
|
||||
}
|
||||
response := quicFabricControlForwardResponse{}
|
||||
_, _, fabricControlHandler, _, _ := t.inboundHandlers()
|
||||
if fabricControlHandler == nil {
|
||||
response.Error = ErrForwardRuntimeUnavailable.Error()
|
||||
} else if payload, err := fabricControlHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
|
||||
response.Error = err.Error()
|
||||
} else {
|
||||
response.Payload = append(json.RawMessage(nil), payload...)
|
||||
}
|
||||
payload, err := json.Marshal(response)
|
||||
if err == nil {
|
||||
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: FabricControlForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) handleInboundSyntheticForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != SyntheticForwardQUICStreamID {
|
||||
return false
|
||||
}
|
||||
response := quicSyntheticForwardResponse{}
|
||||
_, _, _, syntheticHandler, _ := t.inboundHandlers()
|
||||
if syntheticHandler == nil {
|
||||
response.Error = ErrMeshRuntimeDisabled.Error()
|
||||
} else {
|
||||
var envelope SyntheticEnvelope
|
||||
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
|
||||
response.Error = "invalid synthetic mesh envelope"
|
||||
} else if ack, err := syntheticHandler(ctx, envelope); err != nil {
|
||||
response.Error = err.Error()
|
||||
} else {
|
||||
response.Envelope = ack
|
||||
}
|
||||
}
|
||||
payload, err := json.Marshal(response)
|
||||
if err == nil {
|
||||
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: SyntheticForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) inboundHandlers() (func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), func(context.Context, []byte) ([]byte, error), func(context.Context, []byte) ([]byte, error), func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), FabricSessionEventLogger) {
|
||||
if t == nil {
|
||||
return nil, nil, nil, nil, nil
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
return t.inboundProductionHandler, t.inboundWebIngressHandler, t.inboundFabricControlHandler, t.inboundSyntheticHandler, t.logger
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) localPeerID() string {
|
||||
if t == nil {
|
||||
return ""
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
return strings.TrimSpace(t.LocalPeerID)
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) logFabricSession(entry FabricSessionEventLogEntry) {
|
||||
_, _, _, _, logger := t.inboundHandlers()
|
||||
if logger != nil {
|
||||
logger(entry)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) RegisterReverseConn(peerID string, conn *quic.Conn) {
|
||||
if t == nil || conn == nil {
|
||||
return
|
||||
}
|
||||
peerID = strings.TrimSpace(peerID)
|
||||
if peerID == "" {
|
||||
return
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
if t.reverseConns == nil {
|
||||
t.reverseConns = map[string]*quicFabricConnEntry{}
|
||||
}
|
||||
if existing := t.reverseConns[peerID]; existing != nil && existing.conn != nil && existing.conn != conn {
|
||||
select {
|
||||
case <-existing.conn.Context().Done():
|
||||
default:
|
||||
_ = existing.conn.CloseWithError(0, "reverse connection replaced")
|
||||
}
|
||||
}
|
||||
t.reverseConns[peerID] = &quicFabricConnEntry{conn: conn, lastUsed: time.Now()}
|
||||
t.stats.ReverseRegisters++
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) reverseConnForTarget(target FabricTransportTarget) (*quic.Conn, string, bool) {
|
||||
peerID := strings.TrimSpace(target.PeerID)
|
||||
if t == nil || peerID == "" || !fabricTransportPrefersReverseConn(target.Transport) {
|
||||
return nil, "", false
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
t.pruneIdleLocked(time.Now())
|
||||
entry := t.reverseConns[peerID]
|
||||
if entry == nil || entry.conn == nil {
|
||||
return nil, "", false
|
||||
}
|
||||
select {
|
||||
case <-entry.conn.Context().Done():
|
||||
delete(t.reverseConns, peerID)
|
||||
t.stats.ClosedEvicted++
|
||||
return nil, "", false
|
||||
default:
|
||||
entry.lastUsed = time.Now()
|
||||
t.stats.ReverseReuses++
|
||||
return entry.conn, quicFabricReverseConnKey(peerID), true
|
||||
}
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) reserveStream(key string, conn *quic.Conn) error {
|
||||
if t == nil || key == "" {
|
||||
return nil
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
entry := t.conns[key]
|
||||
entry := t.connEntryLocked(key)
|
||||
if entry == nil || entry.conn != conn {
|
||||
return fmt.Errorf("quic fabric connection is not cached")
|
||||
}
|
||||
@@ -267,16 +644,26 @@ func (t *QUICFabricTransport) releaseStream(key string) {
|
||||
return
|
||||
}
|
||||
t.mu.Lock()
|
||||
if entry := t.conns[key]; entry != nil {
|
||||
if entry := t.connEntryLocked(key); entry != nil {
|
||||
if entry.activeStreams > 0 {
|
||||
entry.activeStreams--
|
||||
}
|
||||
entry.lastUsed = time.Now()
|
||||
t.stats.StreamCloses++
|
||||
}
|
||||
t.stats.StreamCloses++
|
||||
t.mu.Unlock()
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) connEntryLocked(key string) *quicFabricConnEntry {
|
||||
if t == nil || key == "" {
|
||||
return nil
|
||||
}
|
||||
if strings.HasPrefix(key, "reverse\x00") {
|
||||
return t.reverseConns[strings.TrimPrefix(key, "reverse\x00")]
|
||||
}
|
||||
return t.conns[key]
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) evictConn(target FabricTransportTarget, conn *quic.Conn) {
|
||||
if t == nil || conn == nil {
|
||||
return
|
||||
@@ -315,6 +702,20 @@ func (t *QUICFabricTransport) pruneIdleLocked(now time.Time) {
|
||||
t.stats.IdleEvicted++
|
||||
}
|
||||
}
|
||||
for peerID, entry := range t.reverseConns {
|
||||
if entry == nil || entry.conn == nil {
|
||||
delete(t.reverseConns, peerID)
|
||||
continue
|
||||
}
|
||||
if !entry.lastUsed.IsZero() && now.Sub(entry.lastUsed) > ttl {
|
||||
if entry.activeStreams > 0 {
|
||||
continue
|
||||
}
|
||||
_ = entry.conn.CloseWithError(0, "idle reverse")
|
||||
delete(t.reverseConns, peerID)
|
||||
t.stats.IdleEvicted++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func quicFabricConnKey(target FabricTransportTarget) string {
|
||||
@@ -340,6 +741,23 @@ func parseQUICFabricConnKey(key string) (peerID string, endpoint string, certSHA
|
||||
return peerID, endpoint, certSHA256
|
||||
}
|
||||
|
||||
func quicFabricReverseConnKey(peerID string) string {
|
||||
peerID = strings.TrimSpace(peerID)
|
||||
if peerID == "" {
|
||||
return ""
|
||||
}
|
||||
return "reverse\x00" + peerID
|
||||
}
|
||||
|
||||
func fabricTransportPrefersReverseConn(transport string) bool {
|
||||
switch strings.ToLower(strings.TrimSpace(transport)) {
|
||||
case "reverse_quic", "relay_quic":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (t *QUICFabricTransport) Close() error {
|
||||
if t == nil {
|
||||
return nil
|
||||
@@ -348,12 +766,19 @@ func (t *QUICFabricTransport) Close() error {
|
||||
t.stats.CloseAllCalls++
|
||||
conns := t.conns
|
||||
t.conns = map[string]*quicFabricConnEntry{}
|
||||
reverseConns := t.reverseConns
|
||||
t.reverseConns = map[string]*quicFabricConnEntry{}
|
||||
t.mu.Unlock()
|
||||
for _, entry := range conns {
|
||||
if entry != nil && entry.conn != nil {
|
||||
_ = entry.conn.CloseWithError(0, "closed")
|
||||
}
|
||||
}
|
||||
for _, entry := range reverseConns {
|
||||
if entry != nil && entry.conn != nil {
|
||||
_ = entry.conn.CloseWithError(0, "closed")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -370,6 +795,7 @@ func (t *QUICFabricTransport) Snapshot() QUICFabricTransportSnapshot {
|
||||
}
|
||||
snapshot := QUICFabricTransportSnapshot{
|
||||
SchemaVersion: "rap.quic_fabric_transport.v1",
|
||||
LocalPeerID: strings.TrimSpace(t.LocalPeerID),
|
||||
MaxStreamsPerConn: limit,
|
||||
Stats: t.stats,
|
||||
}
|
||||
@@ -391,6 +817,40 @@ func (t *QUICFabricTransport) Snapshot() QUICFabricTransportSnapshot {
|
||||
PeerID: peerID,
|
||||
Endpoint: endpoint,
|
||||
CertSHA256: certSHA256,
|
||||
Direction: "outbound",
|
||||
ActiveStreams: entry.activeStreams,
|
||||
MaxStreams: limit,
|
||||
Saturated: entry.activeStreams >= limit,
|
||||
}
|
||||
if !entry.lastUsed.IsZero() {
|
||||
connSnapshot.LastUsedUnixSec = entry.lastUsed.UTC().Unix()
|
||||
}
|
||||
if limit > 0 {
|
||||
connSnapshot.CapacityPressurePercent = (entry.activeStreams * 100) / limit
|
||||
}
|
||||
snapshot.Connections = append(snapshot.Connections, connSnapshot)
|
||||
if entry.activeStreams >= limit {
|
||||
snapshot.SaturatedConnections++
|
||||
}
|
||||
}
|
||||
}
|
||||
for peerID, entry := range t.reverseConns {
|
||||
if entry == nil || entry.conn == nil {
|
||||
delete(t.reverseConns, peerID)
|
||||
continue
|
||||
}
|
||||
select {
|
||||
case <-entry.conn.Context().Done():
|
||||
delete(t.reverseConns, peerID)
|
||||
t.stats.ClosedEvicted++
|
||||
snapshot.Stats.ClosedEvicted++
|
||||
default:
|
||||
snapshot.ActiveCount++
|
||||
snapshot.ActiveStreams += entry.activeStreams
|
||||
connSnapshot := QUICFabricConnSnapshot{
|
||||
PeerID: peerID,
|
||||
Endpoint: entry.conn.RemoteAddr().String(),
|
||||
Direction: "reverse",
|
||||
ActiveStreams: entry.activeStreams,
|
||||
MaxStreams: limit,
|
||||
Saturated: entry.activeStreams >= limit,
|
||||
@@ -462,6 +922,7 @@ func (s *quicFabricSession) Close() error {
|
||||
s.closeOnce.Do(func() {
|
||||
close(s.done)
|
||||
if s.stream != nil {
|
||||
s.stream.CancelRead(0)
|
||||
err = s.stream.Close()
|
||||
}
|
||||
if s.transport != nil {
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"encoding/pem"
|
||||
"math/big"
|
||||
"strings"
|
||||
@@ -341,6 +342,119 @@ func TestQUICFabricTransportLimitsStreamsPerConnection(t *testing.T) {
|
||||
defer second.Close()
|
||||
}
|
||||
|
||||
func TestQUICFabricTransportReusesInboundConnectionForReverseStream(t *testing.T) {
|
||||
reverseTransport := NewQUICFabricTransport(nil)
|
||||
defer reverseTransport.Close()
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: testQUICTLSConfig(t),
|
||||
ReverseTransport: reverseTransport,
|
||||
SyntheticForwardHandler: func(_ context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
|
||||
envelope.To, envelope.From = envelope.From, PeerIdentity{ClusterID: envelope.ClusterID, NodeID: "node-r"}
|
||||
return envelope, nil
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
clientTransport := NewQUICFabricTransport(nil)
|
||||
defer clientTransport.Close()
|
||||
clientTransport.SetLocalPeerID("node-a")
|
||||
clientTransport.SetInboundHandlers(func(_ context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
|
||||
return ProductionForwardResult{
|
||||
Accepted: true,
|
||||
Delivered: true,
|
||||
Forwarded: true,
|
||||
By: PeerIdentity{ClusterID: envelope.ClusterID, NodeID: "node-a"},
|
||||
MessageID: envelope.MessageID,
|
||||
RouteID: envelope.RouteID,
|
||||
}, nil
|
||||
}, nil, nil)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
defer cancel()
|
||||
session, err := clientTransport.Connect(ctx, FabricTransportTarget{
|
||||
PeerID: "node-r",
|
||||
Endpoint: server.Addr().String(),
|
||||
TLSConfig: &tls.Config{
|
||||
InsecureSkipVerify: true,
|
||||
NextProtos: []string{fabricQUICNextProto},
|
||||
},
|
||||
Timeout: time.Second,
|
||||
InboundBuffer: 4,
|
||||
ErrorBuffer: 4,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("client connect: %v", err)
|
||||
}
|
||||
defer session.Close()
|
||||
deadline := time.Now().Add(time.Second)
|
||||
for {
|
||||
if reverseTransport.Snapshot().Stats.ReverseRegisters > 0 {
|
||||
break
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
t.Fatalf("reverse hello did not register connection: %+v", reverseTransport.Snapshot())
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
|
||||
reverseSession, err := reverseTransport.Connect(ctx, FabricTransportTarget{
|
||||
PeerID: "node-a",
|
||||
Endpoint: "10.0.0.2:19443",
|
||||
Transport: "relay_quic",
|
||||
Timeout: time.Second,
|
||||
InboundBuffer: 4,
|
||||
ErrorBuffer: 4,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("reverse connect: %v", err)
|
||||
}
|
||||
defer reverseSession.Close()
|
||||
productionPayload, err := json.Marshal(ProductionEnvelope{
|
||||
FabricProtocolVersion: ProtocolVersion,
|
||||
MessageID: "msg-1",
|
||||
RouteID: "route-r-a",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "node-r",
|
||||
DestinationNodeID: "node-a",
|
||||
CurrentHopNodeID: "node-a",
|
||||
NextHopNodeID: "node-a",
|
||||
ChannelClass: ProductionChannelFabricControl,
|
||||
MessageType: ProductionMessageFabricControl,
|
||||
TTL: 4,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
ExpiresAt: time.Now().UTC().Add(time.Minute),
|
||||
PayloadHash: "unused-by-test-handler",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal production: %v", err)
|
||||
}
|
||||
if err := reverseSession.Send(ctx, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: ProductionForwardQUICStreamID, Sequence: 2, Payload: productionPayload}); err != nil {
|
||||
t.Fatalf("send reverse production: %v", err)
|
||||
}
|
||||
select {
|
||||
case frame := <-reverseSession.Frames():
|
||||
var response quicProductionForwardResponse
|
||||
if err := json.Unmarshal(frame.Payload, &response); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if !response.Result.Accepted || !response.Result.Delivered || response.Result.By.NodeID != "node-a" {
|
||||
t.Fatalf("response = %+v", response)
|
||||
}
|
||||
case err := <-reverseSession.Errors():
|
||||
t.Fatalf("reverse session error: %v", err)
|
||||
case <-ctx.Done():
|
||||
t.Fatal(ctx.Err())
|
||||
}
|
||||
snapshot := reverseTransport.Snapshot()
|
||||
if snapshot.Stats.ReverseRegisters == 0 || snapshot.Stats.ReverseReuses == 0 {
|
||||
t.Fatalf("reverse connection was not registered/reused: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQUICFabricServerHandlesFabricFrames(t *testing.T) {
|
||||
var events []FabricSessionEventLogEntry
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
@@ -389,6 +503,68 @@ func TestQUICFabricServerHandlesFabricFrames(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestQUICFabricServerHandlesWebIngressForwardFrames(t *testing.T) {
|
||||
var received []byte
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: testQUICTLSConfig(t),
|
||||
WebIngressForwardHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
received = append([]byte(nil), payload...)
|
||||
return []byte(`{"schema_version":"rap.web_ingress.fabric_runtime_response.v1","status_code":200,"body_b64":"b2s="}`), nil
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
defer cancel()
|
||||
session, err := NewQUICFabricTransport(nil).Connect(ctx, FabricTransportTarget{
|
||||
Endpoint: server.Addr().String(),
|
||||
TLSConfig: &tls.Config{
|
||||
InsecureSkipVerify: true,
|
||||
NextProtos: []string{fabricQUICNextProto},
|
||||
},
|
||||
Timeout: time.Second,
|
||||
InboundBuffer: 4,
|
||||
ErrorBuffer: 4,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("connect quic fabric: %v", err)
|
||||
}
|
||||
defer session.Close()
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
StreamID: WebIngressForwardQUICStreamID,
|
||||
Sequence: 44,
|
||||
Payload: []byte(`{"envelope":true}`),
|
||||
}); err != nil {
|
||||
t.Fatalf("send web ingress frame: %v", err)
|
||||
}
|
||||
select {
|
||||
case frame := <-session.Frames():
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != WebIngressForwardQUICStreamID || frame.Sequence != 44 {
|
||||
t.Fatalf("frame = %+v", frame)
|
||||
}
|
||||
var response quicWebIngressForwardResponse
|
||||
if err := json.Unmarshal(frame.Payload, &response); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if string(response.Payload) != `{"schema_version":"rap.web_ingress.fabric_runtime_response.v1","status_code":200,"body_b64":"b2s="}` || response.Error != "" {
|
||||
t.Fatalf("response = %+v", response)
|
||||
}
|
||||
case err := <-session.Errors():
|
||||
t.Fatalf("session error: %v", err)
|
||||
case <-ctx.Done():
|
||||
t.Fatal(ctx.Err())
|
||||
}
|
||||
if string(received) != `{"envelope":true}` {
|
||||
t.Fatalf("received = %s", string(received))
|
||||
}
|
||||
}
|
||||
|
||||
func startQUICFabricEchoServer(t *testing.T) *quic.Listener {
|
||||
t.Helper()
|
||||
return startQUICFabricEchoServerWithTLS(t, testQUICTLSConfig(t))
|
||||
|
||||
@@ -0,0 +1,128 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type FabricRouteHealthTracker struct {
|
||||
mu sync.Mutex
|
||||
QuarantineTTL time.Duration
|
||||
routes map[string]FabricRouteHealthEntry
|
||||
}
|
||||
|
||||
type FabricRouteHealthEntry struct {
|
||||
Reason string `json:"reason,omitempty"`
|
||||
Failures uint64 `json:"failures"`
|
||||
LastFailure time.Time `json:"last_failure,omitempty"`
|
||||
RetryAfter time.Time `json:"retry_after,omitempty"`
|
||||
}
|
||||
|
||||
type FabricRouteHealthSnapshot struct {
|
||||
Quarantined map[string]FabricRouteHealthEntry `json:"quarantined,omitempty"`
|
||||
}
|
||||
|
||||
func NewFabricRouteHealthTracker(ttl time.Duration) *FabricRouteHealthTracker {
|
||||
if ttl <= 0 {
|
||||
ttl = 30 * time.Second
|
||||
}
|
||||
return &FabricRouteHealthTracker{QuarantineTTL: ttl, routes: map[string]FabricRouteHealthEntry{}}
|
||||
}
|
||||
|
||||
func (t *FabricRouteHealthTracker) MarkFailure(routeID string, reason string, now time.Time) {
|
||||
routeID = strings.TrimSpace(routeID)
|
||||
if t == nil || routeID == "" {
|
||||
return
|
||||
}
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
ttl := t.QuarantineTTL
|
||||
if ttl <= 0 {
|
||||
ttl = 30 * time.Second
|
||||
}
|
||||
t.mu.Lock()
|
||||
entry := t.routes[routeID]
|
||||
entry.Failures++
|
||||
entry.Reason = strings.TrimSpace(reason)
|
||||
entry.LastFailure = now
|
||||
entry.RetryAfter = now.Add(ttl)
|
||||
if t.routes == nil {
|
||||
t.routes = map[string]FabricRouteHealthEntry{}
|
||||
}
|
||||
t.routes[routeID] = entry
|
||||
t.mu.Unlock()
|
||||
}
|
||||
|
||||
func (t *FabricRouteHealthTracker) MarkSuccess(routeID string) {
|
||||
routeID = strings.TrimSpace(routeID)
|
||||
if t == nil || routeID == "" {
|
||||
return
|
||||
}
|
||||
t.mu.Lock()
|
||||
delete(t.routes, routeID)
|
||||
t.mu.Unlock()
|
||||
}
|
||||
|
||||
func (t *FabricRouteHealthTracker) Apply(routeSet FabricRouteSet, now time.Time) FabricRouteSet {
|
||||
if t == nil {
|
||||
return routeSet
|
||||
}
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
if len(t.routes) == 0 {
|
||||
return routeSet
|
||||
}
|
||||
return mapFabricRouteSet(routeSet, func(route FabricRoute) FabricRoute {
|
||||
entry, ok := t.routes[route.RouteID]
|
||||
if !ok {
|
||||
return route
|
||||
}
|
||||
if !entry.RetryAfter.IsZero() && !now.Before(entry.RetryAfter) {
|
||||
delete(t.routes, route.RouteID)
|
||||
return route
|
||||
}
|
||||
route.Healthy = false
|
||||
route.Degraded = true
|
||||
return route
|
||||
})
|
||||
}
|
||||
|
||||
func (t *FabricRouteHealthTracker) Snapshot(now time.Time) FabricRouteHealthSnapshot {
|
||||
if t == nil {
|
||||
return FabricRouteHealthSnapshot{}
|
||||
}
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
out := map[string]FabricRouteHealthEntry{}
|
||||
for routeID, entry := range t.routes {
|
||||
if !entry.RetryAfter.IsZero() && !now.Before(entry.RetryAfter) {
|
||||
continue
|
||||
}
|
||||
out[routeID] = entry
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return FabricRouteHealthSnapshot{}
|
||||
}
|
||||
return FabricRouteHealthSnapshot{Quarantined: out}
|
||||
}
|
||||
|
||||
func mapFabricRouteSet(routeSet FabricRouteSet, fn func(FabricRoute) FabricRoute) FabricRouteSet {
|
||||
if strings.TrimSpace(routeSet.Primary.RouteID) != "" {
|
||||
routeSet.Primary = fn(routeSet.Primary)
|
||||
}
|
||||
for i := range routeSet.WarmStandby {
|
||||
routeSet.WarmStandby[i] = fn(routeSet.WarmStandby[i])
|
||||
}
|
||||
for i := range routeSet.ColdFallbacks {
|
||||
routeSet.ColdFallbacks[i] = fn(routeSet.ColdFallbacks[i])
|
||||
}
|
||||
return routeSet
|
||||
}
|
||||
@@ -0,0 +1,322 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
FabricCandidateReachabilityPublic = "public"
|
||||
FabricCandidateReachabilityPrivate = "private"
|
||||
FabricCandidateReachabilityRelay = "relay"
|
||||
FabricCandidateReachabilityOutboundOnly = "outbound_only"
|
||||
|
||||
FabricConnectivityDirect = "direct"
|
||||
FabricConnectivityOutboundOnly = "outbound_only"
|
||||
FabricConnectivityRelayRequired = "relay_required"
|
||||
)
|
||||
|
||||
type FabricRoutePlannerConfig struct {
|
||||
ClusterID string
|
||||
LocalNodeID string
|
||||
LocalSegmentID string
|
||||
LocalNATGroupID string
|
||||
DefaultCapacity int
|
||||
RelayCapacity int
|
||||
ReverseCapacity int
|
||||
Observations map[string]EndpointCandidateHealthObservation
|
||||
CapacityPressure map[string]EndpointCandidateCapacityPressure
|
||||
Now time.Time
|
||||
MaxObservationAge time.Duration
|
||||
MaxCapacityPressureAge time.Duration
|
||||
}
|
||||
|
||||
type FabricCandidateMetadata struct {
|
||||
LocalSegmentID string `json:"local_segment_id,omitempty"`
|
||||
NATGroupID string `json:"nat_group_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
ViaNodeID string `json:"via_node_id,omitempty"`
|
||||
STUNServer string `json:"stun_server,omitempty"`
|
||||
ICEFoundation string `json:"ice_foundation,omitempty"`
|
||||
}
|
||||
|
||||
func FabricRouteSetForPeerEndpointCandidates(targetNodeID string, candidates []PeerEndpointCandidate, cfg FabricRoutePlannerConfig) FabricRouteSet {
|
||||
targetNodeID = strings.TrimSpace(targetNodeID)
|
||||
if targetNodeID == "" && len(candidates) > 0 {
|
||||
targetNodeID = strings.TrimSpace(candidates[0].NodeID)
|
||||
}
|
||||
routeSet := FabricRouteSet{TargetKind: FabricChannelTargetNode, TargetID: targetNodeID}
|
||||
if len(candidates) == 0 {
|
||||
return routeSet
|
||||
}
|
||||
now := cfg.Now
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
|
||||
Now: now,
|
||||
Observations: cfg.Observations,
|
||||
MaxObservationAge: firstNonZeroDuration(cfg.MaxObservationAge, 30*time.Second),
|
||||
CapacityPressure: cfg.CapacityPressure,
|
||||
MaxCapacityPressureAge: firstNonZeroDuration(cfg.MaxCapacityPressureAge, 10*time.Second),
|
||||
})
|
||||
routes := make([]FabricRoute, 0, len(ranked))
|
||||
for index, scored := range ranked {
|
||||
route, ok := fabricRouteForPeerEndpointCandidate(scored.Candidate, cfg, scored.Score, index, now)
|
||||
if ok {
|
||||
routes = append(routes, route)
|
||||
}
|
||||
}
|
||||
return routeSetFromRoutes(routeSet, routes)
|
||||
}
|
||||
|
||||
func FabricRouteSetsForPeerEndpointCandidates(candidatesByNode map[string][]PeerEndpointCandidate, cfg FabricRoutePlannerConfig) map[string]FabricRouteSet {
|
||||
out := make(map[string]FabricRouteSet, len(candidatesByNode))
|
||||
for nodeID, candidates := range candidatesByNode {
|
||||
nodeID = strings.TrimSpace(nodeID)
|
||||
if nodeID == "" {
|
||||
continue
|
||||
}
|
||||
routeSet := FabricRouteSetForPeerEndpointCandidates(nodeID, candidates, cfg)
|
||||
if strings.TrimSpace(routeSet.Primary.RouteID) != "" || len(routeSet.WarmStandby) > 0 || len(routeSet.ColdFallbacks) > 0 {
|
||||
out[nodeID] = routeSet
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func fabricRouteForPeerEndpointCandidate(candidate PeerEndpointCandidate, cfg FabricRoutePlannerConfig, score int, index int, now time.Time) (FabricRoute, bool) {
|
||||
candidate.EndpointID = strings.TrimSpace(candidate.EndpointID)
|
||||
candidate.NodeID = strings.TrimSpace(candidate.NodeID)
|
||||
candidate.Address = strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
|
||||
if candidate.EndpointID == "" || candidate.NodeID == "" || candidate.Address == "" || !isQUICOnlyCandidateTransport(candidate.Transport) {
|
||||
return FabricRoute{}, false
|
||||
}
|
||||
metadata := decodeFabricCandidateMetadata(candidate.Metadata)
|
||||
mode := fabricRouteModeForPeerEndpointCandidate(candidate, metadata, cfg)
|
||||
hops := fabricRouteHopsForCandidate(candidate, metadata, mode, cfg)
|
||||
if len(hops) == 0 {
|
||||
return FabricRoute{}, false
|
||||
}
|
||||
relayCount := 0
|
||||
for _, hop := range hops {
|
||||
if hop.Mode == FabricRouteRelay {
|
||||
relayCount++
|
||||
}
|
||||
}
|
||||
latency := fabricRouteLatencyFromCandidate(candidate, cfg, score, index)
|
||||
capacity := fabricRouteCapacityForMode(mode, cfg)
|
||||
if capacity <= 0 {
|
||||
capacity = 100
|
||||
}
|
||||
healthy := true
|
||||
degraded := false
|
||||
if observation, ok := cfg.Observations[candidate.EndpointID]; ok {
|
||||
healthy = observation.ReliabilityScore == 0 || observation.ReliabilityScore >= 50
|
||||
degraded = observation.LastLatencyMs > 0 && observation.LastLatencyMs >= 250
|
||||
}
|
||||
return FabricRoute{
|
||||
RouteID: candidate.EndpointID,
|
||||
ClusterID: strings.TrimSpace(cfg.ClusterID),
|
||||
SourceNodeID: strings.TrimSpace(cfg.LocalNodeID),
|
||||
DestinationNodeID: candidate.NodeID,
|
||||
Hops: hops,
|
||||
BaseLatencyMs: latency,
|
||||
Capacity: capacity,
|
||||
ActiveChannels: int(candidatePressureCount(candidate.EndpointID, cfg)),
|
||||
RelayCount: relayCount,
|
||||
Healthy: healthy,
|
||||
Degraded: degraded,
|
||||
LastUpdatedAt: now,
|
||||
}, true
|
||||
}
|
||||
|
||||
func fabricRouteModeForPeerEndpointCandidate(candidate PeerEndpointCandidate, metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) FabricRouteMode {
|
||||
transportMode := fabricRouteModeForTransportTarget(FabricTransportTarget{Transport: candidate.Transport})
|
||||
if transportMode == FabricRouteRelay || transportMode == FabricRouteReverse || transportMode == FabricRouteICE || transportMode == FabricRouteLAN {
|
||||
return transportMode
|
||||
}
|
||||
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
|
||||
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
|
||||
if sameLocalSegment(metadata, cfg) || sameNATGroup(metadata, cfg) {
|
||||
return FabricRouteLAN
|
||||
}
|
||||
if reachability == FabricCandidateReachabilityRelay || connectivity == FabricConnectivityRelayRequired || strings.TrimSpace(metadata.RelayEndpoint) != "" {
|
||||
return FabricRouteRelay
|
||||
}
|
||||
if connectivity == FabricConnectivityOutboundOnly || reachability == FabricCandidateReachabilityOutboundOnly {
|
||||
return FabricRouteReverse
|
||||
}
|
||||
if strings.TrimSpace(metadata.STUNServer) != "" || strings.TrimSpace(metadata.ICEFoundation) != "" || candidate.NATType != "" {
|
||||
return FabricRouteICE
|
||||
}
|
||||
return FabricRouteDirect
|
||||
}
|
||||
|
||||
func fabricRouteHopsForCandidate(candidate PeerEndpointCandidate, metadata FabricCandidateMetadata, mode FabricRouteMode, cfg FabricRoutePlannerConfig) []FabricRouteHop {
|
||||
localNodeID := strings.TrimSpace(cfg.LocalNodeID)
|
||||
targetNodeID := strings.TrimSpace(candidate.NodeID)
|
||||
endpoint := strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
|
||||
switch mode {
|
||||
case FabricRouteRelay:
|
||||
relayNodeID := firstNonEmpty(strings.TrimSpace(metadata.RelayNodeID), strings.TrimSpace(metadata.ViaNodeID))
|
||||
relayEndpoint := firstNonEmpty(strings.TrimRight(strings.TrimSpace(metadata.RelayEndpoint), "/"), endpoint)
|
||||
hops := []FabricRouteHop{}
|
||||
if localNodeID != "" {
|
||||
hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: FabricRouteDirect})
|
||||
}
|
||||
if relayNodeID == "" {
|
||||
hops = append(hops, FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)})
|
||||
return hops
|
||||
}
|
||||
hops = append(hops,
|
||||
FabricRouteHop{NodeID: relayNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID + ":relay", Address: relayEndpoint},
|
||||
FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)},
|
||||
)
|
||||
return hops
|
||||
case FabricRouteLAN, FabricRouteICE, FabricRouteReverse, FabricRouteDirect:
|
||||
hops := []FabricRouteHop{}
|
||||
if localNodeID != "" {
|
||||
hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: mode})
|
||||
}
|
||||
hops = append(hops, FabricRouteHop{NodeID: targetNodeID, Mode: mode, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)})
|
||||
return hops
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func isQUICOnlyCandidateTransport(transport string) bool {
|
||||
switch strings.ToLower(strings.TrimSpace(transport)) {
|
||||
case "quic", "direct_quic", "udp_quic", "quic_udp",
|
||||
string(FabricRouteLAN), string(FabricRouteReverse), string(FabricRouteRelay), string(FabricRouteICE):
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func fabricRouteLatencyFromCandidate(candidate PeerEndpointCandidate, cfg FabricRoutePlannerConfig, score int, index int) int {
|
||||
if observation, ok := cfg.Observations[candidate.EndpointID]; ok && observation.LastLatencyMs > 0 {
|
||||
if observation.LastLatencyMs > int64(^uint(0)>>1) {
|
||||
return int(^uint(0) >> 1)
|
||||
}
|
||||
return int(observation.LastLatencyMs)
|
||||
}
|
||||
base := 10 + index
|
||||
switch strings.ToLower(strings.TrimSpace(candidate.Reachability)) {
|
||||
case FabricCandidateReachabilityPrivate:
|
||||
base = 3 + index
|
||||
case FabricCandidateReachabilityOutboundOnly:
|
||||
base = 25 + index
|
||||
case FabricCandidateReachabilityRelay:
|
||||
base = 40 + index
|
||||
}
|
||||
if score < 100 {
|
||||
base += (100 - score) / 10
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
func fabricRouteCapacityForMode(mode FabricRouteMode, cfg FabricRoutePlannerConfig) int {
|
||||
switch mode {
|
||||
case FabricRouteRelay:
|
||||
return firstPositiveInt(cfg.RelayCapacity, cfg.DefaultCapacity, 100)
|
||||
case FabricRouteReverse:
|
||||
return firstPositiveInt(cfg.ReverseCapacity, cfg.DefaultCapacity, 100)
|
||||
default:
|
||||
return firstPositiveInt(cfg.DefaultCapacity, 100)
|
||||
}
|
||||
}
|
||||
|
||||
func candidatePressureCount(endpointID string, cfg FabricRoutePlannerConfig) int64 {
|
||||
if pressure, ok := cfg.CapacityPressure[endpointID]; ok {
|
||||
return pressure.Count
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func sameLocalSegment(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
|
||||
localSegment := strings.TrimSpace(cfg.LocalSegmentID)
|
||||
if localSegment == "" {
|
||||
return false
|
||||
}
|
||||
return strings.EqualFold(strings.TrimSpace(metadata.LocalSegmentID), localSegment)
|
||||
}
|
||||
|
||||
func sameNATGroup(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
|
||||
localNATGroup := strings.TrimSpace(cfg.LocalNATGroupID)
|
||||
if localNATGroup == "" {
|
||||
return false
|
||||
}
|
||||
return strings.EqualFold(strings.TrimSpace(metadata.NATGroupID), localNATGroup)
|
||||
}
|
||||
|
||||
func decodeFabricCandidateMetadata(raw json.RawMessage) FabricCandidateMetadata {
|
||||
if len(raw) == 0 {
|
||||
return FabricCandidateMetadata{}
|
||||
}
|
||||
var metadata FabricCandidateMetadata
|
||||
if err := json.Unmarshal(raw, &metadata); err != nil {
|
||||
return FabricCandidateMetadata{}
|
||||
}
|
||||
return metadata
|
||||
}
|
||||
|
||||
func candidatePeerCertSHA256(candidate PeerEndpointCandidate) string {
|
||||
var metadata struct {
|
||||
PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"`
|
||||
TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"`
|
||||
}
|
||||
if len(candidate.Metadata) == 0 {
|
||||
return ""
|
||||
}
|
||||
if err := json.Unmarshal(candidate.Metadata, &metadata); err != nil {
|
||||
return ""
|
||||
}
|
||||
return firstNonEmpty(strings.TrimSpace(metadata.PeerCertSHA256), strings.TrimSpace(metadata.TLSCertSHA256))
|
||||
}
|
||||
|
||||
func firstPositiveInt(values ...int) int {
|
||||
for _, value := range values {
|
||||
if value > 0 {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func firstNonZeroDuration(values ...time.Duration) time.Duration {
|
||||
for _, value := range values {
|
||||
if value > 0 {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func FabricRouteSetForRelayFallback(clusterID string, sourceNodeID string, targetNodeID string, relayNodeID string, relayEndpoint string, targetEndpoint string) FabricRouteSet {
|
||||
relayEndpoint = strings.TrimRight(strings.TrimSpace(relayEndpoint), "/")
|
||||
targetEndpoint = strings.TrimRight(strings.TrimSpace(targetEndpoint), "/")
|
||||
candidate := PeerEndpointCandidate{
|
||||
EndpointID: fmt.Sprintf("%s-via-%s-relay", strings.TrimSpace(targetNodeID), strings.TrimSpace(relayNodeID)),
|
||||
NodeID: strings.TrimSpace(targetNodeID),
|
||||
Transport: string(FabricRouteRelay),
|
||||
Address: targetEndpoint,
|
||||
Reachability: FabricCandidateReachabilityRelay,
|
||||
ConnectivityMode: FabricConnectivityRelayRequired,
|
||||
Metadata: mustMarshalFabricCandidateMetadata(FabricCandidateMetadata{RelayNodeID: relayNodeID, RelayEndpoint: relayEndpoint}),
|
||||
}
|
||||
return FabricRouteSetForPeerEndpointCandidates(targetNodeID, []PeerEndpointCandidate{candidate}, FabricRoutePlannerConfig{
|
||||
ClusterID: clusterID,
|
||||
LocalNodeID: sourceNodeID,
|
||||
})
|
||||
}
|
||||
|
||||
func mustMarshalFabricCandidateMetadata(metadata FabricCandidateMetadata) json.RawMessage {
|
||||
raw, _ := json.Marshal(metadata)
|
||||
return raw
|
||||
}
|
||||
@@ -0,0 +1,187 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
|
||||
metadata, _ := json.Marshal(FabricCandidateMetadata{LocalSegmentID: "site-a", NATGroupID: "nat-a"})
|
||||
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "quic",
|
||||
Address: "quic://203.0.113.10:19443",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-lan",
|
||||
NodeID: "node-b",
|
||||
Transport: "quic",
|
||||
Address: "quic://10.10.0.12:19443",
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "direct",
|
||||
PolicyTags: []string{"private-lan"},
|
||||
Metadata: metadata,
|
||||
},
|
||||
}, FabricRoutePlannerConfig{
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
LocalSegmentID: "site-a",
|
||||
DefaultCapacity: 200,
|
||||
Now: time.Unix(100, 0).UTC(),
|
||||
})
|
||||
if routeSet.Primary.RouteID != "node-b-lan" {
|
||||
t.Fatalf("primary route = %q, want node-b-lan", routeSet.Primary.RouteID)
|
||||
}
|
||||
if routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1].Mode != FabricRouteLAN {
|
||||
t.Fatalf("primary mode = %q, want lan", routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1].Mode)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T) {
|
||||
metadata, _ := json.Marshal(FabricCandidateMetadata{RelayNodeID: "node-r", RelayEndpoint: "quic://node-r:19443"})
|
||||
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "quic",
|
||||
Address: "quic://node-b-passive:19443",
|
||||
Reachability: "outbound_only",
|
||||
ConnectivityMode: "relay_required",
|
||||
NATType: "symmetric",
|
||||
Metadata: metadata,
|
||||
}}, FabricRoutePlannerConfig{
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
RelayCapacity: 50,
|
||||
Now: time.Unix(100, 0).UTC(),
|
||||
})
|
||||
if routeSet.Primary.RouteID != "node-b-relay" {
|
||||
t.Fatalf("primary route = %q", routeSet.Primary.RouteID)
|
||||
}
|
||||
if routeSet.Primary.RelayCount != 2 {
|
||||
t.Fatalf("relay count = %d, want 2", routeSet.Primary.RelayCount)
|
||||
}
|
||||
if got := routeSet.Primary.Hops[1].NodeID; got != "node-r" {
|
||||
t.Fatalf("relay hop = %q, want node-r", got)
|
||||
}
|
||||
if routeSet.Primary.Capacity != 50 {
|
||||
t.Fatalf("capacity = %d, want 50", routeSet.Primary.Capacity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteSetForPeerEndpointCandidatesUsesTargetWhenRelayMetadataIsAbsent(t *testing.T) {
|
||||
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay_quic",
|
||||
Address: "quic://node-b:19443",
|
||||
Reachability: "relay",
|
||||
ConnectivityMode: "relay_required",
|
||||
Metadata: json.RawMessage(`{"tls_cert_sha256":"abc123"}`),
|
||||
}}, FabricRoutePlannerConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
|
||||
if routeSet.Primary.RouteID != "node-b-relay" {
|
||||
t.Fatalf("primary route = %q", routeSet.Primary.RouteID)
|
||||
}
|
||||
if len(routeSet.Primary.Hops) != 2 {
|
||||
t.Fatalf("hops = %+v, want local + target only", routeSet.Primary.Hops)
|
||||
}
|
||||
targetHop := routeSet.Primary.Hops[1]
|
||||
if targetHop.NodeID != "node-b" || targetHop.Mode != FabricRouteRelay || targetHop.PeerCertSHA256 != "abc123" {
|
||||
t.Fatalf("target hop = %+v, want relay-mode target with cert", targetHop)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteSetForPeerEndpointCandidatesAcceptsExplicitQUICModes(t *testing.T) {
|
||||
for _, tc := range []struct {
|
||||
name string
|
||||
transport string
|
||||
wantMode FabricRouteMode
|
||||
}{
|
||||
{name: "lan", transport: "lan_quic", wantMode: FabricRouteLAN},
|
||||
{name: "reverse", transport: "reverse_quic", wantMode: FabricRouteReverse},
|
||||
{name: "relay", transport: "relay_quic", wantMode: FabricRouteRelay},
|
||||
{name: "ice", transport: "ice_quic", wantMode: FabricRouteICE},
|
||||
} {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
|
||||
EndpointID: "node-b-" + tc.name,
|
||||
NodeID: "node-b",
|
||||
Transport: tc.transport,
|
||||
Address: "quic://node-b:19443",
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "direct",
|
||||
Metadata: json.RawMessage(`{"tls_cert_sha256":"abc123"}`),
|
||||
}}, FabricRoutePlannerConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
|
||||
if routeSet.Primary.RouteID == "" {
|
||||
t.Fatalf("%s candidate produced empty route set", tc.transport)
|
||||
}
|
||||
hop := routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1]
|
||||
if hop.Mode != tc.wantMode {
|
||||
t.Fatalf("mode = %q, want %q", hop.Mode, tc.wantMode)
|
||||
}
|
||||
if hop.PeerCertSHA256 != "abc123" {
|
||||
t.Fatalf("peer cert = %q, want abc123", hop.PeerCertSHA256)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteSetForPeerEndpointCandidatesTreatsSameNATGroupAsLAN(t *testing.T) {
|
||||
metadata, _ := json.Marshal(FabricCandidateMetadata{NATGroupID: "nat-a"})
|
||||
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
|
||||
EndpointID: "node-b-nat-lan",
|
||||
NodeID: "node-b",
|
||||
Transport: "quic",
|
||||
Address: "quic://10.44.0.12:19443",
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "direct",
|
||||
NATType: "symmetric",
|
||||
Metadata: metadata,
|
||||
}}, FabricRoutePlannerConfig{
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
LocalNATGroupID: "nat-a",
|
||||
})
|
||||
if routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1].Mode != FabricRouteLAN {
|
||||
t.Fatalf("route = %+v, want LAN mode for same NAT group", routeSet.Primary)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteSetForPeerEndpointCandidatesRejectsNonQUIC(t *testing.T) {
|
||||
for _, candidate := range []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-http",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_http",
|
||||
Address: "http://node-b:8080",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-legacy-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay",
|
||||
Address: "quic://node-r:19443",
|
||||
Reachability: "relay",
|
||||
ConnectivityMode: "relay_required",
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-legacy-reverse",
|
||||
NodeID: "node-b",
|
||||
Transport: "outbound_reverse",
|
||||
Address: "quic://node-b:19443",
|
||||
Reachability: "outbound_only",
|
||||
ConnectivityMode: "outbound_only",
|
||||
},
|
||||
} {
|
||||
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{candidate}, FabricRoutePlannerConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
|
||||
if routeSet.Primary.RouteID != "" || len(routeSet.WarmStandby) != 0 {
|
||||
t.Fatalf("non-quic candidate produced route set: %+v", routeSet)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,137 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
type FabricRoutePressureTracker struct {
|
||||
mu sync.Mutex
|
||||
active map[string]int
|
||||
maxActive map[string]int
|
||||
acquiredTotal uint64
|
||||
releasedTotal uint64
|
||||
maxActiveTotal int
|
||||
lastAcquiredRoute string
|
||||
lastReleasedRoute string
|
||||
}
|
||||
|
||||
type FabricRoutePressureSnapshot struct {
|
||||
Active map[string]int `json:"active"`
|
||||
MaxActive map[string]int `json:"max_active"`
|
||||
ActiveTotal int `json:"active_total"`
|
||||
MaxActiveTotal int `json:"max_active_total"`
|
||||
AcquiredTotal uint64 `json:"acquired_total"`
|
||||
ReleasedTotal uint64 `json:"released_total"`
|
||||
LastAcquiredRoute string `json:"last_acquired_route,omitempty"`
|
||||
LastReleasedRoute string `json:"last_released_route,omitempty"`
|
||||
}
|
||||
|
||||
func NewFabricRoutePressureTracker() *FabricRoutePressureTracker {
|
||||
return &FabricRoutePressureTracker{
|
||||
active: map[string]int{},
|
||||
maxActive: map[string]int{},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *FabricRoutePressureTracker) Apply(routeSet FabricRouteSet) FabricRouteSet {
|
||||
if t == nil {
|
||||
return routeSet
|
||||
}
|
||||
active := t.Snapshot()
|
||||
if len(active) == 0 {
|
||||
return routeSet
|
||||
}
|
||||
apply := func(route FabricRoute) FabricRoute {
|
||||
if count := active[route.RouteID]; count > 0 {
|
||||
route.ActiveChannels += count
|
||||
}
|
||||
return route
|
||||
}
|
||||
routeSet.Primary = apply(routeSet.Primary)
|
||||
for i := range routeSet.WarmStandby {
|
||||
routeSet.WarmStandby[i] = apply(routeSet.WarmStandby[i])
|
||||
}
|
||||
for i := range routeSet.ColdFallbacks {
|
||||
routeSet.ColdFallbacks[i] = apply(routeSet.ColdFallbacks[i])
|
||||
}
|
||||
return routeSet
|
||||
}
|
||||
|
||||
func (t *FabricRoutePressureTracker) Acquire(routeID string) func() {
|
||||
routeID = strings.TrimSpace(routeID)
|
||||
if t == nil || routeID == "" {
|
||||
return func() {}
|
||||
}
|
||||
t.mu.Lock()
|
||||
if t.active == nil {
|
||||
t.active = map[string]int{}
|
||||
}
|
||||
if t.maxActive == nil {
|
||||
t.maxActive = map[string]int{}
|
||||
}
|
||||
t.active[routeID]++
|
||||
if t.active[routeID] > t.maxActive[routeID] {
|
||||
t.maxActive[routeID] = t.active[routeID]
|
||||
}
|
||||
t.acquiredTotal++
|
||||
t.lastAcquiredRoute = routeID
|
||||
if activeTotal := activeTotalLocked(t.active); activeTotal > t.maxActiveTotal {
|
||||
t.maxActiveTotal = activeTotal
|
||||
}
|
||||
t.mu.Unlock()
|
||||
var released atomic.Bool
|
||||
return func() {
|
||||
if released.Swap(true) {
|
||||
return
|
||||
}
|
||||
t.mu.Lock()
|
||||
if t.active[routeID] <= 1 {
|
||||
delete(t.active, routeID)
|
||||
} else {
|
||||
t.active[routeID]--
|
||||
}
|
||||
t.releasedTotal++
|
||||
t.lastReleasedRoute = routeID
|
||||
t.mu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
func (t *FabricRoutePressureTracker) Snapshot() map[string]int {
|
||||
return t.SnapshotPressure().Active
|
||||
}
|
||||
|
||||
func (t *FabricRoutePressureTracker) SnapshotPressure() FabricRoutePressureSnapshot {
|
||||
if t == nil {
|
||||
return FabricRoutePressureSnapshot{}
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
active := make(map[string]int, len(t.active))
|
||||
for routeID, count := range t.active {
|
||||
active[routeID] = count
|
||||
}
|
||||
maxActive := make(map[string]int, len(t.maxActive))
|
||||
for routeID, count := range t.maxActive {
|
||||
maxActive[routeID] = count
|
||||
}
|
||||
return FabricRoutePressureSnapshot{
|
||||
Active: active,
|
||||
MaxActive: maxActive,
|
||||
ActiveTotal: activeTotalLocked(active),
|
||||
MaxActiveTotal: t.maxActiveTotal,
|
||||
AcquiredTotal: t.acquiredTotal,
|
||||
ReleasedTotal: t.releasedTotal,
|
||||
LastAcquiredRoute: t.lastAcquiredRoute,
|
||||
LastReleasedRoute: t.lastReleasedRoute,
|
||||
}
|
||||
}
|
||||
|
||||
func activeTotalLocked(active map[string]int) int {
|
||||
total := 0
|
||||
for _, count := range active {
|
||||
total += count
|
||||
}
|
||||
return total
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
package mesh
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestFabricRoutePressureTrackerAppliesAndReleasesActiveChannels(t *testing.T) {
|
||||
tracker := NewFabricRoutePressureTracker()
|
||||
releaseA := tracker.Acquire("route-a")
|
||||
releaseAAgain := tracker.Acquire("route-a")
|
||||
releaseB := tracker.Acquire("route-b")
|
||||
routeSet := FabricRouteSet{
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: "node-b",
|
||||
Primary: testFabricRoute("route-a", "node-b", 10, 100, 3, true),
|
||||
WarmStandby: []FabricRoute{
|
||||
testFabricRoute("route-b", "node-b", 10, 100, 0, true),
|
||||
},
|
||||
}
|
||||
|
||||
withPressure := tracker.Apply(routeSet)
|
||||
if withPressure.Primary.ActiveChannels != 5 {
|
||||
t.Fatalf("primary active = %d, want 5", withPressure.Primary.ActiveChannels)
|
||||
}
|
||||
if withPressure.WarmStandby[0].ActiveChannels != 1 {
|
||||
t.Fatalf("standby active = %d, want 1", withPressure.WarmStandby[0].ActiveChannels)
|
||||
}
|
||||
|
||||
releaseA()
|
||||
releaseA()
|
||||
releaseAAgain()
|
||||
releaseB()
|
||||
snapshot := tracker.SnapshotPressure()
|
||||
if len(snapshot.Active) != 0 || snapshot.ActiveTotal != 0 {
|
||||
t.Fatalf("snapshot after release = %+v, want inactive", snapshot)
|
||||
}
|
||||
if snapshot.AcquiredTotal != 3 || snapshot.ReleasedTotal != 3 {
|
||||
t.Fatalf("snapshot totals = %+v, want acquired/released 3", snapshot)
|
||||
}
|
||||
if snapshot.MaxActive["route-a"] != 2 || snapshot.MaxActive["route-b"] != 1 || snapshot.MaxActiveTotal != 3 {
|
||||
t.Fatalf("snapshot max = %+v", snapshot)
|
||||
}
|
||||
if snapshot.LastAcquiredRoute != "route-b" || snapshot.LastReleasedRoute != "route-b" {
|
||||
t.Fatalf("snapshot last routes = %+v", snapshot)
|
||||
}
|
||||
}
|
||||
@@ -12,8 +12,9 @@ import (
|
||||
func TestFabricSessionPeerManagerReusesPeerPump(t *testing.T) {
|
||||
var opened int
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
if entry.Event == "fabric_session_websocket_opened" {
|
||||
opened++
|
||||
@@ -83,8 +84,9 @@ func TestFabricSessionPeerManagerReusesPeerPump(t *testing.T) {
|
||||
func TestFabricSessionPeerManagerClosePeerReopens(t *testing.T) {
|
||||
var opened int
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
if entry.Event == "fabric_session_websocket_opened" {
|
||||
opened++
|
||||
@@ -131,8 +133,9 @@ func TestFabricSessionPeerManagerClosePeerReopens(t *testing.T) {
|
||||
func TestFabricSessionPeerManagerReopensClosedPump(t *testing.T) {
|
||||
var opened int
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
if entry.Event == "fabric_session_websocket_opened" {
|
||||
opened++
|
||||
|
||||
@@ -40,73 +40,22 @@ type FabricTransportTarget struct {
|
||||
ErrorBuffer int
|
||||
}
|
||||
|
||||
func FabricTransportForTarget(target FabricTransportTarget, websocket *WebSocketFabricTransport, quicTransport *QUICFabricTransport) (FabricTransport, FabricTransportTarget, error) {
|
||||
func FabricTransportForTarget(target FabricTransportTarget, quicTransport *QUICFabricTransport) (FabricTransport, FabricTransportTarget, error) {
|
||||
transportLabel := strings.ToLower(strings.TrimSpace(target.Transport))
|
||||
endpoint := strings.TrimSpace(target.Endpoint)
|
||||
if strings.HasPrefix(strings.ToLower(endpoint), "quic://") {
|
||||
transportLabel = "quic"
|
||||
if transportLabel == "" {
|
||||
transportLabel = "quic"
|
||||
}
|
||||
target.Endpoint = strings.TrimPrefix(endpoint, "quic://")
|
||||
}
|
||||
switch transportLabel {
|
||||
case "quic", "direct_quic", "udp_quic", "quic_udp":
|
||||
case "quic", "direct_quic", "udp_quic", "quic_udp", "lan_quic", "reverse_quic", "relay_quic", "ice_quic":
|
||||
if quicTransport == nil {
|
||||
quicTransport = NewQUICFabricTransport(nil)
|
||||
}
|
||||
return quicTransport, target, nil
|
||||
case "", "websocket", "ws", "wss", "direct_http", "direct_https", "direct_tcp_tls":
|
||||
if websocket == nil {
|
||||
websocket = NewWebSocketFabricTransport(nil)
|
||||
}
|
||||
return websocket, target, nil
|
||||
default:
|
||||
return nil, target, fmt.Errorf("unsupported fabric transport %q", target.Transport)
|
||||
return nil, target, fmt.Errorf("unsupported fabric transport %q: quic is required", target.Transport)
|
||||
}
|
||||
}
|
||||
|
||||
type WebSocketFabricTransport struct {
|
||||
Manager *FabricSessionPeerManager
|
||||
}
|
||||
|
||||
func NewWebSocketFabricTransport(manager *FabricSessionPeerManager) *WebSocketFabricTransport {
|
||||
if manager == nil {
|
||||
manager = NewFabricSessionPeerManager()
|
||||
}
|
||||
return &WebSocketFabricTransport{Manager: manager}
|
||||
}
|
||||
|
||||
func (t *WebSocketFabricTransport) Connect(ctx context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
|
||||
manager := t.Manager
|
||||
if manager == nil {
|
||||
manager = NewFabricSessionPeerManager()
|
||||
t.Manager = manager
|
||||
}
|
||||
return manager.Get(ctx, FabricSessionPeerTarget{
|
||||
PeerID: target.PeerID,
|
||||
BaseURL: target.Endpoint,
|
||||
Options: FabricSessionDialOptions{
|
||||
Token: target.Token,
|
||||
Header: target.Header,
|
||||
Timeout: target.Timeout,
|
||||
MaxPayload: target.MaxPayload,
|
||||
},
|
||||
Pump: FabricSessionPumpOptions{
|
||||
OutboundBuffer: target.OutboundBuffer,
|
||||
InboundBuffer: target.InboundBuffer,
|
||||
ErrorBuffer: target.ErrorBuffer,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func (t *WebSocketFabricTransport) Close() error {
|
||||
if t == nil || t.Manager == nil {
|
||||
return nil
|
||||
}
|
||||
return t.Manager.Close()
|
||||
}
|
||||
|
||||
func (t *WebSocketFabricTransport) Snapshot() FabricSessionPeerManagerSnapshot {
|
||||
if t == nil || t.Manager == nil {
|
||||
return FabricSessionPeerManagerSnapshot{SchemaVersion: "rap.fabric_session_peer_manager.v1"}
|
||||
}
|
||||
return t.Manager.Snapshot()
|
||||
}
|
||||
|
||||
@@ -1,117 +1,27 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
func TestWebSocketFabricTransportConnectsAndReusesSession(t *testing.T) {
|
||||
var opened int
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
if entry.Event == "fabric_session_websocket_opened" {
|
||||
opened++
|
||||
}
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
transport := NewWebSocketFabricTransport(nil)
|
||||
defer transport.Close()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
target := FabricTransportTarget{
|
||||
PeerID: "node-a",
|
||||
Endpoint: server.URL,
|
||||
Token: "rap_fsn_transport",
|
||||
Timeout: time.Second,
|
||||
OutboundBuffer: 4,
|
||||
InboundBuffer: 4,
|
||||
ErrorBuffer: 4,
|
||||
}
|
||||
|
||||
first, err := transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("first connect: %v", err)
|
||||
}
|
||||
second, err := transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("second connect: %v", err)
|
||||
}
|
||||
if first != second {
|
||||
t.Fatal("transport did not reuse session")
|
||||
}
|
||||
if opened != 1 {
|
||||
t.Fatalf("opened = %d, want 1", opened)
|
||||
}
|
||||
if err := first.Send(ctx, fabricproto.Frame{Type: fabricproto.FramePing, Sequence: 1, Payload: []byte("transport")}); err != nil {
|
||||
t.Fatalf("send ping: %v", err)
|
||||
}
|
||||
select {
|
||||
case frame := <-first.Frames():
|
||||
if frame.Type != fabricproto.FramePong || frame.Sequence != 1 || string(frame.Payload) != "transport" {
|
||||
t.Fatalf("frame = %+v", frame)
|
||||
func TestFabricTransportRejectsWebSocketTransport(t *testing.T) {
|
||||
for _, target := range []FabricTransportTarget{
|
||||
{Transport: "wss", Endpoint: "wss://node-a.example/fabric/session"},
|
||||
{Transport: "relay", Endpoint: "quic://node-r.example:19443"},
|
||||
{Transport: "outbound_reverse", Endpoint: "quic://node-b.example:19443"},
|
||||
} {
|
||||
_, _, err := FabricTransportForTarget(target, nil)
|
||||
if err == nil || !strings.Contains(err.Error(), "quic is required") {
|
||||
t.Fatalf("target = %+v err = %v, want quic-only rejection", target, err)
|
||||
}
|
||||
case err := <-first.Errors():
|
||||
t.Fatalf("session error: %v", err)
|
||||
case <-ctx.Done():
|
||||
t.Fatal(ctx.Err())
|
||||
}
|
||||
}
|
||||
|
||||
func TestWebSocketFabricTransportReopensClosedSession(t *testing.T) {
|
||||
var opened int
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
if entry.Event == "fabric_session_websocket_opened" {
|
||||
opened++
|
||||
}
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
transport := NewWebSocketFabricTransport(nil)
|
||||
defer transport.Close()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
target := FabricTransportTarget{
|
||||
PeerID: "node-a",
|
||||
Endpoint: server.URL,
|
||||
Token: "rap_fsn_transport_reopen",
|
||||
Timeout: time.Second,
|
||||
}
|
||||
|
||||
first, err := transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("first connect: %v", err)
|
||||
}
|
||||
if err := first.Close(); err != nil {
|
||||
t.Fatalf("close first session: %v", err)
|
||||
}
|
||||
second, err := transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("second connect: %v", err)
|
||||
}
|
||||
if first == second {
|
||||
t.Fatal("transport reused closed session")
|
||||
}
|
||||
if opened != 2 {
|
||||
t.Fatalf("opened = %d, want 2", opened)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricTransportForTargetSelectsQUICByScheme(t *testing.T) {
|
||||
transport, target, err := FabricTransportForTarget(FabricTransportTarget{
|
||||
Endpoint: "quic://127.0.0.1:4433",
|
||||
}, nil, nil)
|
||||
}, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("select transport: %v", err)
|
||||
}
|
||||
@@ -123,15 +33,12 @@ func TestFabricTransportForTargetSelectsQUICByScheme(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricTransportForTargetSelectsWebSocketByDefault(t *testing.T) {
|
||||
transport, target, err := FabricTransportForTarget(FabricTransportTarget{
|
||||
func TestFabricTransportForTargetRejectsNonQUICByDefault(t *testing.T) {
|
||||
_, target, err := FabricTransportForTarget(FabricTransportTarget{
|
||||
Endpoint: "https://node.example",
|
||||
}, nil, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("select transport: %v", err)
|
||||
}
|
||||
if _, ok := transport.(*WebSocketFabricTransport); !ok {
|
||||
t.Fatalf("transport = %T, want websocket", transport)
|
||||
}, nil)
|
||||
if err == nil {
|
||||
t.Fatal("non-QUIC target unexpectedly selected a transport")
|
||||
}
|
||||
if target.Endpoint != "https://node.example" {
|
||||
t.Fatalf("endpoint = %q", target.Endpoint)
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// HTTPPeerTransport sends synthetic mesh envelopes to explicitly configured
|
||||
// peer endpoints. It is intentionally narrow: production forwarding remains
|
||||
// disabled and only SyntheticRuntime messages use this transport.
|
||||
type HTTPPeerTransport struct {
|
||||
PeerURLs map[string]string
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
|
||||
func NewHTTPPeerTransport(peerURLs map[string]string) *HTTPPeerTransport {
|
||||
normalized := make(map[string]string, len(peerURLs))
|
||||
for nodeID, baseURL := range peerURLs {
|
||||
nodeID = strings.TrimSpace(nodeID)
|
||||
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
|
||||
if nodeID != "" && baseURL != "" {
|
||||
normalized[nodeID] = baseURL
|
||||
}
|
||||
}
|
||||
return &HTTPPeerTransport{PeerURLs: normalized}
|
||||
}
|
||||
|
||||
func (t *HTTPPeerTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
|
||||
if t == nil {
|
||||
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
|
||||
if baseURL == "" {
|
||||
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
client := NewClient(baseURL)
|
||||
if t.HTTPClient != nil {
|
||||
client.HTTPClient = t.HTTPClient
|
||||
}
|
||||
return client.SendSynthetic(ctx, envelope)
|
||||
}
|
||||
@@ -1,130 +0,0 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestHTTPPeerTransportDirectSyntheticProbe(t *testing.T) {
|
||||
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
defer nodeA.Close()
|
||||
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
|
||||
defer nodeB.Close()
|
||||
|
||||
route := liveSyntheticRoute("route-direct", []string{"node-a", "node-b"})
|
||||
routes := []SyntheticRoute{route}
|
||||
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-b": nodeB.URL})
|
||||
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
|
||||
|
||||
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-direct")
|
||||
if err != nil {
|
||||
t.Fatalf("send live direct probe: %v", err)
|
||||
}
|
||||
if ack.MessageType != SyntheticMessageProbeAck {
|
||||
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
|
||||
}
|
||||
payload := decodeAckPayload(t, ack)
|
||||
if got, want := payload.Path, []string{"node-a", "node-b"}; !sameStrings(got, want) {
|
||||
t.Fatalf("path = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHTTPPeerTransportSingleRelaySyntheticProbe(t *testing.T) {
|
||||
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
defer nodeA.Close()
|
||||
nodeR := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"})
|
||||
defer nodeR.Close()
|
||||
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
|
||||
defer nodeB.Close()
|
||||
|
||||
route := liveSyntheticRoute("route-relay", []string{"node-a", "node-r", "node-b"})
|
||||
routes := []SyntheticRoute{route}
|
||||
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-r": nodeR.URL})
|
||||
nodeR.Runtime = newLiveRuntime(nodeR.Local, routes, map[string]string{"node-b": nodeB.URL})
|
||||
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
|
||||
|
||||
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-relay")
|
||||
if err != nil {
|
||||
t.Fatalf("send live relay probe: %v", err)
|
||||
}
|
||||
if ack.MessageType != SyntheticMessageProbeAck {
|
||||
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
|
||||
}
|
||||
payload := decodeAckPayload(t, ack)
|
||||
if got, want := payload.Path, []string{"node-a", "node-r", "node-b"}; !sameStrings(got, want) {
|
||||
t.Fatalf("path = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHTTPPeerTransportMissingPeer(t *testing.T) {
|
||||
transport := NewHTTPPeerTransport(map[string]string{})
|
||||
_, err := transport.SendSynthetic(context.Background(), "node-missing", SyntheticEnvelope{})
|
||||
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
|
||||
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
|
||||
}
|
||||
}
|
||||
|
||||
type liveSyntheticNode struct {
|
||||
Local PeerIdentity
|
||||
Runtime *SyntheticRuntime
|
||||
URL string
|
||||
server *httptest.Server
|
||||
}
|
||||
|
||||
func newLiveSyntheticNode(t *testing.T, local PeerIdentity) *liveSyntheticNode {
|
||||
t.Helper()
|
||||
node := &liveSyntheticNode{Local: local}
|
||||
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
|
||||
}))
|
||||
node.URL = node.server.URL
|
||||
return node
|
||||
}
|
||||
|
||||
func (n *liveSyntheticNode) Close() {
|
||||
if n.server != nil {
|
||||
n.server.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func newLiveRuntime(local PeerIdentity, routes []SyntheticRoute, peers map[string]string) *SyntheticRuntime {
|
||||
return NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: true,
|
||||
Local: local,
|
||||
Routes: routes,
|
||||
Transport: NewHTTPPeerTransport(peers),
|
||||
})
|
||||
}
|
||||
|
||||
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
|
||||
return SyntheticRoute{
|
||||
RouteID: routeID,
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: hops[0],
|
||||
DestinationNodeID: hops[len(hops)-1],
|
||||
Hops: hops,
|
||||
AllowedChannels: []string{SyntheticChannelFabricControl},
|
||||
MaxTTL: 8,
|
||||
MaxHops: 8,
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
RouteVersion: "route-v1",
|
||||
PolicyVersion: "policy-v1",
|
||||
PeerDirectoryVersion: "peers-v1",
|
||||
}
|
||||
}
|
||||
|
||||
func sameStrings(left, right []string) bool {
|
||||
if len(left) != len(right) {
|
||||
return false
|
||||
}
|
||||
for i := range left {
|
||||
if left[i] != right[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -53,9 +54,11 @@ type PeerCacheEntry struct {
|
||||
BestReachability string `json:"best_reachability,omitempty"`
|
||||
BestConnectivity string `json:"best_connectivity,omitempty"`
|
||||
BestNATType string `json:"best_nat_type,omitempty"`
|
||||
BestRegion string `json:"best_region,omitempty"`
|
||||
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
|
||||
BestCandidateScore int `json:"best_candidate_score,omitempty"`
|
||||
BestScoreReasons []string `json:"best_score_reasons,omitempty"`
|
||||
BestPeerCertSHA256 string `json:"best_peer_cert_sha256,omitempty"`
|
||||
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
@@ -132,9 +135,11 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
|
||||
entry.BestReachability = scored[0].Candidate.Reachability
|
||||
entry.BestConnectivity = scored[0].Candidate.ConnectivityMode
|
||||
entry.BestNATType = scored[0].Candidate.NATType
|
||||
entry.BestRegion = scored[0].Candidate.Region
|
||||
entry.BestPolicyTags = append([]string{}, scored[0].Candidate.PolicyTags...)
|
||||
entry.BestCandidateScore = scored[0].Score
|
||||
entry.BestScoreReasons = append([]string{}, scored[0].Reasons...)
|
||||
entry.BestPeerCertSHA256 = candidatePeerCertSHA256(scored[0].Candidate)
|
||||
entry.bestScore = scored[0].Score
|
||||
if strings.TrimSpace(scored[0].Candidate.Address) != "" {
|
||||
entry.Endpoint = strings.TrimSpace(scored[0].Candidate.Address)
|
||||
@@ -188,6 +193,7 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
|
||||
if lease.PeerNodeID != cfg.Local.NodeID {
|
||||
entry := peerCacheEntry(entries, lease.PeerNodeID)
|
||||
useLeaseEndpoint := shouldUseRendezvousEndpoint(*entry)
|
||||
localRelay := lease.RelayNodeID == cfg.Local.NodeID
|
||||
entry.RendezvousLeaseID = lease.LeaseID
|
||||
entry.RelayNodeID = lease.RelayNodeID
|
||||
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
|
||||
@@ -195,12 +201,21 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
|
||||
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
|
||||
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
|
||||
if useLeaseEndpoint {
|
||||
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_control")
|
||||
if localRelay {
|
||||
entry.BestTransport = "reverse_quic"
|
||||
} else {
|
||||
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_quic")
|
||||
}
|
||||
entry.BestReachability = "relay"
|
||||
entry.BestConnectivity = firstNonEmpty(lease.ConnectivityMode, "relay_required")
|
||||
entry.Endpoint = entry.RelayEndpoint
|
||||
entry.BestCandidateID = lease.LeaseID
|
||||
entry.BestCandidateAddr = entry.RelayEndpoint
|
||||
if !localRelay {
|
||||
entry.Endpoint = entry.RelayEndpoint
|
||||
entry.BestCandidateID = lease.LeaseID
|
||||
entry.BestCandidateAddr = entry.RelayEndpoint
|
||||
entry.BestPeerCertSHA256 = rendezvousLeasePeerCertSHA256(lease)
|
||||
} else if strings.TrimSpace(entry.Endpoint) == "" {
|
||||
entry.Endpoint = firstNonEmpty(entry.BestCandidateAddr, entry.RelayEndpoint)
|
||||
}
|
||||
entry.bestScore = maxInt(entry.bestScore, 500)
|
||||
}
|
||||
}
|
||||
@@ -262,6 +277,20 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
|
||||
}}
|
||||
}
|
||||
|
||||
func rendezvousLeasePeerCertSHA256(lease PeerRendezvousLease) string {
|
||||
var metadata struct {
|
||||
PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"`
|
||||
TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"`
|
||||
}
|
||||
if len(lease.Metadata) == 0 {
|
||||
return ""
|
||||
}
|
||||
if err := json.Unmarshal(lease.Metadata, &metadata); err != nil {
|
||||
return ""
|
||||
}
|
||||
return firstNonEmpty(strings.TrimSpace(metadata.PeerCertSHA256), strings.TrimSpace(metadata.TLSCertSHA256))
|
||||
}
|
||||
|
||||
func (c *PeerCache) Snapshot() PeerCacheSnapshot {
|
||||
if c == nil {
|
||||
return PeerCacheSnapshot{}
|
||||
|
||||
@@ -10,15 +10,15 @@ func TestPeerCacheSelectsAdjacentWarmPeersWithinLimit(t *testing.T) {
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpoints: map[string]string{
|
||||
"node-a": "http://node-a:19000",
|
||||
"node-r": "http://node-r:19000",
|
||||
"node-c": "http://node-c:19000",
|
||||
"node-a": "quic://node-a:19443",
|
||||
"node-r": "quic://node-r:19443",
|
||||
"node-c": "quic://node-c:19443",
|
||||
},
|
||||
Routes: []SyntheticRoute{
|
||||
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"}),
|
||||
},
|
||||
RecoverySeeds: []PeerRecoverySeed{
|
||||
{NodeID: "node-seed", Endpoint: "https://seed.example.test", Transport: "direct_tcp_tls", Priority: 10},
|
||||
{NodeID: "node-seed", Endpoint: "quic://seed.example.test:19443", Transport: "direct_quic", Priority: 10},
|
||||
},
|
||||
WarmPeerLimit: 2,
|
||||
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
@@ -42,7 +42,7 @@ func TestPeerCachePromotesRecoverySeedAfterRoutePeers(t *testing.T) {
|
||||
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r"}),
|
||||
},
|
||||
RecoverySeeds: []PeerRecoverySeed{
|
||||
{NodeID: "node-seed", Endpoint: "wss://seed.example.test/mesh", Transport: "wss", ConnectivityMode: "direct", Priority: 1},
|
||||
{NodeID: "node-seed", Endpoint: "quic://seed.example.test:19443", Transport: "direct_quic", ConnectivityMode: "direct", Priority: 1},
|
||||
},
|
||||
WarmPeerLimit: 3,
|
||||
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
@@ -68,7 +68,7 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay",
|
||||
Transport: "relay_quic",
|
||||
Address: "relay.example.test",
|
||||
Reachability: "relay",
|
||||
ConnectivityMode: "relay_required",
|
||||
@@ -77,8 +77,8 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -119,10 +119,10 @@ func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
|
||||
LastVerifiedAt: &now,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-wss",
|
||||
EndpointID: "node-b-ice",
|
||||
NodeID: "node-b",
|
||||
Transport: "wss",
|
||||
Address: "https://node-b.example.test:443",
|
||||
Transport: "ice_quic",
|
||||
Address: "quic://node-b.example.test:19444",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -148,10 +148,10 @@ func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatal("node-b missing from cache")
|
||||
}
|
||||
if entry.BestCandidateID != "node-b-wss" || entry.Endpoint != "https://node-b.example.test:443" {
|
||||
if entry.BestCandidateID != "node-b-ice" || entry.Endpoint != "quic://node-b.example.test:19444" {
|
||||
t.Fatalf("peer cache did not apply endpoint observations: %+v", entry)
|
||||
}
|
||||
if !containsString(entry.BestScoreReasons, "transport:wss") {
|
||||
if !containsString(entry.BestScoreReasons, "transport:ice_quic") {
|
||||
t.Fatalf("peer cache did not expose score reasons: %+v", entry.BestScoreReasons)
|
||||
}
|
||||
}
|
||||
@@ -161,15 +161,15 @@ func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpoints: map[string]string{
|
||||
"node-b": "https://node-b.public.example.test:443",
|
||||
"node-b": "quic://node-b.public.example.test:19443",
|
||||
},
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "https://node-b.public.example.test:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://node-b.public.example.test:19443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -179,8 +179,8 @@ func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-corp-lan",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "http://10.24.10.20:19001",
|
||||
Transport: "lan_quic",
|
||||
Address: "quic://10.24.10.20:19443",
|
||||
Reachability: "private",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -199,7 +199,7 @@ func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatal("node-b missing from peer cache")
|
||||
}
|
||||
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "http://10.24.10.20:19001" {
|
||||
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "quic://10.24.10.20:19443" {
|
||||
t.Fatalf("peer cache did not choose corp LAN endpoint: %+v", entry)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@ type PeerConnectionIntentPlanConfig struct {
|
||||
PeerCache PeerCacheSnapshot
|
||||
RecoveryPlan PeerRecoveryPlan
|
||||
RendezvousLeases []PeerRendezvousLease
|
||||
PreferredRegion string
|
||||
Now time.Time
|
||||
}
|
||||
|
||||
@@ -62,12 +63,14 @@ type PeerConnectionIntent struct {
|
||||
Reachability string `json:"reachability,omitempty"`
|
||||
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
||||
NATType string `json:"nat_type,omitempty"`
|
||||
Region string `json:"region,omitempty"`
|
||||
PolicyTags []string `json:"policy_tags,omitempty"`
|
||||
RequiresRendezvous bool `json:"requires_rendezvous"`
|
||||
RendezvousResolved bool `json:"rendezvous_resolved"`
|
||||
DirectCandidate bool `json:"direct_candidate"`
|
||||
RelayCandidate bool `json:"relay_candidate"`
|
||||
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
||||
BestPeerCertSHA256 string `json:"best_peer_cert_sha256,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
@@ -94,33 +97,35 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
|
||||
}
|
||||
entry := entryByNode[candidate.NodeID]
|
||||
intent := PeerConnectionIntent{
|
||||
NodeID: candidate.NodeID,
|
||||
Action: connectionIntentAction(candidate),
|
||||
Reason: candidate.Reason,
|
||||
Endpoint: candidate.Endpoint,
|
||||
ConnectionState: candidate.ConnectionState,
|
||||
Transport: firstNonEmpty(candidate.BestTransport, entry.BestTransport),
|
||||
Reachability: entry.BestReachability,
|
||||
ConnectivityMode: entry.BestConnectivity,
|
||||
NATType: entry.BestNATType,
|
||||
PolicyTags: append([]string{}, entry.BestPolicyTags...),
|
||||
BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID),
|
||||
RendezvousLeaseID: entry.RendezvousLeaseID,
|
||||
RelayNodeID: entry.RelayNodeID,
|
||||
RelayEndpoint: entry.RelayEndpoint,
|
||||
RelayCandidate: entry.RelayControl,
|
||||
ControlPlaneOnly: entry.RelayControl,
|
||||
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
|
||||
Priority: candidate.Priority,
|
||||
GeneratedAt: now,
|
||||
NodeID: candidate.NodeID,
|
||||
Action: connectionIntentAction(candidate),
|
||||
Reason: candidate.Reason,
|
||||
Endpoint: candidate.Endpoint,
|
||||
ConnectionState: candidate.ConnectionState,
|
||||
Transport: firstNonEmpty(candidate.BestTransport, entry.BestTransport),
|
||||
Reachability: entry.BestReachability,
|
||||
ConnectivityMode: entry.BestConnectivity,
|
||||
NATType: entry.BestNATType,
|
||||
Region: entry.BestRegion,
|
||||
PolicyTags: append([]string{}, entry.BestPolicyTags...),
|
||||
BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID),
|
||||
BestPeerCertSHA256: entry.BestPeerCertSHA256,
|
||||
RendezvousLeaseID: entry.RendezvousLeaseID,
|
||||
RelayNodeID: entry.RelayNodeID,
|
||||
RelayEndpoint: entry.RelayEndpoint,
|
||||
RelayCandidate: entry.RelayControl,
|
||||
ControlPlaneOnly: entry.RelayControl,
|
||||
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
|
||||
Priority: candidate.Priority,
|
||||
GeneratedAt: now,
|
||||
}
|
||||
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent)
|
||||
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent, cfg.PreferredRegion)
|
||||
intent.TransportMode = mode
|
||||
intent.RequiresRendezvous = requiresRendezvous
|
||||
intent.DirectCandidate = directCandidate
|
||||
if intent.RequiresRendezvous {
|
||||
if lease, ok := rendezvousLeaseForPeer(cfg.RendezvousLeases, intent.NodeID, now); ok {
|
||||
applyRendezvousLease(&intent, lease)
|
||||
applyRendezvousLease(&intent, lease, cfg.PeerCache.LocalNodeID)
|
||||
}
|
||||
}
|
||||
intents = append(intents, intent)
|
||||
@@ -185,10 +190,12 @@ func connectionIntentAction(candidate PeerRecoveryCandidate) string {
|
||||
}
|
||||
}
|
||||
|
||||
func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
|
||||
func classifyPeerTransport(intent PeerConnectionIntent, preferredRegion string) (string, bool, bool) {
|
||||
transport := strings.ToLower(strings.TrimSpace(intent.Transport))
|
||||
connectivity := strings.ToLower(strings.TrimSpace(intent.ConnectivityMode))
|
||||
reachability := strings.ToLower(strings.TrimSpace(intent.Reachability))
|
||||
region := strings.TrimSpace(intent.Region)
|
||||
preferredRegion = strings.TrimSpace(preferredRegion)
|
||||
tags := lowerStringSet(intent.PolicyTags)
|
||||
|
||||
if strings.Contains(transport, "relay") || connectivity == "relay_required" || reachability == "relay" {
|
||||
@@ -201,6 +208,9 @@ func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
|
||||
return PeerTransportModeCorporateLAN, false, true
|
||||
}
|
||||
if tags["private-lan"] || reachability == "private" || endpointHasPrivateHost(intent.Endpoint) {
|
||||
if preferredRegion != "" && region != "" && !strings.EqualFold(region, preferredRegion) {
|
||||
return PeerTransportModeRelayRequired, true, false
|
||||
}
|
||||
return PeerTransportModePrivateLAN, false, true
|
||||
}
|
||||
if strings.Contains(transport, "direct") || reachability == "public" || connectivity == "direct" {
|
||||
@@ -246,9 +256,16 @@ func rendezvousLeaseForPeer(leases []PeerRendezvousLease, peerNodeID string, now
|
||||
return candidates[0], true
|
||||
}
|
||||
|
||||
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease) {
|
||||
intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
|
||||
intent.Transport = firstNonEmpty(lease.Transport, "relay_control")
|
||||
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease, localNodeID string) {
|
||||
localRelay := strings.TrimSpace(lease.RelayNodeID) == strings.TrimSpace(localNodeID)
|
||||
if !localRelay {
|
||||
intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
|
||||
}
|
||||
if localRelay {
|
||||
intent.Transport = "reverse_quic"
|
||||
} else {
|
||||
intent.Transport = firstNonEmpty(lease.Transport, "relay_quic")
|
||||
}
|
||||
intent.TransportMode = PeerTransportModeRelayControl
|
||||
intent.RequiresRendezvous = false
|
||||
intent.RendezvousResolved = true
|
||||
@@ -256,17 +273,33 @@ func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLeas
|
||||
intent.RelayCandidate = true
|
||||
intent.RendezvousLeaseID = lease.LeaseID
|
||||
intent.RelayNodeID = lease.RelayNodeID
|
||||
intent.RelayEndpoint = intent.Endpoint
|
||||
intent.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
|
||||
intent.ControlPlaneOnly = true
|
||||
if certSHA256 := rendezvousLeasePeerCertSHA256(lease); certSHA256 != "" && !localRelay {
|
||||
intent.BestPeerCertSHA256 = certSHA256
|
||||
}
|
||||
if lease.ConnectivityMode != "" {
|
||||
intent.ConnectivityMode = lease.ConnectivityMode
|
||||
}
|
||||
}
|
||||
|
||||
func endpointHasPrivateHost(rawEndpoint string) bool {
|
||||
addr, ok := endpointHostAddr(rawEndpoint)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
|
||||
}
|
||||
|
||||
func endpointHasUnspecifiedHost(rawEndpoint string) bool {
|
||||
addr, ok := endpointHostAddr(rawEndpoint)
|
||||
return ok && addr.IsUnspecified()
|
||||
}
|
||||
|
||||
func endpointHostAddr(rawEndpoint string) (netip.Addr, bool) {
|
||||
rawEndpoint = strings.TrimSpace(rawEndpoint)
|
||||
if rawEndpoint == "" {
|
||||
return false
|
||||
return netip.Addr{}, false
|
||||
}
|
||||
host := rawEndpoint
|
||||
if parsed, err := url.Parse(rawEndpoint); err == nil && parsed.Host != "" {
|
||||
@@ -277,9 +310,9 @@ func endpointHasPrivateHost(rawEndpoint string) bool {
|
||||
}
|
||||
addr, err := netip.ParseAddr(strings.Trim(host, "[]"))
|
||||
if err != nil {
|
||||
return false
|
||||
return netip.Addr{}, false
|
||||
}
|
||||
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
|
||||
return addr, true
|
||||
}
|
||||
|
||||
func lowerStringSet(values []string) map[string]bool {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -11,8 +12,8 @@ func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "http://10.24.10.20:19001",
|
||||
BestTransport: "direct_tcp_tls",
|
||||
Endpoint: "quic://10.24.10.20:19443",
|
||||
BestTransport: "lan_quic",
|
||||
BestReachability: "private",
|
||||
BestConnectivity: "direct",
|
||||
BestPolicyTags: []string{"corp-lan", "same-site"},
|
||||
@@ -23,7 +24,7 @@ func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
|
||||
Candidates: []PeerRecoveryCandidate{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "http://10.24.10.20:19001",
|
||||
Endpoint: "quic://10.24.10.20:19443",
|
||||
ConnectionState: PeerConnectionReady,
|
||||
Reason: "maintain_ready",
|
||||
Priority: 100,
|
||||
@@ -48,15 +49,15 @@ func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *te
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "https://node-b.example.test:443",
|
||||
BestTransport: "direct_tcp_tls",
|
||||
Endpoint: "quic://node-b.example.test:19443",
|
||||
BestTransport: "reverse_quic",
|
||||
BestReachability: "outbound_only",
|
||||
BestConnectivity: "outbound_only",
|
||||
},
|
||||
{
|
||||
NodeID: "node-c",
|
||||
Endpoint: "relay://fabric-relay/node-c",
|
||||
BestTransport: "relay",
|
||||
BestTransport: "relay_quic",
|
||||
BestReachability: "relay",
|
||||
BestConnectivity: "relay_required",
|
||||
},
|
||||
@@ -66,7 +67,7 @@ func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *te
|
||||
Candidates: []PeerRecoveryCandidate{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "https://node-b.example.test:443",
|
||||
Endpoint: "quic://node-b.example.test:19443",
|
||||
ConnectionState: PeerConnectionDisconnected,
|
||||
Reason: "recover_warm",
|
||||
Priority: 90,
|
||||
@@ -91,6 +92,42 @@ func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *te
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionIntentsRequireRendezvousForRemotePrivateRegion(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
|
||||
PreferredRegion: "ifcm",
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "quic://192.168.200.61:19132",
|
||||
BestTransport: "direct_quic",
|
||||
BestReachability: "private",
|
||||
BestConnectivity: "private_lan",
|
||||
BestRegion: "docker-test",
|
||||
},
|
||||
}},
|
||||
RecoveryPlan: PeerRecoveryPlan{
|
||||
Mode: PeerRecoveryModeRecovery,
|
||||
Candidates: []PeerRecoveryCandidate{{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "quic://192.168.200.61:19132",
|
||||
ConnectionState: PeerConnectionDisconnected,
|
||||
Reason: "recover_warm",
|
||||
Priority: 100,
|
||||
}},
|
||||
},
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.IntentCount != 1 || plan.RelayRequiredCount != 1 || plan.RendezvousRequiredCount != 1 {
|
||||
t.Fatalf("unexpected remote private plan counts: %+v", plan)
|
||||
}
|
||||
intent := plan.Intents[0]
|
||||
if intent.DirectCandidate || !intent.RequiresRendezvous || intent.TransportMode != PeerTransportModeRelayRequired {
|
||||
t.Fatalf("unexpected remote private intent: %+v", intent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
|
||||
@@ -120,13 +157,14 @@ func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "http://node-r:19000",
|
||||
Transport: "relay_control",
|
||||
RelayEndpoint: "quic://node-r:19443",
|
||||
Transport: "relay_quic",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 10,
|
||||
ControlPlaneOnly: true,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
Metadata: peerConnectionIntentLeaseMetadata(t, "abc123"),
|
||||
},
|
||||
},
|
||||
Now: now,
|
||||
@@ -137,9 +175,10 @@ func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
|
||||
}
|
||||
intent := plan.Intents[0]
|
||||
if intent.TransportMode != PeerTransportModeRelayControl ||
|
||||
intent.Endpoint != "http://node-r:19000" ||
|
||||
intent.Endpoint != "quic://node-r:19443" ||
|
||||
intent.RelayNodeID != "node-r" ||
|
||||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
|
||||
intent.BestPeerCertSHA256 != "abc123" ||
|
||||
!intent.RelayCandidate ||
|
||||
!intent.RendezvousResolved ||
|
||||
intent.RequiresRendezvous {
|
||||
@@ -176,8 +215,8 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
|
||||
LeaseID: "lease-expired-preferred",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r-old",
|
||||
RelayEndpoint: "http://node-r-old:19000",
|
||||
Transport: "relay_control",
|
||||
RelayEndpoint: "quic://node-r-old:19443",
|
||||
Transport: "relay_quic",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 1,
|
||||
ControlPlaneOnly: true,
|
||||
@@ -188,8 +227,8 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
|
||||
LeaseID: "lease-active-reselected",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r-new",
|
||||
RelayEndpoint: "http://node-r-new:19000",
|
||||
Transport: "relay_control",
|
||||
RelayEndpoint: "quic://node-r-new:19443",
|
||||
Transport: "relay_quic",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 20,
|
||||
ControlPlaneOnly: true,
|
||||
@@ -206,20 +245,29 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
|
||||
intent := plan.Intents[0]
|
||||
if intent.RendezvousLeaseID != "lease-active-reselected" ||
|
||||
intent.RelayNodeID != "node-r-new" ||
|
||||
intent.Endpoint != "http://node-r-new:19000" {
|
||||
intent.Endpoint != "quic://node-r-new:19443" {
|
||||
t.Fatalf("expired lease was not skipped: %+v", intent)
|
||||
}
|
||||
}
|
||||
|
||||
func peerConnectionIntentLeaseMetadata(t *testing.T, certSHA256 string) json.RawMessage {
|
||||
t.Helper()
|
||||
payload, err := json.Marshal(map[string]string{"peer_cert_sha256": certSHA256})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal metadata: %v", err)
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
func TestPeerConnectionIntentsClassifyPrivateEndpointWithoutCandidateHints(t *testing.T) {
|
||||
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{NodeID: "node-b", Endpoint: "http://192.168.10.20:19001"},
|
||||
{NodeID: "node-b", Endpoint: "quic://192.168.10.20:19443"},
|
||||
}},
|
||||
RecoveryPlan: PeerRecoveryPlan{Candidates: []PeerRecoveryCandidate{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "http://192.168.10.20:19001",
|
||||
Endpoint: "quic://192.168.10.20:19443",
|
||||
ConnectionState: PeerConnectionDisconnected,
|
||||
Reason: "recover_peer",
|
||||
Priority: 10,
|
||||
|
||||
@@ -2,6 +2,7 @@ package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -25,6 +26,8 @@ type PeerConnectionManagerConfig struct {
|
||||
Tracker *PeerConnectionTracker
|
||||
RendezvousLeases []PeerRendezvousLease
|
||||
HTTPClient *http.Client
|
||||
QUICTransport *QUICFabricTransport
|
||||
PreferredRegion string
|
||||
ProbeTimeout time.Duration
|
||||
Now func() time.Time
|
||||
}
|
||||
@@ -35,6 +38,8 @@ type PeerConnectionManager struct {
|
||||
tracker *PeerConnectionTracker
|
||||
rendezvousLeases []PeerRendezvousLease
|
||||
httpClient *http.Client
|
||||
quicTransport *QUICFabricTransport
|
||||
preferredRegion string
|
||||
probeTimeout time.Duration
|
||||
now func() time.Time
|
||||
|
||||
@@ -101,9 +106,10 @@ type PeerConnectionCandidateProbeResult struct {
|
||||
}
|
||||
|
||||
type peerConnectionProbeTarget struct {
|
||||
CandidateID string
|
||||
Endpoint string
|
||||
Transport string
|
||||
CandidateID string
|
||||
Endpoint string
|
||||
Transport string
|
||||
PeerCertSHA256 string
|
||||
}
|
||||
|
||||
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
|
||||
@@ -132,6 +138,8 @@ func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionMa
|
||||
tracker: cfg.Tracker,
|
||||
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
|
||||
httpClient: httpClient,
|
||||
quicTransport: cfg.QUICTransport,
|
||||
preferredRegion: strings.TrimSpace(cfg.PreferredRegion),
|
||||
probeTimeout: probeTimeout,
|
||||
now: now,
|
||||
}
|
||||
@@ -155,6 +163,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
|
||||
PeerCache: peerSnapshot,
|
||||
RecoveryPlan: recoveryPlan,
|
||||
RendezvousLeases: rendezvousLeases,
|
||||
PreferredRegion: m.preferredRegion,
|
||||
Now: startedAt,
|
||||
})
|
||||
entriesByNode := map[string]PeerCacheEntry{}
|
||||
@@ -215,6 +224,15 @@ func (m *PeerConnectionManager) UpdatePeerConfig(peerCache *PeerCache, rendezvou
|
||||
m.rendezvousLeases = append([]PeerRendezvousLease{}, rendezvousLeases...)
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) UpdateQUICTransport(transport *QUICFabricTransport) {
|
||||
if m == nil {
|
||||
return
|
||||
}
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.quicTransport = transport
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvousLease) {
|
||||
if m == nil {
|
||||
return nil, nil
|
||||
@@ -242,17 +260,18 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
StartedAt: startedAt,
|
||||
}
|
||||
peer := PeerCacheEntry{
|
||||
NodeID: intent.NodeID,
|
||||
Endpoint: intent.Endpoint,
|
||||
Warm: true,
|
||||
WarmReason: intent.Reason,
|
||||
RecoverySeed: intent.RecoverySeed,
|
||||
BestCandidateID: intent.BestCandidateID,
|
||||
BestTransport: intent.Transport,
|
||||
RendezvousLeaseID: intent.RendezvousLeaseID,
|
||||
RelayNodeID: intent.RelayNodeID,
|
||||
RelayEndpoint: intent.RelayEndpoint,
|
||||
RelayControl: intent.RelayCandidate,
|
||||
NodeID: intent.NodeID,
|
||||
Endpoint: intent.Endpoint,
|
||||
Warm: true,
|
||||
WarmReason: intent.Reason,
|
||||
RecoverySeed: intent.RecoverySeed,
|
||||
BestCandidateID: intent.BestCandidateID,
|
||||
BestTransport: intent.Transport,
|
||||
RendezvousLeaseID: intent.RendezvousLeaseID,
|
||||
RelayNodeID: intent.RelayNodeID,
|
||||
RelayEndpoint: intent.RelayEndpoint,
|
||||
RelayControl: intent.RelayCandidate,
|
||||
BestPeerCertSHA256: firstNonEmpty(intent.BestPeerCertSHA256, cacheEntry.BestPeerCertSHA256),
|
||||
}
|
||||
if intent.RequiresRendezvous {
|
||||
result.LinkStatus = PeerConnectionProbeDeferred
|
||||
@@ -282,13 +301,12 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
ClusterID: m.local.ClusterID,
|
||||
NodeID: intent.NodeID,
|
||||
}
|
||||
if intent.RelayCandidate && intent.RelayNodeID != "" {
|
||||
target.NodeID = intent.RelayNodeID
|
||||
}
|
||||
target.NodeID = peerConnectionProbeTargetNodeID(intent, m.local.NodeID)
|
||||
targets := []peerConnectionProbeTarget{{
|
||||
CandidateID: intent.BestCandidateID,
|
||||
Endpoint: intent.Endpoint,
|
||||
Transport: intent.Transport,
|
||||
CandidateID: intent.BestCandidateID,
|
||||
Endpoint: intent.Endpoint,
|
||||
Transport: intent.Transport,
|
||||
PeerCertSHA256: intent.BestPeerCertSHA256,
|
||||
}}
|
||||
if intent.DirectCandidate {
|
||||
targets = peerConnectionProbeTargets(intent, cacheEntry)
|
||||
@@ -300,13 +318,14 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
probePeer.BestCandidateID = strings.TrimSpace(probeTarget.CandidateID)
|
||||
probePeer.BestCandidateAddr = probePeer.Endpoint
|
||||
probePeer.BestTransport = strings.TrimSpace(probeTarget.Transport)
|
||||
probePeer.BestPeerCertSHA256 = firstNonEmpty(probeTarget.PeerCertSHA256, probePeer.BestPeerCertSHA256)
|
||||
if probePeer.Endpoint == "" {
|
||||
continue
|
||||
}
|
||||
candidateStartedAt := normalizedNow(m.now())
|
||||
m.tracker.BeginProbe(probePeer, candidateStartedAt)
|
||||
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
|
||||
_, err := NewClient(probePeer.Endpoint).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
|
||||
err := m.probePeerTarget(probeCtx, probePeer, target)
|
||||
cancel()
|
||||
completedAt := normalizedNow(m.now())
|
||||
candidateResult := PeerConnectionCandidateProbeResult{
|
||||
@@ -354,47 +373,97 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
return result
|
||||
}
|
||||
|
||||
func peerConnectionProbeTargetNodeID(intent PeerConnectionIntent, localNodeID string) string {
|
||||
if intent.RelayCandidate && strings.TrimSpace(intent.RelayNodeID) != "" && strings.TrimSpace(intent.RelayNodeID) != strings.TrimSpace(localNodeID) {
|
||||
return intent.RelayNodeID
|
||||
}
|
||||
return intent.NodeID
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer PeerCacheEntry, target PeerIdentity) error {
|
||||
endpoint := strings.TrimRight(strings.TrimSpace(probePeer.Endpoint), "/")
|
||||
transport := strings.TrimSpace(probePeer.BestTransport)
|
||||
if hasLegacyEndpointScheme(endpoint) {
|
||||
return fmt.Errorf("non_quic_probe_rejected")
|
||||
}
|
||||
if peerConnectionTargetIsQUIC(transport, endpoint) {
|
||||
carrier, selectedTarget, err := FabricTransportForTarget(FabricTransportTarget{
|
||||
EndpointID: probePeer.BestCandidateID,
|
||||
PeerID: target.NodeID,
|
||||
Endpoint: endpoint,
|
||||
Transport: transport,
|
||||
Timeout: m.probeTimeout,
|
||||
PeerCertSHA256: strings.TrimSpace(probePeer.BestPeerCertSHA256),
|
||||
}, m.quicTransport)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
session, err := carrier.Connect(ctx, selectedTarget)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return session.Close()
|
||||
}
|
||||
return fmt.Errorf("non_quic_probe_rejected")
|
||||
}
|
||||
|
||||
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
|
||||
seen := map[string]struct{}{}
|
||||
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
|
||||
add := func(candidateID, endpoint, transport string) {
|
||||
add := func(candidateID, endpoint, transport, peerCertSHA256 string) {
|
||||
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
|
||||
if endpoint == "" {
|
||||
return
|
||||
}
|
||||
if endpointHasUnspecifiedHost(endpoint) {
|
||||
return
|
||||
}
|
||||
key := candidateID + "|" + endpoint
|
||||
if _, ok := seen[key]; ok {
|
||||
return
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
out = append(out, peerConnectionProbeTarget{
|
||||
CandidateID: strings.TrimSpace(candidateID),
|
||||
Endpoint: endpoint,
|
||||
Transport: strings.TrimSpace(transport),
|
||||
CandidateID: strings.TrimSpace(candidateID),
|
||||
Endpoint: endpoint,
|
||||
Transport: strings.TrimSpace(transport),
|
||||
PeerCertSHA256: strings.TrimSpace(peerCertSHA256),
|
||||
})
|
||||
}
|
||||
for _, candidate := range cacheEntry.EndpointCandidates {
|
||||
if !candidateUsableForDirectProbe(candidate) {
|
||||
continue
|
||||
}
|
||||
add(candidate.EndpointID, candidate.Address, candidate.Transport)
|
||||
add(candidate.EndpointID, candidate.Address, candidate.Transport, candidatePeerCertSHA256(candidate))
|
||||
}
|
||||
add(intent.BestCandidateID, intent.Endpoint, intent.Transport)
|
||||
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, cacheEntry.BestPeerCertSHA256)
|
||||
return out
|
||||
}
|
||||
|
||||
func peerConnectionTargetIsQUIC(transport string, endpoint string) bool {
|
||||
return isQUICOnlyCandidateTransport(transport) || strings.HasPrefix(strings.ToLower(strings.TrimSpace(endpoint)), "quic://")
|
||||
}
|
||||
|
||||
func candidateUsableForDirectProbe(candidate PeerEndpointCandidate) bool {
|
||||
endpoint := strings.TrimSpace(candidate.Address)
|
||||
if endpoint == "" || strings.HasPrefix(endpoint, "relay://") || strings.HasPrefix(endpoint, "outbound://") {
|
||||
return false
|
||||
}
|
||||
if endpointHasUnspecifiedHost(endpoint) {
|
||||
return false
|
||||
}
|
||||
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
|
||||
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
|
||||
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
|
||||
if connectivity == "outbound_only" || connectivity == "relay_required" || reachability == "outbound_only" || reachability == "relay" {
|
||||
return false
|
||||
}
|
||||
return transport == "" || strings.Contains(transport, "direct") || transport == "wss" || strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
|
||||
return transport == "" ||
|
||||
strings.Contains(transport, "direct_quic") ||
|
||||
transport == "quic" ||
|
||||
transport == "lan_quic" ||
|
||||
transport == "ice_quic" ||
|
||||
strings.HasPrefix(endpoint, "quic://")
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
|
||||
|
||||
@@ -2,8 +2,8 @@ package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@@ -11,12 +11,18 @@ import (
|
||||
func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
|
||||
}.Handler())
|
||||
tlsConfig := testQUICTLSConfig(t)
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
certSHA256 := testQUICCertSHA256(t, tlsConfig)
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
@@ -24,19 +30,20 @@ func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-direct",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: server.URL,
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://" + server.Addr().String(),
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "direct",
|
||||
PolicyTags: []string{"corp-lan", "same-site"},
|
||||
Priority: 1,
|
||||
Metadata: peerConnectionProbeMetadata(t, certSHA256),
|
||||
},
|
||||
},
|
||||
"node-c": {
|
||||
{
|
||||
EndpointID: "node-c-relay",
|
||||
NodeID: "node-c",
|
||||
Transport: "relay",
|
||||
Transport: "relay_quic",
|
||||
Address: "relay://fabric/node-c",
|
||||
Reachability: "relay",
|
||||
ConnectivityMode: "relay_required",
|
||||
@@ -49,10 +56,11 @@ func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
|
||||
})
|
||||
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
|
||||
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
ProbeTimeout: time.Second,
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
QUICTransport: NewQUICFabricTransport(nil),
|
||||
ProbeTimeout: time.Second,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
return current
|
||||
@@ -116,24 +124,31 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
|
||||
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
|
||||
}.Handler())
|
||||
tlsConfig := testQUICTLSConfig(t)
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
certSHA256 := testQUICCertSHA256(t, tlsConfig)
|
||||
leases := []PeerRendezvousLease{
|
||||
{
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: server.URL,
|
||||
Transport: "relay_control",
|
||||
RelayEndpoint: "quic://" + server.Addr().String(),
|
||||
Transport: "relay_quic",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 10,
|
||||
ControlPlaneOnly: true,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
Metadata: peerConnectionProbeMetadata(t, certSHA256),
|
||||
},
|
||||
}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
@@ -143,7 +158,7 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay",
|
||||
Transport: "relay_quic",
|
||||
Address: "relay://fabric/node-b",
|
||||
Reachability: "relay",
|
||||
ConnectivityMode: "relay_required",
|
||||
@@ -161,6 +176,7 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
RendezvousLeases: leases,
|
||||
QUICTransport: NewQUICFabricTransport(nil),
|
||||
ProbeTimeout: time.Second,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
@@ -189,15 +205,37 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionProbeTargetKeepsPeerForLocalRelayReverseQUIC(t *testing.T) {
|
||||
intent := PeerConnectionIntent{
|
||||
NodeID: "node-b",
|
||||
RelayCandidate: true,
|
||||
RelayNodeID: "node-a",
|
||||
Transport: "reverse_quic",
|
||||
}
|
||||
if got := peerConnectionProbeTargetNodeID(intent, "node-a"); got != "node-b" {
|
||||
t.Fatalf("local relay reverse probe target = %q, want peer node-b", got)
|
||||
}
|
||||
intent.RelayNodeID = "node-r"
|
||||
if got := peerConnectionProbeTargetNodeID(intent, "node-a"); got != "node-r" {
|
||||
t.Fatalf("remote relay probe target = %q, want relay node-r", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
|
||||
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
|
||||
}.Handler())
|
||||
tlsConfig := testQUICTLSConfig(t)
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
certSHA256 := testQUICCertSHA256(t, tlsConfig)
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
@@ -205,8 +243,8 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-dead",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_http",
|
||||
Address: "http://127.0.0.1:1",
|
||||
Transport: "lan_quic",
|
||||
Address: "quic://127.0.0.1:1",
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "private_lan",
|
||||
Priority: 1,
|
||||
@@ -214,11 +252,12 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
|
||||
{
|
||||
EndpointID: "node-b-live",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_http",
|
||||
Address: server.URL,
|
||||
Transport: "lan_quic",
|
||||
Address: "quic://" + server.Addr().String(),
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "private_lan",
|
||||
Priority: 2,
|
||||
Metadata: peerConnectionProbeMetadata(t, certSHA256),
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -227,11 +266,11 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
|
||||
})
|
||||
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
|
||||
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
HTTPClient: &http.Client{Timeout: 100 * time.Millisecond},
|
||||
ProbeTimeout: 100 * time.Millisecond,
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
QUICTransport: NewQUICFabricTransport(nil),
|
||||
ProbeTimeout: 100 * time.Millisecond,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
return current
|
||||
@@ -243,7 +282,7 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
|
||||
t.Fatalf("unexpected cycle: %+v", cycle)
|
||||
}
|
||||
result := cycle.Results[0]
|
||||
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != server.URL {
|
||||
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != "quic://"+server.Addr().String() {
|
||||
t.Fatalf("fallback did not select live candidate: %+v", result)
|
||||
}
|
||||
if len(result.CandidateResults) != 2 ||
|
||||
@@ -252,7 +291,85 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
|
||||
t.Fatalf("candidate probe trail mismatch: %+v", result.CandidateResults)
|
||||
}
|
||||
snapshot := tracker.Snapshot()
|
||||
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != server.URL {
|
||||
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != "quic://"+server.Addr().String() {
|
||||
t.Fatalf("tracker did not retain selected candidate: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionManagerSkipsUnspecifiedQUICCandidates(t *testing.T) {
|
||||
now := time.Date(2026, 5, 17, 6, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
tlsConfig := testQUICTLSConfig(t)
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
certSHA256 := testQUICCertSHA256(t, tlsConfig)
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-unspecified-v6",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://[::]:19131",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 1,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-live",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://" + server.Addr().String(),
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 2,
|
||||
Metadata: peerConnectionProbeMetadata(t, certSHA256),
|
||||
},
|
||||
},
|
||||
},
|
||||
WarmPeerLimit: 1,
|
||||
Now: now,
|
||||
})
|
||||
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
|
||||
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
QUICTransport: NewQUICFabricTransport(nil),
|
||||
ProbeTimeout: time.Second,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
return current
|
||||
},
|
||||
})
|
||||
|
||||
cycle := manager.ProbeOnce(context.Background())
|
||||
if cycle.Attempted != 1 || cycle.Succeeded != 1 || len(cycle.Results) != 1 {
|
||||
t.Fatalf("unexpected cycle: %+v", cycle)
|
||||
}
|
||||
result := cycle.Results[0]
|
||||
if result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != "quic://"+server.Addr().String() {
|
||||
t.Fatalf("manager did not skip unspecified endpoint: %+v", result)
|
||||
}
|
||||
if len(result.CandidateResults) != 1 || result.CandidateResults[0].CandidateID != "node-b-live" {
|
||||
t.Fatalf("unspecified endpoint should not be probed: %+v", result.CandidateResults)
|
||||
}
|
||||
}
|
||||
|
||||
func peerConnectionProbeMetadata(t *testing.T, certSHA256 string) json.RawMessage {
|
||||
t.Helper()
|
||||
payload, err := json.Marshal(map[string]string{"peer_cert_sha256": certSHA256})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal probe metadata: %v", err)
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ func TestPeerConnectionTrackerTransitionsReadyAndDegraded(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
|
||||
Entries: []PeerCacheEntry{
|
||||
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "http://node-b:19000"},
|
||||
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "quic://node-b:19443"},
|
||||
},
|
||||
}, now)
|
||||
|
||||
|
||||
@@ -76,12 +76,12 @@ func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
|
||||
Entries: []PeerCacheEntry{
|
||||
{
|
||||
NodeID: "node-c",
|
||||
Endpoint: "http://relay:19001",
|
||||
Endpoint: "quic://relay:19443",
|
||||
Warm: true,
|
||||
WarmReason: "rendezvous_lease",
|
||||
RendezvousLeaseID: "lease-1",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "http://relay:19001",
|
||||
RelayEndpoint: "quic://relay:19443",
|
||||
RelayControl: true,
|
||||
},
|
||||
},
|
||||
@@ -121,7 +121,7 @@ func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
|
||||
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
|
||||
return PeerCacheEntry{
|
||||
NodeID: nodeID,
|
||||
Endpoint: "http://" + nodeID + ":19001",
|
||||
Endpoint: "quic://" + nodeID + ":19443",
|
||||
Warm: warm,
|
||||
WarmReason: warmReason,
|
||||
RecoverySeed: recoverySeed,
|
||||
|
||||
@@ -2,42 +2,369 @@ package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
type ProductionForwardTransport interface {
|
||||
SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error)
|
||||
}
|
||||
|
||||
type HTTPProductionForwardTransport struct {
|
||||
PeerURLs map[string]string
|
||||
HTTPClient *http.Client
|
||||
type QUICProductionForwardTransport struct {
|
||||
Targets map[string]FabricTransportTarget
|
||||
RouteSets map[string]FabricRouteSet
|
||||
Transport FabricTransport
|
||||
Router FabricChannelRouter
|
||||
Timeout time.Duration
|
||||
Pressure *FabricRoutePressureTracker
|
||||
Health *FabricRouteHealthTracker
|
||||
sequence atomic.Uint64
|
||||
}
|
||||
|
||||
func NewHTTPProductionForwardTransport(peerURLs map[string]string) *HTTPProductionForwardTransport {
|
||||
normalized := make(map[string]string, len(peerURLs))
|
||||
for nodeID, baseURL := range peerURLs {
|
||||
type QUICProductionForwardTransportSnapshot struct {
|
||||
RoutePressure FabricRoutePressureSnapshot `json:"route_pressure"`
|
||||
RouteHealth FabricRouteHealthSnapshot `json:"route_health,omitempty"`
|
||||
}
|
||||
|
||||
func NewQUICProductionForwardTransport(targets map[string]FabricTransportTarget, transport *QUICFabricTransport) *QUICProductionForwardTransport {
|
||||
routeSets := make(map[string]FabricRouteSet, len(targets))
|
||||
for nodeID, target := range targets {
|
||||
nodeID = strings.TrimSpace(nodeID)
|
||||
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
|
||||
if nodeID != "" && baseURL != "" {
|
||||
normalized[nodeID] = baseURL
|
||||
target.Endpoint = strings.TrimRight(strings.TrimSpace(target.Endpoint), "/")
|
||||
target.Transport = strings.TrimSpace(target.Transport)
|
||||
if nodeID != "" && target.Endpoint != "" {
|
||||
target.PeerID = firstNonEmpty(strings.TrimSpace(target.PeerID), nodeID)
|
||||
routeSets[nodeID] = FabricRouteSetForTransportTargets("", "", nodeID, []FabricTransportTarget{target})
|
||||
}
|
||||
}
|
||||
return &HTTPProductionForwardTransport{PeerURLs: normalized}
|
||||
if transport == nil {
|
||||
transport = NewQUICFabricTransport(nil)
|
||||
}
|
||||
return NewQUICProductionForwardTransportFromRouteSets(routeSets, transport)
|
||||
}
|
||||
|
||||
func (t *HTTPProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
|
||||
if t == nil {
|
||||
return ProductionForwardResult{}, ErrForwardPeerUnavailable
|
||||
func NewQUICProductionForwardTransportFromRouteSets(routeSets map[string]FabricRouteSet, transport FabricTransport) *QUICProductionForwardTransport {
|
||||
normalizedRouteSets := make(map[string]FabricRouteSet, len(routeSets))
|
||||
targets := make(map[string]FabricTransportTarget, len(routeSets))
|
||||
for nodeID, routeSet := range routeSets {
|
||||
nodeID = strings.TrimSpace(nodeID)
|
||||
if nodeID == "" {
|
||||
continue
|
||||
}
|
||||
normalizedRouteSets[nodeID] = routeSet
|
||||
if target, err := FabricTransportTargetForRoute(routeSet.Primary); err == nil {
|
||||
targets[nodeID] = target
|
||||
}
|
||||
}
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
|
||||
if baseURL == "" {
|
||||
return ProductionForwardResult{}, ErrForwardPeerUnavailable
|
||||
if transport == nil {
|
||||
transport = NewQUICFabricTransport(nil)
|
||||
}
|
||||
client := NewClient(baseURL)
|
||||
if t.HTTPClient != nil {
|
||||
client.HTTPClient = t.HTTPClient
|
||||
return &QUICProductionForwardTransport{
|
||||
Targets: targets,
|
||||
RouteSets: normalizedRouteSets,
|
||||
Transport: transport,
|
||||
Router: NewFabricChannelRouter(FabricChannelRouterConfig{
|
||||
MaxAckLatencyMs: 2000,
|
||||
MinRerouteInterval: 50 * time.Millisecond,
|
||||
}),
|
||||
Timeout: 30 * time.Second,
|
||||
Pressure: NewFabricRoutePressureTracker(),
|
||||
Health: NewFabricRouteHealthTracker(30 * time.Second),
|
||||
}
|
||||
return client.SendProduction(ctx, envelope)
|
||||
}
|
||||
|
||||
func (t *QUICProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
|
||||
if t == nil || t.Transport == nil {
|
||||
return ProductionForwardResult{}, ErrForwardPeerUnavailable
|
||||
}
|
||||
nextNodeID = strings.TrimSpace(nextNodeID)
|
||||
routeSet, ok := t.RouteSets[nextNodeID]
|
||||
if !ok {
|
||||
target, targetOK := t.Targets[nextNodeID]
|
||||
if !targetOK || strings.TrimSpace(target.Endpoint) == "" {
|
||||
return ProductionForwardResult{}, ErrForwardPeerUnavailable
|
||||
}
|
||||
routeSet = FabricRouteSetForTransportTargets(envelope.ClusterID, envelope.CurrentHopNodeID, nextNodeID, []FabricTransportTarget{target})
|
||||
}
|
||||
spec := FabricChannelSpec{
|
||||
ChannelID: firstNonEmpty(strings.TrimSpace(envelope.MessageID), fmt.Sprintf("production-%d", t.sequence.Add(1))),
|
||||
ClusterID: envelope.ClusterID,
|
||||
SourceNodeID: firstNonEmpty(productionRouteSetSourceNodeID(routeSet), envelope.CurrentHopNodeID),
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: nextNodeID,
|
||||
TrafficClass: FabricServiceChannelReliable,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
}
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
result, err := t.sendProductionWithRouteSet(ctx, spec, routeSet, payload)
|
||||
if err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func productionRouteSetSourceNodeID(routeSet FabricRouteSet) string {
|
||||
for _, route := range flattenFabricRouteSet(routeSet) {
|
||||
if sourceNodeID := strings.TrimSpace(route.SourceNodeID); sourceNodeID != "" {
|
||||
return sourceNodeID
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (t *QUICProductionForwardTransport) sendProductionWithRouteSet(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payload []byte) (ProductionForwardResult, error) {
|
||||
router := t.Router
|
||||
if router.Config.MaxRoutePressure == 0 {
|
||||
router = NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 2000, MinRerouteInterval: 50 * time.Millisecond})
|
||||
}
|
||||
routeSet = t.routeSetForScheduling(routeSet)
|
||||
channel, _, err := router.OpenChannel(spec, routeSet, time.Now().UTC())
|
||||
if err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
timeout := t.Timeout
|
||||
if timeout <= 0 {
|
||||
timeout = 30 * time.Second
|
||||
}
|
||||
for {
|
||||
routeSet = t.routeSetForScheduling(routeSet)
|
||||
route, ok := findFabricRoute(routeSet, channel.RouteID)
|
||||
if !ok {
|
||||
return ProductionForwardResult{}, ErrFabricRouteNotFound
|
||||
}
|
||||
target, err := FabricTransportTargetForRoute(route)
|
||||
if err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
target.PeerID = firstNonEmpty(strings.TrimSpace(target.PeerID), spec.TargetID)
|
||||
target.MaxPayload = fabricproto.DefaultMaxPayload
|
||||
releaseRoute := t.acquireProductionRoute(route.RouteID)
|
||||
session, err := t.Transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
releaseRoute()
|
||||
t.markProductionRouteFailure(route.RouteID, err)
|
||||
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
Failed: true,
|
||||
Reason: "connect_failed",
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
channel = updated
|
||||
if event.Type == FabricChannelRouteEventReroute {
|
||||
continue
|
||||
}
|
||||
if rerouteErr != nil {
|
||||
return ProductionForwardResult{}, rerouteErr
|
||||
}
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
response, ackMs, err := t.sendProductionOnSession(ctx, session, payload, timeout)
|
||||
_ = session.Close()
|
||||
releaseRoute()
|
||||
if err == nil {
|
||||
t.markProductionRouteSuccess(route.RouteID)
|
||||
_, _, _ = router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
AckLatencyMs: ackMs,
|
||||
BytesSent: uint64(len(payload)),
|
||||
FramesSent: 1,
|
||||
BytesRecv: uint64(len(response.Payload)),
|
||||
FramesRecv: 1,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
return decodeQUICProductionForwardResponse(response.Payload)
|
||||
}
|
||||
t.markProductionRouteFailure(route.RouteID, err)
|
||||
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
Failed: true,
|
||||
Reason: "response_failed",
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
channel = updated
|
||||
if event.Type == FabricChannelRouteEventReroute {
|
||||
continue
|
||||
}
|
||||
if rerouteErr != nil {
|
||||
return ProductionForwardResult{}, rerouteErr
|
||||
}
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
}
|
||||
|
||||
func (t *QUICProductionForwardTransport) routeSetWithActiveChannels(routeSet FabricRouteSet) FabricRouteSet {
|
||||
if t == nil || t.Pressure == nil {
|
||||
return routeSet
|
||||
}
|
||||
return t.Pressure.Apply(routeSet)
|
||||
}
|
||||
|
||||
func (t *QUICProductionForwardTransport) routeSetForScheduling(routeSet FabricRouteSet) FabricRouteSet {
|
||||
if t != nil && t.Health != nil {
|
||||
routeSet = t.Health.Apply(routeSet, time.Now().UTC())
|
||||
}
|
||||
return t.routeSetWithActiveChannels(routeSet)
|
||||
}
|
||||
|
||||
func (t *QUICProductionForwardTransport) acquireProductionRoute(routeID string) func() {
|
||||
if t == nil || t.Pressure == nil {
|
||||
return func() {}
|
||||
}
|
||||
return t.Pressure.Acquire(routeID)
|
||||
}
|
||||
|
||||
func (t *QUICProductionForwardTransport) markProductionRouteFailure(routeID string, err error) {
|
||||
if t == nil || t.Health == nil || err == nil {
|
||||
return
|
||||
}
|
||||
t.Health.MarkFailure(routeID, err.Error(), time.Now().UTC())
|
||||
}
|
||||
|
||||
func (t *QUICProductionForwardTransport) markProductionRouteSuccess(routeID string) {
|
||||
if t == nil || t.Health == nil {
|
||||
return
|
||||
}
|
||||
t.Health.MarkSuccess(routeID)
|
||||
}
|
||||
|
||||
func (t *QUICProductionForwardTransport) Snapshot() QUICProductionForwardTransportSnapshot {
|
||||
if t == nil {
|
||||
return QUICProductionForwardTransportSnapshot{}
|
||||
}
|
||||
var pressure FabricRoutePressureSnapshot
|
||||
if t.Pressure != nil {
|
||||
pressure = t.Pressure.SnapshotPressure()
|
||||
}
|
||||
var health FabricRouteHealthSnapshot
|
||||
if t.Health != nil {
|
||||
health = t.Health.Snapshot(time.Now().UTC())
|
||||
}
|
||||
return QUICProductionForwardTransportSnapshot{RoutePressure: pressure, RouteHealth: health}
|
||||
}
|
||||
|
||||
func (t *QUICProductionForwardTransport) sendProductionOnSession(ctx context.Context, session FabricTransportSession, payload []byte, timeout time.Duration) (fabricproto.Frame, int64, error) {
|
||||
sequence := t.sequence.Add(1)
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
StreamID: ProductionForwardQUICStreamID,
|
||||
Sequence: sequence,
|
||||
Payload: payload,
|
||||
}); err != nil {
|
||||
return fabricproto.Frame{}, 0, err
|
||||
}
|
||||
waitCtx := ctx
|
||||
if timeout > 0 {
|
||||
var cancel context.CancelFunc
|
||||
waitCtx, cancel = context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
}
|
||||
started := time.Now()
|
||||
for {
|
||||
select {
|
||||
case <-waitCtx.Done():
|
||||
return fabricproto.Frame{}, 0, waitCtx.Err()
|
||||
case err, ok := <-session.Errors():
|
||||
if !ok {
|
||||
return fabricproto.Frame{}, 0, ErrForwardPeerUnavailable
|
||||
}
|
||||
if err != nil {
|
||||
return fabricproto.Frame{}, 0, err
|
||||
}
|
||||
case frame, ok := <-session.Frames():
|
||||
if !ok {
|
||||
return fabricproto.Frame{}, 0, ErrForwardPeerUnavailable
|
||||
}
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != ProductionForwardQUICStreamID || frame.Sequence != sequence {
|
||||
continue
|
||||
}
|
||||
return frame, time.Since(started).Milliseconds(), nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func decodeQUICProductionForwardResponse(payload []byte) (ProductionForwardResult, error) {
|
||||
var response quicProductionForwardResponse
|
||||
if err := json.Unmarshal(payload, &response); err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
if strings.TrimSpace(response.Error) != "" {
|
||||
return ProductionForwardResult{}, fmt.Errorf("%w: %s", ErrForwardPeerUnavailable, response.Error)
|
||||
}
|
||||
return response.Result, nil
|
||||
}
|
||||
|
||||
func FabricRouteSetForTransportTargets(clusterID string, sourceNodeID string, targetNodeID string, targets []FabricTransportTarget) FabricRouteSet {
|
||||
routeSet := FabricRouteSet{TargetKind: FabricChannelTargetNode, TargetID: strings.TrimSpace(targetNodeID)}
|
||||
routes := make([]FabricRoute, 0, len(targets))
|
||||
for index, target := range targets {
|
||||
target.Endpoint = strings.TrimRight(strings.TrimSpace(target.Endpoint), "/")
|
||||
if strings.TrimSpace(target.Endpoint) == "" {
|
||||
continue
|
||||
}
|
||||
peerID := firstNonEmpty(strings.TrimSpace(target.PeerID), strings.TrimSpace(targetNodeID))
|
||||
routeID := strings.TrimSpace(target.EndpointID)
|
||||
if routeID == "" {
|
||||
routeID = fmt.Sprintf("%s-quic-%d", peerID, index)
|
||||
}
|
||||
routes = append(routes, FabricRoute{
|
||||
RouteID: routeID,
|
||||
ClusterID: strings.TrimSpace(clusterID),
|
||||
SourceNodeID: strings.TrimSpace(sourceNodeID),
|
||||
DestinationNodeID: peerID,
|
||||
Hops: []FabricRouteHop{{
|
||||
NodeID: peerID,
|
||||
Mode: fabricRouteModeForTransportTarget(target),
|
||||
EndpointID: strings.TrimSpace(target.EndpointID),
|
||||
Address: target.Endpoint,
|
||||
PeerCertSHA256: strings.TrimSpace(target.PeerCertSHA256),
|
||||
}},
|
||||
BaseLatencyMs: routeLatencyForIndex(index),
|
||||
Capacity: 100,
|
||||
ActiveChannels: 0,
|
||||
Healthy: true,
|
||||
LastUpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
}
|
||||
if len(routes) == 0 {
|
||||
return routeSet
|
||||
}
|
||||
routeSet.Primary = routes[0]
|
||||
if len(routes) > 1 {
|
||||
routeSet.WarmStandby = append(routeSet.WarmStandby, routes[1:]...)
|
||||
}
|
||||
return routeSet
|
||||
}
|
||||
|
||||
func fabricRouteModeForTransportTarget(target FabricTransportTarget) FabricRouteMode {
|
||||
switch strings.ToLower(strings.TrimSpace(target.Transport)) {
|
||||
case string(FabricRouteLAN):
|
||||
return FabricRouteLAN
|
||||
case string(FabricRouteReverse):
|
||||
return FabricRouteReverse
|
||||
case string(FabricRouteRelay):
|
||||
return FabricRouteRelay
|
||||
case string(FabricRouteICE):
|
||||
return FabricRouteICE
|
||||
default:
|
||||
return FabricRouteDirect
|
||||
}
|
||||
}
|
||||
|
||||
func routeLatencyForIndex(index int) int {
|
||||
if index <= 0 {
|
||||
return 10
|
||||
}
|
||||
return 10 + index
|
||||
}
|
||||
|
||||
@@ -0,0 +1,339 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
func TestQUICProductionForwardTransportReroutesOnConnectFailure(t *testing.T) {
|
||||
transport := newFakeProductionForwardFabricTransport()
|
||||
transport.failConnect["quic://dead.example.test:19443"] = true
|
||||
transport.results["quic://fast.example.test:19443"] = ProductionForwardResult{
|
||||
Delivered: true,
|
||||
MessageID: "message-1",
|
||||
RouteID: "route-1",
|
||||
}
|
||||
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
|
||||
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
|
||||
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
|
||||
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
|
||||
}),
|
||||
}, transport)
|
||||
forward.Timeout = time.Second
|
||||
|
||||
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
|
||||
if err != nil {
|
||||
t.Fatalf("send production: %v", err)
|
||||
}
|
||||
if !result.Delivered || result.MessageID != "message-1" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
|
||||
t.Fatalf("dead connect count = %d, want 1", got)
|
||||
}
|
||||
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
|
||||
t.Fatalf("fast connect count = %d, want 1", got)
|
||||
}
|
||||
snapshot := forward.Snapshot()
|
||||
if snapshot.RoutePressure.AcquiredTotal != 2 || snapshot.RoutePressure.ReleasedTotal != 2 || snapshot.RoutePressure.MaxActiveTotal == 0 {
|
||||
t.Fatalf("route pressure snapshot = %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQUICProductionForwardTransportQuarantinesFailedRoute(t *testing.T) {
|
||||
transport := newFakeProductionForwardFabricTransport()
|
||||
transport.failConnect["quic://dead.example.test:19443"] = true
|
||||
transport.results["quic://fast.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
|
||||
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
|
||||
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
|
||||
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
|
||||
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
|
||||
}),
|
||||
}, transport)
|
||||
forward.Timeout = time.Second
|
||||
|
||||
for i := 0; i < 2; i++ {
|
||||
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
|
||||
if err != nil {
|
||||
t.Fatalf("send production #%d: %v", i+1, err)
|
||||
}
|
||||
if !result.Delivered {
|
||||
t.Fatalf("result #%d = %+v", i+1, result)
|
||||
}
|
||||
}
|
||||
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
|
||||
t.Fatalf("dead connect count = %d, want quarantine after first failure", got)
|
||||
}
|
||||
if got := transport.connectCount("quic://fast.example.test:19443"); got != 2 {
|
||||
t.Fatalf("fast connect count = %d, want both sends on healthy route", got)
|
||||
}
|
||||
snapshot := forward.Snapshot()
|
||||
if snapshot.RouteHealth.Quarantined["dead"].Failures != 1 {
|
||||
t.Fatalf("route health snapshot = %+v, want dead route quarantined", snapshot.RouteHealth)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRouteHealthTrackerExpiresQuarantine(t *testing.T) {
|
||||
routeSet := FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
|
||||
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
|
||||
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
|
||||
})
|
||||
tracker := NewFabricRouteHealthTracker(time.Second)
|
||||
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
|
||||
|
||||
tracker.MarkFailure("dead", "connect failed", now)
|
||||
applied := tracker.Apply(routeSet, now.Add(500*time.Millisecond))
|
||||
if applied.Primary.Healthy || !applied.Primary.Degraded {
|
||||
t.Fatalf("primary after quarantine = %+v, want unhealthy degraded route", applied.Primary)
|
||||
}
|
||||
if len(tracker.Snapshot(now.Add(500*time.Millisecond)).Quarantined) != 1 {
|
||||
t.Fatalf("route health snapshot = %+v, want one quarantined route", tracker.Snapshot(now.Add(500*time.Millisecond)))
|
||||
}
|
||||
|
||||
applied = tracker.Apply(routeSet, now.Add(2*time.Second))
|
||||
if !applied.Primary.Healthy || applied.Primary.Degraded {
|
||||
t.Fatalf("primary after ttl = %+v, want route restored", applied.Primary)
|
||||
}
|
||||
if snapshot := tracker.Snapshot(now.Add(2 * time.Second)); len(snapshot.Quarantined) != 0 {
|
||||
t.Fatalf("route health snapshot after ttl = %+v, want empty quarantine", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQUICProductionForwardTransportReroutesOnResponseTimeout(t *testing.T) {
|
||||
transport := newFakeProductionForwardFabricTransport()
|
||||
transport.delays["quic://slow.example.test:19443"] = 100 * time.Millisecond
|
||||
transport.results["quic://slow.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
|
||||
transport.results["quic://fast.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
|
||||
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
|
||||
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
|
||||
{EndpointID: "slow", PeerID: "node-b", Endpoint: "quic://slow.example.test:19443", Transport: "quic"},
|
||||
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
|
||||
}),
|
||||
}, transport)
|
||||
forward.Timeout = 10 * time.Millisecond
|
||||
|
||||
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
|
||||
if err != nil {
|
||||
t.Fatalf("send production: %v", err)
|
||||
}
|
||||
if !result.Delivered || result.MessageID != "message-1" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if got := transport.connectCount("quic://slow.example.test:19443"); got != 1 {
|
||||
t.Fatalf("slow connect count = %d, want 1", got)
|
||||
}
|
||||
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
|
||||
t.Fatalf("fast connect count = %d, want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQUICProductionForwardTransportSchedulesWithRouteSetSourceForForwardedEnvelope(t *testing.T) {
|
||||
transport := newFakeProductionForwardFabricTransport()
|
||||
transport.results["quic://node-c.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-forwarded"}
|
||||
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
|
||||
"node-c": FabricRouteSetForTransportTargets("cluster-a", "node-b", "node-c", []FabricTransportTarget{
|
||||
{EndpointID: "node-c-direct", PeerID: "node-c", Endpoint: "quic://node-c.example.test:19443", Transport: "quic"},
|
||||
}),
|
||||
}, transport)
|
||||
forward.Timeout = time.Second
|
||||
envelope := testProductionForwardEnvelope("message-forwarded")
|
||||
envelope.ClusterID = "cluster-a"
|
||||
envelope.SourceNodeID = "node-a"
|
||||
envelope.DestinationNodeID = "node-c"
|
||||
envelope.CurrentHopNodeID = "node-c"
|
||||
envelope.NextHopNodeID = "node-c"
|
||||
|
||||
result, err := forward.SendProduction(context.Background(), "node-c", envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("send production: %v", err)
|
||||
}
|
||||
if !result.Delivered || result.MessageID != "message-forwarded" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if got := transport.connectCount("quic://node-c.example.test:19443"); got != 1 {
|
||||
t.Fatalf("connect count = %d, want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQUICProductionForwardTransportSpreadsConcurrentChannelsByActivePressure(t *testing.T) {
|
||||
transport := newFakeProductionForwardFabricTransport()
|
||||
transport.delays["quic://route-a.example.test:19443"] = 80 * time.Millisecond
|
||||
transport.results["quic://route-a.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
|
||||
transport.results["quic://route-b.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-2"}
|
||||
routeSet := FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
|
||||
{EndpointID: "route-a", PeerID: "node-b", Endpoint: "quic://route-a.example.test:19443", Transport: "quic"},
|
||||
{EndpointID: "route-b", PeerID: "node-b", Endpoint: "quic://route-b.example.test:19443", Transport: "quic"},
|
||||
})
|
||||
routeSet.Primary.Capacity = 100
|
||||
routeSet.WarmStandby[0].Capacity = 100
|
||||
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{"node-b": routeSet}, transport)
|
||||
forward.Timeout = time.Second
|
||||
|
||||
firstDone := make(chan error, 1)
|
||||
go func() {
|
||||
_, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
|
||||
firstDone <- err
|
||||
}()
|
||||
transport.waitForConnect(t, "quic://route-a.example.test:19443", 1)
|
||||
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-2"))
|
||||
if err != nil {
|
||||
t.Fatalf("second send production: %v", err)
|
||||
}
|
||||
if !result.Delivered || result.MessageID != "message-2" {
|
||||
t.Fatalf("second result = %+v", result)
|
||||
}
|
||||
if got := transport.connectCount("quic://route-b.example.test:19443"); got != 1 {
|
||||
t.Fatalf("route-b connect count = %d, want 1", got)
|
||||
}
|
||||
if err := <-firstDone; err != nil {
|
||||
t.Fatalf("first send production: %v", err)
|
||||
}
|
||||
snapshot := forward.Snapshot()
|
||||
if snapshot.RoutePressure.MaxActive["route-a"] != 1 || snapshot.RoutePressure.MaxActive["route-b"] != 1 || snapshot.RoutePressure.AcquiredTotal != 2 {
|
||||
t.Fatalf("route pressure snapshot = %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
type fakeProductionForwardFabricTransport struct {
|
||||
mu sync.Mutex
|
||||
failConnect map[string]bool
|
||||
delays map[string]time.Duration
|
||||
results map[string]ProductionForwardResult
|
||||
connects map[string]int
|
||||
}
|
||||
|
||||
func newFakeProductionForwardFabricTransport() *fakeProductionForwardFabricTransport {
|
||||
return &fakeProductionForwardFabricTransport{
|
||||
failConnect: map[string]bool{},
|
||||
delays: map[string]time.Duration{},
|
||||
results: map[string]ProductionForwardResult{},
|
||||
connects: map[string]int{},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *fakeProductionForwardFabricTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
|
||||
endpoint := target.Endpoint
|
||||
t.mu.Lock()
|
||||
t.connects[endpoint]++
|
||||
fail := t.failConnect[endpoint]
|
||||
delay := t.delays[endpoint]
|
||||
result := t.results[endpoint]
|
||||
t.mu.Unlock()
|
||||
if fail {
|
||||
return nil, ErrForwardPeerUnavailable
|
||||
}
|
||||
return &fakeProductionForwardFabricSession{
|
||||
delay: delay,
|
||||
result: result,
|
||||
frames: make(chan fabricproto.Frame, 16),
|
||||
errors: make(chan error, 1),
|
||||
done: make(chan struct{}),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (t *fakeProductionForwardFabricTransport) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *fakeProductionForwardFabricTransport) connectCount(endpoint string) int {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
return t.connects[endpoint]
|
||||
}
|
||||
|
||||
func (t *fakeProductionForwardFabricTransport) waitForConnect(tb testing.TB, endpoint string, count int) {
|
||||
tb.Helper()
|
||||
deadline := time.Now().Add(time.Second)
|
||||
for {
|
||||
t.mu.Lock()
|
||||
got := t.connects[endpoint]
|
||||
t.mu.Unlock()
|
||||
if got >= count {
|
||||
return
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
tb.Fatalf("timed out waiting for %s connect count %d, got %d", endpoint, count, got)
|
||||
}
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
type fakeProductionForwardFabricSession struct {
|
||||
delay time.Duration
|
||||
result ProductionForwardResult
|
||||
frames chan fabricproto.Frame
|
||||
errors chan error
|
||||
done chan struct{}
|
||||
once sync.Once
|
||||
}
|
||||
|
||||
func (s *fakeProductionForwardFabricSession) Send(_ context.Context, frame fabricproto.Frame) error {
|
||||
if frame.Type != fabricproto.FrameData {
|
||||
return nil
|
||||
}
|
||||
responsePayload, _ := json.Marshal(quicProductionForwardResponse{Result: s.result})
|
||||
go func() {
|
||||
if s.delay > 0 {
|
||||
time.Sleep(s.delay)
|
||||
}
|
||||
select {
|
||||
case <-s.done:
|
||||
case s.frames <- fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: frame.TrafficClass,
|
||||
StreamID: frame.StreamID,
|
||||
Sequence: frame.Sequence,
|
||||
Payload: responsePayload,
|
||||
}:
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeProductionForwardFabricSession) Frames() <-chan fabricproto.Frame {
|
||||
return s.frames
|
||||
}
|
||||
|
||||
func (s *fakeProductionForwardFabricSession) Errors() <-chan error {
|
||||
return s.errors
|
||||
}
|
||||
|
||||
func (s *fakeProductionForwardFabricSession) Close() error {
|
||||
s.once.Do(func() {
|
||||
close(s.done)
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeProductionForwardFabricSession) Closed() bool {
|
||||
select {
|
||||
case <-s.done:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func testProductionForwardEnvelope(messageID string) ProductionEnvelope {
|
||||
now := time.Now().UTC()
|
||||
return ProductionEnvelope{
|
||||
FabricProtocolVersion: ProtocolVersion,
|
||||
MessageID: messageID,
|
||||
RouteID: "route-1",
|
||||
ClusterID: "cluster-a",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
CurrentHopNodeID: "node-a",
|
||||
NextHopNodeID: "node-b",
|
||||
ChannelClass: ProductionChannelFabricControl,
|
||||
MessageType: ProductionMessageFabricControl,
|
||||
TTL: 8,
|
||||
CreatedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}
|
||||
}
|
||||
@@ -106,6 +106,9 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
|
||||
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
|
||||
}
|
||||
if hasLegacyEndpointScheme(endpoint) {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint")
|
||||
}
|
||||
}
|
||||
for nodeID, candidates := range cfg.PeerEndpointCandidates {
|
||||
if strings.TrimSpace(nodeID) == "" {
|
||||
@@ -121,6 +124,9 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
|
||||
strings.TrimSpace(candidate.ConnectivityMode) == "" {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
|
||||
}
|
||||
if !isQUICOnlyCandidateTransport(candidate.Transport) || hasLegacyEndpointScheme(candidate.Address) {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint candidate")
|
||||
}
|
||||
}
|
||||
}
|
||||
for endpointID, observation := range cfg.PeerEndpointObservations {
|
||||
@@ -179,6 +185,14 @@ func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) err
|
||||
return nil
|
||||
}
|
||||
|
||||
func hasLegacyEndpointScheme(endpoint string) bool {
|
||||
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
|
||||
return strings.HasPrefix(endpoint, "http://") ||
|
||||
strings.HasPrefix(endpoint, "https://") ||
|
||||
strings.HasPrefix(endpoint, "ws://") ||
|
||||
strings.HasPrefix(endpoint, "wss://")
|
||||
}
|
||||
|
||||
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
|
||||
if len(seeds) > 20 {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains too many recovery seeds")
|
||||
@@ -191,6 +205,9 @@ func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
|
||||
strings.TrimSpace(seed.Transport) == "" {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
|
||||
}
|
||||
if !isQUICOnlyCandidateTransport(seed.Transport) || hasLegacyEndpointScheme(seed.Endpoint) {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC recovery seed")
|
||||
}
|
||||
if _, duplicate := seen[key]; duplicate {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains duplicate recovery seed")
|
||||
}
|
||||
@@ -224,6 +241,9 @@ func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRo
|
||||
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
|
||||
}
|
||||
if !isQUICOnlyCandidateTransport(lease.Transport) || hasLegacyEndpointScheme(lease.RelayEndpoint) {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC rendezvous lease")
|
||||
}
|
||||
if _, duplicate := seen[lease.LeaseID]; duplicate {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains duplicate rendezvous lease")
|
||||
}
|
||||
|
||||
@@ -18,14 +18,14 @@ func TestLoadScopedSyntheticConfig(t *testing.T) {
|
||||
ConfigVersion: "config-v1",
|
||||
PeerDirectoryVersion: "peers-v1",
|
||||
PolicyVersion: "policy-v1",
|
||||
PeerEndpoints: map[string]string{"node-b": "http://127.0.0.1:19002"},
|
||||
PeerEndpoints: map[string]string{"node-b": "quic://127.0.0.1:19443"},
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
NATType: "restricted",
|
||||
ConnectivityMode: "direct",
|
||||
@@ -55,8 +55,8 @@ func TestLoadScopedSyntheticConfig(t *testing.T) {
|
||||
RecoverySeeds: []PeerRecoverySeed{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "https://node-b.example.test:443",
|
||||
Transport: "direct_tcp_tls",
|
||||
Endpoint: "quic://node-b.example.test:19443",
|
||||
Transport: "direct_quic",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
@@ -66,8 +66,8 @@ func TestLoadScopedSyntheticConfig(t *testing.T) {
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "http://node-r:19000",
|
||||
Transport: "relay_control",
|
||||
RelayEndpoint: "quic://node-r:19443",
|
||||
Transport: "relay_quic",
|
||||
ConnectivityMode: "relay_required",
|
||||
RouteIDs: []string{"route-a-b"},
|
||||
AllowedChannels: []string{"fabric_control", "route_control"},
|
||||
@@ -158,8 +158,8 @@ func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-c",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
},
|
||||
@@ -174,6 +174,73 @@ func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpoint(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
PeerEndpoints: map[string]string{"node-b": "https://node-b.example.test:443"},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("expected non-QUIC peer endpoint error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateTransport(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-websocket",
|
||||
NodeID: "node-b",
|
||||
Transport: "websocket",
|
||||
Address: "quic://203.0.113.20:19443",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
},
|
||||
},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("expected non-QUIC peer endpoint candidate error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateScheme(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-https",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "https://node-b.example.test:443",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
},
|
||||
},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("expected non-QUIC peer endpoint candidate error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointObservation(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
@@ -217,7 +284,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
RecoverySeeds: []PeerRecoverySeed{
|
||||
{NodeID: "node-b", Endpoint: "", Transport: "direct_tcp_tls"},
|
||||
{NodeID: "node-b", Endpoint: "", Transport: "direct_quic"},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
@@ -228,6 +295,23 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsLegacyRecoverySeed(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
RecoverySeeds: []PeerRecoverySeed{
|
||||
{NodeID: "node-b", Endpoint: "https://node-b.example.test:443", Transport: "direct_quic"},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("expected non-QUIC recovery seed error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17z12.synthetic.v1",
|
||||
@@ -238,8 +322,8 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "http://node-r:19000",
|
||||
Transport: "relay_control",
|
||||
RelayEndpoint: "quic://node-r:19443",
|
||||
Transport: "relay_quic",
|
||||
RouteIDs: []string{"route-a-b"},
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
},
|
||||
@@ -253,6 +337,36 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsLegacyRendezvousLease(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17z12.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
RendezvousLeases: []PeerRendezvousLease{
|
||||
{
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "https://node-r.example.test:443",
|
||||
Transport: "relay_quic",
|
||||
ConnectivityMode: "relay_required",
|
||||
RouteIDs: []string{"route-a-b"},
|
||||
AllowedChannels: []string{"fabric_control", "route_control"},
|
||||
Priority: 10,
|
||||
ControlPlaneOnly: true,
|
||||
IssuedAt: time.Now().UTC().Add(-time.Minute),
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("expected non-QUIC rendezvous lease error")
|
||||
}
|
||||
}
|
||||
|
||||
func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
|
||||
t.Helper()
|
||||
payload, err := json.Marshal(cfg)
|
||||
@@ -265,3 +379,32 @@ func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
|
||||
return SyntheticRoute{
|
||||
RouteID: routeID,
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: hops[0],
|
||||
DestinationNodeID: hops[len(hops)-1],
|
||||
Hops: hops,
|
||||
AllowedChannels: []string{SyntheticChannelFabricControl},
|
||||
MaxTTL: 8,
|
||||
MaxHops: 8,
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
RouteVersion: "route-v1",
|
||||
PolicyVersion: "policy-v1",
|
||||
PeerDirectoryVersion: "peers-v1",
|
||||
}
|
||||
}
|
||||
|
||||
func sameStrings(left, right []string) bool {
|
||||
if len(left) != len(right) {
|
||||
return false
|
||||
}
|
||||
for i := range left {
|
||||
if left[i] != right[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -69,22 +69,24 @@ type VPNPacketIngressRoutePreference interface {
|
||||
}
|
||||
|
||||
type Server struct {
|
||||
Local PeerIdentity
|
||||
SyntheticRuntime *SyntheticRuntime
|
||||
ProductionForwardingEnabled bool
|
||||
ProductionEnvelopeObserver ProductionEnvelopeObserver
|
||||
ProductionEnvelopeDelivery ProductionEnvelopeDelivery
|
||||
ProductionForwardTransport ProductionForwardTransport
|
||||
ProductionForwardLogger ProductionForwardLogger
|
||||
FabricServiceChannelLogger FabricServiceChannelAccessLogger
|
||||
RemoteWorkspaceFrameSink RemoteWorkspaceFrameSink
|
||||
ProductionRoutes []SyntheticRoute
|
||||
VPNPacketIngress VPNPacketIngress
|
||||
BackendProxyBaseURL string
|
||||
ClusterAuthorityPublicKey string
|
||||
ServiceChannelIntrospection bool
|
||||
FabricSessionEnabled bool
|
||||
FabricSessionLogger FabricSessionEventLogger
|
||||
Local PeerIdentity
|
||||
SyntheticRuntime *SyntheticRuntime
|
||||
ProductionForwardingEnabled bool
|
||||
ProductionEnvelopeObserver ProductionEnvelopeObserver
|
||||
ProductionEnvelopeDelivery ProductionEnvelopeDelivery
|
||||
ProductionForwardTransport ProductionForwardTransport
|
||||
ProductionForwardLogger ProductionForwardLogger
|
||||
DisableHTTPDataPlane bool
|
||||
FabricServiceChannelLogger FabricServiceChannelAccessLogger
|
||||
RemoteWorkspaceFrameSink RemoteWorkspaceFrameSink
|
||||
ProductionRoutes []SyntheticRoute
|
||||
VPNPacketIngress VPNPacketIngress
|
||||
BackendProxyBaseURL string
|
||||
ClusterAuthorityPublicKey string
|
||||
ServiceChannelIntrospection bool
|
||||
FabricSessionEnabled bool
|
||||
FabricSessionWebSocketEnabled bool
|
||||
FabricSessionLogger FabricSessionEventLogger
|
||||
}
|
||||
|
||||
func (s Server) Handler() http.Handler {
|
||||
@@ -92,7 +94,7 @@ func (s Server) Handler() http.Handler {
|
||||
mux.HandleFunc("/mesh/v1/health", s.handleHealth)
|
||||
mux.HandleFunc("/mesh/v1/forward", s.handleForward)
|
||||
mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe)
|
||||
if s.FabricSessionEnabled {
|
||||
if s.FabricSessionEnabled && s.FabricSessionWebSocketEnabled {
|
||||
mux.HandleFunc("/mesh/v1/fabric/session/ws", s.handleFabricSessionWebSocket)
|
||||
}
|
||||
if s.RemoteWorkspaceFrameSink != nil {
|
||||
@@ -198,6 +200,7 @@ type FabricSessionEventLogEntry struct {
|
||||
Event string `json:"event"`
|
||||
ClusterID string `json:"cluster_id,omitempty"`
|
||||
NodeID string `json:"node_id,omitempty"`
|
||||
PeerID string `json:"peer_id,omitempty"`
|
||||
AcceptedBy string `json:"accepted_by,omitempty"`
|
||||
SessionID string `json:"session_id,omitempty"`
|
||||
SessionEvent fabricproto.SessionEventType `json:"session_event,omitempty"`
|
||||
@@ -2079,16 +2082,12 @@ func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
if s.DisableHTTPDataPlane {
|
||||
http.Error(w, "mesh data-plane forwarding requires QUIC fabric transport", http.StatusGone)
|
||||
return
|
||||
}
|
||||
if !s.ProductionForwardingEnabled {
|
||||
s.logProductionForward(ProductionForwardLogEntry{
|
||||
Event: "production_forward_rejected",
|
||||
ClusterID: s.Local.ClusterID,
|
||||
LocalNodeID: s.Local.NodeID,
|
||||
Reason: ErrForwardDisabled.Error(),
|
||||
StatusCode: http.StatusNotImplemented,
|
||||
OccurredAt: time.Now().UTC(),
|
||||
})
|
||||
http.Error(w, ErrForwardDisabled.Error(), http.StatusNotImplemented)
|
||||
s.rejectProductionForward(w, ProductionEnvelope{}, ErrForwardDisabled, forwardStatusCode(ErrForwardDisabled))
|
||||
return
|
||||
}
|
||||
var envelope ProductionEnvelope
|
||||
@@ -2104,54 +2103,57 @@ func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "invalid production mesh envelope", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if err := ValidateProductionEnvelope(s.Local, envelope, time.Now().UTC()); err != nil {
|
||||
result, err := s.ForwardProduction(r.Context(), envelope)
|
||||
if err != nil {
|
||||
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
|
||||
return
|
||||
}
|
||||
writeProductionForwardResult(w, result)
|
||||
}
|
||||
|
||||
func (s Server) ForwardProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
|
||||
if !s.ProductionForwardingEnabled {
|
||||
return ProductionForwardResult{}, ErrForwardDisabled
|
||||
}
|
||||
if err := ValidateProductionEnvelope(s.Local, envelope, time.Now().UTC()); err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
if err := ValidateProductionEnvelopeRouteConfig(s.Local, envelope, s.ProductionRoutes, time.Now().UTC()); err != nil {
|
||||
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
|
||||
return
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_accepted", s.Local, envelope, "", 0))
|
||||
if s.ProductionEnvelopeObserver != nil {
|
||||
observation := NewProductionEnvelopeObservation(envelope, time.Now().UTC())
|
||||
if err := observeProductionEnvelope(r.Context(), s.ProductionEnvelopeObserver, observation); err != nil {
|
||||
if err := observeProductionEnvelope(ctx, s.ProductionEnvelopeObserver, observation); err != nil {
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, ErrForwardObservationFailed.Error(), http.StatusInternalServerError))
|
||||
http.Error(w, ErrForwardObservationFailed.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
return ProductionForwardResult{}, ErrForwardObservationFailed
|
||||
}
|
||||
}
|
||||
if envelope.DestinationNodeID == s.Local.NodeID {
|
||||
if err := deliverProductionEnvelope(r.Context(), s.ProductionEnvelopeDelivery, envelope); err != nil {
|
||||
if err := deliverProductionEnvelope(ctx, s.ProductionEnvelopeDelivery, envelope); err != nil {
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, ErrForwardDeliveryFailed.Error(), http.StatusInternalServerError))
|
||||
http.Error(w, ErrForwardDeliveryFailed.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
return ProductionForwardResult{}, ErrForwardDeliveryFailed
|
||||
}
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_delivered", s.Local, envelope, "", http.StatusOK))
|
||||
writeProductionForwardResult(w, ProductionForwardResult{
|
||||
return ProductionForwardResult{
|
||||
Accepted: true,
|
||||
Delivered: true,
|
||||
By: s.Local,
|
||||
MessageID: envelope.MessageID,
|
||||
RouteID: envelope.RouteID,
|
||||
})
|
||||
return
|
||||
}, nil
|
||||
}
|
||||
if envelope.NextHopNodeID == s.Local.NodeID {
|
||||
s.rejectProductionForward(w, envelope, ErrLoopDetected, forwardStatusCode(ErrLoopDetected))
|
||||
return
|
||||
return ProductionForwardResult{}, ErrLoopDetected
|
||||
}
|
||||
if len(envelope.RoutePath) == 0 && envelope.NextHopNodeID != envelope.DestinationNodeID {
|
||||
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
|
||||
return
|
||||
return ProductionForwardResult{}, ErrForwardRuntimeUnavailable
|
||||
}
|
||||
if s.ProductionForwardTransport == nil {
|
||||
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
|
||||
return
|
||||
return ProductionForwardResult{}, ErrForwardRuntimeUnavailable
|
||||
}
|
||||
if envelope.TTL <= 1 {
|
||||
s.rejectProductionForward(w, envelope, ErrTTLExhausted, forwardStatusCode(ErrTTLExhausted))
|
||||
return
|
||||
return ProductionForwardResult{}, ErrTTLExhausted
|
||||
}
|
||||
forwarded := envelope
|
||||
forwarded.CurrentHopNodeID = envelope.NextHopNodeID
|
||||
@@ -2159,10 +2161,9 @@ func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
|
||||
forwarded.TTL = envelope.TTL - 1
|
||||
forwarded.HopCount = envelope.HopCount + 1
|
||||
forwarded.VisitedNodeIDs = append(append([]string{}, envelope.VisitedNodeIDs...), s.Local.NodeID)
|
||||
result, err := s.ProductionForwardTransport.SendProduction(r.Context(), envelope.NextHopNodeID, forwarded)
|
||||
result, err := s.ProductionForwardTransport.SendProduction(ctx, envelope.NextHopNodeID, forwarded)
|
||||
if err != nil {
|
||||
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
|
||||
return
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_forwarded", s.Local, envelope, "", http.StatusOK))
|
||||
result.Accepted = true
|
||||
@@ -2171,7 +2172,7 @@ func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
|
||||
result.MessageID = envelope.MessageID
|
||||
result.RouteID = envelope.RouteID
|
||||
result.NextNodeID = envelope.NextHopNodeID
|
||||
writeProductionForwardResult(w, result)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s Server) rejectProductionForward(w http.ResponseWriter, envelope ProductionEnvelope, err error, statusCode int) {
|
||||
@@ -2262,6 +2263,10 @@ func (s Server) handleSyntheticProbe(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
if s.DisableHTTPDataPlane {
|
||||
http.Error(w, "mesh synthetic probes require QUIC fabric transport", http.StatusGone)
|
||||
return
|
||||
}
|
||||
if s.SyntheticRuntime == nil {
|
||||
http.Error(w, ErrMeshRuntimeDisabled.Error(), http.StatusServiceUnavailable)
|
||||
return
|
||||
@@ -2307,17 +2312,19 @@ func syntheticStatusCode(err error) int {
|
||||
}
|
||||
|
||||
func forwardStatusCode(err error) int {
|
||||
switch err {
|
||||
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
|
||||
switch {
|
||||
case errors.Is(err, ErrClusterMismatch), errors.Is(err, ErrNodeMismatch), errors.Is(err, ErrUnauthorizedChannel), errors.Is(err, ErrLoopDetected):
|
||||
return http.StatusForbidden
|
||||
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrRouteIDRequired:
|
||||
case errors.Is(err, ErrRouteExpired), errors.Is(err, ErrTTLExhausted), errors.Is(err, ErrInvalidRoutePath), errors.Is(err, ErrRouteIDRequired), errors.Is(err, ErrForwardEnvelopeInvalid):
|
||||
return http.StatusBadRequest
|
||||
case ErrForwardRuntimeUnavailable:
|
||||
case errors.Is(err, ErrForwardRuntimeUnavailable), errors.Is(err, ErrForwardDisabled):
|
||||
return http.StatusNotImplemented
|
||||
case ErrRouteNotFound:
|
||||
case errors.Is(err, ErrRouteNotFound):
|
||||
return http.StatusNotFound
|
||||
case ErrForwardPeerUnavailable:
|
||||
case errors.Is(err, ErrForwardPeerUnavailable):
|
||||
return http.StatusBadGateway
|
||||
case errors.Is(err, ErrForwardObservationFailed), errors.Is(err, ErrForwardDeliveryFailed):
|
||||
return http.StatusInternalServerError
|
||||
default:
|
||||
return http.StatusBadRequest
|
||||
}
|
||||
|
||||
@@ -23,6 +23,18 @@ import (
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
type testProductionForwardTransport struct {
|
||||
targets map[string]Server
|
||||
}
|
||||
|
||||
func (t testProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
|
||||
target, ok := t.targets[strings.TrimSpace(nextNodeID)]
|
||||
if !ok {
|
||||
return ProductionForwardResult{}, ErrForwardPeerUnavailable
|
||||
}
|
||||
return target.ForwardProduction(ctx, envelope)
|
||||
}
|
||||
|
||||
func TestMeshHealthAcceptsSameCluster(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{Local: local}.Handler())
|
||||
@@ -92,8 +104,9 @@ func TestFabricSessionWebSocketDisabledByDefault(t *testing.T) {
|
||||
func TestFabricSessionWebSocketPingPongAndEvents(t *testing.T) {
|
||||
var events []FabricSessionEventLogEntry
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
events = append(events, entry)
|
||||
},
|
||||
@@ -119,8 +132,9 @@ func TestFabricSessionWebSocketPingPongAndEvents(t *testing.T) {
|
||||
|
||||
func TestFabricSessionWebSocketOpenStreamDataAck(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
@@ -151,8 +165,9 @@ func TestFabricSessionWebSocketOpenStreamDataAck(t *testing.T) {
|
||||
|
||||
func TestFabricSessionWebSocketRequiresToken(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
@@ -172,9 +187,10 @@ func TestFabricSessionWebSocketRequiresSignedAuthorityWhenConfigured(t *testing.
|
||||
t.Fatalf("generate key: %v", err)
|
||||
}
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
@@ -196,9 +212,10 @@ func TestFabricSessionWebSocketAcceptsSignedAuthority(t *testing.T) {
|
||||
token := "rap_fsn_signedtest"
|
||||
var events []FabricSessionEventLogEntry
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
events = append(events, entry)
|
||||
},
|
||||
@@ -360,23 +377,20 @@ func TestMeshForwardingGateDeliversFabricControlAtDestination(t *testing.T) {
|
||||
func TestMeshForwardingGateForwardsDirectFabricControlToNextHop(t *testing.T) {
|
||||
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
|
||||
var deliveredObservation ProductionEnvelopeObservation
|
||||
serverC := httptest.NewServer(Server{
|
||||
serverC := Server{
|
||||
Local: nodeC,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
|
||||
deliveredObservation = observation
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer serverC.Close()
|
||||
}
|
||||
|
||||
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
serverB := httptest.NewServer(Server{
|
||||
Local: nodeB,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeC.NodeID: serverC.URL,
|
||||
}),
|
||||
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeC.NodeID: serverC}},
|
||||
}.Handler())
|
||||
defer serverB.Close()
|
||||
|
||||
@@ -414,36 +428,30 @@ func TestMeshForwardingGateForwardsMultiHopFabricControlByRoutePath(t *testing.T
|
||||
var deliveredObservation ProductionEnvelopeObservation
|
||||
var nodeREvents []ProductionForwardLogEntry
|
||||
var nodeBEvents []ProductionForwardLogEntry
|
||||
serverC := httptest.NewServer(Server{
|
||||
serverC := Server{
|
||||
Local: nodeC,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
|
||||
deliveredObservation = observation
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer serverC.Close()
|
||||
}
|
||||
|
||||
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
|
||||
serverR := httptest.NewServer(Server{
|
||||
serverR := Server{
|
||||
Local: nodeR,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeC.NodeID: serverC.URL,
|
||||
}),
|
||||
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeC.NodeID: serverC}},
|
||||
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
|
||||
nodeREvents = append(nodeREvents, entry)
|
||||
},
|
||||
}.Handler())
|
||||
defer serverR.Close()
|
||||
}
|
||||
|
||||
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
serverB := httptest.NewServer(Server{
|
||||
Local: nodeB,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeR.NodeID: serverR.URL,
|
||||
}),
|
||||
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeR.NodeID: serverR}},
|
||||
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
|
||||
nodeBEvents = append(nodeBEvents, entry)
|
||||
},
|
||||
@@ -490,7 +498,7 @@ func TestMeshForwardingGateForwardsConfiguredProductionRoute(t *testing.T) {
|
||||
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
|
||||
route := configuredProductionRoute("route-1", []string{"node-a", "node-b", "node-r", nodeC.NodeID})
|
||||
var deliveredObservation ProductionEnvelopeObservation
|
||||
serverC := httptest.NewServer(Server{
|
||||
serverC := Server{
|
||||
Local: nodeC,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionRoutes: []SyntheticRoute{route},
|
||||
@@ -498,28 +506,22 @@ func TestMeshForwardingGateForwardsConfiguredProductionRoute(t *testing.T) {
|
||||
deliveredObservation = observation
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer serverC.Close()
|
||||
}
|
||||
|
||||
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
|
||||
serverR := httptest.NewServer(Server{
|
||||
serverR := Server{
|
||||
Local: nodeR,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionRoutes: []SyntheticRoute{route},
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeC.NodeID: serverC.URL,
|
||||
}),
|
||||
}.Handler())
|
||||
defer serverR.Close()
|
||||
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeC.NodeID: serverC}},
|
||||
}
|
||||
|
||||
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
serverB := httptest.NewServer(Server{
|
||||
Local: nodeB,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionRoutes: []SyntheticRoute{route},
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeR.NodeID: serverR.URL,
|
||||
}),
|
||||
ProductionForwardTransport: testProductionForwardTransport{targets: map[string]Server{nodeR.NodeID: serverR}},
|
||||
}.Handler())
|
||||
defer serverB.Close()
|
||||
|
||||
@@ -5016,3 +5018,30 @@ func TestSyntheticEndpointDisabledByDefault(t *testing.T) {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusServiceUnavailable)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHTTPDataPlaneDisabledRequiresQUIC(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
|
||||
SyntheticRuntime: NewSyntheticRuntime(SyntheticRuntimeConfig{Enabled: true, Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}),
|
||||
DisableHTTPDataPlane: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/synthetic/probe", "application/json", bytes.NewReader([]byte(`{}`)))
|
||||
if err != nil {
|
||||
t.Fatalf("post synthetic probe: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusGone {
|
||||
t.Fatalf("synthetic status = %d, want %d", resp.StatusCode, http.StatusGone)
|
||||
}
|
||||
|
||||
resp, err = http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader([]byte(`{}`)))
|
||||
if err != nil {
|
||||
t.Fatalf("post production forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusGone {
|
||||
t.Fatalf("forward status = %d, want %d", resp.StatusCode, http.StatusGone)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,268 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
type QUICSyntheticTransport struct {
|
||||
Targets map[string]FabricTransportTarget
|
||||
RouteSets map[string]FabricRouteSet
|
||||
Transport FabricTransport
|
||||
Router FabricChannelRouter
|
||||
Timeout time.Duration
|
||||
Pressure *FabricRoutePressureTracker
|
||||
Health *FabricRouteHealthTracker
|
||||
sequence atomic.Uint64
|
||||
}
|
||||
|
||||
type QUICSyntheticTransportSnapshot struct {
|
||||
RoutePressure FabricRoutePressureSnapshot `json:"route_pressure"`
|
||||
RouteHealth FabricRouteHealthSnapshot `json:"route_health,omitempty"`
|
||||
}
|
||||
|
||||
func NewQUICSyntheticTransportFromRouteSets(routeSets map[string]FabricRouteSet, transport FabricTransport) *QUICSyntheticTransport {
|
||||
normalizedRouteSets := make(map[string]FabricRouteSet, len(routeSets))
|
||||
targets := make(map[string]FabricTransportTarget, len(routeSets))
|
||||
for nodeID, routeSet := range routeSets {
|
||||
nodeID = strings.TrimSpace(nodeID)
|
||||
if nodeID == "" {
|
||||
continue
|
||||
}
|
||||
normalizedRouteSets[nodeID] = routeSet
|
||||
if target, err := FabricTransportTargetForRoute(routeSet.Primary); err == nil {
|
||||
targets[nodeID] = target
|
||||
}
|
||||
}
|
||||
if transport == nil {
|
||||
transport = NewQUICFabricTransport(nil)
|
||||
}
|
||||
return &QUICSyntheticTransport{
|
||||
Targets: targets,
|
||||
RouteSets: normalizedRouteSets,
|
||||
Transport: transport,
|
||||
Router: NewFabricChannelRouter(FabricChannelRouterConfig{
|
||||
MaxAckLatencyMs: 2000,
|
||||
MinRerouteInterval: 50 * time.Millisecond,
|
||||
}),
|
||||
Timeout: 10 * time.Second,
|
||||
Pressure: NewFabricRoutePressureTracker(),
|
||||
Health: NewFabricRouteHealthTracker(30 * time.Second),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *QUICSyntheticTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
|
||||
if t == nil || t.Transport == nil {
|
||||
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
nextNodeID = strings.TrimSpace(nextNodeID)
|
||||
routeSet, ok := t.RouteSets[nextNodeID]
|
||||
if !ok {
|
||||
target, targetOK := t.Targets[nextNodeID]
|
||||
if !targetOK || strings.TrimSpace(target.Endpoint) == "" {
|
||||
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
routeSet = FabricRouteSetForTransportTargets(envelope.ClusterID, envelope.From.NodeID, nextNodeID, []FabricTransportTarget{target})
|
||||
}
|
||||
spec := FabricChannelSpec{
|
||||
ChannelID: fmt.Sprintf("synthetic-%d", t.sequence.Add(1)),
|
||||
ClusterID: envelope.ClusterID,
|
||||
SourceNodeID: envelope.From.NodeID,
|
||||
TargetKind: FabricChannelTargetNode,
|
||||
TargetID: nextNodeID,
|
||||
TrafficClass: FabricServiceChannelReliable,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
}
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
return SyntheticEnvelope{}, err
|
||||
}
|
||||
return t.sendSyntheticWithRouteSet(ctx, spec, routeSet, payload)
|
||||
}
|
||||
|
||||
func (t *QUICSyntheticTransport) sendSyntheticWithRouteSet(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payload []byte) (SyntheticEnvelope, error) {
|
||||
router := t.Router
|
||||
if router.Config.MaxRoutePressure == 0 {
|
||||
router = NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 2000, MinRerouteInterval: 50 * time.Millisecond})
|
||||
}
|
||||
routeSet = t.routeSetForScheduling(routeSet)
|
||||
channel, _, err := router.OpenChannel(spec, routeSet, time.Now().UTC())
|
||||
if err != nil {
|
||||
return SyntheticEnvelope{}, err
|
||||
}
|
||||
timeout := t.Timeout
|
||||
if timeout <= 0 {
|
||||
timeout = 10 * time.Second
|
||||
}
|
||||
for {
|
||||
routeSet = t.routeSetForScheduling(routeSet)
|
||||
route, ok := findFabricRoute(routeSet, channel.RouteID)
|
||||
if !ok {
|
||||
return SyntheticEnvelope{}, ErrFabricRouteNotFound
|
||||
}
|
||||
target, err := FabricTransportTargetForRoute(route)
|
||||
if err != nil {
|
||||
return SyntheticEnvelope{}, err
|
||||
}
|
||||
target.PeerID = firstNonEmpty(strings.TrimSpace(target.PeerID), spec.TargetID)
|
||||
target.MaxPayload = fabricproto.DefaultMaxPayload
|
||||
releaseRoute := t.acquireSyntheticRoute(route.RouteID)
|
||||
session, err := t.Transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
releaseRoute()
|
||||
t.markSyntheticRouteFailure(route.RouteID, err)
|
||||
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
Failed: true,
|
||||
Reason: "connect_failed",
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
channel = updated
|
||||
if event.Type == FabricChannelRouteEventReroute {
|
||||
continue
|
||||
}
|
||||
if rerouteErr != nil {
|
||||
return SyntheticEnvelope{}, rerouteErr
|
||||
}
|
||||
return SyntheticEnvelope{}, fmt.Errorf("%w: %v", ErrSyntheticPeerUnavailable, err)
|
||||
}
|
||||
response, ackMs, err := t.sendSyntheticOnSession(ctx, session, payload, timeout)
|
||||
_ = session.Close()
|
||||
releaseRoute()
|
||||
if err == nil {
|
||||
t.markSyntheticRouteSuccess(route.RouteID)
|
||||
_, _, _ = router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
AckLatencyMs: ackMs,
|
||||
BytesSent: uint64(len(payload)),
|
||||
FramesSent: 1,
|
||||
BytesRecv: uint64(len(response.Payload)),
|
||||
FramesRecv: 1,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
return decodeQUICSyntheticForwardResponse(response.Payload)
|
||||
}
|
||||
t.markSyntheticRouteFailure(route.RouteID, err)
|
||||
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
|
||||
ChannelID: spec.ChannelID,
|
||||
RouteID: route.RouteID,
|
||||
Failed: true,
|
||||
Reason: "response_failed",
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}, time.Now().UTC())
|
||||
channel = updated
|
||||
if event.Type == FabricChannelRouteEventReroute {
|
||||
continue
|
||||
}
|
||||
if rerouteErr != nil {
|
||||
return SyntheticEnvelope{}, rerouteErr
|
||||
}
|
||||
return SyntheticEnvelope{}, fmt.Errorf("%w: %v", ErrSyntheticPeerUnavailable, err)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *QUICSyntheticTransport) routeSetForScheduling(routeSet FabricRouteSet) FabricRouteSet {
|
||||
if t != nil && t.Health != nil {
|
||||
routeSet = t.Health.Apply(routeSet, time.Now().UTC())
|
||||
}
|
||||
if t != nil && t.Pressure != nil {
|
||||
routeSet = t.Pressure.Apply(routeSet)
|
||||
}
|
||||
return routeSet
|
||||
}
|
||||
|
||||
func (t *QUICSyntheticTransport) acquireSyntheticRoute(routeID string) func() {
|
||||
if t == nil || t.Pressure == nil {
|
||||
return func() {}
|
||||
}
|
||||
return t.Pressure.Acquire(routeID)
|
||||
}
|
||||
|
||||
func (t *QUICSyntheticTransport) markSyntheticRouteFailure(routeID string, err error) {
|
||||
if t == nil || t.Health == nil || err == nil {
|
||||
return
|
||||
}
|
||||
t.Health.MarkFailure(routeID, err.Error(), time.Now().UTC())
|
||||
}
|
||||
|
||||
func (t *QUICSyntheticTransport) markSyntheticRouteSuccess(routeID string) {
|
||||
if t == nil || t.Health == nil {
|
||||
return
|
||||
}
|
||||
t.Health.MarkSuccess(routeID)
|
||||
}
|
||||
|
||||
func (t *QUICSyntheticTransport) Snapshot() QUICSyntheticTransportSnapshot {
|
||||
if t == nil {
|
||||
return QUICSyntheticTransportSnapshot{}
|
||||
}
|
||||
var pressure FabricRoutePressureSnapshot
|
||||
if t.Pressure != nil {
|
||||
pressure = t.Pressure.SnapshotPressure()
|
||||
}
|
||||
var health FabricRouteHealthSnapshot
|
||||
if t.Health != nil {
|
||||
health = t.Health.Snapshot(time.Now().UTC())
|
||||
}
|
||||
return QUICSyntheticTransportSnapshot{RoutePressure: pressure, RouteHealth: health}
|
||||
}
|
||||
|
||||
func (t *QUICSyntheticTransport) sendSyntheticOnSession(ctx context.Context, session FabricTransportSession, payload []byte, timeout time.Duration) (fabricproto.Frame, int64, error) {
|
||||
sequence := t.sequence.Add(1)
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
StreamID: SyntheticForwardQUICStreamID,
|
||||
Sequence: sequence,
|
||||
Payload: payload,
|
||||
}); err != nil {
|
||||
return fabricproto.Frame{}, 0, err
|
||||
}
|
||||
waitCtx := ctx
|
||||
if timeout > 0 {
|
||||
var cancel context.CancelFunc
|
||||
waitCtx, cancel = context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
}
|
||||
started := time.Now()
|
||||
for {
|
||||
select {
|
||||
case <-waitCtx.Done():
|
||||
return fabricproto.Frame{}, 0, waitCtx.Err()
|
||||
case err, ok := <-session.Errors():
|
||||
if !ok {
|
||||
return fabricproto.Frame{}, 0, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
if err != nil {
|
||||
return fabricproto.Frame{}, 0, err
|
||||
}
|
||||
case frame, ok := <-session.Frames():
|
||||
if !ok {
|
||||
return fabricproto.Frame{}, 0, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != SyntheticForwardQUICStreamID || frame.Sequence != sequence {
|
||||
continue
|
||||
}
|
||||
return frame, time.Since(started).Milliseconds(), nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func decodeQUICSyntheticForwardResponse(payload []byte) (SyntheticEnvelope, error) {
|
||||
var response quicSyntheticForwardResponse
|
||||
if err := json.Unmarshal(payload, &response); err != nil {
|
||||
return SyntheticEnvelope{}, err
|
||||
}
|
||||
if strings.TrimSpace(response.Error) != "" {
|
||||
return SyntheticEnvelope{}, fmt.Errorf("%w: %s", ErrSyntheticPeerUnavailable, response.Error)
|
||||
}
|
||||
return response.Envelope, nil
|
||||
}
|
||||
@@ -0,0 +1,223 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
func TestQUICSyntheticTransportReroutesOnConnectFailure(t *testing.T) {
|
||||
transport := newFakeSyntheticFabricTransport()
|
||||
transport.failConnect["quic://dead.example.test:19443"] = true
|
||||
transport.responses["quic://fast.example.test:19443"] = testSyntheticAckEnvelope("route-1", 1)
|
||||
forward := NewQUICSyntheticTransportFromRouteSets(map[string]FabricRouteSet{
|
||||
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
|
||||
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
|
||||
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
|
||||
}),
|
||||
}, transport)
|
||||
forward.Timeout = time.Second
|
||||
|
||||
ack, err := forward.SendSynthetic(context.Background(), "node-b", testSyntheticEnvelope("route-1", 1))
|
||||
if err != nil {
|
||||
t.Fatalf("send synthetic: %v", err)
|
||||
}
|
||||
if ack.RouteID != "route-1" || ack.MessageType != SyntheticMessageRouteHealthAck {
|
||||
t.Fatalf("ack = %+v", ack)
|
||||
}
|
||||
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
|
||||
t.Fatalf("dead connect count = %d, want 1", got)
|
||||
}
|
||||
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
|
||||
t.Fatalf("fast connect count = %d, want 1", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestQUICFabricServerHandlesSyntheticFrames(t *testing.T) {
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: testQUICTLSConfig(t),
|
||||
SyntheticForwardHandler: func(_ context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
|
||||
return testSyntheticAckEnvelope(envelope.RouteID, envelope.Sequence), nil
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
defer cancel()
|
||||
session, err := NewQUICFabricTransport(nil).Connect(ctx, FabricTransportTarget{
|
||||
Endpoint: server.Addr().String(),
|
||||
TLSConfig: &tls.Config{
|
||||
InsecureSkipVerify: true,
|
||||
NextProtos: []string{fabricQUICNextProto},
|
||||
},
|
||||
Timeout: time.Second,
|
||||
InboundBuffer: 4,
|
||||
ErrorBuffer: 4,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("connect: %v", err)
|
||||
}
|
||||
defer session.Close()
|
||||
|
||||
payload, err := json.Marshal(testSyntheticEnvelope("route-1", 7))
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
StreamID: SyntheticForwardQUICStreamID,
|
||||
Sequence: 42,
|
||||
Payload: payload,
|
||||
}); err != nil {
|
||||
t.Fatalf("send synthetic frame: %v", err)
|
||||
}
|
||||
select {
|
||||
case frame := <-session.Frames():
|
||||
if frame.StreamID != SyntheticForwardQUICStreamID || frame.Sequence != 42 {
|
||||
t.Fatalf("frame = %+v", frame)
|
||||
}
|
||||
ack, err := decodeQUICSyntheticForwardResponse(frame.Payload)
|
||||
if err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if ack.RouteID != "route-1" || ack.MessageType != SyntheticMessageRouteHealthAck || ack.Sequence != 7 {
|
||||
t.Fatalf("ack = %+v", ack)
|
||||
}
|
||||
case err := <-session.Errors():
|
||||
t.Fatalf("session error: %v", err)
|
||||
case <-ctx.Done():
|
||||
t.Fatal(ctx.Err())
|
||||
}
|
||||
}
|
||||
|
||||
type fakeSyntheticFabricTransport struct {
|
||||
mu sync.Mutex
|
||||
failConnect map[string]bool
|
||||
responses map[string]SyntheticEnvelope
|
||||
connects map[string]int
|
||||
}
|
||||
|
||||
func newFakeSyntheticFabricTransport() *fakeSyntheticFabricTransport {
|
||||
return &fakeSyntheticFabricTransport{
|
||||
failConnect: map[string]bool{},
|
||||
responses: map[string]SyntheticEnvelope{},
|
||||
connects: map[string]int{},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *fakeSyntheticFabricTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
|
||||
endpoint := target.Endpoint
|
||||
t.mu.Lock()
|
||||
t.connects[endpoint]++
|
||||
fail := t.failConnect[endpoint]
|
||||
response := t.responses[endpoint]
|
||||
t.mu.Unlock()
|
||||
if fail {
|
||||
return nil, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
return &fakeSyntheticFabricSession{
|
||||
response: response,
|
||||
frames: make(chan fabricproto.Frame, 16),
|
||||
errors: make(chan error, 1),
|
||||
done: make(chan struct{}),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (t *fakeSyntheticFabricTransport) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *fakeSyntheticFabricTransport) connectCount(endpoint string) int {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
return t.connects[endpoint]
|
||||
}
|
||||
|
||||
type fakeSyntheticFabricSession struct {
|
||||
response SyntheticEnvelope
|
||||
frames chan fabricproto.Frame
|
||||
errors chan error
|
||||
done chan struct{}
|
||||
once sync.Once
|
||||
}
|
||||
|
||||
func (s *fakeSyntheticFabricSession) Send(_ context.Context, frame fabricproto.Frame) error {
|
||||
if frame.Type != fabricproto.FrameData {
|
||||
return nil
|
||||
}
|
||||
responsePayload, _ := json.Marshal(quicSyntheticForwardResponse{Envelope: s.response})
|
||||
go func() {
|
||||
select {
|
||||
case <-s.done:
|
||||
case s.frames <- fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: frame.TrafficClass,
|
||||
StreamID: frame.StreamID,
|
||||
Sequence: frame.Sequence,
|
||||
Payload: responsePayload,
|
||||
}:
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeSyntheticFabricSession) Frames() <-chan fabricproto.Frame {
|
||||
return s.frames
|
||||
}
|
||||
|
||||
func (s *fakeSyntheticFabricSession) Errors() <-chan error {
|
||||
return s.errors
|
||||
}
|
||||
|
||||
func (s *fakeSyntheticFabricSession) Close() error {
|
||||
s.once.Do(func() {
|
||||
close(s.done)
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *fakeSyntheticFabricSession) Closed() bool {
|
||||
select {
|
||||
case <-s.done:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func testSyntheticEnvelope(routeID string, sequence uint64) SyntheticEnvelope {
|
||||
now := time.Now().UTC()
|
||||
return SyntheticEnvelope{
|
||||
ProtocolVersion: ProtocolVersion,
|
||||
RouteID: routeID,
|
||||
ClusterID: "cluster-a",
|
||||
From: PeerIdentity{ClusterID: "cluster-a", NodeID: "node-a"},
|
||||
To: PeerIdentity{ClusterID: "cluster-a", NodeID: "node-b"},
|
||||
Channel: SyntheticChannelFabricControl,
|
||||
MessageType: SyntheticMessageRouteHealth,
|
||||
TTL: 8,
|
||||
HopCount: 1,
|
||||
Visited: []string{"node-a"},
|
||||
Sequence: sequence,
|
||||
SentAt: now,
|
||||
}
|
||||
}
|
||||
|
||||
func testSyntheticAckEnvelope(routeID string, sequence uint64) SyntheticEnvelope {
|
||||
ack := testSyntheticEnvelope(routeID, sequence)
|
||||
ack.From = PeerIdentity{ClusterID: "cluster-a", NodeID: "node-b"}
|
||||
ack.To = PeerIdentity{ClusterID: "cluster-a", NodeID: "node-a"}
|
||||
ack.MessageType = SyntheticMessageRouteHealthAck
|
||||
ack.Visited = []string{"node-a", "node-b"}
|
||||
return ack
|
||||
}
|
||||
Reference in New Issue
Block a user