Refactor RDP proxy handling and update related tests

This commit is contained in:
2026-05-17 20:38:35 +03:00
parent 8e9402580f
commit d551e57fd5
172 changed files with 22117 additions and 2509 deletions
@@ -0,0 +1,17 @@
FROM golang:1.25-bookworm AS build
WORKDIR /src
COPY agents/rap-node-agent/go.mod ./
COPY agents/rap-node-agent/go.sum ./
RUN go mod download
COPY agents/rap-node-agent/ ./
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/fabric-loadtest ./cmd/fabric-loadtest
FROM debian:bookworm-slim
RUN apt-get update \
&& apt-get install -y --no-install-recommends ca-certificates iproute2 iptables iputils-ping procps \
&& rm -rf /var/lib/apt/lists/*
COPY --from=build /out/fabric-loadtest /usr/local/bin/fabric-loadtest
ENTRYPOINT ["/usr/local/bin/fabric-loadtest"]
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,760 @@
package main
import (
"bytes"
"context"
"strings"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
)
func TestRouteModeCoverageVerdictRequiresMixedModes(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
TopologyProfile: "mixed-public-nat-lan-relay",
Targets: []string{"a", "b", "c", "d"},
FailTarget: -1,
ImpairTarget: -1,
},
SuccessfulStreams: 4,
TargetStats: map[string]targetStats{
"a": {RouteModes: map[string]int{string(mesh.FabricRouteLAN): 1}},
"b": {RouteModes: map[string]int{string(mesh.FabricRouteICE): 1}},
"c": {RouteModes: map[string]int{string(mesh.FabricRouteReverse): 1}},
"d": {RouteModes: map[string]int{}},
},
}
reasons := routeModeCoverageVerdictReasons(report)
if len(reasons) != 1 || !strings.Contains(reasons[0], string(mesh.FabricRouteRelay)) {
t.Fatalf("reasons = %v, want missing relay route mode", reasons)
}
report.TargetStats["d"] = targetStats{RouteModes: map[string]int{string(mesh.FabricRouteRelay): 1}}
if reasons := routeModeCoverageVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("reasons = %v, want full coverage pass", reasons)
}
}
func TestLegacyRouteModeVerdictRejectsNonQUICModes(t *testing.T) {
report := loadtestReport{
TargetStats: map[string]targetStats{
"a": {RouteModes: map[string]int{
"direct_quic": 4,
"relay": 1,
"outbound_reverse": 2,
"wss": 3,
}},
},
}
reasons := legacyRouteModeVerdictReasons(report)
if len(reasons) != 1 ||
!strings.Contains(reasons[0], "relay:1") ||
!strings.Contains(reasons[0], "outbound_reverse:2") ||
!strings.Contains(reasons[0], "wss:3") {
t.Fatalf("reasons = %v, want legacy route mode failure", reasons)
}
report.TargetStats["a"] = targetStats{RouteModes: map[string]int{
string(mesh.FabricRouteDirect): 1,
string(mesh.FabricRouteLAN): 1,
string(mesh.FabricRouteICE): 1,
string(mesh.FabricRouteReverse): 1,
string(mesh.FabricRouteRelay): 1,
}}
if reasons := legacyRouteModeVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("reasons = %v, want QUIC modes accepted", reasons)
}
}
func TestTargetEndpointPolicyVerdictRejectsNonQUICTargets(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
Targets: []string{
"quic://a:19443",
"http://b:19443",
"ws://c:19443",
"d:19443",
"",
},
},
}
reasons := targetEndpointPolicyVerdictReasons(report)
if len(reasons) != 1 ||
!strings.Contains(reasons[0], "http://b:19443") ||
!strings.Contains(reasons[0], "ws://c:19443") ||
!strings.Contains(reasons[0], "d:19443") ||
!strings.Contains(reasons[0], "<empty>") {
t.Fatalf("reasons = %v, want non-QUIC target failure", reasons)
}
report.Config.Targets = []string{"quic://a:19443", " QUIC://b:19443 "}
if reasons := targetEndpointPolicyVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("reasons = %v, want QUIC targets accepted", reasons)
}
}
func TestRunClientRejectsNonQUICTargetBeforeDial(t *testing.T) {
_, err := runClient(context.Background(), loadtestConfig{
Targets: []string{"http://127.0.0.1:19443"},
Streams: 1,
Concurrency: 1,
BytesPerStream: 1,
PayloadSize: 1,
})
if err == nil || !strings.Contains(err.Error(), "non_quic_targets=http://127.0.0.1:19443") {
t.Fatalf("err = %v, want non-QUIC target validation error", err)
}
}
func TestFillLoadtestPayloadVariesByStreamAndSequence(t *testing.T) {
first := make([]byte, 128)
second := make([]byte, 128)
third := make([]byte, 128)
fillLoadtestPayload(first, 7, 9, 1, 0)
fillLoadtestPayload(second, 7, 9, 2, int64(len(first)))
fillLoadtestPayload(third, 8, 10, 1, 0)
if bytes.Equal(first, second) {
t.Fatal("payload did not vary by sequence/offset")
}
if bytes.Equal(first, third) {
t.Fatal("payload did not vary by stream")
}
if bytes.Count(first, []byte{first[0]}) == len(first) {
t.Fatal("payload collapsed to a constant byte")
}
}
func TestFillLoadtestPayloadIsDeterministic(t *testing.T) {
first := make([]byte, 128)
second := make([]byte, 128)
fillLoadtestPayload(first, 7, 9, 1, 0)
fillLoadtestPayload(second, 7, 9, 1, 0)
if !bytes.Equal(first, second) {
t.Fatal("payload is not deterministic")
}
}
func TestFillLoadtestPayloadHandlesShortFinalChunk(t *testing.T) {
chunk := make([]byte, 17)
fillLoadtestPayload(chunk, 7, 9, 3, 256)
if bytes.Equal(chunk, make([]byte, len(chunk))) {
t.Fatal("short payload chunk stayed zeroed")
}
}
func TestVerdictFailsSuccessfulStreamAckMismatch(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 1,
},
TotalStreams: 1,
SuccessfulStreams: 1,
BytesSent: 1024,
FramesSent: 2,
AcksReceived: 1,
AckMismatchedStreams: 1,
ChannelOpens: 1,
ChannelCloses: 1,
RoutePressure: mesh.FabricRoutePressureSnapshot{AcquiredTotal: 1, ReleasedTotal: 1, MaxActiveTotal: 1},
}
gotVerdict, reasons := verdict(report)
if gotVerdict != "fail" {
t.Fatalf("verdict = %q, want fail", gotVerdict)
}
found := false
for _, reason := range reasons {
if reason == "ack_mismatched_streams=1" {
found = true
}
}
if !found {
t.Fatalf("reasons = %v, want ack mismatch reason", reasons)
}
}
func TestVerdictFailsAckIntegrityError(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 1,
},
TotalStreams: 1,
FailedStreams: 1,
BytesSent: 1024,
FramesSent: 1,
AcksReceived: 1,
AckIntegrityErrors: 1,
ChannelOpens: 1,
ChannelCloses: 1,
RoutePressure: mesh.FabricRoutePressureSnapshot{AcquiredTotal: 1, ReleasedTotal: 1, MaxActiveTotal: 1},
}
gotVerdict, reasons := verdict(report)
if gotVerdict != "fail" {
t.Fatalf("verdict = %q, want fail", gotVerdict)
}
found := false
for _, reason := range reasons {
if reason == "ack_integrity_errors=1" {
found = true
}
}
if !found {
t.Fatalf("reasons = %v, want ack integrity reason", reasons)
}
}
func TestVerdictFailsBelowMinimumThroughput(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 1,
MinThroughputMbps: 100,
},
TotalStreams: 1,
SuccessfulStreams: 1,
BytesSent: 1024,
FramesSent: 1,
AcksReceived: 1,
ThroughputBps: 99 * 1000 * 1000,
ChannelOpens: 1,
ChannelCloses: 1,
RoutePressure: mesh.FabricRoutePressureSnapshot{AcquiredTotal: 1, ReleasedTotal: 1, MaxActiveTotal: 1},
}
gotVerdict, reasons := verdict(report)
if gotVerdict != "fail" {
t.Fatalf("verdict = %q, want fail", gotVerdict)
}
found := false
for _, reason := range reasons {
if strings.HasPrefix(reason, "throughput_bps=") {
found = true
}
}
if !found {
t.Fatalf("reasons = %v, want throughput reason", reasons)
}
report.ThroughputBps = 100 * 1000 * 1000
if gotVerdict, reasons := verdict(report); gotVerdict != "pass" {
t.Fatalf("verdict = %q reasons=%v, want pass at threshold", gotVerdict, reasons)
}
}
func TestVerdictFailsBelowMinimumChannelChurn(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 1,
MinChannelChurn: 1000,
},
TotalStreams: 1,
SuccessfulStreams: 1,
BytesSent: 1024,
FramesSent: 1,
AcksReceived: 1,
ChannelOpens: 1,
ChannelCloses: 1,
ChannelChurnPerSec: 999,
RoutePressure: mesh.FabricRoutePressureSnapshot{AcquiredTotal: 1, ReleasedTotal: 1, MaxActiveTotal: 1},
}
gotVerdict, reasons := verdict(report)
if gotVerdict != "fail" {
t.Fatalf("verdict = %q, want fail", gotVerdict)
}
found := false
for _, reason := range reasons {
if strings.HasPrefix(reason, "channel_churn_per_sec=") {
found = true
}
}
if !found {
t.Fatalf("reasons = %v, want channel churn reason", reasons)
}
report.ChannelChurnPerSec = 1000
if gotVerdict, reasons := verdict(report); gotVerdict != "pass" {
t.Fatalf("verdict = %q reasons=%v, want pass at threshold", gotVerdict, reasons)
}
}
func TestTargetByteDistributionVerdictDetectsSkew(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
Targets: []string{"a", "b", "c", "d"},
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 1,
BytesPerStream: 100,
},
SuccessfulStreams: 40,
BytesSent: 4000,
TargetStreams: map[string]int{
"a": 10,
"b": 10,
"c": 10,
"d": 10,
},
TargetBytes: map[string]int64{
"a": 2500,
"b": 500,
"c": 500,
"d": 500,
},
}
reasons := targetByteDistributionVerdictReasons(report)
if len(reasons) != 1 || !strings.HasPrefix(reasons[0], "target_byte_distribution_skew=") {
t.Fatalf("reasons = %v, want byte skew reason", reasons)
}
report.TargetBytes = map[string]int64{
"a": 1000,
"b": 1000,
"c": 1000,
"d": 1000,
}
if reasons := targetByteDistributionVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("reasons = %v, want balanced bytes pass", reasons)
}
}
func TestDistributionVerdictChecksSurvivingTargetsAfterFailure(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
Targets: []string{"quic://a:1", "quic://b:1", "quic://c:1", "quic://d:1"},
FailTarget: 0,
ImpairTarget: -1,
Concurrency: 8,
},
SuccessfulStreams: 90,
TargetStreams: map[string]int{
"quic://b:1": 90,
},
}
reasons := targetDistributionVerdictReasons(report)
if len(reasons) != 1 || !strings.HasPrefix(reasons[0], "target_distribution_collapsed=1/3_targets_used") {
t.Fatalf("reasons = %v, want surviving-target collapse", reasons)
}
report.TargetStreams = map[string]int{
"quic://b:1": 30,
"quic://c:1": 30,
"quic://d:1": 30,
}
if reasons := targetDistributionVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("reasons = %v, want balanced surviving targets pass", reasons)
}
}
func TestRoutePressureVerdictChecksSurvivingTargetsAfterFailure(t *testing.T) {
targets := []string{"quic://a:1", "quic://b:1", "quic://c:1", "quic://d:1"}
report := loadtestReport{
Config: loadtestConfig{
Targets: targets,
FailTarget: 0,
ImpairTarget: -1,
Concurrency: 12,
},
RoutePressure: mesh.FabricRoutePressureSnapshot{
MaxActive: map[string]int{
loadtestRouteID(1, targets[1]): 12,
},
MaxActiveTotal: 12,
},
}
reasons := routePressureDistributionVerdictReasons(report)
if len(reasons) != 1 || !strings.HasPrefix(reasons[0], "route_pressure_distribution_collapsed=1/3_targets_used") {
t.Fatalf("reasons = %v, want surviving-route-pressure collapse", reasons)
}
report.RoutePressure.MaxActive = map[string]int{
loadtestRouteID(1, targets[1]): 4,
loadtestRouteID(2, targets[2]): 4,
loadtestRouteID(3, targets[3]): 4,
}
if reasons := routePressureDistributionVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("reasons = %v, want balanced surviving route pressure pass", reasons)
}
}
func TestVerdictFailsOverallAckLatencySLO(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 1,
MaxAckP95Ms: 10,
MaxAckP99Ms: 20,
},
TotalStreams: 1,
SuccessfulStreams: 1,
BytesSent: 1024,
FramesSent: 1,
AcksReceived: 1,
AckP95Ms: 11,
AckP99Ms: 21,
ChannelOpens: 1,
ChannelCloses: 1,
RoutePressure: mesh.FabricRoutePressureSnapshot{AcquiredTotal: 1, ReleasedTotal: 1, MaxActiveTotal: 1},
}
gotVerdict, reasons := verdict(report)
if gotVerdict != "fail" {
t.Fatalf("verdict = %q, want fail", gotVerdict)
}
foundP95 := false
foundP99 := false
for _, reason := range reasons {
if strings.HasPrefix(reason, "ack_p95_ms=") {
foundP95 = true
}
if strings.HasPrefix(reason, "ack_p99_ms=") {
foundP99 = true
}
}
if !foundP95 || !foundP99 {
t.Fatalf("reasons = %v, want ACK p95 and p99 reasons", reasons)
}
}
func TestTargetAckVerdictDetectsSlowHealthyTarget(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
Targets: []string{"a", "b"},
FailTarget: -1,
ImpairTarget: -1,
MaxTargetAckMs: 10,
},
TargetStats: map[string]targetStats{
"a": {Streams: 10, MaxAckMs: 4},
"b": {Streams: 10, MaxAckMs: 11},
},
}
reasons := targetAckVerdictReasons(report)
if len(reasons) != 1 || !strings.HasPrefix(reasons[0], "target_ack_ms=b:11>10") {
t.Fatalf("reasons = %v, want slow target ack reason", reasons)
}
report.TargetStats["b"] = targetStats{Streams: 10, MaxAckMs: 10}
if reasons := targetAckVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("reasons = %v, want target ack pass at threshold", reasons)
}
}
func TestVerdictFailsSetupLatencySLO(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 1,
MaxSetupP95Ms: 10,
MaxSetupP99Ms: 20,
},
TotalStreams: 1,
SuccessfulStreams: 1,
BytesSent: 1024,
FramesSent: 1,
AcksReceived: 1,
SetupLatencyP95Ms: 11,
SetupLatencyP99Ms: 21,
ChannelOpens: 1,
ChannelCloses: 1,
RoutePressure: mesh.FabricRoutePressureSnapshot{AcquiredTotal: 1, ReleasedTotal: 1, MaxActiveTotal: 1},
}
gotVerdict, reasons := verdict(report)
if gotVerdict != "fail" {
t.Fatalf("verdict = %q, want fail", gotVerdict)
}
foundP95 := false
foundP99 := false
for _, reason := range reasons {
if strings.HasPrefix(reason, "setup_p95_ms=") {
foundP95 = true
}
if strings.HasPrefix(reason, "setup_p99_ms=") {
foundP99 = true
}
}
if !foundP95 || !foundP99 {
t.Fatalf("reasons = %v, want setup p95 and p99 reasons", reasons)
}
}
func TestVerdictFailsRerouteLatencySLO(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 1,
MaxRerouteP95Ms: 10,
MaxRerouteP99Ms: 20,
},
TotalStreams: 1,
SuccessfulStreams: 1,
BytesSent: 1024,
FramesSent: 1,
AcksReceived: 1,
RerouteLatencyP95Ms: 11,
RerouteLatencyP99Ms: 21,
ChannelOpens: 1,
ChannelCloses: 1,
RoutePressure: mesh.FabricRoutePressureSnapshot{AcquiredTotal: 1, ReleasedTotal: 1, MaxActiveTotal: 1},
}
gotVerdict, reasons := verdict(report)
if gotVerdict != "fail" {
t.Fatalf("verdict = %q, want fail", gotVerdict)
}
foundP95 := false
foundP99 := false
for _, reason := range reasons {
if strings.HasPrefix(reason, "reroute_p95_ms=") {
foundP95 = true
}
if strings.HasPrefix(reason, "reroute_p99_ms=") {
foundP99 = true
}
}
if !foundP95 || !foundP99 {
t.Fatalf("reasons = %v, want reroute p95 and p99 reasons", reasons)
}
}
func TestShouldQuarantineTarget(t *testing.T) {
quarantined := []string{
"ack timeout or session closed",
"deadline exceeded",
"connection refused",
"connection reset by peer",
"no route to host",
}
for _, reason := range quarantined {
if !shouldQuarantineTarget(reason) {
t.Fatalf("shouldQuarantineTarget(%q) = false, want true", reason)
}
}
if shouldQuarantineTarget("ack payload checksum mismatch") {
t.Fatal("checksum mismatch should not quarantine a target")
}
if shouldQuarantineTarget("context deadline exceeded") {
t.Fatal("context deadline should not quarantine a target")
}
}
func TestSpreadStartDistributesQuarantinedSlot(t *testing.T) {
targets := []string{"a", "b", "c", "d"}
health := newTargetHealthTracker()
health.MarkDegraded("a", "connection refused", time.Minute)
counts := map[string]int{}
for index := 0; index < 40; index += len(targets) {
initial, spread := loadtestSpreadStart(index, len(targets))
targetIndex := loadtestPreferredTargetIndex(targets, initial, spread, health, -1)
counts[targets[targetIndex]]++
}
if counts["b"] == 0 || counts["c"] == 0 || counts["d"] == 0 {
t.Fatalf("counts = %v, want degraded slot spread across surviving targets", counts)
}
}
func TestSpreadUsableTargetDistributesRetries(t *testing.T) {
targets := []string{"a", "b", "c", "d"}
health := newTargetHealthTracker()
health.MarkDegraded("a", "connection refused", time.Minute)
counts := map[string]int{}
for cohort := 0; cohort < 90; cohort++ {
targetIndex := loadtestSpreadUsableTargetIndex(targets, cohort, health, 0)
counts[targets[targetIndex]]++
}
if counts["b"] != 30 || counts["c"] != 30 || counts["d"] != 30 {
t.Fatalf("counts = %v, want retry load spread evenly across surviving targets", counts)
}
}
func TestLoadtestLogicalStreamIDAvoidsReservedTransportStreams(t *testing.T) {
for _, index := range []int{-1, 0, 1, 999, 1000, 10_000} {
streamID := loadtestLogicalStreamID(index)
if streamID == mesh.ProductionForwardQUICStreamID || streamID == mesh.SyntheticForwardQUICStreamID {
t.Fatalf("loadtestLogicalStreamID(%d) = %d, collides with reserved transport stream", index, streamID)
}
if streamID < 10_000 {
t.Fatalf("loadtestLogicalStreamID(%d) = %d, want loadtest stream range", index, streamID)
}
}
}
func TestLatencyAwareTargetIndexKeepsSlowWANFromOwningPool(t *testing.T) {
targets := []string{"lan-a", "lan-b", "wan"}
health := newTargetHealthTracker()
health.RecordProbes([]targetProbeResult{
{Target: "lan-a", RTTMs: 4, Usable: true},
{Target: "lan-b", RTTMs: 5, Usable: true},
{Target: "wan", RTTMs: 400, Usable: true},
})
counts := map[string]int{}
for index := 0; index < 300; index++ {
targetIndex := loadtestSpreadUsableTargetIndex(targets, index, health, -1)
counts[targets[targetIndex]]++
}
if counts["wan"] == 0 {
t.Fatalf("counts = %v, want slow WAN to stay represented", counts)
}
if counts["wan"] >= counts["lan-a"] || counts["wan"] >= counts["lan-b"] {
t.Fatalf("counts = %v, want latency-aware placement to prefer LAN capacity", counts)
}
}
func TestLatencyAwarePreferredTargetUsesAbsolutePlacementOrdinal(t *testing.T) {
targets := []string{"lan-a", "lan-b", "lan-c", "wan"}
health := newTargetHealthTracker()
health.RecordProbes([]targetProbeResult{
{Target: "lan-a", RTTMs: 4, Usable: true},
{Target: "lan-b", RTTMs: 4, Usable: true},
{Target: "lan-c", RTTMs: 4, Usable: true},
{Target: "wan", RTTMs: 400, Usable: true},
})
counts := map[string]int{}
for index := 0; index < 500; index++ {
preferred, spread := loadtestSpreadStart(index, len(targets))
targetIndex := loadtestPreferredTargetIndex(targets, preferred, spread, health, -1)
counts[targets[targetIndex]]++
}
if len(counts) < len(targets) {
t.Fatalf("counts = %v, want every probed target represented", counts)
}
if counts["wan"] >= counts["lan-a"] || counts["wan"] >= counts["lan-b"] || counts["wan"] >= counts["lan-c"] {
t.Fatalf("counts = %v, want slow WAN weighted below LAN targets", counts)
}
}
func TestHeterogeneousProbeRTTRelaxesEqualDistributionVerdict(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
Targets: []string{"lan", "wan"},
Concurrency: 64,
},
SuccessfulStreams: 100,
BytesSent: 100 * 1024,
TargetStreams: map[string]int{
"lan": 96,
"wan": 4,
},
TargetBytes: map[string]int64{
"lan": 96 * 1024,
"wan": 4 * 1024,
},
TargetProbes: []targetProbeResult{
{Target: "lan", RTTMs: 4, Usable: true},
{Target: "wan", RTTMs: 400, Usable: true},
},
RoutePressure: mesh.FabricRoutePressureSnapshot{
MaxActive: map[string]int{
loadtestRouteID(0, "lan"): 32,
loadtestRouteID(1, "wan"): 1,
},
MaxActiveTotal: 32,
},
}
if reasons := targetDistributionVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("targetDistributionVerdictReasons = %v, want heterogeneous RTT tolerated", reasons)
}
if reasons := targetByteDistributionVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("targetByteDistributionVerdictReasons = %v, want heterogeneous RTT tolerated", reasons)
}
if reasons := routePressureDistributionVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("routePressureDistributionVerdictReasons = %v, want heterogeneous RTT tolerated", reasons)
}
}
func TestTargetHealthQuarantineExpiresButSnapshotKeepsObservation(t *testing.T) {
health := newTargetHealthTracker()
health.MarkDegraded("a", "ack timeout", time.Nanosecond)
if !health.IsDegraded("a") {
t.Fatal("target should be degraded immediately")
}
time.Sleep(time.Millisecond)
if health.IsDegraded("a") {
t.Fatal("target quarantine did not expire")
}
snapshot := health.Snapshot()
if snapshot["a"] != "ack timeout" {
t.Fatalf("snapshot = %v, want historical degraded observation", snapshot)
}
}
func TestRoutePressureDistributionVerdictDetectsCollapse(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
Targets: []string{"a", "b", "c", "d"},
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 16,
},
RoutePressure: mesh.FabricRoutePressureSnapshot{
MaxActive: map[string]int{
loadtestRouteID(0, "a"): 16,
},
MaxActiveTotal: 16,
},
}
reasons := routePressureDistributionVerdictReasons(report)
if len(reasons) != 1 || !strings.HasPrefix(reasons[0], "route_pressure_distribution_collapsed=") {
t.Fatalf("reasons = %v, want collapsed route pressure reason", reasons)
}
}
func TestRoutePressureDistributionVerdictDetectsSkew(t *testing.T) {
report := loadtestReport{
Config: loadtestConfig{
Targets: []string{"a", "b", "c", "d"},
FailTarget: -1,
ImpairTarget: -1,
Concurrency: 16,
},
RoutePressure: mesh.FabricRoutePressureSnapshot{
MaxActive: map[string]int{
loadtestRouteID(0, "a"): 14,
loadtestRouteID(1, "b"): 2,
loadtestRouteID(2, "c"): 2,
loadtestRouteID(3, "d"): 2,
},
MaxActiveTotal: 16,
},
}
reasons := routePressureDistributionVerdictReasons(report)
if len(reasons) != 1 || !strings.HasPrefix(reasons[0], "route_pressure_distribution_skew=") {
t.Fatalf("reasons = %v, want route pressure skew reason", reasons)
}
report.RoutePressure.MaxActive = map[string]int{
loadtestRouteID(0, "a"): 6,
loadtestRouteID(1, "b"): 6,
loadtestRouteID(2, "c"): 5,
loadtestRouteID(3, "d"): 5,
}
if reasons := routePressureDistributionVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("reasons = %v, want balanced route pressure pass", reasons)
}
}
@@ -0,0 +1,199 @@
package main
import (
"context"
"crypto/sha256"
"encoding/base64"
"encoding/hex"
"encoding/json"
"errors"
"flag"
"fmt"
"os"
"strings"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
)
type smokeOutput struct {
OK bool `json:"ok"`
Endpoint string `json:"endpoint"`
EntryNodeID string `json:"entry_node_id"`
NextHopID string `json:"next_hop_node_id"`
RouteID string `json:"route_id"`
ElapsedMS int64 `json:"elapsed_ms"`
Result mesh.ProductionForwardResult `json:"result"`
Error string `json:"error,omitempty"`
EnvelopePath []string `json:"envelope_path,omitempty"`
}
type productionForwardResponse struct {
Result mesh.ProductionForwardResult `json:"result,omitempty"`
Error string `json:"error,omitempty"`
}
func main() {
var (
endpoint = flag.String("endpoint", "", "QUIC fabric endpoint for the entry node, for example quic://host:19131.")
peerCert = flag.String("peer-cert-sha256", "", "Expected entry node QUIC TLS certificate SHA-256 fingerprint.")
clusterID = flag.String("cluster-id", "", "Cluster ID.")
routeID = flag.String("route-id", "", "Configured production route ID.")
sourceNodeID = flag.String("source-node-id", "", "Route source node ID.")
destNodeID = flag.String("destination-node-id", "", "Route destination node ID.")
currentNodeID = flag.String("current-hop-node-id", "", "Current hop node ID expected by the entry node.")
nextHopNodeID = flag.String("next-hop-node-id", "", "Next hop node ID from the entry node.")
routePath = flag.String("route-path", "", "Comma-separated route path.")
channel = flag.String("channel", mesh.ProductionChannelFabricControl, "Production channel class.")
timeout = flag.Duration("timeout", 10*time.Second, "Smoke request timeout.")
payloadText = flag.String("payload", `{"kind":"fabric-production-smoke"}`, "JSON payload string.")
payloadB64 = flag.String("payload-b64", "", "Base64-encoded JSON payload string.")
)
flag.Parse()
if *endpoint == "" || *clusterID == "" || *routeID == "" || *sourceNodeID == "" || *destNodeID == "" || *currentNodeID == "" || *nextHopNodeID == "" {
writeOutput(smokeOutput{OK: false, Error: "endpoint, cluster-id, route-id, source-node-id, destination-node-id, current-hop-node-id and next-hop-node-id are required"})
os.Exit(2)
}
path := splitRoutePath(*routePath)
payloadSource := strings.TrimSpace(*payloadText)
if strings.TrimSpace(*payloadB64) != "" {
decoded, err := base64.StdEncoding.DecodeString(strings.TrimSpace(*payloadB64))
if err != nil {
writeOutput(smokeOutput{OK: false, Error: "payload-b64 must be valid base64"})
os.Exit(2)
}
payloadSource = string(decoded)
}
payload := json.RawMessage(strings.TrimSpace(payloadSource))
if !json.Valid(payload) {
writeOutput(smokeOutput{OK: false, Error: "payload must be valid JSON"})
os.Exit(2)
}
now := time.Now().UTC()
messageType := mesh.ProductionMessageFabricControl
if strings.TrimSpace(*channel) == mesh.ProductionChannelVPNPacket {
messageType = mesh.ProductionMessageVPNPacketBatch
}
sum := sha256.Sum256(payload)
envelope := mesh.ProductionEnvelope{
FabricProtocolVersion: mesh.ProtocolVersion,
MessageID: fmt.Sprintf("fabric-production-smoke-%d", now.UnixNano()),
RouteID: strings.TrimSpace(*routeID),
ClusterID: strings.TrimSpace(*clusterID),
SourceNodeID: strings.TrimSpace(*sourceNodeID),
DestinationNodeID: strings.TrimSpace(*destNodeID),
CurrentHopNodeID: strings.TrimSpace(*currentNodeID),
NextHopNodeID: strings.TrimSpace(*nextHopNodeID),
RoutePath: path,
ChannelClass: strings.TrimSpace(*channel),
MessageType: messageType,
TTL: 8,
HopCount: 0,
CreatedAt: now,
ExpiresAt: now.Add(time.Minute),
PayloadLength: len(payload),
PayloadHash: hex.EncodeToString(sum[:]),
Payload: payload,
}
transport := mesh.NewQUICFabricTransport(nil)
ctx, cancel := context.WithTimeout(context.Background(), *timeout)
defer cancel()
started := time.Now()
result, err := sendProductionEnvelope(ctx, transport, mesh.FabricTransportTarget{
EndpointID: "fabric-production-smoke-entry",
PeerID: envelope.CurrentHopNodeID,
Endpoint: strings.TrimSpace(*endpoint),
Transport: "quic",
PeerCertSHA256: strings.TrimSpace(*peerCert),
Timeout: *timeout,
InboundBuffer: 8,
ErrorBuffer: 4,
}, envelope)
output := smokeOutput{
OK: err == nil && result.Accepted,
Endpoint: *endpoint,
EntryNodeID: envelope.CurrentHopNodeID,
NextHopID: envelope.NextHopNodeID,
RouteID: envelope.RouteID,
ElapsedMS: time.Since(started).Milliseconds(),
Result: result,
EnvelopePath: path,
}
if err != nil {
output.Error = err.Error()
writeOutput(output)
os.Exit(1)
}
writeOutput(output)
}
func sendProductionEnvelope(ctx context.Context, transport *mesh.QUICFabricTransport, target mesh.FabricTransportTarget, envelope mesh.ProductionEnvelope) (mesh.ProductionForwardResult, error) {
session, err := transport.Connect(ctx, target)
if err != nil {
return mesh.ProductionForwardResult{}, err
}
defer session.Close()
payload, err := json.Marshal(envelope)
if err != nil {
return mesh.ProductionForwardResult{}, err
}
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: mesh.ProductionForwardQUICStreamID,
Sequence: 1,
Payload: payload,
}); err != nil {
return mesh.ProductionForwardResult{}, err
}
for {
select {
case <-ctx.Done():
return mesh.ProductionForwardResult{}, ctx.Err()
case err := <-session.Errors():
if err != nil {
return mesh.ProductionForwardResult{}, err
}
case frame := <-session.Frames():
if frame.Type != fabricproto.FrameData || frame.StreamID != mesh.ProductionForwardQUICStreamID || frame.Sequence != 1 {
continue
}
var response productionForwardResponse
if err := json.Unmarshal(frame.Payload, &response); err != nil {
return mesh.ProductionForwardResult{}, err
}
if strings.TrimSpace(response.Error) != "" {
return mesh.ProductionForwardResult{}, errors.New(response.Error)
}
return response.Result, nil
}
}
}
func splitRoutePath(value string) []string {
value = strings.TrimSpace(value)
if value == "" {
return nil
}
parts := strings.Split(value, ",")
out := make([]string, 0, len(parts))
for _, part := range parts {
part = strings.TrimSpace(part)
if part != "" {
out = append(out, part)
}
}
return out
}
func writeOutput(output smokeOutput) {
payload, err := json.MarshalIndent(output, "", " ")
if err != nil {
fmt.Fprintf(os.Stderr, "marshal smoke output: %v\n", err)
return
}
fmt.Println(string(payload))
}
@@ -28,6 +28,18 @@ type smokeNode struct {
server *httptest.Server
}
type smokeSyntheticTransport struct {
peers map[string]string
}
func (t smokeSyntheticTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope mesh.SyntheticEnvelope) (mesh.SyntheticEnvelope, error) {
baseURL := t.peers[nextNodeID]
if baseURL == "" {
return mesh.SyntheticEnvelope{}, mesh.ErrSyntheticPeerUnavailable
}
return mesh.NewClient(baseURL).SendSynthetic(ctx, envelope)
}
type smokeReport struct {
Stage string `json:"stage"`
ProductionForwarding bool `json:"production_forwarding"`
@@ -433,7 +445,7 @@ func writeSmokeScopedConfig(local mesh.PeerIdentity, peers map[string]string, ro
func newSmokeNode(local mesh.PeerIdentity) *smokeNode {
node := &smokeNode{Local: local}
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime, FabricSessionEnabled: true}.Handler().ServeHTTP(w, r)
mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime, FabricSessionEnabled: true, FabricSessionWebSocketEnabled: true}.Handler().ServeHTTP(w, r)
}))
node.URL = node.server.URL
return node
@@ -454,7 +466,7 @@ func smokeRuntime(local mesh.PeerIdentity, routes []mesh.SyntheticRoute, peers m
mesh.SyntheticChannelFabricControl,
mesh.SyntheticChannelRouteControl,
},
Transport: mesh.NewHTTPPeerTransport(peers),
Transport: smokeSyntheticTransport{peers: peers},
})
}
@@ -217,7 +217,7 @@ func runInstallLinux(ctx context.Context, args []string) error {
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session WebSocket endpoint.")
fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.")
fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
@@ -230,7 +230,7 @@ func runInstallLinux(ctx context.Context, args []string) error {
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "direct_http"), "Advertised transport.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Advertised transport.")
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "linux"), "Region/site hint.")
@@ -305,7 +305,7 @@ func runInstallWindows(ctx context.Context, args []string) error {
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session WebSocket endpoint.")
fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.")
fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
@@ -318,7 +318,7 @@ func runInstallWindows(ctx context.Context, args []string) error {
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "direct_http"), "Advertised transport.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Advertised transport.")
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "windows"), "Region/site hint.")
@@ -799,7 +799,7 @@ func parseInstall(args []string) (installCommandConfig, error) {
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable synthetic mesh runtime.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session WebSocket endpoint.")
fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.")
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
@@ -812,7 +812,7 @@ func parseInstall(args []string) (installCommandConfig, error) {
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", ""), "Advertised transport.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Advertised transport.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", ""), "Connectivity mode hint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", ""), "NAT type hint.")
fs.StringVar(&cfg.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint.")
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+12 -6
View File
@@ -2,15 +2,21 @@ module github.com/example/remote-access-platform/agents/rap-node-agent
go 1.25.5
require golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb
require (
github.com/gorilla/websocket v1.5.3
github.com/quic-go/quic-go v0.59.1
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb
)
require (
github.com/gorilla/websocket v1.5.3 // indirect
github.com/quic-go/quic-go v0.59.1 // indirect
golang.org/x/crypto v0.50.0 // indirect
golang.org/x/net v0.53.0 // indirect
golang.org/x/sys v0.43.0 // indirect
golang.org/x/crypto v0.51.0 // indirect
golang.org/x/mobile v0.0.0-20260514233045-7de0a8fa7f4d // indirect
golang.org/x/mod v0.36.0 // indirect
golang.org/x/net v0.54.0 // indirect
golang.org/x/sync v0.20.0 // indirect
golang.org/x/sys v0.44.0 // indirect
golang.org/x/time v0.15.0 // indirect
golang.org/x/tools v0.45.0 // indirect
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
gvisor.dev/gvisor v0.0.0-20260505022556-2306ef3db943 // indirect
)
+24 -6
View File
@@ -1,20 +1,38 @@
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/quic-go/quic-go v0.59.1 h1:0Gmua0HW1Tv7ANR7hUYwRyD0MG5OJfgvYSZasGZzBic=
github.com/quic-go/quic-go v0.59.1/go.mod h1:upnsH4Ju1YkqpLXC305eW3yDZ4NfnNbmQRCMWS58IKU=
golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI=
golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
go.uber.org/mock v0.5.2 h1:LbtPTcP8A5k9WPXj54PPPbjcI4Y6lhyOZXn+VS7wNko=
go.uber.org/mock v0.5.2/go.mod h1:wLlUxC2vVTPTaE3UD51E0BGOAElKrILxhVSDYQLld5o=
golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI=
golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/mobile v0.0.0-20260514233045-7de0a8fa7f4d h1:XNPSUMmnREiyj6HdYfJjTJVQIC5c1b3+qV7mbxUjzwk=
golang.org/x/mobile v0.0.0-20260514233045-7de0a8fa7f4d/go.mod h1:ltIbhcRzKgwHa4ZxKJeiv0nyzcXUUYCqMyO0Y+vPmXw=
golang.org/x/mod v0.36.0 h1:JJjpVx6myfUsUdAzZuOSTTmRE0PfZeNWzzvKrP7amb4=
golang.org/x/mod v0.36.0/go.mod h1:moc6ELqsWcOw5Ef3xVprK5ul/MvtVvkIXLziUOICjUQ=
golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w=
golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ=
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
golang.org/x/tools v0.45.0 h1:18qN3FAooORvApf5XjCXgsuayZOEtXf6JK18I3+ONa8=
golang.org/x/tools v0.45.0/go.mod h1:LuUGqqaXcXMEFEruIVJVm5mgDD8vww/z/SR1gQ4uE/0=
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg=
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2/go.mod h1:deeaetjYA+DHMHg+sMSMI58GrEteJUUzzw7en6TJQcI=
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb h1:whnFRlWMcXI9d+ZbWg+4sHnLp52d5yiIPUxMBSt4X9A=
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb/go.mod h1:rpwXGsirqLqN2L0JDJQlwOboGHmptD5ZD6T2VmcqhTw=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gvisor.dev/gvisor v0.0.0-20260505022556-2306ef3db943 h1:YUPk0vGbex2+Jk7XXIgLIPG6oEAD9ml0x7wd6i/bmA4=
gvisor.dev/gvisor v0.0.0-20260505022556-2306ef3db943/go.mod h1:xQ2PWgHmWJA/Ph4i1q1jBm39BKhc3W0DXqWoDSyuBOY=
@@ -7,7 +7,7 @@ import (
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const Version = "0.2.280-fabricsession"
const Version = "0.2.309-latencyaware"
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
return client.EnrollRequest{
@@ -38,9 +38,12 @@ func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) cli
"vpn_local_gateway_shortcut": false,
"vpn_farm_owned_dataplane": true,
"fabric_data_session_v1": true,
"fabric_session_websocket_smoke": true,
"fabric_session_quic_smoke": true,
"vpn_backend_relay_fallback": false,
"fabric_service_channel_required": true,
"web_ingress_workload_contract": "rap.web_ingress.workload_contract.v1",
"web_ingress_real_listener_gate": "RAP_WEB_INGRESS_RUNTIME_ENABLED",
"web_ingress_runtime_enabled": false,
"external_backend_entry_proxy": true,
},
ReportedFacts: map[string]any{
@@ -67,9 +70,12 @@ func HeartbeatPayload() client.HeartbeatRequest {
"vpn_local_gateway_shortcut": false,
"vpn_farm_owned_dataplane": true,
"fabric_data_session_v1": true,
"fabric_session_websocket_smoke": true,
"fabric_session_quic_smoke": true,
"vpn_backend_relay_fallback": false,
"fabric_service_channel_required": true,
"web_ingress_workload_contract": "rap.web_ingress.workload_contract.v1",
"web_ingress_real_listener_gate": "RAP_WEB_INGRESS_RUNTIME_ENABLED",
"web_ingress_runtime_enabled": false,
"external_backend_entry_proxy": true,
},
ServiceStates: map[string]any{
@@ -14,6 +14,8 @@ import (
const (
AuthoritySchemaVersion = "rap.cluster_authority.v1"
SignatureSchemaVersion = "rap.cluster_authority.signature.v1"
QuorumSchemaVersion = "rap.cluster_authority.quorum.v1"
QuorumEnvelopeVersion = "rap.cluster_authority.quorum_envelope.v1"
AlgorithmEd25519 = "ed25519"
)
@@ -30,6 +32,34 @@ type Signature struct {
Signature string `json:"signature"`
}
type QuorumMember struct {
NodeID string `json:"node_id,omitempty"`
Role string `json:"role,omitempty"`
PublicKey string `json:"public_key"`
PublicKeyFingerprint string `json:"public_key_fingerprint"`
Scopes []string `json:"scopes,omitempty"`
}
type QuorumDescriptor struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
Epoch string `json:"epoch"`
Threshold int `json:"threshold"`
Members []QuorumMember `json:"members"`
}
type QuorumEnvelope struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
Epoch string `json:"epoch"`
Threshold int `json:"threshold"`
PayloadSHA256 string `json:"payload_sha256"`
QuorumSHA256 string `json:"quorum_sha256"`
Signatures []Signature `json:"signatures"`
AllowedScopes []string `json:"allowed_scopes,omitempty"`
DecisionReason string `json:"decision_reason,omitempty"`
}
func VerifyRaw(publicKeyB64 string, payload json.RawMessage, signature Signature) error {
if signature.SchemaVersion != SignatureSchemaVersion {
return fmt.Errorf("%w: schema_version must be %s", ErrInvalidSignature, SignatureSchemaVersion)
@@ -58,6 +88,86 @@ func VerifyRaw(publicKeyB64 string, payload json.RawMessage, signature Signature
return nil
}
func VerifyQuorumRaw(descriptor QuorumDescriptor, payload json.RawMessage, envelope QuorumEnvelope, requiredScope string) error {
if descriptor.SchemaVersion != QuorumSchemaVersion {
return fmt.Errorf("%w: quorum schema_version must be %s", ErrInvalidSignature, QuorumSchemaVersion)
}
if envelope.SchemaVersion != QuorumEnvelopeVersion {
return fmt.Errorf("%w: quorum envelope schema_version must be %s", ErrInvalidSignature, QuorumEnvelopeVersion)
}
if strings.TrimSpace(descriptor.ClusterID) == "" || descriptor.ClusterID != envelope.ClusterID {
return fmt.Errorf("%w: quorum cluster mismatch", ErrInvalidSignature)
}
if strings.TrimSpace(descriptor.Epoch) == "" || descriptor.Epoch != envelope.Epoch {
return fmt.Errorf("%w: quorum epoch mismatch", ErrInvalidSignature)
}
threshold := descriptor.Threshold
if envelope.Threshold > threshold {
threshold = envelope.Threshold
}
if threshold <= 0 || threshold > len(descriptor.Members) {
return fmt.Errorf("%w: invalid quorum threshold", ErrInvalidSignature)
}
payloadHash, err := HashRaw(payload)
if err != nil {
return err
}
if envelope.PayloadSHA256 != payloadHash {
return fmt.Errorf("%w: quorum payload hash mismatch", ErrInvalidSignature)
}
descriptorHash, err := HashRaw(mustMarshalQuorumDescriptor(descriptor))
if err != nil {
return err
}
if envelope.QuorumSHA256 != descriptorHash {
return fmt.Errorf("%w: quorum descriptor hash mismatch", ErrInvalidSignature)
}
members := map[string]QuorumMember{}
for _, member := range descriptor.Members {
fingerprint := strings.TrimSpace(member.PublicKeyFingerprint)
if fingerprint == "" {
publicKey, err := decodePublicKey(member.PublicKey)
if err != nil {
return err
}
fingerprint = Fingerprint(publicKey)
}
if _, exists := members[fingerprint]; exists {
return fmt.Errorf("%w: duplicate quorum member", ErrInvalidSignature)
}
member.PublicKeyFingerprint = fingerprint
members[fingerprint] = member
}
seen := map[string]bool{}
valid := 0
for _, signature := range envelope.Signatures {
fingerprint := strings.TrimSpace(signature.KeyFingerprint)
if seen[fingerprint] {
continue
}
member, ok := members[fingerprint]
if !ok {
return fmt.Errorf("%w: quorum signer is not a member", ErrInvalidSignature)
}
if requiredScope != "" && !memberAllowsScope(member, requiredScope) {
return fmt.Errorf("%w: quorum signer scope mismatch", ErrInvalidSignature)
}
if err := VerifyRaw(member.PublicKey, payload, signature); err != nil {
return err
}
seen[fingerprint] = true
valid++
}
if valid < threshold {
return fmt.Errorf("%w: quorum threshold not met", ErrInvalidSignature)
}
return nil
}
func QuorumDescriptorHash(descriptor QuorumDescriptor) (string, error) {
return HashRaw(mustMarshalQuorumDescriptor(descriptor))
}
func Fingerprint(publicKey ed25519.PublicKey) string {
sum := sha256.Sum256(publicKey)
return "rap-ca-ed25519-" + hex.EncodeToString(sum[:16])
@@ -72,6 +182,28 @@ func HashRaw(raw json.RawMessage) (string, error) {
return hex.EncodeToString(sum[:]), nil
}
func mustMarshalQuorumDescriptor(descriptor QuorumDescriptor) json.RawMessage {
raw, err := json.Marshal(descriptor)
if err != nil {
return nil
}
return raw
}
func memberAllowsScope(member QuorumMember, requiredScope string) bool {
requiredScope = strings.TrimSpace(requiredScope)
if requiredScope == "" {
return true
}
for _, scope := range member.Scopes {
scope = strings.TrimSpace(scope)
if scope == "*" || scope == requiredScope {
return true
}
}
return false
}
func CanonicalJSON(raw json.RawMessage) ([]byte, error) {
if len(raw) == 0 {
return nil, fmt.Errorf("%w: empty payload", ErrInvalidPayload)
@@ -5,6 +5,7 @@ import (
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"testing"
)
@@ -50,3 +51,114 @@ func TestVerifyRawRejectsTamperedPayload(t *testing.T) {
t.Fatalf("err = %v, want ErrInvalidSignature", err)
}
}
func TestVerifyQuorumRawAcceptsThreshold(t *testing.T) {
payload := json.RawMessage(`{"schema_version":"rap.node_update_plan_authority.v1","cluster_id":"cluster-1","action":"update"}`)
descriptor, privateKeys := testQuorumDescriptor(t, 3, 2)
payloadHash, err := HashRaw(payload)
if err != nil {
t.Fatalf("payload hash: %v", err)
}
quorumHash, err := QuorumDescriptorHash(descriptor)
if err != nil {
t.Fatalf("quorum hash: %v", err)
}
envelope := QuorumEnvelope{
SchemaVersion: QuorumEnvelopeVersion,
ClusterID: "cluster-1",
Epoch: "epoch-1",
Threshold: 2,
PayloadSHA256: payloadHash,
QuorumSHA256: quorumHash,
Signatures: []Signature{
signTestPayload(t, payload, privateKeys[0]),
signTestPayload(t, payload, privateKeys[1]),
},
}
if err := VerifyQuorumRaw(descriptor, payload, envelope, "update-authority"); err != nil {
t.Fatalf("VerifyQuorumRaw: %v", err)
}
}
func TestVerifyQuorumRawRejectsBelowThreshold(t *testing.T) {
payload := json.RawMessage(`{"schema_version":"rap.node_update_plan_authority.v1","cluster_id":"cluster-1","action":"update"}`)
descriptor, privateKeys := testQuorumDescriptor(t, 3, 2)
payloadHash, _ := HashRaw(payload)
quorumHash, _ := QuorumDescriptorHash(descriptor)
envelope := QuorumEnvelope{
SchemaVersion: QuorumEnvelopeVersion,
ClusterID: "cluster-1",
Epoch: "epoch-1",
Threshold: 2,
PayloadSHA256: payloadHash,
QuorumSHA256: quorumHash,
Signatures: []Signature{signTestPayload(t, payload, privateKeys[0])},
}
if err := VerifyQuorumRaw(descriptor, payload, envelope, "update-authority"); !errors.Is(err, ErrInvalidSignature) {
t.Fatalf("err = %v, want ErrInvalidSignature", err)
}
}
func TestVerifyQuorumRawRejectsTamperedDescriptor(t *testing.T) {
payload := json.RawMessage(`{"schema_version":"rap.node_update_plan_authority.v1","cluster_id":"cluster-1","action":"update"}`)
descriptor, privateKeys := testQuorumDescriptor(t, 3, 2)
payloadHash, _ := HashRaw(payload)
quorumHash, _ := QuorumDescriptorHash(descriptor)
descriptor.Threshold = 1
envelope := QuorumEnvelope{
SchemaVersion: QuorumEnvelopeVersion,
ClusterID: "cluster-1",
Epoch: "epoch-1",
Threshold: 2,
PayloadSHA256: payloadHash,
QuorumSHA256: quorumHash,
Signatures: []Signature{
signTestPayload(t, payload, privateKeys[0]),
signTestPayload(t, payload, privateKeys[1]),
},
}
if err := VerifyQuorumRaw(descriptor, payload, envelope, "update-authority"); !errors.Is(err, ErrInvalidSignature) {
t.Fatalf("err = %v, want ErrInvalidSignature", err)
}
}
func testQuorumDescriptor(t *testing.T, members int, threshold int) (QuorumDescriptor, []ed25519.PrivateKey) {
t.Helper()
descriptor := QuorumDescriptor{
SchemaVersion: QuorumSchemaVersion,
ClusterID: "cluster-1",
Epoch: "epoch-1",
Threshold: threshold,
}
privateKeys := make([]ed25519.PrivateKey, 0, members)
for i := 0; i < members; i++ {
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
descriptor.Members = append(descriptor.Members, QuorumMember{
NodeID: fmt.Sprintf("authority-%d", i+1),
Role: "update-authority",
PublicKey: base64.StdEncoding.EncodeToString(publicKey),
PublicKeyFingerprint: Fingerprint(publicKey),
Scopes: []string{"update-authority"},
})
privateKeys = append(privateKeys, privateKey)
}
return descriptor, privateKeys
}
func signTestPayload(t *testing.T, payload json.RawMessage, privateKey ed25519.PrivateKey) Signature {
t.Helper()
canonical, err := CanonicalJSON(payload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
publicKey := privateKey.Public().(ed25519.PublicKey)
return Signature{
SchemaVersion: SignatureSchemaVersion,
Algorithm: AlgorithmEd25519,
KeyFingerprint: Fingerprint(publicKey),
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
}
}
@@ -9,6 +9,7 @@ import (
"io"
"net/http"
"net/url"
"strings"
"time"
)
@@ -17,6 +18,17 @@ type Client struct {
httpClient *http.Client
}
type RawControlRequest struct {
Method string `json:"method"`
Path string `json:"path"`
Body json.RawMessage `json:"body,omitempty"`
}
type RawControlResponse struct {
StatusCode int `json:"status_code"`
Body json.RawMessage `json:"body,omitempty"`
}
type EnrollRequest struct {
ClusterID string `json:"cluster_id"`
JoinToken string `json:"join_token"`
@@ -52,6 +64,7 @@ type NodeBootstrap struct {
Certificate map[string]any `json:"certificate"`
HeartbeatEndpoint string `json:"heartbeat_endpoint"`
ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"`
ClusterAuthorityQuorum json.RawMessage `json:"cluster_authority_quorum,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
}
@@ -123,6 +136,7 @@ type NodeUpdatePlan struct {
Artifact *ReleaseArtifact `json:"artifact,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
AuthorityQuorum *QuorumEnvelope `json:"authority_quorum,omitempty"`
ProductionForwarding bool `json:"production_forwarding"`
}
@@ -293,6 +307,26 @@ type SyntheticMeshConfig struct {
ProductionForwarding bool `json:"production_forwarding"`
}
type AdminRuntimeProjectionRequest struct {
SchemaVersion string `json:"schema_version"`
Method string `json:"method"`
Path string `json:"path"`
Query string `json:"query,omitempty"`
Host string `json:"host,omitempty"`
Scope string `json:"scope"`
ServiceClass string `json:"service_class"`
ObservedAt string `json:"observed_at"`
}
type AdminRuntimeProjectionResponse struct {
SchemaVersion string `json:"schema_version"`
Status string `json:"status"`
Reason string `json:"reason,omitempty"`
StatusCode int `json:"status_code"`
Headers map[string]string `json:"headers,omitempty"`
Body json.RawMessage `json:"body,omitempty"`
}
func (c *SyntheticMeshConfig) UnmarshalJSON(data []byte) error {
type syntheticMeshConfigAlias SyntheticMeshConfig
var decoded syntheticMeshConfigAlias
@@ -448,6 +482,18 @@ type ClusterSignature struct {
SignedAt time.Time `json:"signed_at"`
}
type QuorumEnvelope struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
Epoch string `json:"epoch"`
Threshold int `json:"threshold"`
PayloadSHA256 string `json:"payload_sha256"`
QuorumSHA256 string `json:"quorum_sha256"`
Signatures []ClusterSignature `json:"signatures"`
AllowedScopes []string `json:"allowed_scopes,omitempty"`
DecisionReason string `json:"decision_reason,omitempty"`
}
type PeerDirectoryEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
@@ -744,6 +790,50 @@ func (c *Client) SyntheticMeshConfig(ctx context.Context, clusterID, nodeID stri
return response.Config, nil
}
func (c *Client) AdminRuntimeProjection(ctx context.Context, clusterID, nodeID string, request AdminRuntimeProjectionRequest) (AdminRuntimeProjectionResponse, error) {
var response AdminRuntimeProjectionResponse
path := fmt.Sprintf("/clusters/%s/nodes/%s/admin-runtime/projection", clusterID, nodeID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return AdminRuntimeProjectionResponse{}, err
}
return response, nil
}
func (c *Client) RawControl(ctx context.Context, request RawControlRequest) (RawControlResponse, error) {
method := strings.ToUpper(strings.TrimSpace(request.Method))
if method == "" {
method = http.MethodGet
}
path := strings.TrimSpace(request.Path)
if !strings.HasPrefix(path, "/") {
return RawControlResponse{}, fmt.Errorf("control path must be relative")
}
var body io.Reader
if len(request.Body) > 0 && string(request.Body) != "null" {
body = bytes.NewReader(request.Body)
}
httpReq, err := http.NewRequestWithContext(ctx, method, c.baseURL+path, body)
if err != nil {
return RawControlResponse{}, err
}
if body != nil {
httpReq.Header.Set("Content-Type", "application/json")
}
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return RawControlResponse{}, err
}
defer httpResp.Body.Close()
payload, err := io.ReadAll(io.LimitReader(httpResp.Body, 2*1024*1024))
if err != nil {
return RawControlResponse{}, err
}
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return RawControlResponse{}, fmt.Errorf("backend returned status %d: %s", httpResp.StatusCode, string(payload))
}
return RawControlResponse{StatusCode: httpResp.StatusCode, Body: json.RawMessage(payload)}, nil
}
func (c *Client) getJSON(ctx context.Context, path string, response any) error {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
if err != nil {
@@ -21,6 +21,11 @@ type Config struct {
NodeName string
StateDir string
WorkloadSupervisionEnabled bool
WebIngressRuntimeEnabled bool
WebIngressSigningPrivateKey string
WebIngressSigningKeyID string
WebIngressTrustedKeysJSON string
WebIngressRuntimeServiceClasses string
HeartbeatInterval time.Duration
EnrollmentPollInterval time.Duration
EnrollmentPollTimeout time.Duration
@@ -43,6 +48,12 @@ type Config struct {
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
MeshLocalSegmentID string
MeshNATGroupID string
MeshSTUNReflexiveEndpoint string
MeshSTUNServer string
MeshRelayNodeID string
MeshRelayEndpoint string
MeshRegion string
MeshSyntheticConfigPath string
MeshPeerEndpointsJSON string
@@ -68,9 +79,14 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.StringVar(&cfg.NodeName, "node-name", getEnv(env, "RAP_NODE_NAME", hostnameOrDefault()), "Node display name.")
fs.StringVar(&cfg.StateDir, "state-dir", getEnv(env, "RAP_NODE_STATE_DIR", defaultStateDir), "Local node-agent state directory.")
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getEnvBool(env, "RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable desired workload polling and status reporting. Disabled by default while service runtime is not implemented.")
fs.BoolVar(&cfg.WebIngressRuntimeEnabled, "web-ingress-runtime-enabled", getEnvBool(env, "RAP_WEB_INGRESS_RUNTIME_ENABLED", false), "Enable the future real 80/443 web ingress listener runtime. Disabled by default; contract probe remains safe without it.")
fs.StringVar(&cfg.WebIngressSigningPrivateKey, "web-ingress-signing-private-key", getEnv(env, "RAP_WEB_INGRESS_SIGNING_PRIVATE_KEY", ""), "Base64 Ed25519 private key used to sign web ingress fabric envelopes. Empty keeps signing disabled.")
fs.StringVar(&cfg.WebIngressSigningKeyID, "web-ingress-signing-key-id", getEnv(env, "RAP_WEB_INGRESS_SIGNING_KEY_ID", ""), "Optional key id for web ingress envelope signatures.")
fs.StringVar(&cfg.WebIngressTrustedKeysJSON, "web-ingress-trusted-keys-json", getEnv(env, "RAP_WEB_INGRESS_TRUSTED_KEYS_JSON", ""), "JSON map or array of trusted Ed25519 public keys for web ingress runtime receiver.")
fs.StringVar(&cfg.WebIngressRuntimeServiceClasses, "web-ingress-runtime-service-classes", getEnv(env, "RAP_WEB_INGRESS_RUNTIME_SERVICE_CLASSES", ""), "Optional comma-separated allow-list of web ingress runtime service classes accepted by this node.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getEnvBool(env, "RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session WebSocket endpoint. Disabled by default.")
fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getEnvBool(env, "RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint. Disabled by default.")
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getEnvBool(env, "RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric session when explicitly enabled. Disabled by default.")
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getEnvBool(env, "RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener. Disabled by default.")
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getEnv(env, "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "Listen address for QUIC/UDP fabric endpoint, for example :19443.")
@@ -84,9 +100,15 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "direct_tcp_tls"), "Transport label for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Transport label for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshLocalSegmentID, "mesh-local-segment-id", getEnv(env, "RAP_MESH_LOCAL_SEGMENT_ID", ""), "Optional local LAN/site segment ID advertised with QUIC endpoint candidates.")
fs.StringVar(&cfg.MeshNATGroupID, "mesh-nat-group-id", getEnv(env, "RAP_MESH_NAT_GROUP_ID", ""), "Optional NAT group ID advertised with QUIC endpoint candidates.")
fs.StringVar(&cfg.MeshSTUNReflexiveEndpoint, "mesh-stun-reflexive-endpoint", getEnv(env, "RAP_MESH_STUN_REFLEXIVE_ENDPOINT", ""), "Optional STUN-discovered reflexive QUIC endpoint, for example quic://203.0.113.10:19443.")
fs.StringVar(&cfg.MeshSTUNServer, "mesh-stun-server", getEnv(env, "RAP_MESH_STUN_SERVER", ""), "Optional STUN server name used to discover the reflexive endpoint.")
fs.StringVar(&cfg.MeshRelayNodeID, "mesh-relay-node-id", getEnv(env, "RAP_MESH_RELAY_NODE_ID", ""), "Optional relay node ID for relay-required QUIC fallback candidates.")
fs.StringVar(&cfg.MeshRelayEndpoint, "mesh-relay-endpoint", getEnv(env, "RAP_MESH_RELAY_ENDPOINT", ""), "Optional relay QUIC endpoint for relay-required fallback candidates.")
fs.StringVar(&cfg.MeshRegion, "mesh-region", getEnv(env, "RAP_MESH_REGION", ""), "Optional region/site hint for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshSyntheticConfigPath, "mesh-synthetic-config", getEnv(env, "RAP_MESH_SYNTHETIC_CONFIG", ""), "Path to scoped synthetic mesh config snapshot. Preferred over debug JSON env.")
fs.StringVar(&cfg.MeshPeerEndpointsJSON, "mesh-peer-endpoints-json", getEnv(env, "RAP_MESH_PEER_ENDPOINTS_JSON", ""), "JSON object mapping peer node_id to synthetic mesh endpoint URL.")
@@ -129,12 +151,27 @@ func Load(args []string, env map[string]string) (Config, error) {
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
if cfg.MeshAdvertiseTransport == "" {
cfg.MeshAdvertiseTransport = "quic"
}
cfg.MeshAdvertiseTransport = normalizeLegacyAdvertiseTransport(cfg.MeshAdvertiseTransport)
cfg.MeshAdvertiseEndpoint = normalizeLegacyEndpointSchemeToQUIC(cfg.MeshAdvertiseEndpoint)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
cfg.MeshLocalSegmentID = strings.TrimSpace(cfg.MeshLocalSegmentID)
cfg.MeshNATGroupID = strings.TrimSpace(cfg.MeshNATGroupID)
cfg.MeshSTUNReflexiveEndpoint = normalizeLegacyEndpointSchemeToQUIC(strings.TrimRight(strings.TrimSpace(cfg.MeshSTUNReflexiveEndpoint), "/"))
cfg.MeshSTUNServer = strings.TrimSpace(cfg.MeshSTUNServer)
cfg.MeshRelayNodeID = strings.TrimSpace(cfg.MeshRelayNodeID)
cfg.MeshRelayEndpoint = normalizeLegacyEndpointSchemeToQUIC(strings.TrimRight(strings.TrimSpace(cfg.MeshRelayEndpoint), "/"))
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
cfg.MeshSyntheticConfigPath = strings.TrimSpace(cfg.MeshSyntheticConfigPath)
cfg.MeshPeerEndpointsJSON = strings.TrimSpace(cfg.MeshPeerEndpointsJSON)
cfg.MeshSyntheticRoutesJSON = strings.TrimSpace(cfg.MeshSyntheticRoutesJSON)
cfg.WebIngressSigningPrivateKey = strings.TrimSpace(cfg.WebIngressSigningPrivateKey)
cfg.WebIngressSigningKeyID = strings.TrimSpace(cfg.WebIngressSigningKeyID)
cfg.WebIngressTrustedKeysJSON = strings.TrimSpace(cfg.WebIngressTrustedKeysJSON)
cfg.WebIngressRuntimeServiceClasses = strings.TrimSpace(cfg.WebIngressRuntimeServiceClasses)
cfg.RemoteWorkspaceRealAdapterCommand = strings.TrimSpace(cfg.RemoteWorkspaceRealAdapterCommand)
cfg.RemoteWorkspaceRealAdapterArgsJSON = strings.TrimSpace(cfg.RemoteWorkspaceRealAdapterArgsJSON)
cfg.RemoteWorkspaceRealAdapterWorkDir = strings.TrimSpace(cfg.RemoteWorkspaceRealAdapterWorkDir)
@@ -176,9 +213,62 @@ func Load(args []string, env map[string]string) (Config, error) {
if cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return Config{}, errors.New("mesh listen auto port start must be less than or equal to end")
}
if !isQUICAdvertiseTransport(cfg.MeshAdvertiseTransport) {
return Config{}, errors.New("mesh advertise transport must be a QUIC transport label")
}
if hasLegacyEndpointScheme(cfg.MeshAdvertiseEndpoint) {
return Config{}, errors.New("mesh advertise endpoint must be a QUIC endpoint")
}
if cfg.MeshSTUNReflexiveEndpoint != "" && hasLegacyEndpointScheme(cfg.MeshSTUNReflexiveEndpoint) {
return Config{}, errors.New("mesh STUN reflexive endpoint must be a QUIC endpoint")
}
if cfg.MeshRelayEndpoint != "" && hasLegacyEndpointScheme(cfg.MeshRelayEndpoint) {
return Config{}, errors.New("mesh relay endpoint must be a QUIC endpoint")
}
return cfg, nil
}
func isQUICAdvertiseTransport(label string) bool {
switch strings.ToLower(strings.TrimSpace(label)) {
case "quic", "direct_quic", "udp_quic", "quic_udp", "lan_quic", "reverse_quic", "relay_quic", "ice_quic":
return true
default:
return false
}
}
func normalizeLegacyAdvertiseTransport(label string) string {
switch strings.ToLower(strings.TrimSpace(label)) {
case "direct_http", "direct_https", "direct_tcp_tls", "http", "https", "ws", "wss", "websocket":
return "direct_quic"
case "outbound_reverse", "reverse", "reverse_outbound":
return "reverse_quic"
case "relay", "relay_control":
return "relay_quic"
default:
return strings.TrimSpace(label)
}
}
func normalizeLegacyEndpointSchemeToQUIC(endpoint string) string {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
lower := strings.ToLower(endpoint)
for _, prefix := range []string{"http://", "https://", "ws://", "wss://"} {
if strings.HasPrefix(lower, prefix) {
return "quic://" + endpoint[len(prefix):]
}
}
return endpoint
}
func hasLegacyEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
}
func readEnv() map[string]string {
out := map[string]string{}
for _, pair := range os.Environ() {
@@ -15,6 +15,11 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_NODE_NAME": "node-a",
"RAP_NODE_STATE_DIR": "/tmp/rap-node",
"RAP_WORKLOAD_SUPERVISION_ENABLED": "true",
"RAP_WEB_INGRESS_RUNTIME_ENABLED": "true",
"RAP_WEB_INGRESS_SIGNING_PRIVATE_KEY": " private-key-b64 ",
"RAP_WEB_INGRESS_SIGNING_KEY_ID": " web-key-1 ",
"RAP_WEB_INGRESS_TRUSTED_KEYS_JSON": ` {"web-key-1":"public-key-b64"} `,
"RAP_WEB_INGRESS_RUNTIME_SERVICE_CLASSES": " platform_admin, cluster_admin ",
"RAP_HEARTBEAT_INTERVAL_SECONDS": "7",
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS": "3",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30",
@@ -32,11 +37,17 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_MESH_LISTEN_PORT_MODE": "auto",
"RAP_MESH_LISTEN_AUTO_PORT_START": "19010",
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443/",
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443/",
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
"RAP_MESH_CONNECTIVITY_MODE": "outbound_only",
"RAP_MESH_NAT_TYPE": "symmetric",
"RAP_MESH_LOCAL_SEGMENT_ID": "site-a",
"RAP_MESH_NAT_GROUP_ID": "nat-a",
"RAP_MESH_STUN_REFLEXIVE_ENDPOINT": "quic://203.0.113.20:19443/",
"RAP_MESH_STUN_SERVER": "stun.example.test:3478",
"RAP_MESH_RELAY_NODE_ID": "node-r",
"RAP_MESH_RELAY_ENDPOINT": "quic://node-r.example.test:19443/",
"RAP_MESH_REGION": "eu",
"RAP_MESH_SYNTHETIC_CONFIG": "/tmp/rap-node/mesh-synthetic.json",
"RAP_MESH_PEER_ENDPOINTS_JSON": `{"node-b":"http://127.0.0.1:19002"}`,
@@ -67,6 +78,15 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if !cfg.WorkloadSupervisionEnabled {
t.Fatal("WorkloadSupervisionEnabled = false, want true")
}
if !cfg.WebIngressRuntimeEnabled {
t.Fatal("WebIngressRuntimeEnabled = false, want true")
}
if cfg.WebIngressSigningPrivateKey != "private-key-b64" ||
cfg.WebIngressSigningKeyID != "web-key-1" ||
cfg.WebIngressTrustedKeysJSON != `{"web-key-1":"public-key-b64"}` ||
cfg.WebIngressRuntimeServiceClasses != "platform_admin, cluster_admin" {
t.Fatalf("unexpected web ingress key config: %+v", cfg)
}
if !cfg.MeshSyntheticRuntimeEnabled {
t.Fatal("MeshSyntheticRuntimeEnabled = false, want true")
}
@@ -100,11 +120,17 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if cfg.MeshListenPortMode != "auto" || cfg.MeshListenAutoPortStart != 19010 || cfg.MeshListenAutoPortEnd != 19020 {
t.Fatalf("unexpected mesh listen port config: %+v", cfg)
}
if cfg.MeshAdvertiseEndpoint != "https://node-a.example.test:443" ||
if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:19443" ||
cfg.MeshAdvertiseEndpointsJSON == "" ||
cfg.MeshAdvertiseTransport != "wss" ||
cfg.MeshAdvertiseTransport != "direct_quic" ||
cfg.MeshConnectivityMode != "outbound_only" ||
cfg.MeshNATType != "symmetric" ||
cfg.MeshLocalSegmentID != "site-a" ||
cfg.MeshNATGroupID != "nat-a" ||
cfg.MeshSTUNReflexiveEndpoint != "quic://203.0.113.20:19443" ||
cfg.MeshSTUNServer != "stun.example.test:3478" ||
cfg.MeshRelayNodeID != "node-r" ||
cfg.MeshRelayEndpoint != "quic://node-r.example.test:19443" ||
cfg.MeshRegion != "eu" {
t.Fatalf("unexpected mesh advertise config: %+v", cfg)
}
@@ -139,6 +165,9 @@ func TestLoadConfigDefaultsEnrollmentPollingToNoTimeout(t *testing.T) {
cfg.RemoteWorkspaceRealAdapterWorkDir != "" {
t.Fatalf("real adapter config should default disabled and empty: %+v", cfg)
}
if cfg.WebIngressRuntimeEnabled {
t.Fatalf("web ingress runtime should default disabled: %+v", cfg)
}
}
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
@@ -162,3 +191,33 @@ func TestLoadConfigRejectsTooLargeProductionObservationSinkCapacity(t *testing.T
t.Fatal("Load returned nil error for too-large sink capacity")
}
}
func TestLoadConfigNormalizesLegacyMeshAdvertiseTransport(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443",
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
})
if err != nil {
t.Fatalf("Load returned error for legacy mesh advertise transport migration: %v", err)
}
if cfg.MeshAdvertiseTransport != "direct_quic" {
t.Fatalf("transport = %q, want direct_quic", cfg.MeshAdvertiseTransport)
}
}
func TestLoadConfigNormalizesLegacyMeshAdvertiseEndpointScheme(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443",
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
})
if err != nil {
t.Fatalf("Load returned error for legacy mesh advertise endpoint migration: %v", err)
}
if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:443" {
t.Fatalf("endpoint = %q, want quic scheme", cfg.MeshAdvertiseEndpoint)
}
}
@@ -1,6 +1,9 @@
package fabricproto
import "errors"
import (
"crypto/sha256"
"errors"
)
var (
ErrUnsupportedSessionFrame = errors.New("unsupported fabric session frame")
@@ -62,6 +65,7 @@ func (s *Session) HandleFrame(frame Frame) (SessionEvent, []Frame, error) {
TrafficClass: frame.TrafficClass,
StreamID: frame.StreamID,
Sequence: frame.Sequence,
Payload: DataAckPayload(frame.Payload),
}}, nil
case FrameAck:
if err := s.Ack(frame.StreamID, frame.Sequence); err != nil {
@@ -103,6 +107,11 @@ func (s *Session) HandleFrame(frame Frame) (SessionEvent, []Frame, error) {
}
}
func DataAckPayload(payload []byte) []byte {
sum := sha256.Sum256(payload)
return sum[:]
}
func (s *Session) handleDataFrame(frame Frame) (SessionEvent, error) {
s.mu.Lock()
defer s.mu.Unlock()
@@ -1,6 +1,7 @@
package fabricproto
import (
"bytes"
"errors"
"testing"
)
@@ -36,6 +37,9 @@ func TestHandleFrameOpensStreamAndReceivesData(t *testing.T) {
if len(responses) != 1 || responses[0].Type != FrameAck || responses[0].StreamID != 7 || responses[0].Sequence != 11 {
t.Fatalf("responses = %+v, want ack for stream 7 seq 11", responses)
}
if !bytes.Equal(responses[0].Payload, DataAckPayload([]byte("rdp-input"))) {
t.Fatalf("ack checksum = %x, want sha256 payload checksum", responses[0].Payload)
}
snapshot := session.Snapshot()
if snapshot.FramesReceived != 1 || snapshot.Streams[7].Received != 1 {
t.Fatalf("received metrics = %+v stream=%+v", snapshot, snapshot.Streams[7])
@@ -136,6 +136,12 @@ func (cfg RuntimeConfig) ValidateInstall() error {
if cfg.MeshListenAutoPortStart > 0 && cfg.MeshListenAutoPortEnd > 0 && cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return errors.New("mesh listen auto port start must be less than or equal to end")
}
if cfg.MeshAdvertiseTransport != "" && !isQUICAdvertiseTransport(cfg.MeshAdvertiseTransport) {
return errors.New("mesh advertise transport must be a QUIC transport label")
}
if hasLegacyEndpointScheme(cfg.MeshAdvertiseEndpoint) {
return errors.New("mesh advertise endpoint must be a QUIC endpoint")
}
if cfg.ProductionObservationSinkCap < 0 {
return errors.New("production observation sink capacity must not be negative")
}
@@ -153,3 +159,20 @@ func firstNonEmpty(value, fallback string) string {
}
return strings.TrimSpace(value)
}
func isQUICAdvertiseTransport(label string) bool {
switch strings.ToLower(strings.TrimSpace(label)) {
case "quic", "direct_quic", "udp_quic", "quic_udp", "lan_quic", "reverse_quic", "relay_quic", "ice_quic":
return true
default:
return false
}
}
func hasLegacyEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
}
@@ -73,7 +73,8 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
VPNFabricQUICMaxStreamsPerConn: 24,
VPNFabricQUICIdleTTLSeconds: 120,
MeshListenAddr: ":19131",
MeshAdvertiseEndpoint: "http://10.0.0.11:19131/",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443/",
MeshAdvertiseTransport: "direct_quic",
MeshConnectivityMode: "private_lan",
})
@@ -94,7 +95,8 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
"RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN=24",
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS=120",
"RAP_MESH_LISTEN_ADDR=:19131",
"RAP_MESH_ADVERTISE_ENDPOINT=http://10.0.0.11:19131",
"RAP_MESH_ADVERTISE_ENDPOINT=quic://10.0.0.11:19443",
"RAP_MESH_ADVERTISE_TRANSPORT=direct_quic",
"RAP_MESH_CONNECTIVITY_MODE=private_lan",
"rap-node-agent:test",
} {
@@ -384,3 +386,35 @@ func TestValidateRequiresJoinTokenUnlessReplacingExistingState(t *testing.T) {
t.Fatalf("replace update should allow missing join token: %v", err)
}
}
func TestValidateRejectsLegacyMeshAdvertiseTransport(t *testing.T) {
err := RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443",
MeshAdvertiseTransport: "wss",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "QUIC transport") {
t.Fatalf("expected QUIC transport validation error, got %v", err)
}
}
func TestValidateRejectsLegacyMeshAdvertiseEndpointScheme(t *testing.T) {
err := RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "http://10.0.0.11:19131",
MeshAdvertiseTransport: "direct_quic",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "QUIC endpoint") {
t.Fatalf("expected QUIC endpoint validation error, got %v", err)
}
}
@@ -16,6 +16,7 @@ import (
"strings"
"time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
@@ -119,6 +120,21 @@ type NodeUpdatePlan struct {
Artifact *ReleaseArtifact `json:"artifact,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature json.RawMessage `json:"authority_signature,omitempty"`
AuthorityQuorum *clusterauth.QuorumEnvelope `json:"authority_quorum,omitempty"`
ProductionForwarding bool `json:"production_forwarding"`
}
type nodeUpdatePlanAuthorityPayload struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
Product string `json:"product"`
CurrentVersion string `json:"current_version,omitempty"`
Action string `json:"action"`
TargetVersion string `json:"target_version,omitempty"`
ArtifactSHA256 string `json:"artifact_sha256,omitempty"`
ArtifactURL string `json:"artifact_url,omitempty"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}
@@ -516,9 +532,87 @@ func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return NodeUpdatePlan{}, err
}
if err := verifyNodeUpdatePlanAuthority(req, out.Plan); err != nil {
return NodeUpdatePlan{}, err
}
return out.Plan, nil
}
func verifyNodeUpdatePlanAuthority(req UpdateRequest, plan NodeUpdatePlan) error {
identity, ok := pinnedUpdatePlanAuthority(req)
if !ok {
return nil
}
if len(identity.ClusterAuthorityQuorum) > 0 {
if plan.AuthorityQuorum == nil {
return errors.New("update plan quorum authority is required by pinned cluster quorum")
}
var descriptor clusterauth.QuorumDescriptor
if err := json.Unmarshal(identity.ClusterAuthorityQuorum, &descriptor); err != nil {
return fmt.Errorf("invalid pinned cluster authority quorum: %w", err)
}
if len(plan.AuthorityPayload) == 0 {
return errors.New("update plan authority payload is required by pinned cluster quorum")
}
if err := clusterauth.VerifyQuorumRaw(descriptor, plan.AuthorityPayload, *plan.AuthorityQuorum, "update-authority"); err != nil {
return fmt.Errorf("update plan quorum authority rejected: %w", err)
}
return verifyNodeUpdatePlanAuthorityPayload(plan)
}
if len(plan.AuthorityPayload) == 0 || len(plan.AuthoritySignature) == 0 {
return errors.New("update plan authority signature is required by pinned cluster authority")
}
var signature clusterauth.Signature
if err := json.Unmarshal(plan.AuthoritySignature, &signature); err != nil {
return fmt.Errorf("invalid update plan authority signature: %w", err)
}
if identity.ClusterAuthorityFingerprint != "" && signature.KeyFingerprint != identity.ClusterAuthorityFingerprint {
return errors.New("update plan authority fingerprint mismatch")
}
if err := clusterauth.VerifyRaw(identity.ClusterAuthorityPublicKey, plan.AuthorityPayload, signature); err != nil {
return fmt.Errorf("update plan authority signature rejected: %w", err)
}
return verifyNodeUpdatePlanAuthorityPayload(plan)
}
func verifyNodeUpdatePlanAuthorityPayload(plan NodeUpdatePlan) error {
var payload nodeUpdatePlanAuthorityPayload
if err := json.Unmarshal(plan.AuthorityPayload, &payload); err != nil {
return fmt.Errorf("invalid update plan authority payload: %w", err)
}
if payload.SchemaVersion != "rap.node_update_plan_authority.v1" ||
payload.ClusterID != plan.ClusterID ||
payload.NodeID != plan.NodeID ||
payload.Product != plan.Product ||
payload.CurrentVersion != plan.CurrentVersion ||
payload.Action != plan.Action ||
payload.TargetVersion != plan.TargetVersion ||
payload.ProductionForwarding != plan.ProductionForwarding {
return errors.New("update plan authority payload mismatch")
}
if plan.Artifact != nil {
if payload.ArtifactSHA256 != plan.Artifact.SHA256 || payload.ArtifactURL != plan.Artifact.URL {
return errors.New("update plan artifact authority payload mismatch")
}
}
return nil
}
func pinnedUpdatePlanAuthority(req UpdateRequest) (state.Identity, bool) {
stateDir := strings.TrimSpace(req.StateDir)
if stateDir == "" {
return state.Identity{}, false
}
identity, err := state.Load(filepath.Join(stateDir, state.FileName))
if err != nil {
return state.Identity{}, false
}
if strings.TrimSpace(identity.ClusterAuthorityPublicKey) == "" {
return state.Identity{}, false
}
return identity, true
}
func resolveUpdateRequest(req UpdateRequest) (UpdateRequest, error) {
req = req.Normalize()
if err := req.Validate(); err != nil {
@@ -2,6 +2,9 @@ package hostagent
import (
"context"
"crypto/ed25519"
cryptorand "crypto/rand"
"encoding/base64"
"encoding/json"
"fmt"
"net/http"
@@ -12,6 +15,7 @@ import (
"testing"
"time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
@@ -21,6 +25,101 @@ type updateRunner struct {
inspectJSON string
}
func writePinnedAuthorityIdentity(t *testing.T) (string, ed25519.PublicKey, ed25519.PrivateKey) {
t.Helper()
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("generate authority key: %v", err)
}
dir := t.TempDir()
identity := state.Identity{
NodeID: "node-1",
ClusterID: "cluster-1",
NodeName: "node-a",
IdentityStatus: "active",
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
ClusterAuthorityFingerprint: clusterauth.Fingerprint(publicKey),
}
if err := state.Save(filepath.Join(dir, state.FileName), identity); err != nil {
t.Fatalf("save identity: %v", err)
}
return dir, publicKey, privateKey
}
func writePinnedQuorumIdentity(t *testing.T) (string, clusterauth.QuorumDescriptor, []ed25519.PrivateKey) {
t.Helper()
descriptor := clusterauth.QuorumDescriptor{
SchemaVersion: clusterauth.QuorumSchemaVersion,
ClusterID: "cluster-1",
Epoch: "epoch-1",
Threshold: 2,
}
privateKeys := make([]ed25519.PrivateKey, 0, 3)
for i := 0; i < 3; i++ {
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("generate authority key: %v", err)
}
descriptor.Members = append(descriptor.Members, clusterauth.QuorumMember{
NodeID: fmt.Sprintf("authority-%d", i+1),
Role: "update-authority",
PublicKey: base64.StdEncoding.EncodeToString(publicKey),
PublicKeyFingerprint: clusterauth.Fingerprint(publicKey),
Scopes: []string{"update-authority"},
})
privateKeys = append(privateKeys, privateKey)
}
rawQuorum, err := json.Marshal(descriptor)
if err != nil {
t.Fatalf("marshal quorum: %v", err)
}
dir := t.TempDir()
identity := state.Identity{
NodeID: "node-1",
ClusterID: "cluster-1",
NodeName: "node-a",
IdentityStatus: "active",
ClusterAuthorityQuorum: rawQuorum,
}
if err := state.Save(filepath.Join(dir, state.FileName), identity); err != nil {
t.Fatalf("save identity: %v", err)
}
return dir, descriptor, privateKeys
}
func signedAuthorityPayload(t *testing.T, publicKey ed25519.PublicKey, privateKey ed25519.PrivateKey, payload any) (json.RawMessage, clusterauth.Signature) {
t.Helper()
raw, err := json.Marshal(payload)
if err != nil {
t.Fatalf("marshal payload: %v", err)
}
canonical, err := clusterauth.CanonicalJSON(raw)
if err != nil {
t.Fatalf("canonical payload: %v", err)
}
return raw, clusterauth.Signature{
SchemaVersion: clusterauth.SignatureSchemaVersion,
Algorithm: clusterauth.AlgorithmEd25519,
KeyFingerprint: clusterauth.Fingerprint(publicKey),
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
}
}
func signHostAgentPayload(t *testing.T, payload json.RawMessage, privateKey ed25519.PrivateKey) clusterauth.Signature {
t.Helper()
canonical, err := clusterauth.CanonicalJSON(payload)
if err != nil {
t.Fatalf("canonical payload: %v", err)
}
publicKey := privateKey.Public().(ed25519.PublicKey)
return clusterauth.Signature{
SchemaVersion: clusterauth.SignatureSchemaVersion,
Algorithm: clusterauth.AlgorithmEd25519,
KeyFingerprint: clusterauth.Fingerprint(publicKey),
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
}
}
func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.T) {
urls := artifactURLsForBackend(ReleaseArtifact{
URL: "/downloads/rap-node-agent-0.2.92.tar",
@@ -41,6 +140,161 @@ func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.
}
}
func TestFetchNodeUpdatePlanRejectsUnsignedPlanWithPinnedAuthority(t *testing.T) {
stateDir, _, _ := writePinnedAuthorityIdentity(t)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"schema_version": "rap.node_update_plan.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0",
"action": "none",
"reason": "already_current",
"production_forwarding": false,
},
})
}))
defer server.Close()
_, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
StateDir: stateDir,
CurrentVersion: "0.1.0",
OS: "linux",
Arch: "amd64",
InstallType: "docker",
})
if err == nil || !strings.Contains(err.Error(), "authority signature is required") {
t.Fatalf("expected pinned authority rejection, got %v", err)
}
}
func TestFetchNodeUpdatePlanAcceptsSignedPlanWithPinnedAuthority(t *testing.T) {
stateDir, publicKey, privateKey := writePinnedAuthorityIdentity(t)
plan := map[string]any{
"schema_version": "rap.node_update_plan.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0",
"action": "none",
"reason": "already_current",
"production_forwarding": false,
}
payload := map[string]any{
"schema_version": "rap.node_update_plan_authority.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0",
"action": "none",
"target_version": "",
"artifact_sha256": "",
"control_plane_only": true,
"production_forwarding": false,
}
rawPayload, signature := signedAuthorityPayload(t, publicKey, privateKey, payload)
plan["authority_payload"] = json.RawMessage(rawPayload)
plan["authority_signature"] = signature
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(map[string]any{"node_update_plan": plan})
}))
defer server.Close()
got, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
StateDir: stateDir,
CurrentVersion: "0.1.0",
OS: "linux",
Arch: "amd64",
InstallType: "docker",
})
if err != nil {
t.Fatalf("fetch signed plan: %v", err)
}
if got.Action != "none" || got.Reason != "already_current" {
t.Fatalf("unexpected plan: %+v", got)
}
}
func TestFetchNodeUpdatePlanAcceptsQuorumSignedPlan(t *testing.T) {
stateDir, descriptor, privateKeys := writePinnedQuorumIdentity(t)
plan := map[string]any{
"schema_version": "rap.node_update_plan.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0",
"action": "none",
"reason": "already_current",
"production_forwarding": false,
}
payload := map[string]any{
"schema_version": "rap.node_update_plan_authority.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0",
"action": "none",
"target_version": "",
"artifact_sha256": "",
"control_plane_only": true,
"production_forwarding": false,
}
rawPayload, err := json.Marshal(payload)
if err != nil {
t.Fatalf("marshal payload: %v", err)
}
payloadHash, err := clusterauth.HashRaw(rawPayload)
if err != nil {
t.Fatalf("payload hash: %v", err)
}
quorumHash, err := clusterauth.QuorumDescriptorHash(descriptor)
if err != nil {
t.Fatalf("quorum hash: %v", err)
}
plan["authority_payload"] = json.RawMessage(rawPayload)
plan["authority_quorum"] = clusterauth.QuorumEnvelope{
SchemaVersion: clusterauth.QuorumEnvelopeVersion,
ClusterID: "cluster-1",
Epoch: "epoch-1",
Threshold: 2,
PayloadSHA256: payloadHash,
QuorumSHA256: quorumHash,
Signatures: []clusterauth.Signature{
signHostAgentPayload(t, rawPayload, privateKeys[0]),
signHostAgentPayload(t, rawPayload, privateKeys[1]),
},
}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_ = json.NewEncoder(w).Encode(map[string]any{"node_update_plan": plan})
}))
defer server.Close()
got, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
StateDir: stateDir,
CurrentVersion: "0.1.0",
OS: "linux",
Arch: "amd64",
InstallType: "docker",
})
if err != nil {
t.Fatalf("fetch quorum plan: %v", err)
}
if got.Action != "none" {
t.Fatalf("unexpected plan: %+v", got)
}
}
func (r *updateRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) >= 2 && args[0] == "inspect" && args[1] == "--format" {
@@ -13,6 +13,7 @@ func TestClientFabricSessionFrameRoundTrip(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -39,6 +40,7 @@ func TestClientFabricSessionPersistentRoundTrips(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -82,6 +84,7 @@ func TestClientFabricSessionPersistentDataAcks(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -137,6 +140,7 @@ func TestClientFabricSessionPumpMovesIndependentFrames(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -204,6 +208,7 @@ func TestClientFabricSessionReportsRejectedStatus(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
@@ -72,6 +72,10 @@ const (
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionVPNPacketPayloadBytes = 256 * 1024
MaxProductionEnvelopeFutureSkew = time.Minute
ProductionForwardQUICStreamID = 1
WebIngressForwardQUICStreamID = 2
FabricControlForwardQUICStreamID = 3
SyntheticForwardQUICStreamID = 1001
)
type PeerIdentity struct {
@@ -47,6 +47,9 @@ func RankPeerEndpointCandidates(candidates []PeerEndpointCandidate, opts Endpoin
}
out := make([]ScoredPeerEndpointCandidate, 0, len(candidates))
for _, candidate := range candidates {
if endpointHasUnspecifiedHost(candidate.Address) {
continue
}
out = append(out, scorePeerEndpointCandidate(candidate, opts))
}
sort.SliceStable(out, func(i, j int) bool {
@@ -68,25 +71,25 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
score := 100
reasons := []string{"base"}
switch candidate.Transport {
switch strings.ToLower(strings.TrimSpace(candidate.Transport)) {
case "quic", "direct_quic", "udp_quic", "quic_udp":
score += 45
reasons = append(reasons, "transport:quic")
case "direct_tcp_tls", "direct_http", "direct_https":
score += 35
reasons = append(reasons, "transport:direct")
case "wss":
score += 25
reasons = append(reasons, "transport:wss")
case "outbound_reverse":
score += 10
reasons = append(reasons, "transport:outbound_reverse")
case "relay":
case "lan_quic":
score += 42
reasons = append(reasons, "transport:lan_quic")
case "ice_quic":
score += 38
reasons = append(reasons, "transport:ice_quic")
case "reverse_quic":
score += 15
reasons = append(reasons, "transport:reverse_quic")
case "relay_quic":
score += 5
reasons = append(reasons, "transport:relay")
reasons = append(reasons, "transport:relay_quic")
default:
score -= 100
reasons = append(reasons, "transport:unknown")
reasons = append(reasons, "transport:non_quic_rejected")
}
switch candidate.Reachability {
@@ -173,7 +176,8 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
score += 8
reasons = append(reasons, "channel:control-direct")
}
if candidate.Transport == "relay" {
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
if transport == "relay" || transport == "relay_quic" {
score -= 8
reasons = append(reasons, "channel:control-relay-penalty")
}
@@ -234,14 +238,20 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv
}
switch {
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
score += 18
score += 24
reasons = append(reasons, "latency:low")
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 150:
score += 8
reasons = append(reasons, "latency:moderate")
case observation.LastLatencyMs > 0:
score -= 10
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 300:
score -= 12
reasons = append(reasons, "latency:high")
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 750:
score -= 32
reasons = append(reasons, "latency:very_high")
case observation.LastLatencyMs > 0:
score -= 60
reasons = append(reasons, "latency:extreme")
}
if observation.ReliabilityScore > 0 {
switch {
@@ -13,7 +13,7 @@ func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Transport: "relay_quic",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "symmetric",
@@ -25,8 +25,8 @@ func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -38,8 +38,8 @@ func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
{
EndpointID: "node-b-private-stale",
NodeID: "node-b",
Transport: "wss",
Address: "10.0.0.5:443",
Transport: "lan_quic",
Address: "quic://10.0.0.5:19443",
Reachability: "private",
NATType: "restricted",
ConnectivityMode: "direct",
@@ -74,8 +74,8 @@ func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
{
EndpointID: "endpoint-b",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.21:443",
Transport: "direct_quic",
Address: "quic://203.0.113.21:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -84,8 +84,8 @@ func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
{
EndpointID: "endpoint-a",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -103,10 +103,10 @@ func TestRankPeerEndpointCandidatesPrefersQUICFastPath(t *testing.T) {
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-wss",
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "wss",
Address: "wss://node-b.example.test",
Transport: "relay_quic",
Address: "quic://relay.example.test:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -138,14 +138,44 @@ func TestRankPeerEndpointCandidatesPrefersQUICFastPath(t *testing.T) {
}
}
func TestRankPeerEndpointCandidatesDropsUnspecifiedQUICEndpoint(t *testing.T) {
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-unspecified",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://[::]:19131",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 1,
},
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19131",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{})
if len(ranked) != 1 || ranked[0].Candidate.EndpointID != "node-b-public" {
t.Fatalf("unspecified endpoint was not dropped: %+v", ranked)
}
}
func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -155,8 +185,8 @@ func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T)
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "10.24.10.20:19001",
Transport: "lan_quic",
Address: "quic://10.24.10.20:19443",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
@@ -184,7 +214,7 @@ func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T
{
EndpointID: "node-b-outbound",
NodeID: "node-b",
Transport: "outbound_reverse",
Transport: "reverse_quic",
Address: "node-b.reverse.local",
Reachability: "outbound_only",
NATType: "symmetric",
@@ -194,7 +224,7 @@ func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Transport: "relay_quic",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "blocked",
@@ -222,18 +252,18 @@ func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "node-b-wss",
EndpointID: "node-b-ice",
NodeID: "node-b",
Transport: "wss",
Address: "node-b.example.test",
Transport: "ice_quic",
Address: "quic://node-b.example.test:19443",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
@@ -253,8 +283,8 @@ func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
ReliabilityScore: 50,
ObservedAt: now.Add(-time.Minute),
},
"node-b-wss": {
EndpointID: "node-b-wss",
"node-b-ice": {
EndpointID: "node-b-ice",
LastLatencyMs: 35,
SuccessCount: 8,
ReliabilityScore: 95,
@@ -262,8 +292,8 @@ func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
},
},
})
if ranked[0].Candidate.EndpointID != "node-b-wss" {
t.Fatalf("top endpoint = %q, want node-b-wss: %+v", ranked[0].Candidate.EndpointID, ranked)
if ranked[0].Candidate.EndpointID != "node-b-ice" {
t.Fatalf("top endpoint = %q, want node-b-ice: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "latency:low") || !containsReason(ranked[0].Reasons, "reliability:high") {
t.Fatalf("top reasons missing health hints: %+v", ranked[0].Reasons)
@@ -279,8 +309,8 @@ func TestRankPeerEndpointCandidatesTreatsStaleObservationAsPenalty(t *testing.T)
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -321,10 +351,10 @@ func TestRankPeerEndpointCandidatesDoesNotRewardZeroLatencyFailure(t *testing.T)
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-wss",
EndpointID: "node-b-ice",
NodeID: "node-b",
Transport: "wss",
Address: "https://node-b.example.test:443",
Transport: "ice_quic",
Address: "quic://node-b.example.test:19444",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 10,
@@ -345,14 +375,81 @@ func TestRankPeerEndpointCandidatesDoesNotRewardZeroLatencyFailure(t *testing.T)
},
MaxObservationAge: time.Minute,
})
if ranked[0].Candidate.EndpointID != "node-b-wss" {
t.Fatalf("top endpoint = %q, want wss after repeated quic failures: %+v", ranked[0].Candidate.EndpointID, ranked)
if ranked[0].Candidate.EndpointID != "node-b-ice" {
t.Fatalf("top endpoint = %q, want ice_quic after repeated direct QUIC failures: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if containsReason(ranked[1].Reasons, "latency:moderate") {
t.Fatalf("zero latency failure was rewarded as moderate latency: %+v", ranked[1].Reasons)
}
}
func TestRankPeerEndpointCandidatesPenalizesSevereLatencyGradient(t *testing.T) {
now := time.Date(2026, 5, 17, 6, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-lan",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://10.0.0.2:19443",
Reachability: "private",
ConnectivityMode: "direct",
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-wan",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
ConnectivityMode: "direct",
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-bad-relay",
NodeID: "node-b",
Transport: "relay_quic",
Address: "quic://relay.example.test:19443",
Reachability: "relay",
ConnectivityMode: "relay_required",
LastVerifiedAt: &now,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
MaxVerificationAge: time.Minute,
MaxObservationAge: time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"node-b-lan": {
EndpointID: "node-b-lan",
LastLatencyMs: 4,
ReliabilityScore: 95,
ObservedAt: now,
},
"node-b-wan": {
EndpointID: "node-b-wan",
LastLatencyMs: 420,
ReliabilityScore: 95,
ObservedAt: now,
},
"node-b-bad-relay": {
EndpointID: "node-b-bad-relay",
LastLatencyMs: 900,
ReliabilityScore: 95,
ObservedAt: now,
},
},
})
if ranked[0].Candidate.EndpointID != "node-b-lan" || ranked[1].Candidate.EndpointID != "node-b-wan" || ranked[2].Candidate.EndpointID != "node-b-bad-relay" {
t.Fatalf("ranked endpoints = %+v, want lan, wan, bad relay", ranked)
}
if !containsReason(ranked[1].Reasons, "latency:very_high") {
t.Fatalf("wan reasons = %+v, want latency:very_high", ranked[1].Reasons)
}
if !containsReason(ranked[2].Reasons, "latency:extreme") {
t.Fatalf("relay reasons = %+v, want latency:extreme", ranked[2].Reasons)
}
}
func TestRankPeerEndpointCandidatesTreatsCapacityAsSoftPressure(t *testing.T) {
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{
@@ -0,0 +1,217 @@
package mesh
import (
"errors"
"strings"
"time"
)
type FabricChannelRouteEventType string
const (
FabricChannelRouteEventNone FabricChannelRouteEventType = ""
FabricChannelRouteEventOpened FabricChannelRouteEventType = "opened"
FabricChannelRouteEventReroute FabricChannelRouteEventType = "reroute"
)
var ErrFabricRouteRerouteSuppressed = errors.New("fabric route reroute suppressed")
type FabricChannelRouterConfig struct {
SchedulerConfig FabricRouteSchedulerConfig
MaxAckLatencyMs int64
MaxRoutePressure int
MinRerouteInterval time.Duration
ProjectedChannelCost int
}
type FabricChannelRouter struct {
Config FabricChannelRouterConfig
Scheduler FabricRouteScheduler
}
type FabricChannelObservation struct {
ChannelID string
RouteID string
AckLatencyMs int64
Failed bool
BytesSent uint64
BytesRecv uint64
FramesSent uint64
FramesRecv uint64
Reason string
ObservedAt time.Time
}
type FabricChannelRouteEvent struct {
Type FabricChannelRouteEventType
Reason string
PreviousRoute FabricRoute
NextRoute FabricRoute
Choice FabricRouteChoice
Observation FabricChannelObservation
Channel FabricChannel
OccurredAt time.Time
}
func NewFabricChannelRouter(cfg FabricChannelRouterConfig) FabricChannelRouter {
cfg = normalizeFabricChannelRouterConfig(cfg)
return FabricChannelRouter{
Config: cfg,
Scheduler: NewFabricRouteScheduler(cfg.SchedulerConfig),
}
}
func (r FabricChannelRouter) OpenChannel(spec FabricChannelSpec, routeSet FabricRouteSet, now time.Time) (FabricChannel, FabricChannelRouteEvent, error) {
if now.IsZero() {
now = time.Now().UTC()
}
choice, err := r.Scheduler.ChooseRoute(spec, routeSet, now)
if err != nil {
return FabricChannel{}, FabricChannelRouteEvent{}, err
}
channel := FabricChannel{
Spec: spec,
State: FabricChannelOpen,
RouteID: choice.Route.RouteID,
TargetNode: choice.Route.DestinationNodeID,
OpenedAt: now,
}
event := FabricChannelRouteEvent{
Type: FabricChannelRouteEventOpened,
Reason: choice.Reason,
NextRoute: choice.Route,
Choice: choice,
Channel: channel,
OccurredAt: now,
}
return channel, event, nil
}
func (r FabricChannelRouter) ObserveChannel(channel FabricChannel, routeSet FabricRouteSet, observation FabricChannelObservation, now time.Time) (FabricChannel, FabricChannelRouteEvent, error) {
if now.IsZero() {
now = time.Now().UTC()
}
if observation.ObservedAt.IsZero() {
observation.ObservedAt = now
}
channel.BytesSent += observation.BytesSent
channel.BytesRecv += observation.BytesRecv
channel.FramesSent += observation.FramesSent
channel.FramesRecv += observation.FramesRecv
if channel.State == "" {
channel.State = FabricChannelOpen
}
if !r.shouldReroute(channel, observation, routeSet, now) {
return channel, FabricChannelRouteEvent{Type: FabricChannelRouteEventNone, Observation: observation, Channel: channel, OccurredAt: now}, nil
}
previous, _ := findFabricRoute(routeSet, channel.RouteID)
choice, err := r.chooseAlternativeRoute(channel.Spec, routeSet, channel.RouteID, now)
if err != nil {
return channel, FabricChannelRouteEvent{}, err
}
channel.RouteID = choice.Route.RouteID
channel.TargetNode = choice.Route.DestinationNodeID
channel.LastReroute = now
channel.RerouteCount++
reason := observation.Reason
if strings.TrimSpace(reason) == "" {
reason = rerouteReason(r.Config, observation, previous)
}
event := FabricChannelRouteEvent{
Type: FabricChannelRouteEventReroute,
Reason: reason,
PreviousRoute: previous,
NextRoute: choice.Route,
Choice: choice,
Observation: observation,
Channel: channel,
OccurredAt: now,
}
return channel, event, nil
}
func (r FabricChannelRouter) shouldReroute(channel FabricChannel, observation FabricChannelObservation, routeSet FabricRouteSet, now time.Time) bool {
cfg := normalizeFabricChannelRouterConfig(r.Config)
if cfg.MinRerouteInterval > 0 && !channel.LastReroute.IsZero() && now.Sub(channel.LastReroute) < cfg.MinRerouteInterval {
return false
}
if observation.Failed {
return true
}
if cfg.MaxAckLatencyMs > 0 && observation.AckLatencyMs > cfg.MaxAckLatencyMs {
return true
}
if cfg.MaxRoutePressure > 0 {
if route, ok := findFabricRoute(routeSet, channel.RouteID); ok && fabricRoutePressurePercent(route, cfg.ProjectedChannelCost) > cfg.MaxRoutePressure {
return true
}
}
return false
}
func (r FabricChannelRouter) chooseAlternativeRoute(spec FabricChannelSpec, routeSet FabricRouteSet, currentRouteID string, now time.Time) (FabricRouteChoice, error) {
routes := flattenFabricRouteSet(routeSet)
alternatives := make([]FabricRoute, 0, len(routes))
for _, route := range routes {
if route.RouteID == currentRouteID {
continue
}
alternatives = append(alternatives, route)
}
if len(alternatives) == 0 {
return FabricRouteChoice{}, ErrFabricRouteNotFound
}
return r.Scheduler.ChooseRoute(spec, routeSetFromRoutes(routeSet, alternatives), now)
}
func normalizeFabricChannelRouterConfig(cfg FabricChannelRouterConfig) FabricChannelRouterConfig {
if cfg.ProjectedChannelCost <= 0 {
cfg.ProjectedChannelCost = 1
}
if cfg.SchedulerConfig.ProjectedChannelCost <= 0 {
cfg.SchedulerConfig.ProjectedChannelCost = cfg.ProjectedChannelCost
}
if cfg.MaxRoutePressure <= 0 {
cfg.MaxRoutePressure = 90
}
return cfg
}
func rerouteReason(cfg FabricChannelRouterConfig, observation FabricChannelObservation, route FabricRoute) string {
cfg = normalizeFabricChannelRouterConfig(cfg)
switch {
case observation.Failed:
return "route_failure"
case cfg.MaxAckLatencyMs > 0 && observation.AckLatencyMs > cfg.MaxAckLatencyMs:
return "ack_latency_threshold"
case cfg.MaxRoutePressure > 0 && fabricRoutePressurePercent(route, cfg.ProjectedChannelCost) > cfg.MaxRoutePressure:
return "route_capacity_pressure"
default:
return "route_degraded"
}
}
func findFabricRoute(routeSet FabricRouteSet, routeID string) (FabricRoute, bool) {
routeID = strings.TrimSpace(routeID)
if routeID == "" {
return FabricRoute{}, false
}
for _, route := range flattenFabricRouteSet(routeSet) {
if route.RouteID == routeID {
return route, true
}
}
return FabricRoute{}, false
}
func routeSetFromRoutes(template FabricRouteSet, routes []FabricRoute) FabricRouteSet {
out := FabricRouteSet{TargetKind: template.TargetKind, TargetID: template.TargetID}
if len(routes) == 0 {
return out
}
out.Primary = routes[0]
if len(routes) > 1 {
out.WarmStandby = append(out.WarmStandby, routes[1:]...)
}
return out
}
@@ -0,0 +1,151 @@
package mesh
import (
"testing"
"time"
)
func TestFabricChannelRouterOpensOnBestRoute(t *testing.T) {
router := NewFabricChannelRouter(FabricChannelRouterConfig{})
now := time.Now()
channel, event, err := router.OpenChannel(testFabricChannelSpec(FabricChannelTargetNode, "node-b"), FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testFabricRoute("route-slow", "node-b", 80, 100, 0, true),
WarmStandby: []FabricRoute{
testFabricRoute("route-fast", "node-b", 15, 100, 0, true),
},
}, now)
if err != nil {
t.Fatalf("open channel: %v", err)
}
if channel.RouteID != "route-fast" || channel.State != FabricChannelOpen {
t.Fatalf("channel = %+v, want route-fast open", channel)
}
if event.Type != FabricChannelRouteEventOpened || event.NextRoute.RouteID != "route-fast" {
t.Fatalf("event = %+v", event)
}
}
func TestFabricChannelRouterReroutesOnSlowAck(t *testing.T) {
router := NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 30})
now := time.Now()
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testFabricRoute("route-primary", "node-b", 10, 100, 0, true),
WarmStandby: []FabricRoute{
testFabricRoute("route-standby", "node-b", 20, 100, 0, true),
},
}
channel := FabricChannel{
Spec: testFabricChannelSpec(FabricChannelTargetNode, "node-b"),
State: FabricChannelOpen,
RouteID: "route-primary",
OpenedAt: now.Add(-time.Minute),
}
updated, event, err := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: channel.Spec.ChannelID,
RouteID: channel.RouteID,
AckLatencyMs: 120,
BytesSent: 4096,
FramesSent: 4,
}, now)
if err != nil {
t.Fatalf("observe channel: %v", err)
}
if event.Type != FabricChannelRouteEventReroute || event.Reason != "ack_latency_threshold" {
t.Fatalf("event = %+v", event)
}
if updated.RouteID != "route-standby" || updated.RerouteCount != 1 || updated.BytesSent != 4096 || updated.FramesSent != 4 {
t.Fatalf("updated = %+v", updated)
}
}
func TestFabricChannelRouterReroutesPoolTargetOnFailure(t *testing.T) {
router := NewFabricChannelRouter(FabricChannelRouterConfig{})
now := time.Now()
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
Primary: testFabricPoolRoute("route-node-b", "node-b", 10, true),
WarmStandby: []FabricRoute{
testFabricPoolRoute("route-node-c", "node-c", 20, true),
},
}
channel := FabricChannel{
Spec: testFabricChannelSpec(FabricChannelTargetPool, "pool-egress"),
State: FabricChannelOpen,
RouteID: "route-node-b",
TargetNode: "node-b",
OpenedAt: now.Add(-time.Minute),
}
updated, event, err := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: channel.Spec.ChannelID,
RouteID: channel.RouteID,
Failed: true,
Reason: "target_failed",
}, now)
if err != nil {
t.Fatalf("observe channel: %v", err)
}
if event.Type != FabricChannelRouteEventReroute || event.PreviousRoute.RouteID != "route-node-b" || event.NextRoute.RouteID != "route-node-c" {
t.Fatalf("event = %+v", event)
}
if updated.TargetNode != "node-c" || updated.RouteID != "route-node-c" {
t.Fatalf("updated = %+v", updated)
}
}
func TestFabricChannelRouterSuppressesRerouteInsideHysteresis(t *testing.T) {
router := NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 30, MinRerouteInterval: time.Minute})
now := time.Now()
channel := FabricChannel{
Spec: testFabricChannelSpec(FabricChannelTargetNode, "node-b"),
State: FabricChannelOpen,
RouteID: "route-primary",
LastReroute: now.Add(-10 * time.Second),
}
updated, event, err := router.ObserveChannel(channel, FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testFabricRoute("route-primary", "node-b", 10, 100, 0, true),
WarmStandby: []FabricRoute{testFabricRoute("route-standby", "node-b", 20, 100, 0, true)},
}, FabricChannelObservation{AckLatencyMs: 120}, now)
if err != nil {
t.Fatalf("observe channel: %v", err)
}
if event.Type != FabricChannelRouteEventNone || updated.RouteID != "route-primary" {
t.Fatalf("event=%+v updated=%+v", event, updated)
}
}
func testFabricChannelSpec(kind FabricChannelTargetKind, targetID string) FabricChannelSpec {
return FabricChannelSpec{
ChannelID: "channel-1",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: kind,
TargetID: targetID,
}
}
func testFabricRoute(routeID string, destination string, latency int, capacity int, active int, healthy bool) FabricRoute {
return FabricRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: destination,
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: destination}},
BaseLatencyMs: latency,
Capacity: capacity,
ActiveChannels: active,
Healthy: healthy,
}
}
func testFabricPoolRoute(routeID string, destination string, latency int, healthy bool) FabricRoute {
route := testFabricRoute(routeID, destination, latency, 100, 0, healthy)
route.PoolID = "pool-egress"
return route
}
@@ -0,0 +1,487 @@
package mesh
import (
"context"
"fmt"
"strings"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
type FabricChannelRuntimeConfig struct {
RouterConfig FabricChannelRouterConfig
StreamID uint64
TrafficClass fabricproto.TrafficClass
Timeout time.Duration
MaxPayload int
RouteHealthTTL time.Duration
}
type FabricChannelRuntime struct {
Transport FabricTransport
Router FabricChannelRouter
Pressure *FabricRoutePressureTracker
Health *FabricRouteHealthTracker
Config FabricChannelRuntimeConfig
}
type FabricChannelRuntimeResult struct {
Channel FabricChannel
BytesSent uint64
BytesRecv uint64
FramesSent uint64
FramesRecv uint64
AcksReceived uint64
RouteEvents []FabricChannelRouteEvent
RouteAttempts []string
MigrationEvents int
RoutePressure FabricRoutePressureSnapshot
RouteHealth FabricRouteHealthSnapshot
}
type FabricChannelRequestResponseResult struct {
FabricChannelRuntimeResult
ResponsePayload []byte
}
func NewFabricChannelRuntime(transport FabricTransport, cfg FabricChannelRuntimeConfig) *FabricChannelRuntime {
if cfg.StreamID == 0 {
cfg.StreamID = 2
}
if cfg.TrafficClass == 0 {
cfg.TrafficClass = fabricproto.TrafficClassBulk
}
if cfg.Timeout <= 0 {
cfg.Timeout = 30 * time.Second
}
if cfg.MaxPayload <= 0 {
cfg.MaxPayload = fabricproto.DefaultMaxPayload
}
return &FabricChannelRuntime{
Transport: transport,
Router: NewFabricChannelRouter(cfg.RouterConfig),
Pressure: NewFabricRoutePressureTracker(),
Health: NewFabricRouteHealthTracker(cfg.RouteHealthTTL),
Config: cfg,
}
}
func (r *FabricChannelRuntime) SendReliable(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payloads [][]byte) (FabricChannelRuntimeResult, error) {
if r == nil || r.Transport == nil {
return FabricChannelRuntimeResult{}, ErrForwardRuntimeUnavailable
}
now := time.Now().UTC()
routeSet = r.routeSetForScheduling(routeSet)
channel, event, err := r.Router.OpenChannel(spec, routeSet, now)
if err != nil {
return FabricChannelRuntimeResult{}, err
}
result := FabricChannelRuntimeResult{Channel: channel, RouteEvents: []FabricChannelRouteEvent{event}}
sequence := uint64(0)
index := 0
for index < len(payloads) {
routeSet = r.routeSetForScheduling(routeSet)
route, ok := findFabricRoute(routeSet, channel.RouteID)
if !ok {
return result, ErrFabricRouteNotFound
}
result.RouteAttempts = append(result.RouteAttempts, route.RouteID)
target, err := FabricTransportTargetForRoute(route)
if err != nil {
return result, err
}
releaseRoute := r.acquireRoute(route.RouteID)
session, err := r.Transport.Connect(ctx, target)
if err != nil {
releaseRoute()
r.markRouteFailure(route.RouteID, err)
updated, event, rerouteErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "connect_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
result.Channel = channel
if event.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, event)
result.MigrationEvents++
continue
}
if rerouteErr != nil {
return result, rerouteErr
}
return result, err
}
migrated, sendErr := r.sendOnSession(ctx, session, &channel, routeSet, route, payloads, &index, &sequence, &result)
_ = session.Close()
releaseRoute()
result.Channel = channel
if sendErr != nil {
return result, sendErr
}
if !migrated {
break
}
}
result.Channel = channel
result.RoutePressure = r.snapshotRoutePressure()
result.RouteHealth = r.snapshotRouteHealth()
return result, nil
}
func (r *FabricChannelRuntime) SendRequestResponse(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payload []byte) (FabricChannelRequestResponseResult, error) {
if r == nil || r.Transport == nil {
return FabricChannelRequestResponseResult{}, ErrForwardRuntimeUnavailable
}
if len(payload) > r.Config.MaxPayload {
return FabricChannelRequestResponseResult{}, fmt.Errorf("%w: %d > %d", fabricproto.ErrInvalidPayloadLen, len(payload), r.Config.MaxPayload)
}
now := time.Now().UTC()
routeSet = r.routeSetForScheduling(routeSet)
channel, event, err := r.Router.OpenChannel(spec, routeSet, now)
if err != nil {
return FabricChannelRequestResponseResult{}, err
}
result := FabricChannelRequestResponseResult{
FabricChannelRuntimeResult: FabricChannelRuntimeResult{Channel: channel, RouteEvents: []FabricChannelRouteEvent{event}},
}
sequence := uint64(1)
for {
routeSet = r.routeSetForScheduling(routeSet)
route, ok := findFabricRoute(routeSet, channel.RouteID)
if !ok {
return result, ErrFabricRouteNotFound
}
result.RouteAttempts = append(result.RouteAttempts, route.RouteID)
target, err := FabricTransportTargetForRoute(route)
if err != nil {
return result, err
}
releaseRoute := r.acquireRoute(route.RouteID)
session, err := r.Transport.Connect(ctx, target)
if err != nil {
releaseRoute()
r.markRouteFailure(route.RouteID, err)
updated, routeEvent, rerouteErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "connect_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
result.Channel = channel
if routeEvent.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, routeEvent)
result.MigrationEvents++
continue
}
if rerouteErr != nil {
return result, rerouteErr
}
return result, err
}
response, ackMs, sendErr := r.sendRequestResponseOnSession(ctx, session, route.RouteID, spec.ChannelID, payload, sequence)
_ = session.Close()
releaseRoute()
result.Channel = channel
if sendErr == nil {
r.markRouteSuccess(route.RouteID)
result.BytesSent += uint64(len(payload))
result.FramesSent++
result.BytesRecv += uint64(len(response))
result.FramesRecv++
result.AcksReceived++
updated, routeEvent, observeErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
AckLatencyMs: ackMs,
BytesSent: uint64(len(payload)),
FramesSent: 1,
BytesRecv: uint64(len(response)),
FramesRecv: 1,
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
result.Channel = channel
if observeErr != nil {
return result, observeErr
}
if routeEvent.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, routeEvent)
result.MigrationEvents++
}
result.ResponsePayload = response
result.RoutePressure = r.snapshotRoutePressure()
result.RouteHealth = r.snapshotRouteHealth()
return result, nil
}
r.markRouteFailure(route.RouteID, sendErr)
updated, routeEvent, rerouteErr := r.Router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "response_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
result.Channel = channel
if routeEvent.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, routeEvent)
result.MigrationEvents++
continue
}
if rerouteErr != nil {
return result, rerouteErr
}
return result, sendErr
}
}
func (r *FabricChannelRuntime) routeSetForScheduling(routeSet FabricRouteSet) FabricRouteSet {
if r != nil && r.Health != nil {
routeSet = r.Health.Apply(routeSet, time.Now().UTC())
}
return r.routeSetWithActiveChannels(routeSet)
}
func (r *FabricChannelRuntime) routeSetWithActiveChannels(routeSet FabricRouteSet) FabricRouteSet {
if r == nil || r.Pressure == nil {
return routeSet
}
return r.Pressure.Apply(routeSet)
}
func (r *FabricChannelRuntime) acquireRoute(routeID string) func() {
if r == nil || r.Pressure == nil {
return func() {}
}
return r.Pressure.Acquire(routeID)
}
func (r *FabricChannelRuntime) snapshotRoutePressure() FabricRoutePressureSnapshot {
if r == nil || r.Pressure == nil {
return FabricRoutePressureSnapshot{}
}
return r.Pressure.SnapshotPressure()
}
func (r *FabricChannelRuntime) snapshotRouteHealth() FabricRouteHealthSnapshot {
if r == nil || r.Health == nil {
return FabricRouteHealthSnapshot{}
}
return r.Health.Snapshot(time.Now().UTC())
}
func (r *FabricChannelRuntime) markRouteFailure(routeID string, err error) {
if r == nil || r.Health == nil || err == nil {
return
}
r.Health.MarkFailure(routeID, err.Error(), time.Now().UTC())
}
func (r *FabricChannelRuntime) markRouteSuccess(routeID string) {
if r == nil || r.Health == nil {
return
}
r.Health.MarkSuccess(routeID)
}
func (r *FabricChannelRuntime) sendOnSession(ctx context.Context, session FabricTransportSession, channel *FabricChannel, routeSet FabricRouteSet, route FabricRoute, payloads [][]byte, index *int, sequence *uint64, result *FabricChannelRuntimeResult) (bool, error) {
cfg := r.Config
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameOpenStream,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
}); err != nil {
r.markRouteFailure(route.RouteID, err)
return false, err
}
for *index < len(payloads) {
payload := payloads[*index]
if len(payload) > cfg.MaxPayload {
return false, fmt.Errorf("%w: %d > %d", fabricproto.ErrInvalidPayloadLen, len(payload), cfg.MaxPayload)
}
(*sequence)++
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
Sequence: *sequence,
Payload: payload,
}); err != nil {
r.markRouteFailure(route.RouteID, err)
return false, err
}
ackOK, ackMs := waitForFabricRuntimeAck(ctx, session, cfg.StreamID, *sequence, cfg.Timeout)
if !ackOK {
r.markRouteFailure(route.RouteID, fmt.Errorf("ack_failed"))
updated, event, err := r.Router.ObserveChannel(*channel, routeSet, FabricChannelObservation{
ChannelID: channel.Spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "ack_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
*channel = updated
if event.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, event)
result.MigrationEvents++
return true, nil
}
return false, err
}
r.markRouteSuccess(route.RouteID)
*index++
result.BytesSent += uint64(len(payload))
result.FramesSent++
result.AcksReceived++
updated, event, err := r.Router.ObserveChannel(*channel, routeSet, FabricChannelObservation{
ChannelID: channel.Spec.ChannelID,
RouteID: route.RouteID,
AckLatencyMs: ackMs,
BytesSent: uint64(len(payload)),
FramesSent: 1,
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
*channel = updated
if err != nil {
return false, err
}
if event.Type == FabricChannelRouteEventReroute {
result.RouteEvents = append(result.RouteEvents, event)
result.MigrationEvents++
return true, nil
}
}
_ = session.Send(context.Background(), fabricproto.Frame{
Type: fabricproto.FrameCloseStream,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
})
return false, nil
}
func (r *FabricChannelRuntime) sendRequestResponseOnSession(ctx context.Context, session FabricTransportSession, routeID string, channelID string, payload []byte, sequence uint64) ([]byte, int64, error) {
cfg := r.Config
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameOpenStream,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
}); err != nil {
r.markRouteFailure(routeID, err)
return nil, 0, err
}
started := time.Now()
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
Sequence: sequence,
Payload: payload,
}); err != nil {
r.markRouteFailure(routeID, err)
return nil, 0, err
}
waitCtx := ctx
if cfg.Timeout > 0 {
var cancel context.CancelFunc
waitCtx, cancel = context.WithTimeout(ctx, cfg.Timeout)
defer cancel()
}
for {
select {
case <-waitCtx.Done():
return nil, 0, waitCtx.Err()
case err, ok := <-session.Errors():
if !ok {
return nil, 0, ErrForwardPeerUnavailable
}
if err != nil {
return nil, 0, err
}
case frame, ok := <-session.Frames():
if !ok {
return nil, 0, ErrForwardPeerUnavailable
}
if frame.Type != fabricproto.FrameData || frame.StreamID != cfg.StreamID || frame.Sequence != sequence {
continue
}
_ = session.Send(context.Background(), fabricproto.Frame{
Type: fabricproto.FrameCloseStream,
TrafficClass: cfg.TrafficClass,
StreamID: cfg.StreamID,
})
return append([]byte(nil), frame.Payload...), time.Since(started).Milliseconds(), nil
}
}
}
func FabricTransportTargetForRoute(route FabricRoute) (FabricTransportTarget, error) {
if strings.TrimSpace(route.RouteID) == "" {
return FabricTransportTarget{}, ErrFabricRouteNotFound
}
if route.RelayCount > 0 {
for _, hop := range route.Hops {
if hop.Mode != FabricRouteRelay {
continue
}
if target, ok := fabricTransportTargetForHop(hop); ok {
return target, nil
}
}
}
for i := len(route.Hops) - 1; i >= 0; i-- {
if target, ok := fabricTransportTargetForHop(route.Hops[i]); ok {
return target, nil
}
}
return FabricTransportTarget{}, fmt.Errorf("%w: route %s has no transport endpoint", ErrFabricRouteNotFound, route.RouteID)
}
func fabricTransportTargetForHop(hop FabricRouteHop) (FabricTransportTarget, bool) {
endpoint := strings.TrimSpace(hop.Address)
if endpoint == "" {
return FabricTransportTarget{}, false
}
transport := string(hop.Mode)
if transport == "" {
transport = "quic"
}
return FabricTransportTarget{
EndpointID: hop.EndpointID,
PeerID: strings.TrimSpace(hop.NodeID),
Endpoint: endpoint,
Transport: transport,
PeerCertSHA256: strings.TrimSpace(hop.PeerCertSHA256),
}, true
}
func waitForFabricRuntimeAck(ctx context.Context, session FabricTransportSession, streamID uint64, sequence uint64, timeout time.Duration) (bool, int64) {
started := time.Now()
if timeout > 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
for {
select {
case <-ctx.Done():
return false, 0
case err, ok := <-session.Errors():
if !ok || err != nil {
return false, 0
}
case frame, ok := <-session.Frames():
if !ok {
return false, 0
}
if frame.Type == fabricproto.FrameAck && frame.StreamID == streamID && frame.Sequence == sequence {
return true, time.Since(started).Milliseconds()
}
}
}
}
@@ -0,0 +1,495 @@
package mesh
import (
"context"
"strings"
"sync"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestFabricChannelRuntimeMigratesSlowAckToStandbyRoute(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://slow.example.test:19443": 60 * time.Millisecond,
"quic://fast.example.test:19443": 0,
})
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-slow", "node-b", "quic://slow.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
},
}
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{
[]byte("one"),
[]byte("two"),
[]byte("three"),
})
if err != nil {
t.Fatalf("send reliable: %v", err)
}
if result.MigrationEvents != 1 {
t.Fatalf("migration events = %d, want 1: %+v", result.MigrationEvents, result.RouteEvents)
}
if result.Channel.RouteID != "route-fast" || result.Channel.RerouteCount != 1 {
t.Fatalf("channel = %+v", result.Channel)
}
if result.BytesSent != uint64(len("one")+len("two")+len("three")) || result.AcksReceived != 3 {
t.Fatalf("result = %+v", result)
}
if got := transport.connectCount("quic://slow.example.test:19443"); got != 1 {
t.Fatalf("slow connect count = %d, want 1", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
t.Fatalf("fast connect count = %d, want 1", got)
}
if result.RoutePressure.AcquiredTotal != 2 || result.RoutePressure.ReleasedTotal != 2 || result.RoutePressure.MaxActiveTotal == 0 {
t.Fatalf("route pressure = %+v", result.RoutePressure)
}
}
func TestFabricChannelRuntimeReroutesOnConnectFailure(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://fast.example.test:19443": 0,
})
transport.failConnect["quic://dead.example.test:19443"] = true
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-dead", "node-b", "quic://dead.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
},
}
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("payload")})
if err != nil {
t.Fatalf("send reliable: %v", err)
}
if result.MigrationEvents != 1 || result.Channel.RouteID != "route-fast" || result.BytesSent != uint64(len("payload")) {
t.Fatalf("result = %+v", result)
}
}
func TestFabricChannelRuntimeQuarantinesFailedRouteAcrossChannels(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://fast.example.test:19443": 0,
})
transport.failConnect["quic://dead.example.test:19443"] = true
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
RouteHealthTTL: time.Minute,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-dead", "node-b", "quic://dead.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
},
}
first, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("first")})
if err != nil {
t.Fatalf("first send reliable: %v", err)
}
if first.Channel.RouteID != "route-fast" || first.RouteHealth.Quarantined["route-dead"].Failures != 1 {
t.Fatalf("first result = %+v", first)
}
second, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("second")})
if err != nil {
t.Fatalf("second send reliable: %v", err)
}
if second.Channel.RouteID != "route-fast" {
t.Fatalf("second route = %s, want route-fast", second.Channel.RouteID)
}
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
t.Fatalf("dead connect count = %d, want one attempt before quarantine", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 2 {
t.Fatalf("fast connect count = %d, want both channels on healthy route", got)
}
}
func TestFabricChannelRuntimeReroutesOnAckTimeout(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://slow.example.test:19443": 100 * time.Millisecond,
"quic://fast.example.test:19443": 0,
})
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
Timeout: 10 * time.Millisecond,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-slow", "node-b", "quic://slow.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-b", "quic://fast.example.test:19443", 20),
},
}
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("payload")})
if err != nil {
t.Fatalf("send reliable: %v", err)
}
if result.MigrationEvents != 1 || result.Channel.RouteID != "route-fast" || result.BytesSent != uint64(len("payload")) {
t.Fatalf("result = %+v", result)
}
}
func TestFabricChannelRuntimeSpreadsConcurrentChannelsBySharedPressure(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://route-a.example.test:19443": 80 * time.Millisecond,
"quic://route-b.example.test:19443": 0,
})
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{StreamID: 9})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testRuntimeRoute("route-a", "node-b", "quic://route-a.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-b", "node-b", "quic://route-b.example.test:19443", 11),
},
}
firstDone := make(chan error, 1)
go func() {
_, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("one")})
firstDone <- err
}()
transport.waitForConnect(t, "quic://route-a.example.test:19443", 1)
result, err := runtime.SendReliable(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-b"), routeSet, [][]byte{[]byte("two")})
if err != nil {
t.Fatalf("second send reliable: %v", err)
}
if result.Channel.RouteID != "route-b" {
t.Fatalf("second route = %s, want route-b", result.Channel.RouteID)
}
if got := transport.connectCount("quic://route-b.example.test:19443"); got != 1 {
t.Fatalf("route-b connect count = %d, want 1", got)
}
if err := <-firstDone; err != nil {
t.Fatalf("first send reliable: %v", err)
}
}
func TestFabricChannelRuntimeRequestResponseReturnsPayload(t *testing.T) {
transport := newFakeFabricRequestResponseTransport(map[string][]byte{
"quic://runtime.example.test:19443": []byte(`{"status":"ok"}`),
})
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetPool,
TargetID: "pool-admin-runtime",
Primary: testRuntimePoolRoute("route-runtime", "pool-admin-runtime", "node-runtime", "quic://runtime.example.test:19443", 10),
}
result, err := runtime.SendRequestResponse(context.Background(), FabricChannelSpec{
ChannelID: "channel-web-1",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetPool,
TargetID: "pool-admin-runtime",
TrafficClass: "control",
CreatedAt: time.Now().UTC(),
}, routeSet, []byte(`{"request":true}`))
if err != nil {
t.Fatalf("request response: %v", err)
}
if string(result.ResponsePayload) != `{"status":"ok"}` {
t.Fatalf("response payload = %s", string(result.ResponsePayload))
}
if result.Channel.RouteID != "route-runtime" ||
result.BytesSent != uint64(len(`{"request":true}`)) ||
result.BytesRecv != uint64(len(`{"status":"ok"}`)) ||
result.FramesSent != 1 ||
result.FramesRecv != 1 ||
result.AcksReceived != 1 {
t.Fatalf("result = %+v", result)
}
}
func TestFabricChannelRuntimeRequestResponseReroutesOnResponseFailure(t *testing.T) {
transport := newFakeFabricRequestResponseTransport(map[string][]byte{
"quic://fast.example.test:19443": []byte(`{"status":"ok"}`),
})
transport.failResponse["quic://slow.example.test:19443"] = true
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: FabricChannelRouterConfig{MaxAckLatencyMs: 30},
StreamID: 9,
Timeout: 10 * time.Millisecond,
})
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-runtime",
Primary: testRuntimeRoute("route-slow", "node-runtime", "quic://slow.example.test:19443", 10),
WarmStandby: []FabricRoute{
testRuntimeRoute("route-fast", "node-runtime", "quic://fast.example.test:19443", 20),
},
}
result, err := runtime.SendRequestResponse(context.Background(), testFabricChannelSpec(FabricChannelTargetNode, "node-runtime"), routeSet, []byte(`{"request":true}`))
if err != nil {
t.Fatalf("request response: %v", err)
}
if result.MigrationEvents != 1 || result.Channel.RouteID != "route-fast" || string(result.ResponsePayload) != `{"status":"ok"}` {
t.Fatalf("result = %+v", result)
}
}
func TestFabricTransportTargetForRouteUsesLastAddressedHop(t *testing.T) {
target, err := FabricTransportTargetForRoute(FabricRoute{
RouteID: "route-1",
Hops: []FabricRouteHop{
{NodeID: "node-a"},
{NodeID: "node-r", Mode: FabricRouteRelay, EndpointID: "relay-1", Address: "quic://relay.example.test:19443"},
{NodeID: "node-b", Mode: FabricRouteDirect, EndpointID: "node-b-quic", Address: "quic://node-b.example.test:19443"},
},
})
if err != nil {
t.Fatalf("target for route: %v", err)
}
if target.PeerID != "node-b" || target.EndpointID != "node-b-quic" || target.Endpoint != "quic://node-b.example.test:19443" || target.Transport != string(FabricRouteDirect) {
t.Fatalf("target = %+v", target)
}
}
type fakeFabricRequestResponseTransport struct {
mu sync.Mutex
responses map[string][]byte
failResponse map[string]bool
connects map[string]int
}
func newFakeFabricRequestResponseTransport(responses map[string][]byte) *fakeFabricRequestResponseTransport {
return &fakeFabricRequestResponseTransport{
responses: responses,
failResponse: map[string]bool{},
connects: map[string]int{},
}
}
func (t *fakeFabricRequestResponseTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
endpoint := target.Endpoint
t.mu.Lock()
t.connects[endpoint]++
response := append([]byte(nil), t.responses[endpoint]...)
failResponse := t.failResponse[endpoint]
t.mu.Unlock()
return &fakeFabricRequestResponseSession{
response: response,
failResponse: failResponse,
frames: make(chan fabricproto.Frame, 16),
errors: make(chan error, 1),
done: make(chan struct{}),
}, nil
}
func (t *fakeFabricRequestResponseTransport) Close() error {
return nil
}
type fakeFabricRequestResponseSession struct {
response []byte
failResponse bool
frames chan fabricproto.Frame
errors chan error
done chan struct{}
once sync.Once
}
func (s *fakeFabricRequestResponseSession) Send(_ context.Context, frame fabricproto.Frame) error {
if frame.Type != fabricproto.FrameData || s.failResponse {
return nil
}
response := append([]byte(nil), s.response...)
go func() {
select {
case <-s.done:
case s.frames <- fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: frame.TrafficClass, StreamID: frame.StreamID, Sequence: frame.Sequence, Payload: response}:
}
}()
return nil
}
func (s *fakeFabricRequestResponseSession) Frames() <-chan fabricproto.Frame {
return s.frames
}
func (s *fakeFabricRequestResponseSession) Errors() <-chan error {
return s.errors
}
func (s *fakeFabricRequestResponseSession) Close() error {
s.once.Do(func() {
close(s.done)
})
return nil
}
func (s *fakeFabricRequestResponseSession) Closed() bool {
select {
case <-s.done:
return true
default:
return false
}
}
func TestFabricTransportTargetForRouteUsesRelayHopForRelayRoute(t *testing.T) {
target, err := FabricTransportTargetForRoute(FabricRoute{
RouteID: "route-relay",
RelayCount: 1,
Hops: []FabricRouteHop{
{NodeID: "node-a"},
{NodeID: "node-r", Mode: FabricRouteRelay, EndpointID: "relay-1", Address: "quic://relay.example.test:19443", PeerCertSHA256: "relay-cert"},
{NodeID: "node-b", Mode: FabricRouteRelay, EndpointID: "node-b-private", Address: "quic://10.0.0.2:19443", PeerCertSHA256: "node-b-cert"},
},
})
if err != nil {
t.Fatalf("target for relay route: %v", err)
}
if target.PeerID != "node-r" || target.EndpointID != "relay-1" || target.Endpoint != "quic://relay.example.test:19443" || target.PeerCertSHA256 != "relay-cert" {
t.Fatalf("target = %+v", target)
}
}
type fakeFabricRuntimeTransport struct {
mu sync.Mutex
delays map[string]time.Duration
failConnect map[string]bool
connects map[string]int
}
func newFakeFabricRuntimeTransport(delays map[string]time.Duration) *fakeFabricRuntimeTransport {
return &fakeFabricRuntimeTransport{
delays: delays,
failConnect: map[string]bool{},
connects: map[string]int{},
}
}
func (t *fakeFabricRuntimeTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
endpoint := target.Endpoint
t.mu.Lock()
t.connects[endpoint]++
fail := t.failConnect[endpoint]
delay := t.delays[endpoint]
t.mu.Unlock()
if fail {
return nil, ErrForwardPeerUnavailable
}
return &fakeFabricRuntimeSession{
endpoint: endpoint,
delay: delay,
frames: make(chan fabricproto.Frame, 64),
errors: make(chan error, 1),
done: make(chan struct{}),
}, nil
}
func (t *fakeFabricRuntimeTransport) Close() error {
return nil
}
func (t *fakeFabricRuntimeTransport) connectCount(endpoint string) int {
t.mu.Lock()
defer t.mu.Unlock()
return t.connects[endpoint]
}
func (t *fakeFabricRuntimeTransport) waitForConnect(tb testing.TB, endpoint string, count int) {
tb.Helper()
deadline := time.Now().Add(time.Second)
for {
t.mu.Lock()
got := t.connects[endpoint]
t.mu.Unlock()
if got >= count {
return
}
if time.Now().After(deadline) {
tb.Fatalf("timed out waiting for %s connect count %d, got %d", endpoint, count, got)
}
time.Sleep(time.Millisecond)
}
}
type fakeFabricRuntimeSession struct {
endpoint string
delay time.Duration
frames chan fabricproto.Frame
errors chan error
done chan struct{}
once sync.Once
}
func (s *fakeFabricRuntimeSession) Send(_ context.Context, frame fabricproto.Frame) error {
if frame.Type != fabricproto.FrameData {
return nil
}
delay := s.delay
go func() {
if delay > 0 {
time.Sleep(delay)
}
select {
case <-s.done:
case s.frames <- fabricproto.Frame{Type: fabricproto.FrameAck, TrafficClass: frame.TrafficClass, StreamID: frame.StreamID, Sequence: frame.Sequence}:
}
}()
return nil
}
func (s *fakeFabricRuntimeSession) Frames() <-chan fabricproto.Frame {
return s.frames
}
func (s *fakeFabricRuntimeSession) Errors() <-chan error {
return s.errors
}
func (s *fakeFabricRuntimeSession) Close() error {
s.once.Do(func() {
close(s.done)
})
return nil
}
func (s *fakeFabricRuntimeSession) Closed() bool {
select {
case <-s.done:
return true
default:
return false
}
}
func testRuntimeRoute(routeID string, destination string, endpoint string, latency int) FabricRoute {
route := testFabricRoute(routeID, destination, latency, 100, 0, true)
route.Hops[len(route.Hops)-1].Address = endpoint
route.Hops[len(route.Hops)-1].EndpointID = strings.TrimPrefix(routeID, "route-")
route.Hops[len(route.Hops)-1].Mode = FabricRouteDirect
return route
}
func testRuntimePoolRoute(routeID string, poolID string, destination string, endpoint string, latency int) FabricRoute {
route := testRuntimeRoute(routeID, destination, endpoint, latency)
route.PoolID = poolID
return route
}
@@ -0,0 +1,390 @@
package mesh
import (
"errors"
"sort"
"strings"
"time"
)
type FabricChannelTargetKind string
const (
FabricChannelTargetNode FabricChannelTargetKind = "node"
FabricChannelTargetPool FabricChannelTargetKind = "pool"
)
type FabricChannelLifecycleState string
const (
FabricChannelOpening FabricChannelLifecycleState = "opening"
FabricChannelOpen FabricChannelLifecycleState = "open"
FabricChannelDraining FabricChannelLifecycleState = "draining"
FabricChannelClosed FabricChannelLifecycleState = "closed"
)
type FabricRouteMode string
const (
FabricRouteDirect FabricRouteMode = "direct_quic"
FabricRouteLAN FabricRouteMode = "lan_quic"
FabricRouteReverse FabricRouteMode = "reverse_quic"
FabricRouteRelay FabricRouteMode = "relay_quic"
FabricRouteICE FabricRouteMode = "ice_quic"
)
var (
ErrFabricChannelInvalid = errors.New("fabric channel request is invalid")
ErrFabricRouteNotFound = errors.New("fabric route not found")
)
type FabricChannelSpec struct {
ChannelID string
ClusterID string
SourceNodeID string
TargetKind FabricChannelTargetKind
TargetID string
TrafficClass string
MinBandwidth int64
StickyKey string
CreatedAt time.Time
ForbiddenHops []string
}
type FabricServiceChannelTarget struct {
Kind FabricChannelTargetKind
PoolIDs []string
NodeIDs []string
SelectedNodeID string
ServiceRole string
SelectionPolicy string
SingleMemberPool bool
}
type FabricServiceChannelRequest struct {
SchemaVersion string
ChannelID string
ClusterID string
OrganizationID string
UserID string
ResourceID string
SourceNodeID string
SourceRole string
ServiceClass string
Target FabricServiceChannelTarget
TrafficClass string
CreatedAt time.Time
}
type FabricChannel struct {
Spec FabricChannelSpec
State FabricChannelLifecycleState
RouteID string
TargetNode string
OpenedAt time.Time
LastReroute time.Time
BytesSent uint64
BytesRecv uint64
FramesSent uint64
FramesRecv uint64
RerouteCount uint64
}
type FabricRouteHop struct {
NodeID string
Mode FabricRouteMode
EndpointID string
Address string
PeerCertSHA256 string
}
type FabricRoute struct {
RouteID string
ClusterID string
SourceNodeID string
DestinationNodeID string
PoolID string
Hops []FabricRouteHop
BaseLatencyMs int
JitterMs int
LossPermille int
Capacity int
ActiveChannels int
RelayCount int
LastUpdatedAt time.Time
Healthy bool
Degraded bool
}
type FabricRouteSet struct {
TargetKind FabricChannelTargetKind
TargetID string
Primary FabricRoute
WarmStandby []FabricRoute
ColdFallbacks []FabricRoute
}
type FabricAdjacency struct {
FromNodeID string
ToNodeID string
Mode FabricRouteMode
RTTMs int
JitterMs int
LossPermille int
Capacity int
ActiveChannels int
ThroughputBps int64
PressurePercent int
Healthy bool
PassiveOutbound bool
LocalSegmentID string
NATGroupID string
LastObservedAt time.Time
LastFailureReason string
}
type FabricRouteChoice struct {
Route FabricRoute
Score int
Reason string
PressureBefore int
PressureAfter int
}
type FabricRouteSchedulerConfig struct {
LatencyWeight int
JitterWeight int
LossWeight int
PressureWeight int
HopPenalty int
RelayPenalty int
DegradedPenalty int
ProjectedChannelCost int
HardMaxRoutePressure int
}
type FabricRouteScheduler struct {
Config FabricRouteSchedulerConfig
}
func NewFabricRouteScheduler(cfg FabricRouteSchedulerConfig) FabricRouteScheduler {
return FabricRouteScheduler{Config: normalizeFabricRouteSchedulerConfig(cfg)}
}
func (s FabricRouteScheduler) ChooseRoute(spec FabricChannelSpec, routeSet FabricRouteSet, now time.Time) (FabricRouteChoice, error) {
if err := ValidateFabricChannelSpec(spec); err != nil {
return FabricRouteChoice{}, err
}
routes := flattenFabricRouteSet(routeSet)
if len(routes) == 0 {
return FabricRouteChoice{}, ErrFabricRouteNotFound
}
forbidden := stringSet(spec.ForbiddenHops)
choices := make([]FabricRouteChoice, 0, len(routes))
for _, route := range routes {
if !fabricRouteUsable(spec, route, forbidden, now) {
continue
}
choice := s.scoreRoute(route)
if s.Config.HardMaxRoutePressure > 0 && choice.PressureAfter > s.Config.HardMaxRoutePressure {
continue
}
choice.Route = route
choices = append(choices, choice)
}
if len(choices) == 0 {
return FabricRouteChoice{}, ErrFabricRouteNotFound
}
sort.SliceStable(choices, func(i, j int) bool {
if choices[i].Score != choices[j].Score {
return choices[i].Score < choices[j].Score
}
if choices[i].PressureAfter != choices[j].PressureAfter {
return choices[i].PressureAfter < choices[j].PressureAfter
}
if choices[i].Route.BaseLatencyMs != choices[j].Route.BaseLatencyMs {
return choices[i].Route.BaseLatencyMs < choices[j].Route.BaseLatencyMs
}
return choices[i].Route.RouteID < choices[j].Route.RouteID
})
return choices[0], nil
}
func ValidateFabricChannelSpec(spec FabricChannelSpec) error {
if strings.TrimSpace(spec.ChannelID) == "" || strings.TrimSpace(spec.ClusterID) == "" || strings.TrimSpace(spec.SourceNodeID) == "" || strings.TrimSpace(spec.TargetID) == "" {
return ErrFabricChannelInvalid
}
switch spec.TargetKind {
case FabricChannelTargetNode, FabricChannelTargetPool:
return nil
default:
return ErrFabricChannelInvalid
}
}
func FabricChannelSpecFromServiceRequest(req FabricServiceChannelRequest, localNodeID string, now time.Time) (FabricChannelSpec, error) {
if now.IsZero() {
now = time.Now().UTC()
}
sourceNodeID := firstNonEmpty(strings.TrimSpace(req.SourceNodeID), strings.TrimSpace(localNodeID))
targetKind := req.Target.Kind
if targetKind == "" {
targetKind = FabricChannelTargetPool
}
targetID := firstNonEmpty(firstString(req.Target.PoolIDs), strings.TrimSpace(req.Target.SelectedNodeID), firstString(req.Target.NodeIDs))
if targetKind == FabricChannelTargetNode {
targetID = firstNonEmpty(strings.TrimSpace(req.Target.SelectedNodeID), firstString(req.Target.NodeIDs), targetID)
}
spec := FabricChannelSpec{
ChannelID: firstNonEmpty(strings.TrimSpace(req.ChannelID), strings.TrimSpace(req.ResourceID)),
ClusterID: strings.TrimSpace(req.ClusterID),
SourceNodeID: sourceNodeID,
TargetKind: targetKind,
TargetID: targetID,
TrafficClass: firstNonEmpty(strings.TrimSpace(req.TrafficClass), serviceClassDefaultTrafficClass(req.ServiceClass)),
StickyKey: strings.TrimSpace(req.ResourceID),
CreatedAt: now,
}
if err := ValidateFabricChannelSpec(spec); err != nil {
return FabricChannelSpec{}, err
}
return spec, nil
}
func serviceClassDefaultTrafficClass(serviceClass string) string {
switch strings.TrimSpace(strings.ToLower(serviceClass)) {
case FabricServiceClassVPNPackets:
return FabricServiceChannelBulk
case FabricServiceClassRemoteWorkspace:
return FabricServiceChannelInteractive
default:
return FabricServiceChannelReliable
}
}
func firstString(values []string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
}
return ""
}
func (s FabricRouteScheduler) scoreRoute(route FabricRoute) FabricRouteChoice {
cfg := normalizeFabricRouteSchedulerConfig(s.Config)
pressureBefore := fabricRoutePressurePercent(route, 0)
pressureAfter := fabricRoutePressurePercent(route, cfg.ProjectedChannelCost)
score := route.BaseLatencyMs*cfg.LatencyWeight +
route.JitterMs*cfg.JitterWeight +
route.LossPermille*cfg.LossWeight +
pressureAfter*cfg.PressureWeight +
len(route.Hops)*cfg.HopPenalty +
route.RelayCount*cfg.RelayPenalty
if route.Degraded {
score += cfg.DegradedPenalty
}
reason := "latency_load_score"
if pressureAfter >= 90 {
reason = "capacity_pressure_avoidance"
}
if route.RelayCount > 0 {
reason = "relay_fallback_available"
}
return FabricRouteChoice{Score: score, Reason: reason, PressureBefore: pressureBefore, PressureAfter: pressureAfter}
}
func normalizeFabricRouteSchedulerConfig(cfg FabricRouteSchedulerConfig) FabricRouteSchedulerConfig {
if cfg.LatencyWeight <= 0 {
cfg.LatencyWeight = 10
}
if cfg.JitterWeight <= 0 {
cfg.JitterWeight = 4
}
if cfg.LossWeight <= 0 {
cfg.LossWeight = 8
}
if cfg.PressureWeight <= 0 {
cfg.PressureWeight = 12
}
if cfg.HopPenalty <= 0 {
cfg.HopPenalty = 5
}
if cfg.RelayPenalty <= 0 {
cfg.RelayPenalty = 25
}
if cfg.DegradedPenalty <= 0 {
cfg.DegradedPenalty = 500
}
if cfg.ProjectedChannelCost <= 0 {
cfg.ProjectedChannelCost = 1
}
if cfg.HardMaxRoutePressure < 0 {
cfg.HardMaxRoutePressure = 0
}
return cfg
}
func flattenFabricRouteSet(routeSet FabricRouteSet) []FabricRoute {
routes := make([]FabricRoute, 0, 1+len(routeSet.WarmStandby)+len(routeSet.ColdFallbacks))
if strings.TrimSpace(routeSet.Primary.RouteID) != "" {
routes = append(routes, routeSet.Primary)
}
routes = append(routes, routeSet.WarmStandby...)
routes = append(routes, routeSet.ColdFallbacks...)
return routes
}
func fabricRouteUsable(spec FabricChannelSpec, route FabricRoute, forbidden map[string]struct{}, now time.Time) bool {
if strings.TrimSpace(route.RouteID) == "" || !route.Healthy {
return false
}
if route.ClusterID != "" && spec.ClusterID != "" && route.ClusterID != spec.ClusterID {
return false
}
if route.SourceNodeID != "" && route.SourceNodeID != spec.SourceNodeID {
return false
}
switch spec.TargetKind {
case FabricChannelTargetNode:
if route.DestinationNodeID != "" && route.DestinationNodeID != spec.TargetID {
return false
}
case FabricChannelTargetPool:
if route.PoolID != "" && route.PoolID != spec.TargetID {
return false
}
}
for _, hop := range route.Hops {
if _, blocked := forbidden[hop.NodeID]; blocked {
return false
}
}
return true
}
func fabricRoutePressurePercent(route FabricRoute, projected int) int {
if route.Capacity <= 0 {
return 100
}
active := route.ActiveChannels + projected
if active <= 0 {
return 0
}
pressure := (active * 100) / route.Capacity
if pressure > 100 {
return 100
}
return pressure
}
func stringSet(values []string) map[string]struct{} {
out := make(map[string]struct{}, len(values))
for _, value := range values {
value = strings.TrimSpace(value)
if value != "" {
out[value] = struct{}{}
}
}
return out
}
@@ -0,0 +1,244 @@
package mesh
import (
"errors"
"testing"
"time"
)
func TestFabricRouteSchedulerAvoidsSaturatedShortestRoute(t *testing.T) {
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{})
spec := FabricChannelSpec{
ChannelID: "channel-1",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
}
choice, err := scheduler.ChooseRoute(spec, FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: FabricRoute{
RouteID: "short-saturated",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-b"}},
BaseLatencyMs: 10,
Capacity: 10,
ActiveChannels: 10,
Healthy: true,
},
WarmStandby: []FabricRoute{{
RouteID: "slightly-longer-free",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-r"}, {NodeID: "node-b"}},
BaseLatencyMs: 18,
Capacity: 100,
ActiveChannels: 5,
RelayCount: 1,
Healthy: true,
}},
}, time.Now())
if err != nil {
t.Fatalf("choose route: %v", err)
}
if choice.Route.RouteID != "slightly-longer-free" {
t.Fatalf("route = %q, want slightly-longer-free score=%d pressure=%d", choice.Route.RouteID, choice.Score, choice.PressureAfter)
}
}
func TestFabricChannelSpecFromServiceRequestTargetsPool(t *testing.T) {
spec, err := FabricChannelSpecFromServiceRequest(FabricServiceChannelRequest{
ChannelID: "vpn-1",
ClusterID: "cluster-1",
ResourceID: "vpn-1",
ServiceClass: FabricServiceClassVPNPackets,
Target: FabricServiceChannelTarget{
Kind: FabricChannelTargetPool,
PoolIDs: []string{"home-ipv4"},
ServiceRole: "ipv4-egress",
},
}, "android-node", time.Now())
if err != nil {
t.Fatalf("service request spec: %v", err)
}
if spec.SourceNodeID != "android-node" || spec.TargetKind != FabricChannelTargetPool || spec.TargetID != "home-ipv4" || spec.TrafficClass != FabricServiceChannelBulk {
t.Fatalf("unexpected spec: %+v", spec)
}
}
func TestFabricChannelSpecFromServiceRequestKeepsServiceOutOfEndpointSelection(t *testing.T) {
_, err := FabricChannelSpecFromServiceRequest(FabricServiceChannelRequest{
ChannelID: "rdp-1",
ClusterID: "cluster-1",
ServiceClass: FabricServiceClassRemoteWorkspace,
Target: FabricServiceChannelTarget{
Kind: FabricChannelTargetPool,
ServiceRole: "rdp-gateway",
},
}, "client-node", time.Now())
if !errors.Is(err, ErrFabricChannelInvalid) {
t.Fatalf("err = %v, want invalid without pool/node target id", err)
}
}
func TestFabricRouteSchedulerPoolSkipsFailedEndpoint(t *testing.T) {
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{})
spec := FabricChannelSpec{
ChannelID: "channel-pool",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
}
choice, err := scheduler.ChooseRoute(spec, FabricRouteSet{
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
Primary: FabricRoute{
RouteID: "pool-node-dead",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
PoolID: "pool-egress",
Capacity: 100,
Healthy: false,
},
WarmStandby: []FabricRoute{{
RouteID: "pool-node-live",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-c",
PoolID: "pool-egress",
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-c"}},
BaseLatencyMs: 25,
Capacity: 100,
Healthy: true,
}},
}, time.Now())
if err != nil {
t.Fatalf("choose route: %v", err)
}
if choice.Route.DestinationNodeID != "node-c" {
t.Fatalf("destination = %q, want node-c", choice.Route.DestinationNodeID)
}
}
func TestFabricRouteSchedulerHonorsForbiddenHops(t *testing.T) {
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{})
spec := FabricChannelSpec{
ChannelID: "channel-1",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
ForbiddenHops: []string{"node-r"},
}
_, err := scheduler.ChooseRoute(spec, FabricRouteSet{
Primary: FabricRoute{
RouteID: "blocked",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Hops: []FabricRouteHop{{NodeID: "node-a"}, {NodeID: "node-r"}, {NodeID: "node-b"}},
Capacity: 100,
Healthy: true,
},
}, time.Now())
if !errors.Is(err, ErrFabricRouteNotFound) {
t.Fatalf("err = %v, want ErrFabricRouteNotFound", err)
}
}
func TestFabricRouteSchedulerRejectsRoutesAboveHardPressureLimit(t *testing.T) {
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{HardMaxRoutePressure: 80})
spec := FabricChannelSpec{
ChannelID: "channel-pressure",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
}
choice, err := scheduler.ChooseRoute(spec, FabricRouteSet{
Primary: FabricRoute{
RouteID: "too-busy",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Capacity: 10,
ActiveChannels: 9,
Healthy: true,
},
WarmStandby: []FabricRoute{{
RouteID: "admissible",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Capacity: 10,
ActiveChannels: 5,
Healthy: true,
}},
}, time.Now())
if err != nil {
t.Fatalf("choose route: %v", err)
}
if choice.Route.RouteID != "admissible" {
t.Fatalf("route = %q, want admissible", choice.Route.RouteID)
}
}
func TestFabricRouteSchedulerKeepsHighLatencyRouteAsFallbackUntilFastRouteSaturates(t *testing.T) {
spec := FabricChannelSpec{
ChannelID: "channel-latency-aware",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
}
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetPool,
TargetID: "pool-egress",
Primary: FabricRoute{
RouteID: "lan-fast",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-lan",
PoolID: "pool-egress",
BaseLatencyMs: 4,
Capacity: 100,
ActiveChannels: 85,
Healthy: true,
},
WarmStandby: []FabricRoute{{
RouteID: "wan-slow",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-wan",
PoolID: "pool-egress",
BaseLatencyMs: 420,
Capacity: 100,
ActiveChannels: 0,
Healthy: true,
}},
}
scheduler := NewFabricRouteScheduler(FabricRouteSchedulerConfig{HardMaxRoutePressure: 90})
choice, err := scheduler.ChooseRoute(spec, routeSet, time.Now())
if err != nil {
t.Fatalf("choose route: %v", err)
}
if choice.Route.RouteID != "lan-fast" {
t.Fatalf("route = %q, want fast LAN before hard pressure limit", choice.Route.RouteID)
}
routeSet.Primary.ActiveChannels = 90
choice, err = scheduler.ChooseRoute(spec, routeSet, time.Now())
if err != nil {
t.Fatalf("choose fallback route: %v", err)
}
if choice.Route.RouteID != "wan-slow" {
t.Fatalf("route = %q, want WAN only after LAN reaches hard pressure limit", choice.Route.RouteID)
}
}
@@ -0,0 +1,130 @@
package mesh
import (
"context"
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
type FabricOverlayTransportConfig struct {
ClusterID string
LocalNodeID string
RouterConfig FabricChannelRouterConfig
Timeout time.Duration
}
type FabricOverlayTransport struct {
Runtime *FabricChannelRuntime
RouteSets map[string]FabricRouteSet
Config FabricOverlayTransportConfig
sequence atomic.Uint64
}
type FabricOverlayTransportSnapshot struct {
RoutePressure FabricRoutePressureSnapshot `json:"route_pressure"`
RouteHealth FabricRouteHealthSnapshot `json:"route_health,omitempty"`
}
type FabricOverlaySendRequest struct {
ChannelID string
TargetKind FabricChannelTargetKind
TargetID string
TrafficClass fabricproto.TrafficClass
Payloads [][]byte
StickyKey string
}
func NewFabricOverlayTransport(transport FabricTransport, routeSets map[string]FabricRouteSet, cfg FabricOverlayTransportConfig) *FabricOverlayTransport {
if cfg.Timeout <= 0 {
cfg.Timeout = 30 * time.Second
}
runtime := NewFabricChannelRuntime(transport, FabricChannelRuntimeConfig{
RouterConfig: cfg.RouterConfig,
Timeout: cfg.Timeout,
})
normalized := make(map[string]FabricRouteSet, len(routeSets))
for targetID, routeSet := range routeSets {
targetID = strings.TrimSpace(targetID)
if targetID != "" {
normalized[targetID] = routeSet
}
}
return &FabricOverlayTransport{
Runtime: runtime,
RouteSets: normalized,
Config: cfg,
}
}
func (t *FabricOverlayTransport) Send(ctx context.Context, req FabricOverlaySendRequest) (FabricChannelRuntimeResult, error) {
if t == nil || t.Runtime == nil {
return FabricChannelRuntimeResult{}, ErrForwardRuntimeUnavailable
}
targetID := strings.TrimSpace(req.TargetID)
if targetID == "" {
return FabricChannelRuntimeResult{}, ErrFabricChannelInvalid
}
routeSet, ok := t.RouteSets[targetID]
if !ok {
return FabricChannelRuntimeResult{}, ErrFabricRouteNotFound
}
targetKind := req.TargetKind
if targetKind == "" {
targetKind = routeSet.TargetKind
}
if targetKind == "" {
targetKind = FabricChannelTargetNode
}
trafficClass := req.TrafficClass
if trafficClass == 0 {
trafficClass = fabricproto.TrafficClassReliable
}
t.Runtime.Config.TrafficClass = trafficClass
spec := FabricChannelSpec{
ChannelID: firstNonEmpty(strings.TrimSpace(req.ChannelID), fmt.Sprintf("fabric-overlay-%d", t.sequence.Add(1))),
ClusterID: strings.TrimSpace(t.Config.ClusterID),
SourceNodeID: strings.TrimSpace(t.Config.LocalNodeID),
TargetKind: targetKind,
TargetID: targetID,
TrafficClass: loadFabricTrafficClassName(trafficClass),
StickyKey: strings.TrimSpace(req.StickyKey),
CreatedAt: time.Now().UTC(),
}
return t.Runtime.SendReliable(ctx, spec, routeSet, req.Payloads)
}
func (t *FabricOverlayTransport) SnapshotPressure() FabricRoutePressureSnapshot {
if t == nil || t.Runtime == nil || t.Runtime.Pressure == nil {
return FabricRoutePressureSnapshot{}
}
return t.Runtime.Pressure.SnapshotPressure()
}
func (t *FabricOverlayTransport) Snapshot() FabricOverlayTransportSnapshot {
if t == nil || t.Runtime == nil {
return FabricOverlayTransportSnapshot{}
}
return FabricOverlayTransportSnapshot{
RoutePressure: t.Runtime.snapshotRoutePressure(),
RouteHealth: t.Runtime.snapshotRouteHealth(),
}
}
func loadFabricTrafficClassName(trafficClass fabricproto.TrafficClass) string {
switch trafficClass {
case fabricproto.TrafficClassControl:
return "control"
case fabricproto.TrafficClassInteractive:
return "interactive"
case fabricproto.TrafficClassBulk:
return "bulk"
case fabricproto.TrafficClassReliable:
return "reliable"
default:
return fmt.Sprintf("traffic_class_%d", trafficClass)
}
}
@@ -0,0 +1,49 @@
package mesh
import (
"context"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestFabricOverlayTransportSendsThroughRouteSet(t *testing.T) {
transport := newFakeFabricRuntimeTransport(map[string]time.Duration{
"quic://node-b:19443": 0,
})
overlay := NewFabricOverlayTransport(transport, map[string]FabricRouteSet{
"node-b": {
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: FabricRoute{
RouteID: "node-b-direct",
ClusterID: "cluster-1",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
Hops: []FabricRouteHop{{NodeID: "node-b", Mode: FabricRouteDirect, EndpointID: "node-b-direct", Address: "quic://node-b:19443"}},
Capacity: 100,
Healthy: true,
},
},
}, FabricOverlayTransportConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
result, err := overlay.Send(ctx, FabricOverlaySendRequest{
TargetID: "node-b",
TrafficClass: fabricproto.TrafficClassReliable,
Payloads: [][]byte{[]byte("payload")},
})
if err != nil {
t.Fatalf("send: %v", err)
}
if result.BytesSent != uint64(len("payload")) || result.AcksReceived != 1 {
t.Fatalf("result = %+v", result)
}
if pressure := overlay.SnapshotPressure(); pressure.ActiveTotal != 0 || pressure.AcquiredTotal != pressure.ReleasedTotal {
t.Fatalf("pressure leak: %+v", pressure)
}
if snapshot := overlay.Snapshot(); snapshot.RoutePressure.AcquiredTotal != 1 || len(snapshot.RouteHealth.Quarantined) != 0 {
t.Fatalf("snapshot = %+v", snapshot)
}
}
@@ -3,9 +3,12 @@ package mesh
import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"net"
"strings"
"sync"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
"github.com/quic-go/quic-go"
@@ -14,6 +17,13 @@ import (
type QUICFabricServer struct {
listener *quic.Listener
logger FabricSessionEventLogger
reverseMu sync.RWMutex
reverseTransport *QUICFabricTransport
fabricFrameHandler FabricFrameHandler
productionForwardHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
webIngressForwardHandler func(context.Context, []byte) ([]byte, error)
fabricControlHandler func(context.Context, []byte) ([]byte, error)
syntheticForwardHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
done chan struct{}
closeOnce sync.Once
}
@@ -23,8 +33,20 @@ type QUICFabricServerConfig struct {
TLSConfig *tls.Config
QUICConfig *quic.Config
Logger FabricSessionEventLogger
ReverseTransport *QUICFabricTransport
FabricFrameHandler FabricFrameHandler
ProductionForwardHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
WebIngressForwardHandler func(context.Context, []byte) ([]byte, error)
FabricControlHandler func(context.Context, []byte) ([]byte, error)
SyntheticForwardHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
}
type FabricFrameSender interface {
SendFrame(context.Context, fabricproto.Frame) error
}
type FabricFrameHandler func(context.Context, FabricFrameSender, fabricproto.Frame) (bool, error)
func StartQUICFabricServer(ctx context.Context, cfg QUICFabricServerConfig) (*QUICFabricServer, error) {
if cfg.ListenAddr == "" {
return nil, fmt.Errorf("quic fabric listen addr is required")
@@ -44,6 +66,12 @@ func StartQUICFabricServer(ctx context.Context, cfg QUICFabricServerConfig) (*QU
server := &QUICFabricServer{
listener: listener,
logger: cfg.Logger,
reverseTransport: cfg.ReverseTransport,
fabricFrameHandler: cfg.FabricFrameHandler,
productionForwardHandler: cfg.ProductionForwardHandler,
webIngressForwardHandler: cfg.WebIngressForwardHandler,
fabricControlHandler: cfg.FabricControlHandler,
syntheticForwardHandler: cfg.SyntheticForwardHandler,
done: make(chan struct{}),
}
go server.acceptLoop(ctx)
@@ -57,6 +85,15 @@ func (s *QUICFabricServer) Addr() net.Addr {
return s.listener.Addr()
}
func (s *QUICFabricServer) SetReverseTransport(transport *QUICFabricTransport) {
if s == nil {
return
}
s.reverseMu.Lock()
s.reverseTransport = transport
s.reverseMu.Unlock()
}
func (s *QUICFabricServer) Close() error {
if s == nil {
return nil
@@ -95,6 +132,8 @@ func (s *QUICFabricServer) handleConn(ctx context.Context, conn *quic.Conn) {
func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, stream *quic.Stream) {
session := fabricproto.NewSession(fabricproto.SessionConfig{})
sender := quicStreamFrameSender{stream: stream}
defer func() { _ = stream.Close() }()
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_stream_opened",
AcceptedBy: "quic",
@@ -116,6 +155,29 @@ func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, st
if err != nil {
return
}
s.registerReverseHelloFrame(conn, frame)
if s.handleProductionForwardFrame(ctx, stream, frame) {
continue
}
if s.handleWebIngressForwardFrame(ctx, stream, frame) {
continue
}
if s.handleFabricControlForwardFrame(ctx, stream, frame) {
continue
}
if s.handleSyntheticForwardFrame(ctx, conn, stream, frame) {
continue
}
if s.fabricFrameHandler != nil {
handled, err := s.fabricFrameHandler(ctx, sender, frame)
if err != nil {
_ = conn.CloseWithError(2, err.Error())
return
}
if handled {
continue
}
}
event, responses, err := session.HandleFrame(frame)
if err != nil {
_ = conn.CloseWithError(2, err.Error())
@@ -140,6 +202,196 @@ func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, st
}
}
type quicStreamFrameSender struct {
stream *quic.Stream
mu sync.Mutex
}
func (s quicStreamFrameSender) SendFrame(ctx context.Context, frame fabricproto.Frame) error {
if s.stream == nil {
return fmt.Errorf("quic fabric stream is closed")
}
s.mu.Lock()
defer s.mu.Unlock()
if deadline, ok := ctx.Deadline(); ok {
_ = s.stream.SetWriteDeadline(deadline)
} else {
_ = s.stream.SetWriteDeadline(time.Now().Add(30 * time.Second))
}
return fabricproto.WriteFrame(s.stream, frame)
}
func (s *QUICFabricServer) registerReverseHelloFrame(conn *quic.Conn, frame fabricproto.Frame) {
reverseTransport := s.getReverseTransport()
if s == nil || reverseTransport == nil || conn == nil || frame.Type != fabricproto.FramePing {
return
}
payload := string(frame.Payload)
if !strings.HasPrefix(payload, fabricQUICReverseHelloPrefix) {
return
}
peerID := strings.TrimPrefix(payload, fabricQUICReverseHelloPrefix)
reverseTransport.RegisterReverseConn(peerID, conn)
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_reverse_registered",
AcceptedBy: "quic_reverse_hello",
RemoteAddr: conn.RemoteAddr().String(),
PeerID: peerID,
})
}
type quicProductionForwardResponse struct {
Result ProductionForwardResult `json:"result,omitempty"`
Error string `json:"error,omitempty"`
}
type quicSyntheticForwardResponse struct {
Envelope SyntheticEnvelope `json:"envelope,omitempty"`
Error string `json:"error,omitempty"`
}
type quicWebIngressForwardResponse struct {
Payload json.RawMessage `json:"payload,omitempty"`
Error string `json:"error,omitempty"`
}
type quicFabricControlForwardResponse struct {
Payload json.RawMessage `json:"payload,omitempty"`
Error string `json:"error,omitempty"`
}
func (s *QUICFabricServer) handleProductionForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != ProductionForwardQUICStreamID {
return false
}
response := quicProductionForwardResponse{}
if s == nil || s.productionForwardHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else {
var envelope ProductionEnvelope
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
response.Error = "invalid production mesh envelope"
} else if result, err := s.productionForwardHandler(ctx, envelope); err != nil {
response.Error = err.Error()
} else {
response.Result = result
}
}
payload, err := json.Marshal(response)
if err != nil {
return true
}
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: ProductionForwardQUICStreamID,
Sequence: frame.Sequence,
Payload: payload,
})
return true
}
func (s *QUICFabricServer) handleWebIngressForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != WebIngressForwardQUICStreamID {
return false
}
response := quicWebIngressForwardResponse{}
if s == nil || s.webIngressForwardHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else if payload, err := s.webIngressForwardHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
response.Error = err.Error()
} else {
response.Payload = append(json.RawMessage(nil), payload...)
}
payload, err := json.Marshal(response)
if err != nil {
return true
}
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: WebIngressForwardQUICStreamID,
Sequence: frame.Sequence,
Payload: payload,
})
return true
}
func (s *QUICFabricServer) handleFabricControlForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != FabricControlForwardQUICStreamID {
return false
}
response := quicFabricControlForwardResponse{}
if s == nil || s.fabricControlHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else if payload, err := s.fabricControlHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
response.Error = err.Error()
} else {
response.Payload = append(json.RawMessage(nil), payload...)
}
payload, err := json.Marshal(response)
if err != nil {
return true
}
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: FabricControlForwardQUICStreamID,
Sequence: frame.Sequence,
Payload: payload,
})
return true
}
func (s *QUICFabricServer) handleSyntheticForwardFrame(ctx context.Context, conn *quic.Conn, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != SyntheticForwardQUICStreamID {
return false
}
response := quicSyntheticForwardResponse{}
if s == nil || s.syntheticForwardHandler == nil {
response.Error = ErrMeshRuntimeDisabled.Error()
} else {
var envelope SyntheticEnvelope
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
response.Error = "invalid synthetic mesh envelope"
} else if ack, err := s.syntheticForwardHandler(ctx, envelope); err != nil {
response.Error = err.Error()
} else {
s.registerReversePeerConn(envelope.From.NodeID, conn)
response.Envelope = ack
}
}
payload, err := json.Marshal(response)
if err != nil {
return true
}
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: SyntheticForwardQUICStreamID,
Sequence: frame.Sequence,
Payload: payload,
})
return true
}
func (s *QUICFabricServer) registerReversePeerConn(peerID string, conn *quic.Conn) {
reverseTransport := s.getReverseTransport()
if s == nil || reverseTransport == nil || conn == nil {
return
}
reverseTransport.RegisterReverseConn(peerID, conn)
}
func (s *QUICFabricServer) getReverseTransport() *QUICFabricTransport {
if s == nil {
return nil
}
s.reverseMu.RLock()
defer s.reverseMu.RUnlock()
return s.reverseTransport
}
func (s *QUICFabricServer) logFabricSession(entry FabricSessionEventLogEntry) {
if s != nil && s.logger != nil {
s.logger(entry)
@@ -6,7 +6,9 @@ import (
"crypto/tls"
"crypto/x509"
"encoding/hex"
"encoding/json"
"fmt"
"net"
"sort"
"strings"
"sync"
@@ -17,6 +19,7 @@ import (
)
const fabricQUICNextProto = "rap-fabric-data-session-v1"
const fabricQUICReverseHelloPrefix = "rap-fabric-reverse-hello-v1:"
const defaultQUICFabricConnIdleTTL = 5 * time.Minute
const defaultQUICFabricMaxStreamsPerConn = 64
const ErrQUICFabricStreamLimitReached = quicFabricError("quic fabric stream limit reached")
@@ -29,16 +32,28 @@ func (e quicFabricError) Error() string {
type QUICFabricTransport struct {
Config *quic.Config
LocalPeerID string
IdleTTL time.Duration
MaxStreamsPerConn int
DialAddr func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error)
mu sync.Mutex
conns map[string]*quicFabricConnEntry
reverseConns map[string]*quicFabricConnEntry
inboundProductionHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
inboundWebIngressHandler func(context.Context, []byte) ([]byte, error)
inboundFabricControlHandler func(context.Context, []byte) ([]byte, error)
inboundSyntheticHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
logger FabricSessionEventLogger
stats QUICFabricTransportStats
}
type QUICFabricTransportStats struct {
Opens uint64 `json:"opens"`
Reuses uint64 `json:"reuses"`
ReverseHelloSent uint64 `json:"reverse_hello_sent"`
ReverseHelloFailed uint64 `json:"reverse_hello_failed"`
ReverseRegisters uint64 `json:"reverse_registers"`
ReverseReuses uint64 `json:"reverse_reuses"`
OpenFailures uint64 `json:"open_failures"`
ClosedEvicted uint64 `json:"closed_evicted"`
CloseAllCalls uint64 `json:"close_all_calls"`
@@ -50,6 +65,7 @@ type QUICFabricTransportStats struct {
type QUICFabricTransportSnapshot struct {
SchemaVersion string `json:"schema_version"`
LocalPeerID string `json:"local_peer_id,omitempty"`
ActiveCount int `json:"active_count"`
ActiveStreams int `json:"active_streams"`
MaxStreamsPerConn int `json:"max_streams_per_conn"`
@@ -63,6 +79,7 @@ type QUICFabricConnSnapshot struct {
PeerID string `json:"peer_id,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
CertSHA256 string `json:"cert_sha256,omitempty"`
Direction string `json:"direction,omitempty"`
ActiveStreams int `json:"active_streams"`
MaxStreams int `json:"max_streams"`
CapacityPressurePercent int `json:"capacity_pressure_percent"`
@@ -92,7 +109,41 @@ type quicFabricConnEntry struct {
}
func NewQUICFabricTransport(config *quic.Config) *QUICFabricTransport {
return &QUICFabricTransport{Config: config, IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}}
return &QUICFabricTransport{Config: config, IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}, reverseConns: map[string]*quicFabricConnEntry{}}
}
func (t *QUICFabricTransport) SetInboundHandlers(production func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), synthetic func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), logger FabricSessionEventLogger) {
t.SetInboundHandlersWithWebIngress(production, nil, synthetic, logger)
}
func (t *QUICFabricTransport) SetInboundHandlersWithWebIngress(production func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), webIngress func(context.Context, []byte) ([]byte, error), synthetic func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), logger FabricSessionEventLogger) {
if t == nil {
return
}
t.mu.Lock()
t.inboundProductionHandler = production
t.inboundWebIngressHandler = webIngress
t.inboundSyntheticHandler = synthetic
t.logger = logger
t.mu.Unlock()
}
func (t *QUICFabricTransport) SetInboundFabricControlHandler(handler func(context.Context, []byte) ([]byte, error)) {
if t == nil {
return
}
t.mu.Lock()
t.inboundFabricControlHandler = handler
t.mu.Unlock()
}
func (t *QUICFabricTransport) SetLocalPeerID(peerID string) {
if t == nil {
return
}
t.mu.Lock()
t.LocalPeerID = strings.TrimSpace(peerID)
t.mu.Unlock()
}
func quicTLSConfigForTarget(target FabricTransportTarget) *tls.Config {
@@ -186,9 +237,12 @@ func (t *QUICFabricTransport) connectConn(ctx context.Context, target FabricTran
conn, err := quic.DialAddr(ctx, target.Endpoint, tlsConfig, nil)
return conn, "", true, err
}
if conn, key, ok := t.reverseConnForTarget(target); ok {
return conn, key, false, nil
}
key := quicFabricConnKey(target)
if key == "" {
conn, err := quic.DialAddr(ctx, target.Endpoint, tlsConfig, t.Config)
conn, err := t.dialAddr(ctx, target.Endpoint, tlsConfig)
return conn, "", true, err
}
t.mu.Lock()
@@ -207,7 +261,7 @@ func (t *QUICFabricTransport) connectConn(ctx context.Context, target FabricTran
}
t.mu.Unlock()
conn, err := quic.DialAddr(ctx, target.Endpoint, tlsConfig, t.Config)
conn, err := t.dialAddr(ctx, target.Endpoint, tlsConfig)
if err != nil {
t.mu.Lock()
t.stats.OpenFailures++
@@ -235,16 +289,339 @@ func (t *QUICFabricTransport) connectConn(ctx context.Context, target FabricTran
t.conns[key] = &quicFabricConnEntry{conn: conn, lastUsed: time.Now()}
t.stats.Opens++
t.mu.Unlock()
go t.acceptInboundStreams(context.Background(), conn)
go t.sendReverseHello(context.Background(), conn)
return conn, key, false, nil
}
func (t *QUICFabricTransport) dialAddr(ctx context.Context, endpoint string, tlsConfig *tls.Config) (*quic.Conn, error) {
if t != nil && t.DialAddr != nil {
return t.DialAddr(ctx, endpoint, tlsConfig, t.Config)
}
return quic.DialAddr(ctx, endpoint, tlsConfig, t.Config)
}
func DialQUICAddrWithPacketConn(ctx context.Context, endpoint string, packetConn net.PacketConn, tlsConfig *tls.Config, config *quic.Config) (*quic.Conn, error) {
if packetConn == nil {
return nil, fmt.Errorf("quic packet connection is required")
}
addr, err := net.ResolveUDPAddr("udp", strings.TrimPrefix(strings.TrimSpace(endpoint), "quic://"))
if err != nil {
_ = packetConn.Close()
return nil, err
}
transport := &quic.Transport{Conn: packetConn}
conn, err := transport.Dial(ctx, addr, tlsConfig, config)
if err != nil {
_ = transport.Close()
return nil, err
}
go func() {
<-conn.Context().Done()
_ = transport.Close()
}()
return conn, nil
}
func (t *QUICFabricTransport) sendReverseHello(ctx context.Context, conn *quic.Conn) {
if t == nil || conn == nil {
return
}
localPeerID := t.localPeerID()
if localPeerID == "" {
t.mu.Lock()
t.stats.ReverseHelloFailed++
t.mu.Unlock()
return
}
helloCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
stream, err := conn.OpenStreamSync(helloCtx)
if err != nil {
t.mu.Lock()
t.stats.ReverseHelloFailed++
t.mu.Unlock()
return
}
defer func() { _ = stream.Close() }()
if err := fabricproto.WriteFrame(stream, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 1,
Payload: []byte(fabricQUICReverseHelloPrefix + localPeerID),
}); err != nil {
t.mu.Lock()
t.stats.ReverseHelloFailed++
t.mu.Unlock()
return
}
t.mu.Lock()
t.stats.ReverseHelloSent++
t.mu.Unlock()
_, _ = fabricproto.ReadFrame(stream, fabricproto.DefaultMaxPayload)
}
func (t *QUICFabricTransport) acceptInboundStreams(ctx context.Context, conn *quic.Conn) {
if t == nil || conn == nil {
return
}
for {
stream, err := conn.AcceptStream(ctx)
if err != nil {
return
}
go t.handleInboundStream(ctx, conn, stream)
}
}
func (t *QUICFabricTransport) handleInboundStream(ctx context.Context, conn *quic.Conn, stream *quic.Stream) {
session := fabricproto.NewSession(fabricproto.SessionConfig{})
defer func() { _ = stream.Close() }()
t.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_reverse_stream_opened",
AcceptedBy: "quic_reverse",
RemoteAddr: conn.RemoteAddr().String(),
})
defer t.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_reverse_stream_closed",
AcceptedBy: "quic_reverse",
RemoteAddr: conn.RemoteAddr().String(),
})
for {
select {
case <-ctx.Done():
_ = stream.Close()
return
default:
}
frame, err := fabricproto.ReadFrame(stream, fabricproto.DefaultMaxPayload)
if err != nil {
return
}
t.registerReverseHelloFrame(conn, frame)
if t.handleInboundProductionForwardFrame(ctx, stream, frame) {
continue
}
if t.handleInboundWebIngressForwardFrame(ctx, stream, frame) {
continue
}
if t.handleInboundFabricControlForwardFrame(ctx, stream, frame) {
continue
}
if t.handleInboundSyntheticForwardFrame(ctx, stream, frame) {
continue
}
event, responses, err := session.HandleFrame(frame)
if err != nil {
_ = stream.Close()
return
}
if event.Type != fabricproto.SessionEventNone {
t.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_reverse_event",
SessionEvent: event.Type,
StreamID: event.StreamID,
Sequence: event.Sequence,
TrafficClass: event.TrafficClass,
AcceptedBy: "quic_reverse",
RemoteAddr: conn.RemoteAddr().String(),
})
}
for _, response := range responses {
if err := fabricproto.WriteFrame(stream, response); err != nil {
return
}
}
}
}
func (t *QUICFabricTransport) registerReverseHelloFrame(conn *quic.Conn, frame fabricproto.Frame) {
if t == nil || conn == nil || frame.Type != fabricproto.FramePing {
return
}
payload := string(frame.Payload)
if !strings.HasPrefix(payload, fabricQUICReverseHelloPrefix) {
return
}
peerID := strings.TrimPrefix(payload, fabricQUICReverseHelloPrefix)
t.RegisterReverseConn(peerID, conn)
t.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_reverse_registered",
AcceptedBy: "quic_reverse_hello",
RemoteAddr: conn.RemoteAddr().String(),
PeerID: peerID,
})
}
func (t *QUICFabricTransport) handleInboundProductionForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != ProductionForwardQUICStreamID {
return false
}
response := quicProductionForwardResponse{}
productionHandler, _, _, _, _ := t.inboundHandlers()
if productionHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else {
var envelope ProductionEnvelope
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
response.Error = "invalid production mesh envelope"
} else if result, err := productionHandler(ctx, envelope); err != nil {
response.Error = err.Error()
} else {
response.Result = result
}
}
payload, err := json.Marshal(response)
if err == nil {
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: ProductionForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
}
return true
}
func (t *QUICFabricTransport) handleInboundWebIngressForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != WebIngressForwardQUICStreamID {
return false
}
response := quicWebIngressForwardResponse{}
_, webIngressHandler, _, _, _ := t.inboundHandlers()
if webIngressHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else if payload, err := webIngressHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
response.Error = err.Error()
} else {
response.Payload = append(json.RawMessage(nil), payload...)
}
payload, err := json.Marshal(response)
if err == nil {
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: WebIngressForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
}
return true
}
func (t *QUICFabricTransport) handleInboundFabricControlForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != FabricControlForwardQUICStreamID {
return false
}
response := quicFabricControlForwardResponse{}
_, _, fabricControlHandler, _, _ := t.inboundHandlers()
if fabricControlHandler == nil {
response.Error = ErrForwardRuntimeUnavailable.Error()
} else if payload, err := fabricControlHandler(ctx, append([]byte(nil), frame.Payload...)); err != nil {
response.Error = err.Error()
} else {
response.Payload = append(json.RawMessage(nil), payload...)
}
payload, err := json.Marshal(response)
if err == nil {
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: FabricControlForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
}
return true
}
func (t *QUICFabricTransport) handleInboundSyntheticForwardFrame(ctx context.Context, stream *quic.Stream, frame fabricproto.Frame) bool {
if frame.Type != fabricproto.FrameData || frame.StreamID != SyntheticForwardQUICStreamID {
return false
}
response := quicSyntheticForwardResponse{}
_, _, _, syntheticHandler, _ := t.inboundHandlers()
if syntheticHandler == nil {
response.Error = ErrMeshRuntimeDisabled.Error()
} else {
var envelope SyntheticEnvelope
if err := json.Unmarshal(frame.Payload, &envelope); err != nil {
response.Error = "invalid synthetic mesh envelope"
} else if ack, err := syntheticHandler(ctx, envelope); err != nil {
response.Error = err.Error()
} else {
response.Envelope = ack
}
}
payload, err := json.Marshal(response)
if err == nil {
_ = fabricproto.WriteFrame(stream, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: SyntheticForwardQUICStreamID, Sequence: frame.Sequence, Payload: payload})
}
return true
}
func (t *QUICFabricTransport) inboundHandlers() (func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), func(context.Context, []byte) ([]byte, error), func(context.Context, []byte) ([]byte, error), func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), FabricSessionEventLogger) {
if t == nil {
return nil, nil, nil, nil, nil
}
t.mu.Lock()
defer t.mu.Unlock()
return t.inboundProductionHandler, t.inboundWebIngressHandler, t.inboundFabricControlHandler, t.inboundSyntheticHandler, t.logger
}
func (t *QUICFabricTransport) localPeerID() string {
if t == nil {
return ""
}
t.mu.Lock()
defer t.mu.Unlock()
return strings.TrimSpace(t.LocalPeerID)
}
func (t *QUICFabricTransport) logFabricSession(entry FabricSessionEventLogEntry) {
_, _, _, _, logger := t.inboundHandlers()
if logger != nil {
logger(entry)
}
}
func (t *QUICFabricTransport) RegisterReverseConn(peerID string, conn *quic.Conn) {
if t == nil || conn == nil {
return
}
peerID = strings.TrimSpace(peerID)
if peerID == "" {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if t.reverseConns == nil {
t.reverseConns = map[string]*quicFabricConnEntry{}
}
if existing := t.reverseConns[peerID]; existing != nil && existing.conn != nil && existing.conn != conn {
select {
case <-existing.conn.Context().Done():
default:
_ = existing.conn.CloseWithError(0, "reverse connection replaced")
}
}
t.reverseConns[peerID] = &quicFabricConnEntry{conn: conn, lastUsed: time.Now()}
t.stats.ReverseRegisters++
}
func (t *QUICFabricTransport) reverseConnForTarget(target FabricTransportTarget) (*quic.Conn, string, bool) {
peerID := strings.TrimSpace(target.PeerID)
if t == nil || peerID == "" || !fabricTransportPrefersReverseConn(target.Transport) {
return nil, "", false
}
t.mu.Lock()
defer t.mu.Unlock()
t.pruneIdleLocked(time.Now())
entry := t.reverseConns[peerID]
if entry == nil || entry.conn == nil {
return nil, "", false
}
select {
case <-entry.conn.Context().Done():
delete(t.reverseConns, peerID)
t.stats.ClosedEvicted++
return nil, "", false
default:
entry.lastUsed = time.Now()
t.stats.ReverseReuses++
return entry.conn, quicFabricReverseConnKey(peerID), true
}
}
func (t *QUICFabricTransport) reserveStream(key string, conn *quic.Conn) error {
if t == nil || key == "" {
return nil
}
t.mu.Lock()
defer t.mu.Unlock()
entry := t.conns[key]
entry := t.connEntryLocked(key)
if entry == nil || entry.conn != conn {
return fmt.Errorf("quic fabric connection is not cached")
}
@@ -267,16 +644,26 @@ func (t *QUICFabricTransport) releaseStream(key string) {
return
}
t.mu.Lock()
if entry := t.conns[key]; entry != nil {
if entry := t.connEntryLocked(key); entry != nil {
if entry.activeStreams > 0 {
entry.activeStreams--
}
entry.lastUsed = time.Now()
t.stats.StreamCloses++
}
t.stats.StreamCloses++
t.mu.Unlock()
}
func (t *QUICFabricTransport) connEntryLocked(key string) *quicFabricConnEntry {
if t == nil || key == "" {
return nil
}
if strings.HasPrefix(key, "reverse\x00") {
return t.reverseConns[strings.TrimPrefix(key, "reverse\x00")]
}
return t.conns[key]
}
func (t *QUICFabricTransport) evictConn(target FabricTransportTarget, conn *quic.Conn) {
if t == nil || conn == nil {
return
@@ -315,6 +702,20 @@ func (t *QUICFabricTransport) pruneIdleLocked(now time.Time) {
t.stats.IdleEvicted++
}
}
for peerID, entry := range t.reverseConns {
if entry == nil || entry.conn == nil {
delete(t.reverseConns, peerID)
continue
}
if !entry.lastUsed.IsZero() && now.Sub(entry.lastUsed) > ttl {
if entry.activeStreams > 0 {
continue
}
_ = entry.conn.CloseWithError(0, "idle reverse")
delete(t.reverseConns, peerID)
t.stats.IdleEvicted++
}
}
}
func quicFabricConnKey(target FabricTransportTarget) string {
@@ -340,6 +741,23 @@ func parseQUICFabricConnKey(key string) (peerID string, endpoint string, certSHA
return peerID, endpoint, certSHA256
}
func quicFabricReverseConnKey(peerID string) string {
peerID = strings.TrimSpace(peerID)
if peerID == "" {
return ""
}
return "reverse\x00" + peerID
}
func fabricTransportPrefersReverseConn(transport string) bool {
switch strings.ToLower(strings.TrimSpace(transport)) {
case "reverse_quic", "relay_quic":
return true
default:
return false
}
}
func (t *QUICFabricTransport) Close() error {
if t == nil {
return nil
@@ -348,12 +766,19 @@ func (t *QUICFabricTransport) Close() error {
t.stats.CloseAllCalls++
conns := t.conns
t.conns = map[string]*quicFabricConnEntry{}
reverseConns := t.reverseConns
t.reverseConns = map[string]*quicFabricConnEntry{}
t.mu.Unlock()
for _, entry := range conns {
if entry != nil && entry.conn != nil {
_ = entry.conn.CloseWithError(0, "closed")
}
}
for _, entry := range reverseConns {
if entry != nil && entry.conn != nil {
_ = entry.conn.CloseWithError(0, "closed")
}
}
return nil
}
@@ -370,6 +795,7 @@ func (t *QUICFabricTransport) Snapshot() QUICFabricTransportSnapshot {
}
snapshot := QUICFabricTransportSnapshot{
SchemaVersion: "rap.quic_fabric_transport.v1",
LocalPeerID: strings.TrimSpace(t.LocalPeerID),
MaxStreamsPerConn: limit,
Stats: t.stats,
}
@@ -391,6 +817,40 @@ func (t *QUICFabricTransport) Snapshot() QUICFabricTransportSnapshot {
PeerID: peerID,
Endpoint: endpoint,
CertSHA256: certSHA256,
Direction: "outbound",
ActiveStreams: entry.activeStreams,
MaxStreams: limit,
Saturated: entry.activeStreams >= limit,
}
if !entry.lastUsed.IsZero() {
connSnapshot.LastUsedUnixSec = entry.lastUsed.UTC().Unix()
}
if limit > 0 {
connSnapshot.CapacityPressurePercent = (entry.activeStreams * 100) / limit
}
snapshot.Connections = append(snapshot.Connections, connSnapshot)
if entry.activeStreams >= limit {
snapshot.SaturatedConnections++
}
}
}
for peerID, entry := range t.reverseConns {
if entry == nil || entry.conn == nil {
delete(t.reverseConns, peerID)
continue
}
select {
case <-entry.conn.Context().Done():
delete(t.reverseConns, peerID)
t.stats.ClosedEvicted++
snapshot.Stats.ClosedEvicted++
default:
snapshot.ActiveCount++
snapshot.ActiveStreams += entry.activeStreams
connSnapshot := QUICFabricConnSnapshot{
PeerID: peerID,
Endpoint: entry.conn.RemoteAddr().String(),
Direction: "reverse",
ActiveStreams: entry.activeStreams,
MaxStreams: limit,
Saturated: entry.activeStreams >= limit,
@@ -462,6 +922,7 @@ func (s *quicFabricSession) Close() error {
s.closeOnce.Do(func() {
close(s.done)
if s.stream != nil {
s.stream.CancelRead(0)
err = s.stream.Close()
}
if s.transport != nil {
@@ -9,6 +9,7 @@ import (
"crypto/x509"
"crypto/x509/pkix"
"encoding/hex"
"encoding/json"
"encoding/pem"
"math/big"
"strings"
@@ -341,6 +342,119 @@ func TestQUICFabricTransportLimitsStreamsPerConnection(t *testing.T) {
defer second.Close()
}
func TestQUICFabricTransportReusesInboundConnectionForReverseStream(t *testing.T) {
reverseTransport := NewQUICFabricTransport(nil)
defer reverseTransport.Close()
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: testQUICTLSConfig(t),
ReverseTransport: reverseTransport,
SyntheticForwardHandler: func(_ context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
envelope.To, envelope.From = envelope.From, PeerIdentity{ClusterID: envelope.ClusterID, NodeID: "node-r"}
return envelope, nil
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
clientTransport := NewQUICFabricTransport(nil)
defer clientTransport.Close()
clientTransport.SetLocalPeerID("node-a")
clientTransport.SetInboundHandlers(func(_ context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
return ProductionForwardResult{
Accepted: true,
Delivered: true,
Forwarded: true,
By: PeerIdentity{ClusterID: envelope.ClusterID, NodeID: "node-a"},
MessageID: envelope.MessageID,
RouteID: envelope.RouteID,
}, nil
}, nil, nil)
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
session, err := clientTransport.Connect(ctx, FabricTransportTarget{
PeerID: "node-r",
Endpoint: server.Addr().String(),
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
NextProtos: []string{fabricQUICNextProto},
},
Timeout: time.Second,
InboundBuffer: 4,
ErrorBuffer: 4,
})
if err != nil {
t.Fatalf("client connect: %v", err)
}
defer session.Close()
deadline := time.Now().Add(time.Second)
for {
if reverseTransport.Snapshot().Stats.ReverseRegisters > 0 {
break
}
if time.Now().After(deadline) {
t.Fatalf("reverse hello did not register connection: %+v", reverseTransport.Snapshot())
}
time.Sleep(10 * time.Millisecond)
}
reverseSession, err := reverseTransport.Connect(ctx, FabricTransportTarget{
PeerID: "node-a",
Endpoint: "10.0.0.2:19443",
Transport: "relay_quic",
Timeout: time.Second,
InboundBuffer: 4,
ErrorBuffer: 4,
})
if err != nil {
t.Fatalf("reverse connect: %v", err)
}
defer reverseSession.Close()
productionPayload, err := json.Marshal(ProductionEnvelope{
FabricProtocolVersion: ProtocolVersion,
MessageID: "msg-1",
RouteID: "route-r-a",
ClusterID: "cluster-1",
SourceNodeID: "node-r",
DestinationNodeID: "node-a",
CurrentHopNodeID: "node-a",
NextHopNodeID: "node-a",
ChannelClass: ProductionChannelFabricControl,
MessageType: ProductionMessageFabricControl,
TTL: 4,
CreatedAt: time.Now().UTC(),
ExpiresAt: time.Now().UTC().Add(time.Minute),
PayloadHash: "unused-by-test-handler",
})
if err != nil {
t.Fatalf("marshal production: %v", err)
}
if err := reverseSession.Send(ctx, fabricproto.Frame{Type: fabricproto.FrameData, TrafficClass: fabricproto.TrafficClassReliable, StreamID: ProductionForwardQUICStreamID, Sequence: 2, Payload: productionPayload}); err != nil {
t.Fatalf("send reverse production: %v", err)
}
select {
case frame := <-reverseSession.Frames():
var response quicProductionForwardResponse
if err := json.Unmarshal(frame.Payload, &response); err != nil {
t.Fatalf("decode response: %v", err)
}
if !response.Result.Accepted || !response.Result.Delivered || response.Result.By.NodeID != "node-a" {
t.Fatalf("response = %+v", response)
}
case err := <-reverseSession.Errors():
t.Fatalf("reverse session error: %v", err)
case <-ctx.Done():
t.Fatal(ctx.Err())
}
snapshot := reverseTransport.Snapshot()
if snapshot.Stats.ReverseRegisters == 0 || snapshot.Stats.ReverseReuses == 0 {
t.Fatalf("reverse connection was not registered/reused: %+v", snapshot)
}
}
func TestQUICFabricServerHandlesFabricFrames(t *testing.T) {
var events []FabricSessionEventLogEntry
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
@@ -389,6 +503,68 @@ func TestQUICFabricServerHandlesFabricFrames(t *testing.T) {
}
}
func TestQUICFabricServerHandlesWebIngressForwardFrames(t *testing.T) {
var received []byte
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: testQUICTLSConfig(t),
WebIngressForwardHandler: func(_ context.Context, payload []byte) ([]byte, error) {
received = append([]byte(nil), payload...)
return []byte(`{"schema_version":"rap.web_ingress.fabric_runtime_response.v1","status_code":200,"body_b64":"b2s="}`), nil
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
session, err := NewQUICFabricTransport(nil).Connect(ctx, FabricTransportTarget{
Endpoint: server.Addr().String(),
TLSConfig: &tls.Config{
InsecureSkipVerify: true,
NextProtos: []string{fabricQUICNextProto},
},
Timeout: time.Second,
InboundBuffer: 4,
ErrorBuffer: 4,
})
if err != nil {
t.Fatalf("connect quic fabric: %v", err)
}
defer session.Close()
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: WebIngressForwardQUICStreamID,
Sequence: 44,
Payload: []byte(`{"envelope":true}`),
}); err != nil {
t.Fatalf("send web ingress frame: %v", err)
}
select {
case frame := <-session.Frames():
if frame.Type != fabricproto.FrameData || frame.StreamID != WebIngressForwardQUICStreamID || frame.Sequence != 44 {
t.Fatalf("frame = %+v", frame)
}
var response quicWebIngressForwardResponse
if err := json.Unmarshal(frame.Payload, &response); err != nil {
t.Fatalf("decode response: %v", err)
}
if string(response.Payload) != `{"schema_version":"rap.web_ingress.fabric_runtime_response.v1","status_code":200,"body_b64":"b2s="}` || response.Error != "" {
t.Fatalf("response = %+v", response)
}
case err := <-session.Errors():
t.Fatalf("session error: %v", err)
case <-ctx.Done():
t.Fatal(ctx.Err())
}
if string(received) != `{"envelope":true}` {
t.Fatalf("received = %s", string(received))
}
}
func startQUICFabricEchoServer(t *testing.T) *quic.Listener {
t.Helper()
return startQUICFabricEchoServerWithTLS(t, testQUICTLSConfig(t))
@@ -0,0 +1,128 @@
package mesh
import (
"strings"
"sync"
"time"
)
type FabricRouteHealthTracker struct {
mu sync.Mutex
QuarantineTTL time.Duration
routes map[string]FabricRouteHealthEntry
}
type FabricRouteHealthEntry struct {
Reason string `json:"reason,omitempty"`
Failures uint64 `json:"failures"`
LastFailure time.Time `json:"last_failure,omitempty"`
RetryAfter time.Time `json:"retry_after,omitempty"`
}
type FabricRouteHealthSnapshot struct {
Quarantined map[string]FabricRouteHealthEntry `json:"quarantined,omitempty"`
}
func NewFabricRouteHealthTracker(ttl time.Duration) *FabricRouteHealthTracker {
if ttl <= 0 {
ttl = 30 * time.Second
}
return &FabricRouteHealthTracker{QuarantineTTL: ttl, routes: map[string]FabricRouteHealthEntry{}}
}
func (t *FabricRouteHealthTracker) MarkFailure(routeID string, reason string, now time.Time) {
routeID = strings.TrimSpace(routeID)
if t == nil || routeID == "" {
return
}
if now.IsZero() {
now = time.Now().UTC()
}
ttl := t.QuarantineTTL
if ttl <= 0 {
ttl = 30 * time.Second
}
t.mu.Lock()
entry := t.routes[routeID]
entry.Failures++
entry.Reason = strings.TrimSpace(reason)
entry.LastFailure = now
entry.RetryAfter = now.Add(ttl)
if t.routes == nil {
t.routes = map[string]FabricRouteHealthEntry{}
}
t.routes[routeID] = entry
t.mu.Unlock()
}
func (t *FabricRouteHealthTracker) MarkSuccess(routeID string) {
routeID = strings.TrimSpace(routeID)
if t == nil || routeID == "" {
return
}
t.mu.Lock()
delete(t.routes, routeID)
t.mu.Unlock()
}
func (t *FabricRouteHealthTracker) Apply(routeSet FabricRouteSet, now time.Time) FabricRouteSet {
if t == nil {
return routeSet
}
if now.IsZero() {
now = time.Now().UTC()
}
t.mu.Lock()
defer t.mu.Unlock()
if len(t.routes) == 0 {
return routeSet
}
return mapFabricRouteSet(routeSet, func(route FabricRoute) FabricRoute {
entry, ok := t.routes[route.RouteID]
if !ok {
return route
}
if !entry.RetryAfter.IsZero() && !now.Before(entry.RetryAfter) {
delete(t.routes, route.RouteID)
return route
}
route.Healthy = false
route.Degraded = true
return route
})
}
func (t *FabricRouteHealthTracker) Snapshot(now time.Time) FabricRouteHealthSnapshot {
if t == nil {
return FabricRouteHealthSnapshot{}
}
if now.IsZero() {
now = time.Now().UTC()
}
t.mu.Lock()
defer t.mu.Unlock()
out := map[string]FabricRouteHealthEntry{}
for routeID, entry := range t.routes {
if !entry.RetryAfter.IsZero() && !now.Before(entry.RetryAfter) {
continue
}
out[routeID] = entry
}
if len(out) == 0 {
return FabricRouteHealthSnapshot{}
}
return FabricRouteHealthSnapshot{Quarantined: out}
}
func mapFabricRouteSet(routeSet FabricRouteSet, fn func(FabricRoute) FabricRoute) FabricRouteSet {
if strings.TrimSpace(routeSet.Primary.RouteID) != "" {
routeSet.Primary = fn(routeSet.Primary)
}
for i := range routeSet.WarmStandby {
routeSet.WarmStandby[i] = fn(routeSet.WarmStandby[i])
}
for i := range routeSet.ColdFallbacks {
routeSet.ColdFallbacks[i] = fn(routeSet.ColdFallbacks[i])
}
return routeSet
}
@@ -0,0 +1,322 @@
package mesh
import (
"encoding/json"
"fmt"
"strings"
"time"
)
const (
FabricCandidateReachabilityPublic = "public"
FabricCandidateReachabilityPrivate = "private"
FabricCandidateReachabilityRelay = "relay"
FabricCandidateReachabilityOutboundOnly = "outbound_only"
FabricConnectivityDirect = "direct"
FabricConnectivityOutboundOnly = "outbound_only"
FabricConnectivityRelayRequired = "relay_required"
)
type FabricRoutePlannerConfig struct {
ClusterID string
LocalNodeID string
LocalSegmentID string
LocalNATGroupID string
DefaultCapacity int
RelayCapacity int
ReverseCapacity int
Observations map[string]EndpointCandidateHealthObservation
CapacityPressure map[string]EndpointCandidateCapacityPressure
Now time.Time
MaxObservationAge time.Duration
MaxCapacityPressureAge time.Duration
}
type FabricCandidateMetadata struct {
LocalSegmentID string `json:"local_segment_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ViaNodeID string `json:"via_node_id,omitempty"`
STUNServer string `json:"stun_server,omitempty"`
ICEFoundation string `json:"ice_foundation,omitempty"`
}
func FabricRouteSetForPeerEndpointCandidates(targetNodeID string, candidates []PeerEndpointCandidate, cfg FabricRoutePlannerConfig) FabricRouteSet {
targetNodeID = strings.TrimSpace(targetNodeID)
if targetNodeID == "" && len(candidates) > 0 {
targetNodeID = strings.TrimSpace(candidates[0].NodeID)
}
routeSet := FabricRouteSet{TargetKind: FabricChannelTargetNode, TargetID: targetNodeID}
if len(candidates) == 0 {
return routeSet
}
now := cfg.Now
if now.IsZero() {
now = time.Now().UTC()
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
Observations: cfg.Observations,
MaxObservationAge: firstNonZeroDuration(cfg.MaxObservationAge, 30*time.Second),
CapacityPressure: cfg.CapacityPressure,
MaxCapacityPressureAge: firstNonZeroDuration(cfg.MaxCapacityPressureAge, 10*time.Second),
})
routes := make([]FabricRoute, 0, len(ranked))
for index, scored := range ranked {
route, ok := fabricRouteForPeerEndpointCandidate(scored.Candidate, cfg, scored.Score, index, now)
if ok {
routes = append(routes, route)
}
}
return routeSetFromRoutes(routeSet, routes)
}
func FabricRouteSetsForPeerEndpointCandidates(candidatesByNode map[string][]PeerEndpointCandidate, cfg FabricRoutePlannerConfig) map[string]FabricRouteSet {
out := make(map[string]FabricRouteSet, len(candidatesByNode))
for nodeID, candidates := range candidatesByNode {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
continue
}
routeSet := FabricRouteSetForPeerEndpointCandidates(nodeID, candidates, cfg)
if strings.TrimSpace(routeSet.Primary.RouteID) != "" || len(routeSet.WarmStandby) > 0 || len(routeSet.ColdFallbacks) > 0 {
out[nodeID] = routeSet
}
}
return out
}
func fabricRouteForPeerEndpointCandidate(candidate PeerEndpointCandidate, cfg FabricRoutePlannerConfig, score int, index int, now time.Time) (FabricRoute, bool) {
candidate.EndpointID = strings.TrimSpace(candidate.EndpointID)
candidate.NodeID = strings.TrimSpace(candidate.NodeID)
candidate.Address = strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
if candidate.EndpointID == "" || candidate.NodeID == "" || candidate.Address == "" || !isQUICOnlyCandidateTransport(candidate.Transport) {
return FabricRoute{}, false
}
metadata := decodeFabricCandidateMetadata(candidate.Metadata)
mode := fabricRouteModeForPeerEndpointCandidate(candidate, metadata, cfg)
hops := fabricRouteHopsForCandidate(candidate, metadata, mode, cfg)
if len(hops) == 0 {
return FabricRoute{}, false
}
relayCount := 0
for _, hop := range hops {
if hop.Mode == FabricRouteRelay {
relayCount++
}
}
latency := fabricRouteLatencyFromCandidate(candidate, cfg, score, index)
capacity := fabricRouteCapacityForMode(mode, cfg)
if capacity <= 0 {
capacity = 100
}
healthy := true
degraded := false
if observation, ok := cfg.Observations[candidate.EndpointID]; ok {
healthy = observation.ReliabilityScore == 0 || observation.ReliabilityScore >= 50
degraded = observation.LastLatencyMs > 0 && observation.LastLatencyMs >= 250
}
return FabricRoute{
RouteID: candidate.EndpointID,
ClusterID: strings.TrimSpace(cfg.ClusterID),
SourceNodeID: strings.TrimSpace(cfg.LocalNodeID),
DestinationNodeID: candidate.NodeID,
Hops: hops,
BaseLatencyMs: latency,
Capacity: capacity,
ActiveChannels: int(candidatePressureCount(candidate.EndpointID, cfg)),
RelayCount: relayCount,
Healthy: healthy,
Degraded: degraded,
LastUpdatedAt: now,
}, true
}
func fabricRouteModeForPeerEndpointCandidate(candidate PeerEndpointCandidate, metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) FabricRouteMode {
transportMode := fabricRouteModeForTransportTarget(FabricTransportTarget{Transport: candidate.Transport})
if transportMode == FabricRouteRelay || transportMode == FabricRouteReverse || transportMode == FabricRouteICE || transportMode == FabricRouteLAN {
return transportMode
}
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
if sameLocalSegment(metadata, cfg) || sameNATGroup(metadata, cfg) {
return FabricRouteLAN
}
if reachability == FabricCandidateReachabilityRelay || connectivity == FabricConnectivityRelayRequired || strings.TrimSpace(metadata.RelayEndpoint) != "" {
return FabricRouteRelay
}
if connectivity == FabricConnectivityOutboundOnly || reachability == FabricCandidateReachabilityOutboundOnly {
return FabricRouteReverse
}
if strings.TrimSpace(metadata.STUNServer) != "" || strings.TrimSpace(metadata.ICEFoundation) != "" || candidate.NATType != "" {
return FabricRouteICE
}
return FabricRouteDirect
}
func fabricRouteHopsForCandidate(candidate PeerEndpointCandidate, metadata FabricCandidateMetadata, mode FabricRouteMode, cfg FabricRoutePlannerConfig) []FabricRouteHop {
localNodeID := strings.TrimSpace(cfg.LocalNodeID)
targetNodeID := strings.TrimSpace(candidate.NodeID)
endpoint := strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
switch mode {
case FabricRouteRelay:
relayNodeID := firstNonEmpty(strings.TrimSpace(metadata.RelayNodeID), strings.TrimSpace(metadata.ViaNodeID))
relayEndpoint := firstNonEmpty(strings.TrimRight(strings.TrimSpace(metadata.RelayEndpoint), "/"), endpoint)
hops := []FabricRouteHop{}
if localNodeID != "" {
hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: FabricRouteDirect})
}
if relayNodeID == "" {
hops = append(hops, FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)})
return hops
}
hops = append(hops,
FabricRouteHop{NodeID: relayNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID + ":relay", Address: relayEndpoint},
FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)},
)
return hops
case FabricRouteLAN, FabricRouteICE, FabricRouteReverse, FabricRouteDirect:
hops := []FabricRouteHop{}
if localNodeID != "" {
hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: mode})
}
hops = append(hops, FabricRouteHop{NodeID: targetNodeID, Mode: mode, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)})
return hops
default:
return nil
}
}
func isQUICOnlyCandidateTransport(transport string) bool {
switch strings.ToLower(strings.TrimSpace(transport)) {
case "quic", "direct_quic", "udp_quic", "quic_udp",
string(FabricRouteLAN), string(FabricRouteReverse), string(FabricRouteRelay), string(FabricRouteICE):
return true
default:
return false
}
}
func fabricRouteLatencyFromCandidate(candidate PeerEndpointCandidate, cfg FabricRoutePlannerConfig, score int, index int) int {
if observation, ok := cfg.Observations[candidate.EndpointID]; ok && observation.LastLatencyMs > 0 {
if observation.LastLatencyMs > int64(^uint(0)>>1) {
return int(^uint(0) >> 1)
}
return int(observation.LastLatencyMs)
}
base := 10 + index
switch strings.ToLower(strings.TrimSpace(candidate.Reachability)) {
case FabricCandidateReachabilityPrivate:
base = 3 + index
case FabricCandidateReachabilityOutboundOnly:
base = 25 + index
case FabricCandidateReachabilityRelay:
base = 40 + index
}
if score < 100 {
base += (100 - score) / 10
}
return base
}
func fabricRouteCapacityForMode(mode FabricRouteMode, cfg FabricRoutePlannerConfig) int {
switch mode {
case FabricRouteRelay:
return firstPositiveInt(cfg.RelayCapacity, cfg.DefaultCapacity, 100)
case FabricRouteReverse:
return firstPositiveInt(cfg.ReverseCapacity, cfg.DefaultCapacity, 100)
default:
return firstPositiveInt(cfg.DefaultCapacity, 100)
}
}
func candidatePressureCount(endpointID string, cfg FabricRoutePlannerConfig) int64 {
if pressure, ok := cfg.CapacityPressure[endpointID]; ok {
return pressure.Count
}
return 0
}
func sameLocalSegment(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localSegment := strings.TrimSpace(cfg.LocalSegmentID)
if localSegment == "" {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.LocalSegmentID), localSegment)
}
func sameNATGroup(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localNATGroup := strings.TrimSpace(cfg.LocalNATGroupID)
if localNATGroup == "" {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.NATGroupID), localNATGroup)
}
func decodeFabricCandidateMetadata(raw json.RawMessage) FabricCandidateMetadata {
if len(raw) == 0 {
return FabricCandidateMetadata{}
}
var metadata FabricCandidateMetadata
if err := json.Unmarshal(raw, &metadata); err != nil {
return FabricCandidateMetadata{}
}
return metadata
}
func candidatePeerCertSHA256(candidate PeerEndpointCandidate) string {
var metadata struct {
PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"`
TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"`
}
if len(candidate.Metadata) == 0 {
return ""
}
if err := json.Unmarshal(candidate.Metadata, &metadata); err != nil {
return ""
}
return firstNonEmpty(strings.TrimSpace(metadata.PeerCertSHA256), strings.TrimSpace(metadata.TLSCertSHA256))
}
func firstPositiveInt(values ...int) int {
for _, value := range values {
if value > 0 {
return value
}
}
return 0
}
func firstNonZeroDuration(values ...time.Duration) time.Duration {
for _, value := range values {
if value > 0 {
return value
}
}
return 0
}
func FabricRouteSetForRelayFallback(clusterID string, sourceNodeID string, targetNodeID string, relayNodeID string, relayEndpoint string, targetEndpoint string) FabricRouteSet {
relayEndpoint = strings.TrimRight(strings.TrimSpace(relayEndpoint), "/")
targetEndpoint = strings.TrimRight(strings.TrimSpace(targetEndpoint), "/")
candidate := PeerEndpointCandidate{
EndpointID: fmt.Sprintf("%s-via-%s-relay", strings.TrimSpace(targetNodeID), strings.TrimSpace(relayNodeID)),
NodeID: strings.TrimSpace(targetNodeID),
Transport: string(FabricRouteRelay),
Address: targetEndpoint,
Reachability: FabricCandidateReachabilityRelay,
ConnectivityMode: FabricConnectivityRelayRequired,
Metadata: mustMarshalFabricCandidateMetadata(FabricCandidateMetadata{RelayNodeID: relayNodeID, RelayEndpoint: relayEndpoint}),
}
return FabricRouteSetForPeerEndpointCandidates(targetNodeID, []PeerEndpointCandidate{candidate}, FabricRoutePlannerConfig{
ClusterID: clusterID,
LocalNodeID: sourceNodeID,
})
}
func mustMarshalFabricCandidateMetadata(metadata FabricCandidateMetadata) json.RawMessage {
raw, _ := json.Marshal(metadata)
return raw
}
@@ -0,0 +1,187 @@
package mesh
import (
"encoding/json"
"testing"
"time"
)
func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{LocalSegmentID: "site-a", NATGroupID: "nat-a"})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "quic",
Address: "quic://203.0.113.10:19443",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "node-b-lan",
NodeID: "node-b",
Transport: "quic",
Address: "quic://10.10.0.12:19443",
Reachability: "private",
ConnectivityMode: "direct",
PolicyTags: []string{"private-lan"},
Metadata: metadata,
},
}, FabricRoutePlannerConfig{
ClusterID: "cluster-1",
LocalNodeID: "node-a",
LocalSegmentID: "site-a",
DefaultCapacity: 200,
Now: time.Unix(100, 0).UTC(),
})
if routeSet.Primary.RouteID != "node-b-lan" {
t.Fatalf("primary route = %q, want node-b-lan", routeSet.Primary.RouteID)
}
if routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1].Mode != FabricRouteLAN {
t.Fatalf("primary mode = %q, want lan", routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1].Mode)
}
}
func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{RelayNodeID: "node-r", RelayEndpoint: "quic://node-r:19443"})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "quic",
Address: "quic://node-b-passive:19443",
Reachability: "outbound_only",
ConnectivityMode: "relay_required",
NATType: "symmetric",
Metadata: metadata,
}}, FabricRoutePlannerConfig{
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RelayCapacity: 50,
Now: time.Unix(100, 0).UTC(),
})
if routeSet.Primary.RouteID != "node-b-relay" {
t.Fatalf("primary route = %q", routeSet.Primary.RouteID)
}
if routeSet.Primary.RelayCount != 2 {
t.Fatalf("relay count = %d, want 2", routeSet.Primary.RelayCount)
}
if got := routeSet.Primary.Hops[1].NodeID; got != "node-r" {
t.Fatalf("relay hop = %q, want node-r", got)
}
if routeSet.Primary.Capacity != 50 {
t.Fatalf("capacity = %d, want 50", routeSet.Primary.Capacity)
}
}
func TestFabricRouteSetForPeerEndpointCandidatesUsesTargetWhenRelayMetadataIsAbsent(t *testing.T) {
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay_quic",
Address: "quic://node-b:19443",
Reachability: "relay",
ConnectivityMode: "relay_required",
Metadata: json.RawMessage(`{"tls_cert_sha256":"abc123"}`),
}}, FabricRoutePlannerConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
if routeSet.Primary.RouteID != "node-b-relay" {
t.Fatalf("primary route = %q", routeSet.Primary.RouteID)
}
if len(routeSet.Primary.Hops) != 2 {
t.Fatalf("hops = %+v, want local + target only", routeSet.Primary.Hops)
}
targetHop := routeSet.Primary.Hops[1]
if targetHop.NodeID != "node-b" || targetHop.Mode != FabricRouteRelay || targetHop.PeerCertSHA256 != "abc123" {
t.Fatalf("target hop = %+v, want relay-mode target with cert", targetHop)
}
}
func TestFabricRouteSetForPeerEndpointCandidatesAcceptsExplicitQUICModes(t *testing.T) {
for _, tc := range []struct {
name string
transport string
wantMode FabricRouteMode
}{
{name: "lan", transport: "lan_quic", wantMode: FabricRouteLAN},
{name: "reverse", transport: "reverse_quic", wantMode: FabricRouteReverse},
{name: "relay", transport: "relay_quic", wantMode: FabricRouteRelay},
{name: "ice", transport: "ice_quic", wantMode: FabricRouteICE},
} {
t.Run(tc.name, func(t *testing.T) {
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-" + tc.name,
NodeID: "node-b",
Transport: tc.transport,
Address: "quic://node-b:19443",
Reachability: "private",
ConnectivityMode: "direct",
Metadata: json.RawMessage(`{"tls_cert_sha256":"abc123"}`),
}}, FabricRoutePlannerConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
if routeSet.Primary.RouteID == "" {
t.Fatalf("%s candidate produced empty route set", tc.transport)
}
hop := routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1]
if hop.Mode != tc.wantMode {
t.Fatalf("mode = %q, want %q", hop.Mode, tc.wantMode)
}
if hop.PeerCertSHA256 != "abc123" {
t.Fatalf("peer cert = %q, want abc123", hop.PeerCertSHA256)
}
})
}
}
func TestFabricRouteSetForPeerEndpointCandidatesTreatsSameNATGroupAsLAN(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{NATGroupID: "nat-a"})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-nat-lan",
NodeID: "node-b",
Transport: "quic",
Address: "quic://10.44.0.12:19443",
Reachability: "private",
ConnectivityMode: "direct",
NATType: "symmetric",
Metadata: metadata,
}}, FabricRoutePlannerConfig{
ClusterID: "cluster-1",
LocalNodeID: "node-a",
LocalNATGroupID: "nat-a",
})
if routeSet.Primary.Hops[len(routeSet.Primary.Hops)-1].Mode != FabricRouteLAN {
t.Fatalf("route = %+v, want LAN mode for same NAT group", routeSet.Primary)
}
}
func TestFabricRouteSetForPeerEndpointCandidatesRejectsNonQUIC(t *testing.T) {
for _, candidate := range []PeerEndpointCandidate{
{
EndpointID: "node-b-http",
NodeID: "node-b",
Transport: "direct_http",
Address: "http://node-b:8080",
Reachability: "public",
ConnectivityMode: "direct",
},
{
EndpointID: "node-b-legacy-relay",
NodeID: "node-b",
Transport: "relay",
Address: "quic://node-r:19443",
Reachability: "relay",
ConnectivityMode: "relay_required",
},
{
EndpointID: "node-b-legacy-reverse",
NodeID: "node-b",
Transport: "outbound_reverse",
Address: "quic://node-b:19443",
Reachability: "outbound_only",
ConnectivityMode: "outbound_only",
},
} {
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{candidate}, FabricRoutePlannerConfig{ClusterID: "cluster-1", LocalNodeID: "node-a"})
if routeSet.Primary.RouteID != "" || len(routeSet.WarmStandby) != 0 {
t.Fatalf("non-quic candidate produced route set: %+v", routeSet)
}
}
}
@@ -0,0 +1,137 @@
package mesh
import (
"strings"
"sync"
"sync/atomic"
)
type FabricRoutePressureTracker struct {
mu sync.Mutex
active map[string]int
maxActive map[string]int
acquiredTotal uint64
releasedTotal uint64
maxActiveTotal int
lastAcquiredRoute string
lastReleasedRoute string
}
type FabricRoutePressureSnapshot struct {
Active map[string]int `json:"active"`
MaxActive map[string]int `json:"max_active"`
ActiveTotal int `json:"active_total"`
MaxActiveTotal int `json:"max_active_total"`
AcquiredTotal uint64 `json:"acquired_total"`
ReleasedTotal uint64 `json:"released_total"`
LastAcquiredRoute string `json:"last_acquired_route,omitempty"`
LastReleasedRoute string `json:"last_released_route,omitempty"`
}
func NewFabricRoutePressureTracker() *FabricRoutePressureTracker {
return &FabricRoutePressureTracker{
active: map[string]int{},
maxActive: map[string]int{},
}
}
func (t *FabricRoutePressureTracker) Apply(routeSet FabricRouteSet) FabricRouteSet {
if t == nil {
return routeSet
}
active := t.Snapshot()
if len(active) == 0 {
return routeSet
}
apply := func(route FabricRoute) FabricRoute {
if count := active[route.RouteID]; count > 0 {
route.ActiveChannels += count
}
return route
}
routeSet.Primary = apply(routeSet.Primary)
for i := range routeSet.WarmStandby {
routeSet.WarmStandby[i] = apply(routeSet.WarmStandby[i])
}
for i := range routeSet.ColdFallbacks {
routeSet.ColdFallbacks[i] = apply(routeSet.ColdFallbacks[i])
}
return routeSet
}
func (t *FabricRoutePressureTracker) Acquire(routeID string) func() {
routeID = strings.TrimSpace(routeID)
if t == nil || routeID == "" {
return func() {}
}
t.mu.Lock()
if t.active == nil {
t.active = map[string]int{}
}
if t.maxActive == nil {
t.maxActive = map[string]int{}
}
t.active[routeID]++
if t.active[routeID] > t.maxActive[routeID] {
t.maxActive[routeID] = t.active[routeID]
}
t.acquiredTotal++
t.lastAcquiredRoute = routeID
if activeTotal := activeTotalLocked(t.active); activeTotal > t.maxActiveTotal {
t.maxActiveTotal = activeTotal
}
t.mu.Unlock()
var released atomic.Bool
return func() {
if released.Swap(true) {
return
}
t.mu.Lock()
if t.active[routeID] <= 1 {
delete(t.active, routeID)
} else {
t.active[routeID]--
}
t.releasedTotal++
t.lastReleasedRoute = routeID
t.mu.Unlock()
}
}
func (t *FabricRoutePressureTracker) Snapshot() map[string]int {
return t.SnapshotPressure().Active
}
func (t *FabricRoutePressureTracker) SnapshotPressure() FabricRoutePressureSnapshot {
if t == nil {
return FabricRoutePressureSnapshot{}
}
t.mu.Lock()
defer t.mu.Unlock()
active := make(map[string]int, len(t.active))
for routeID, count := range t.active {
active[routeID] = count
}
maxActive := make(map[string]int, len(t.maxActive))
for routeID, count := range t.maxActive {
maxActive[routeID] = count
}
return FabricRoutePressureSnapshot{
Active: active,
MaxActive: maxActive,
ActiveTotal: activeTotalLocked(active),
MaxActiveTotal: t.maxActiveTotal,
AcquiredTotal: t.acquiredTotal,
ReleasedTotal: t.releasedTotal,
LastAcquiredRoute: t.lastAcquiredRoute,
LastReleasedRoute: t.lastReleasedRoute,
}
}
func activeTotalLocked(active map[string]int) int {
total := 0
for _, count := range active {
total += count
}
return total
}
@@ -0,0 +1,44 @@
package mesh
import "testing"
func TestFabricRoutePressureTrackerAppliesAndReleasesActiveChannels(t *testing.T) {
tracker := NewFabricRoutePressureTracker()
releaseA := tracker.Acquire("route-a")
releaseAAgain := tracker.Acquire("route-a")
releaseB := tracker.Acquire("route-b")
routeSet := FabricRouteSet{
TargetKind: FabricChannelTargetNode,
TargetID: "node-b",
Primary: testFabricRoute("route-a", "node-b", 10, 100, 3, true),
WarmStandby: []FabricRoute{
testFabricRoute("route-b", "node-b", 10, 100, 0, true),
},
}
withPressure := tracker.Apply(routeSet)
if withPressure.Primary.ActiveChannels != 5 {
t.Fatalf("primary active = %d, want 5", withPressure.Primary.ActiveChannels)
}
if withPressure.WarmStandby[0].ActiveChannels != 1 {
t.Fatalf("standby active = %d, want 1", withPressure.WarmStandby[0].ActiveChannels)
}
releaseA()
releaseA()
releaseAAgain()
releaseB()
snapshot := tracker.SnapshotPressure()
if len(snapshot.Active) != 0 || snapshot.ActiveTotal != 0 {
t.Fatalf("snapshot after release = %+v, want inactive", snapshot)
}
if snapshot.AcquiredTotal != 3 || snapshot.ReleasedTotal != 3 {
t.Fatalf("snapshot totals = %+v, want acquired/released 3", snapshot)
}
if snapshot.MaxActive["route-a"] != 2 || snapshot.MaxActive["route-b"] != 1 || snapshot.MaxActiveTotal != 3 {
t.Fatalf("snapshot max = %+v", snapshot)
}
if snapshot.LastAcquiredRoute != "route-b" || snapshot.LastReleasedRoute != "route-b" {
t.Fatalf("snapshot last routes = %+v", snapshot)
}
}
@@ -14,6 +14,7 @@ func TestFabricSessionPeerManagerReusesPeerPump(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
@@ -85,6 +86,7 @@ func TestFabricSessionPeerManagerClosePeerReopens(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
@@ -133,6 +135,7 @@ func TestFabricSessionPeerManagerReopensClosedPump(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
@@ -40,73 +40,22 @@ type FabricTransportTarget struct {
ErrorBuffer int
}
func FabricTransportForTarget(target FabricTransportTarget, websocket *WebSocketFabricTransport, quicTransport *QUICFabricTransport) (FabricTransport, FabricTransportTarget, error) {
func FabricTransportForTarget(target FabricTransportTarget, quicTransport *QUICFabricTransport) (FabricTransport, FabricTransportTarget, error) {
transportLabel := strings.ToLower(strings.TrimSpace(target.Transport))
endpoint := strings.TrimSpace(target.Endpoint)
if strings.HasPrefix(strings.ToLower(endpoint), "quic://") {
if transportLabel == "" {
transportLabel = "quic"
}
target.Endpoint = strings.TrimPrefix(endpoint, "quic://")
}
switch transportLabel {
case "quic", "direct_quic", "udp_quic", "quic_udp":
case "quic", "direct_quic", "udp_quic", "quic_udp", "lan_quic", "reverse_quic", "relay_quic", "ice_quic":
if quicTransport == nil {
quicTransport = NewQUICFabricTransport(nil)
}
return quicTransport, target, nil
case "", "websocket", "ws", "wss", "direct_http", "direct_https", "direct_tcp_tls":
if websocket == nil {
websocket = NewWebSocketFabricTransport(nil)
}
return websocket, target, nil
default:
return nil, target, fmt.Errorf("unsupported fabric transport %q", target.Transport)
return nil, target, fmt.Errorf("unsupported fabric transport %q: quic is required", target.Transport)
}
}
type WebSocketFabricTransport struct {
Manager *FabricSessionPeerManager
}
func NewWebSocketFabricTransport(manager *FabricSessionPeerManager) *WebSocketFabricTransport {
if manager == nil {
manager = NewFabricSessionPeerManager()
}
return &WebSocketFabricTransport{Manager: manager}
}
func (t *WebSocketFabricTransport) Connect(ctx context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
manager := t.Manager
if manager == nil {
manager = NewFabricSessionPeerManager()
t.Manager = manager
}
return manager.Get(ctx, FabricSessionPeerTarget{
PeerID: target.PeerID,
BaseURL: target.Endpoint,
Options: FabricSessionDialOptions{
Token: target.Token,
Header: target.Header,
Timeout: target.Timeout,
MaxPayload: target.MaxPayload,
},
Pump: FabricSessionPumpOptions{
OutboundBuffer: target.OutboundBuffer,
InboundBuffer: target.InboundBuffer,
ErrorBuffer: target.ErrorBuffer,
},
})
}
func (t *WebSocketFabricTransport) Close() error {
if t == nil || t.Manager == nil {
return nil
}
return t.Manager.Close()
}
func (t *WebSocketFabricTransport) Snapshot() FabricSessionPeerManagerSnapshot {
if t == nil || t.Manager == nil {
return FabricSessionPeerManagerSnapshot{SchemaVersion: "rap.fabric_session_peer_manager.v1"}
}
return t.Manager.Snapshot()
}
@@ -1,117 +1,27 @@
package mesh
import (
"context"
"net/http/httptest"
"strings"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestWebSocketFabricTransportConnectsAndReusesSession(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
func TestFabricTransportRejectsWebSocketTransport(t *testing.T) {
for _, target := range []FabricTransportTarget{
{Transport: "wss", Endpoint: "wss://node-a.example/fabric/session"},
{Transport: "relay", Endpoint: "quic://node-r.example:19443"},
{Transport: "outbound_reverse", Endpoint: "quic://node-b.example:19443"},
} {
_, _, err := FabricTransportForTarget(target, nil)
if err == nil || !strings.Contains(err.Error(), "quic is required") {
t.Fatalf("target = %+v err = %v, want quic-only rejection", target, err)
}
},
}.Handler())
defer server.Close()
transport := NewWebSocketFabricTransport(nil)
defer transport.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricTransportTarget{
PeerID: "node-a",
Endpoint: server.URL,
Token: "rap_fsn_transport",
Timeout: time.Second,
OutboundBuffer: 4,
InboundBuffer: 4,
ErrorBuffer: 4,
}
first, err := transport.Connect(ctx, target)
if err != nil {
t.Fatalf("first connect: %v", err)
}
second, err := transport.Connect(ctx, target)
if err != nil {
t.Fatalf("second connect: %v", err)
}
if first != second {
t.Fatal("transport did not reuse session")
}
if opened != 1 {
t.Fatalf("opened = %d, want 1", opened)
}
if err := first.Send(ctx, fabricproto.Frame{Type: fabricproto.FramePing, Sequence: 1, Payload: []byte("transport")}); err != nil {
t.Fatalf("send ping: %v", err)
}
select {
case frame := <-first.Frames():
if frame.Type != fabricproto.FramePong || frame.Sequence != 1 || string(frame.Payload) != "transport" {
t.Fatalf("frame = %+v", frame)
}
case err := <-first.Errors():
t.Fatalf("session error: %v", err)
case <-ctx.Done():
t.Fatal(ctx.Err())
}
}
func TestWebSocketFabricTransportReopensClosedSession(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
}
},
}.Handler())
defer server.Close()
transport := NewWebSocketFabricTransport(nil)
defer transport.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricTransportTarget{
PeerID: "node-a",
Endpoint: server.URL,
Token: "rap_fsn_transport_reopen",
Timeout: time.Second,
}
first, err := transport.Connect(ctx, target)
if err != nil {
t.Fatalf("first connect: %v", err)
}
if err := first.Close(); err != nil {
t.Fatalf("close first session: %v", err)
}
second, err := transport.Connect(ctx, target)
if err != nil {
t.Fatalf("second connect: %v", err)
}
if first == second {
t.Fatal("transport reused closed session")
}
if opened != 2 {
t.Fatalf("opened = %d, want 2", opened)
}
}
func TestFabricTransportForTargetSelectsQUICByScheme(t *testing.T) {
transport, target, err := FabricTransportForTarget(FabricTransportTarget{
Endpoint: "quic://127.0.0.1:4433",
}, nil, nil)
}, nil)
if err != nil {
t.Fatalf("select transport: %v", err)
}
@@ -123,15 +33,12 @@ func TestFabricTransportForTargetSelectsQUICByScheme(t *testing.T) {
}
}
func TestFabricTransportForTargetSelectsWebSocketByDefault(t *testing.T) {
transport, target, err := FabricTransportForTarget(FabricTransportTarget{
func TestFabricTransportForTargetRejectsNonQUICByDefault(t *testing.T) {
_, target, err := FabricTransportForTarget(FabricTransportTarget{
Endpoint: "https://node.example",
}, nil, nil)
if err != nil {
t.Fatalf("select transport: %v", err)
}
if _, ok := transport.(*WebSocketFabricTransport); !ok {
t.Fatalf("transport = %T, want websocket", transport)
}, nil)
if err == nil {
t.Fatal("non-QUIC target unexpectedly selected a transport")
}
if target.Endpoint != "https://node.example" {
t.Fatalf("endpoint = %q", target.Endpoint)
@@ -1,42 +0,0 @@
package mesh
import (
"context"
"net/http"
"strings"
)
// HTTPPeerTransport sends synthetic mesh envelopes to explicitly configured
// peer endpoints. It is intentionally narrow: production forwarding remains
// disabled and only SyntheticRuntime messages use this transport.
type HTTPPeerTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
}
func NewHTTPPeerTransport(peerURLs map[string]string) *HTTPPeerTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
}
}
return &HTTPPeerTransport{PeerURLs: normalized}
}
func (t *HTTPPeerTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
if t == nil {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
}
return client.SendSynthetic(ctx, envelope)
}
@@ -1,130 +0,0 @@
package mesh
import (
"context"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestHTTPPeerTransportDirectSyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-direct", []string{"node-a", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-direct")
if err != nil {
t.Fatalf("send live direct probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportSingleRelaySyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeR := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"})
defer nodeR.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-relay", []string{"node-a", "node-r", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-r": nodeR.URL})
nodeR.Runtime = newLiveRuntime(nodeR.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-relay")
if err != nil {
t.Fatalf("send live relay probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-r", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportMissingPeer(t *testing.T) {
transport := NewHTTPPeerTransport(map[string]string{})
_, err := transport.SendSynthetic(context.Background(), "node-missing", SyntheticEnvelope{})
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
}
}
type liveSyntheticNode struct {
Local PeerIdentity
Runtime *SyntheticRuntime
URL string
server *httptest.Server
}
func newLiveSyntheticNode(t *testing.T, local PeerIdentity) *liveSyntheticNode {
t.Helper()
node := &liveSyntheticNode{Local: local}
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
}))
node.URL = node.server.URL
return node
}
func (n *liveSyntheticNode) Close() {
if n.server != nil {
n.server.Close()
}
}
func newLiveRuntime(local PeerIdentity, routes []SyntheticRoute, peers map[string]string) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: local,
Routes: routes,
Transport: NewHTTPPeerTransport(peers),
})
}
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
MaxTTL: 8,
MaxHops: 8,
ExpiresAt: time.Now().UTC().Add(time.Hour),
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func sameStrings(left, right []string) bool {
if len(left) != len(right) {
return false
}
for i := range left {
if left[i] != right[i] {
return false
}
}
return true
}
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"sort"
"strings"
"time"
@@ -53,9 +54,11 @@ type PeerCacheEntry struct {
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestRegion string `json:"best_region,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
BestScoreReasons []string `json:"best_score_reasons,omitempty"`
BestPeerCertSHA256 string `json:"best_peer_cert_sha256,omitempty"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
@@ -132,9 +135,11 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.BestReachability = scored[0].Candidate.Reachability
entry.BestConnectivity = scored[0].Candidate.ConnectivityMode
entry.BestNATType = scored[0].Candidate.NATType
entry.BestRegion = scored[0].Candidate.Region
entry.BestPolicyTags = append([]string{}, scored[0].Candidate.PolicyTags...)
entry.BestCandidateScore = scored[0].Score
entry.BestScoreReasons = append([]string{}, scored[0].Reasons...)
entry.BestPeerCertSHA256 = candidatePeerCertSHA256(scored[0].Candidate)
entry.bestScore = scored[0].Score
if strings.TrimSpace(scored[0].Candidate.Address) != "" {
entry.Endpoint = strings.TrimSpace(scored[0].Candidate.Address)
@@ -188,6 +193,7 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
if lease.PeerNodeID != cfg.Local.NodeID {
entry := peerCacheEntry(entries, lease.PeerNodeID)
useLeaseEndpoint := shouldUseRendezvousEndpoint(*entry)
localRelay := lease.RelayNodeID == cfg.Local.NodeID
entry.RendezvousLeaseID = lease.LeaseID
entry.RelayNodeID = lease.RelayNodeID
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
@@ -195,12 +201,21 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
if useLeaseEndpoint {
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_control")
if localRelay {
entry.BestTransport = "reverse_quic"
} else {
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_quic")
}
entry.BestReachability = "relay"
entry.BestConnectivity = firstNonEmpty(lease.ConnectivityMode, "relay_required")
if !localRelay {
entry.Endpoint = entry.RelayEndpoint
entry.BestCandidateID = lease.LeaseID
entry.BestCandidateAddr = entry.RelayEndpoint
entry.BestPeerCertSHA256 = rendezvousLeasePeerCertSHA256(lease)
} else if strings.TrimSpace(entry.Endpoint) == "" {
entry.Endpoint = firstNonEmpty(entry.BestCandidateAddr, entry.RelayEndpoint)
}
entry.bestScore = maxInt(entry.bestScore, 500)
}
}
@@ -262,6 +277,20 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
}}
}
func rendezvousLeasePeerCertSHA256(lease PeerRendezvousLease) string {
var metadata struct {
PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"`
TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"`
}
if len(lease.Metadata) == 0 {
return ""
}
if err := json.Unmarshal(lease.Metadata, &metadata); err != nil {
return ""
}
return firstNonEmpty(strings.TrimSpace(metadata.PeerCertSHA256), strings.TrimSpace(metadata.TLSCertSHA256))
}
func (c *PeerCache) Snapshot() PeerCacheSnapshot {
if c == nil {
return PeerCacheSnapshot{}
@@ -10,15 +10,15 @@ func TestPeerCacheSelectsAdjacentWarmPeersWithinLimit(t *testing.T) {
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-a": "http://node-a:19000",
"node-r": "http://node-r:19000",
"node-c": "http://node-c:19000",
"node-a": "quic://node-a:19443",
"node-r": "quic://node-r:19443",
"node-c": "quic://node-c:19443",
},
Routes: []SyntheticRoute{
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "https://seed.example.test", Transport: "direct_tcp_tls", Priority: 10},
{NodeID: "node-seed", Endpoint: "quic://seed.example.test:19443", Transport: "direct_quic", Priority: 10},
},
WarmPeerLimit: 2,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
@@ -42,7 +42,7 @@ func TestPeerCachePromotesRecoverySeedAfterRoutePeers(t *testing.T) {
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "wss://seed.example.test/mesh", Transport: "wss", ConnectivityMode: "direct", Priority: 1},
{NodeID: "node-seed", Endpoint: "quic://seed.example.test:19443", Transport: "direct_quic", ConnectivityMode: "direct", Priority: 1},
},
WarmPeerLimit: 3,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
@@ -68,7 +68,7 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Transport: "relay_quic",
Address: "relay.example.test",
Reachability: "relay",
ConnectivityMode: "relay_required",
@@ -77,8 +77,8 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -119,10 +119,10 @@ func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-wss",
EndpointID: "node-b-ice",
NodeID: "node-b",
Transport: "wss",
Address: "https://node-b.example.test:443",
Transport: "ice_quic",
Address: "quic://node-b.example.test:19444",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -148,10 +148,10 @@ func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
if !ok {
t.Fatal("node-b missing from cache")
}
if entry.BestCandidateID != "node-b-wss" || entry.Endpoint != "https://node-b.example.test:443" {
if entry.BestCandidateID != "node-b-ice" || entry.Endpoint != "quic://node-b.example.test:19444" {
t.Fatalf("peer cache did not apply endpoint observations: %+v", entry)
}
if !containsString(entry.BestScoreReasons, "transport:wss") {
if !containsString(entry.BestScoreReasons, "transport:ice_quic") {
t.Fatalf("peer cache did not expose score reasons: %+v", entry.BestScoreReasons)
}
}
@@ -161,15 +161,15 @@ func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "https://node-b.public.example.test:443",
"node-b": "quic://node-b.public.example.test:19443",
},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "https://node-b.public.example.test:443",
Transport: "direct_quic",
Address: "quic://node-b.public.example.test:19443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
@@ -179,8 +179,8 @@ func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "http://10.24.10.20:19001",
Transport: "lan_quic",
Address: "quic://10.24.10.20:19443",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
@@ -199,7 +199,7 @@ func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
if !ok {
t.Fatal("node-b missing from peer cache")
}
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "http://10.24.10.20:19001" {
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "quic://10.24.10.20:19443" {
t.Fatalf("peer cache did not choose corp LAN endpoint: %+v", entry)
}
}
@@ -29,6 +29,7 @@ type PeerConnectionIntentPlanConfig struct {
PeerCache PeerCacheSnapshot
RecoveryPlan PeerRecoveryPlan
RendezvousLeases []PeerRendezvousLease
PreferredRegion string
Now time.Time
}
@@ -62,12 +63,14 @@ type PeerConnectionIntent struct {
Reachability string `json:"reachability,omitempty"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
NATType string `json:"nat_type,omitempty"`
Region string `json:"region,omitempty"`
PolicyTags []string `json:"policy_tags,omitempty"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestPeerCertSHA256 string `json:"best_peer_cert_sha256,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
@@ -103,8 +106,10 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
Reachability: entry.BestReachability,
ConnectivityMode: entry.BestConnectivity,
NATType: entry.BestNATType,
Region: entry.BestRegion,
PolicyTags: append([]string{}, entry.BestPolicyTags...),
BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID),
BestPeerCertSHA256: entry.BestPeerCertSHA256,
RendezvousLeaseID: entry.RendezvousLeaseID,
RelayNodeID: entry.RelayNodeID,
RelayEndpoint: entry.RelayEndpoint,
@@ -114,13 +119,13 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
Priority: candidate.Priority,
GeneratedAt: now,
}
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent)
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent, cfg.PreferredRegion)
intent.TransportMode = mode
intent.RequiresRendezvous = requiresRendezvous
intent.DirectCandidate = directCandidate
if intent.RequiresRendezvous {
if lease, ok := rendezvousLeaseForPeer(cfg.RendezvousLeases, intent.NodeID, now); ok {
applyRendezvousLease(&intent, lease)
applyRendezvousLease(&intent, lease, cfg.PeerCache.LocalNodeID)
}
}
intents = append(intents, intent)
@@ -185,10 +190,12 @@ func connectionIntentAction(candidate PeerRecoveryCandidate) string {
}
}
func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
func classifyPeerTransport(intent PeerConnectionIntent, preferredRegion string) (string, bool, bool) {
transport := strings.ToLower(strings.TrimSpace(intent.Transport))
connectivity := strings.ToLower(strings.TrimSpace(intent.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(intent.Reachability))
region := strings.TrimSpace(intent.Region)
preferredRegion = strings.TrimSpace(preferredRegion)
tags := lowerStringSet(intent.PolicyTags)
if strings.Contains(transport, "relay") || connectivity == "relay_required" || reachability == "relay" {
@@ -201,6 +208,9 @@ func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
return PeerTransportModeCorporateLAN, false, true
}
if tags["private-lan"] || reachability == "private" || endpointHasPrivateHost(intent.Endpoint) {
if preferredRegion != "" && region != "" && !strings.EqualFold(region, preferredRegion) {
return PeerTransportModeRelayRequired, true, false
}
return PeerTransportModePrivateLAN, false, true
}
if strings.Contains(transport, "direct") || reachability == "public" || connectivity == "direct" {
@@ -246,9 +256,16 @@ func rendezvousLeaseForPeer(leases []PeerRendezvousLease, peerNodeID string, now
return candidates[0], true
}
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease) {
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease, localNodeID string) {
localRelay := strings.TrimSpace(lease.RelayNodeID) == strings.TrimSpace(localNodeID)
if !localRelay {
intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
intent.Transport = firstNonEmpty(lease.Transport, "relay_control")
}
if localRelay {
intent.Transport = "reverse_quic"
} else {
intent.Transport = firstNonEmpty(lease.Transport, "relay_quic")
}
intent.TransportMode = PeerTransportModeRelayControl
intent.RequiresRendezvous = false
intent.RendezvousResolved = true
@@ -256,17 +273,33 @@ func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLeas
intent.RelayCandidate = true
intent.RendezvousLeaseID = lease.LeaseID
intent.RelayNodeID = lease.RelayNodeID
intent.RelayEndpoint = intent.Endpoint
intent.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
intent.ControlPlaneOnly = true
if certSHA256 := rendezvousLeasePeerCertSHA256(lease); certSHA256 != "" && !localRelay {
intent.BestPeerCertSHA256 = certSHA256
}
if lease.ConnectivityMode != "" {
intent.ConnectivityMode = lease.ConnectivityMode
}
}
func endpointHasPrivateHost(rawEndpoint string) bool {
addr, ok := endpointHostAddr(rawEndpoint)
if !ok {
return false
}
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
}
func endpointHasUnspecifiedHost(rawEndpoint string) bool {
addr, ok := endpointHostAddr(rawEndpoint)
return ok && addr.IsUnspecified()
}
func endpointHostAddr(rawEndpoint string) (netip.Addr, bool) {
rawEndpoint = strings.TrimSpace(rawEndpoint)
if rawEndpoint == "" {
return false
return netip.Addr{}, false
}
host := rawEndpoint
if parsed, err := url.Parse(rawEndpoint); err == nil && parsed.Host != "" {
@@ -277,9 +310,9 @@ func endpointHasPrivateHost(rawEndpoint string) bool {
}
addr, err := netip.ParseAddr(strings.Trim(host, "[]"))
if err != nil {
return false
return netip.Addr{}, false
}
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
return addr, true
}
func lowerStringSet(values []string) map[string]bool {
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"testing"
"time"
)
@@ -11,8 +12,8 @@ func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
BestTransport: "direct_tcp_tls",
Endpoint: "quic://10.24.10.20:19443",
BestTransport: "lan_quic",
BestReachability: "private",
BestConnectivity: "direct",
BestPolicyTags: []string{"corp-lan", "same-site"},
@@ -23,7 +24,7 @@ func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
Endpoint: "quic://10.24.10.20:19443",
ConnectionState: PeerConnectionReady,
Reason: "maintain_ready",
Priority: 100,
@@ -48,15 +49,15 @@ func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *te
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
BestTransport: "direct_tcp_tls",
Endpoint: "quic://node-b.example.test:19443",
BestTransport: "reverse_quic",
BestReachability: "outbound_only",
BestConnectivity: "outbound_only",
},
{
NodeID: "node-c",
Endpoint: "relay://fabric-relay/node-c",
BestTransport: "relay",
BestTransport: "relay_quic",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
@@ -66,7 +67,7 @@ func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *te
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
Endpoint: "quic://node-b.example.test:19443",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 90,
@@ -91,6 +92,42 @@ func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *te
}
}
func TestPeerConnectionIntentsRequireRendezvousForRemotePrivateRegion(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PreferredRegion: "ifcm",
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "quic://192.168.200.61:19132",
BestTransport: "direct_quic",
BestReachability: "private",
BestConnectivity: "private_lan",
BestRegion: "docker-test",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{{
NodeID: "node-b",
Endpoint: "quic://192.168.200.61:19132",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 100,
}},
},
Now: now,
})
if plan.IntentCount != 1 || plan.RelayRequiredCount != 1 || plan.RendezvousRequiredCount != 1 {
t.Fatalf("unexpected remote private plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.DirectCandidate || !intent.RequiresRendezvous || intent.TransportMode != PeerTransportModeRelayRequired {
t.Fatalf("unexpected remote private intent: %+v", intent)
}
}
func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
@@ -120,13 +157,14 @@ func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r:19443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
Metadata: peerConnectionIntentLeaseMetadata(t, "abc123"),
},
},
Now: now,
@@ -137,9 +175,10 @@ func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
}
intent := plan.Intents[0]
if intent.TransportMode != PeerTransportModeRelayControl ||
intent.Endpoint != "http://node-r:19000" ||
intent.Endpoint != "quic://node-r:19443" ||
intent.RelayNodeID != "node-r" ||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
intent.BestPeerCertSHA256 != "abc123" ||
!intent.RelayCandidate ||
!intent.RendezvousResolved ||
intent.RequiresRendezvous {
@@ -176,8 +215,8 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
LeaseID: "lease-expired-preferred",
PeerNodeID: "node-b",
RelayNodeID: "node-r-old",
RelayEndpoint: "http://node-r-old:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r-old:19443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 1,
ControlPlaneOnly: true,
@@ -188,8 +227,8 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
LeaseID: "lease-active-reselected",
PeerNodeID: "node-b",
RelayNodeID: "node-r-new",
RelayEndpoint: "http://node-r-new:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r-new:19443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 20,
ControlPlaneOnly: true,
@@ -206,20 +245,29 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
intent := plan.Intents[0]
if intent.RendezvousLeaseID != "lease-active-reselected" ||
intent.RelayNodeID != "node-r-new" ||
intent.Endpoint != "http://node-r-new:19000" {
intent.Endpoint != "quic://node-r-new:19443" {
t.Fatalf("expired lease was not skipped: %+v", intent)
}
}
func peerConnectionIntentLeaseMetadata(t *testing.T, certSHA256 string) json.RawMessage {
t.Helper()
payload, err := json.Marshal(map[string]string{"peer_cert_sha256": certSHA256})
if err != nil {
t.Fatalf("marshal metadata: %v", err)
}
return payload
}
func TestPeerConnectionIntentsClassifyPrivateEndpointWithoutCandidateHints(t *testing.T) {
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{NodeID: "node-b", Endpoint: "http://192.168.10.20:19001"},
{NodeID: "node-b", Endpoint: "quic://192.168.10.20:19443"},
}},
RecoveryPlan: PeerRecoveryPlan{Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://192.168.10.20:19001",
Endpoint: "quic://192.168.10.20:19443",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_peer",
Priority: 10,
@@ -2,6 +2,7 @@ package mesh
import (
"context"
"fmt"
"net/http"
"strings"
"sync"
@@ -25,6 +26,8 @@ type PeerConnectionManagerConfig struct {
Tracker *PeerConnectionTracker
RendezvousLeases []PeerRendezvousLease
HTTPClient *http.Client
QUICTransport *QUICFabricTransport
PreferredRegion string
ProbeTimeout time.Duration
Now func() time.Time
}
@@ -35,6 +38,8 @@ type PeerConnectionManager struct {
tracker *PeerConnectionTracker
rendezvousLeases []PeerRendezvousLease
httpClient *http.Client
quicTransport *QUICFabricTransport
preferredRegion string
probeTimeout time.Duration
now func() time.Time
@@ -104,6 +109,7 @@ type peerConnectionProbeTarget struct {
CandidateID string
Endpoint string
Transport string
PeerCertSHA256 string
}
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
@@ -132,6 +138,8 @@ func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionMa
tracker: cfg.Tracker,
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
httpClient: httpClient,
quicTransport: cfg.QUICTransport,
preferredRegion: strings.TrimSpace(cfg.PreferredRegion),
probeTimeout: probeTimeout,
now: now,
}
@@ -155,6 +163,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
PeerCache: peerSnapshot,
RecoveryPlan: recoveryPlan,
RendezvousLeases: rendezvousLeases,
PreferredRegion: m.preferredRegion,
Now: startedAt,
})
entriesByNode := map[string]PeerCacheEntry{}
@@ -215,6 +224,15 @@ func (m *PeerConnectionManager) UpdatePeerConfig(peerCache *PeerCache, rendezvou
m.rendezvousLeases = append([]PeerRendezvousLease{}, rendezvousLeases...)
}
func (m *PeerConnectionManager) UpdateQUICTransport(transport *QUICFabricTransport) {
if m == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
m.quicTransport = transport
}
func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvousLease) {
if m == nil {
return nil, nil
@@ -253,6 +271,7 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
BestPeerCertSHA256: firstNonEmpty(intent.BestPeerCertSHA256, cacheEntry.BestPeerCertSHA256),
}
if intent.RequiresRendezvous {
result.LinkStatus = PeerConnectionProbeDeferred
@@ -282,13 +301,12 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
ClusterID: m.local.ClusterID,
NodeID: intent.NodeID,
}
if intent.RelayCandidate && intent.RelayNodeID != "" {
target.NodeID = intent.RelayNodeID
}
target.NodeID = peerConnectionProbeTargetNodeID(intent, m.local.NodeID)
targets := []peerConnectionProbeTarget{{
CandidateID: intent.BestCandidateID,
Endpoint: intent.Endpoint,
Transport: intent.Transport,
PeerCertSHA256: intent.BestPeerCertSHA256,
}}
if intent.DirectCandidate {
targets = peerConnectionProbeTargets(intent, cacheEntry)
@@ -300,13 +318,14 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
probePeer.BestCandidateID = strings.TrimSpace(probeTarget.CandidateID)
probePeer.BestCandidateAddr = probePeer.Endpoint
probePeer.BestTransport = strings.TrimSpace(probeTarget.Transport)
probePeer.BestPeerCertSHA256 = firstNonEmpty(probeTarget.PeerCertSHA256, probePeer.BestPeerCertSHA256)
if probePeer.Endpoint == "" {
continue
}
candidateStartedAt := normalizedNow(m.now())
m.tracker.BeginProbe(probePeer, candidateStartedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
_, err := NewClient(probePeer.Endpoint).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
err := m.probePeerTarget(probeCtx, probePeer, target)
cancel()
completedAt := normalizedNow(m.now())
candidateResult := PeerConnectionCandidateProbeResult{
@@ -354,14 +373,51 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
return result
}
func peerConnectionProbeTargetNodeID(intent PeerConnectionIntent, localNodeID string) string {
if intent.RelayCandidate && strings.TrimSpace(intent.RelayNodeID) != "" && strings.TrimSpace(intent.RelayNodeID) != strings.TrimSpace(localNodeID) {
return intent.RelayNodeID
}
return intent.NodeID
}
func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer PeerCacheEntry, target PeerIdentity) error {
endpoint := strings.TrimRight(strings.TrimSpace(probePeer.Endpoint), "/")
transport := strings.TrimSpace(probePeer.BestTransport)
if hasLegacyEndpointScheme(endpoint) {
return fmt.Errorf("non_quic_probe_rejected")
}
if peerConnectionTargetIsQUIC(transport, endpoint) {
carrier, selectedTarget, err := FabricTransportForTarget(FabricTransportTarget{
EndpointID: probePeer.BestCandidateID,
PeerID: target.NodeID,
Endpoint: endpoint,
Transport: transport,
Timeout: m.probeTimeout,
PeerCertSHA256: strings.TrimSpace(probePeer.BestPeerCertSHA256),
}, m.quicTransport)
if err != nil {
return err
}
session, err := carrier.Connect(ctx, selectedTarget)
if err != nil {
return err
}
return session.Close()
}
return fmt.Errorf("non_quic_probe_rejected")
}
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
seen := map[string]struct{}{}
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
add := func(candidateID, endpoint, transport string) {
add := func(candidateID, endpoint, transport, peerCertSHA256 string) {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
if endpoint == "" {
return
}
if endpointHasUnspecifiedHost(endpoint) {
return
}
key := candidateID + "|" + endpoint
if _, ok := seen[key]; ok {
return
@@ -371,30 +427,43 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
CandidateID: strings.TrimSpace(candidateID),
Endpoint: endpoint,
Transport: strings.TrimSpace(transport),
PeerCertSHA256: strings.TrimSpace(peerCertSHA256),
})
}
for _, candidate := range cacheEntry.EndpointCandidates {
if !candidateUsableForDirectProbe(candidate) {
continue
}
add(candidate.EndpointID, candidate.Address, candidate.Transport)
add(candidate.EndpointID, candidate.Address, candidate.Transport, candidatePeerCertSHA256(candidate))
}
add(intent.BestCandidateID, intent.Endpoint, intent.Transport)
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, cacheEntry.BestPeerCertSHA256)
return out
}
func peerConnectionTargetIsQUIC(transport string, endpoint string) bool {
return isQUICOnlyCandidateTransport(transport) || strings.HasPrefix(strings.ToLower(strings.TrimSpace(endpoint)), "quic://")
}
func candidateUsableForDirectProbe(candidate PeerEndpointCandidate) bool {
endpoint := strings.TrimSpace(candidate.Address)
if endpoint == "" || strings.HasPrefix(endpoint, "relay://") || strings.HasPrefix(endpoint, "outbound://") {
return false
}
if endpointHasUnspecifiedHost(endpoint) {
return false
}
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
if connectivity == "outbound_only" || connectivity == "relay_required" || reachability == "outbound_only" || reachability == "relay" {
return false
}
return transport == "" || strings.Contains(transport, "direct") || transport == "wss" || strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
return transport == "" ||
strings.Contains(transport, "direct_quic") ||
transport == "quic" ||
transport == "lan_quic" ||
transport == "ice_quic" ||
strings.HasPrefix(endpoint, "quic://")
}
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
@@ -2,8 +2,8 @@ package mesh
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"time"
)
@@ -11,12 +11,18 @@ import (
func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
@@ -24,19 +30,20 @@ func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: server.URL,
Transport: "direct_quic",
Address: "quic://" + server.Addr().String(),
Reachability: "private",
ConnectivityMode: "direct",
PolicyTags: []string{"corp-lan", "same-site"},
Priority: 1,
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
},
"node-c": {
{
EndpointID: "node-c-relay",
NodeID: "node-c",
Transport: "relay",
Transport: "relay_quic",
Address: "relay://fabric/node-c",
Reachability: "relay",
ConnectivityMode: "relay_required",
@@ -52,6 +59,7 @@ func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
Local: local,
PeerCache: cache,
Tracker: tracker,
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
@@ -116,24 +124,31 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
}.Handler())
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
leases := []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: server.URL,
Transport: "relay_control",
RelayEndpoint: "quic://" + server.Addr().String(),
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
}
cache := NewPeerCache(PeerCacheConfig{
@@ -143,7 +158,7 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Transport: "relay_quic",
Address: "relay://fabric/node-b",
Reachability: "relay",
ConnectivityMode: "relay_required",
@@ -161,6 +176,7 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
PeerCache: cache,
Tracker: tracker,
RendezvousLeases: leases,
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
@@ -189,15 +205,37 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
}
}
func TestPeerConnectionProbeTargetKeepsPeerForLocalRelayReverseQUIC(t *testing.T) {
intent := PeerConnectionIntent{
NodeID: "node-b",
RelayCandidate: true,
RelayNodeID: "node-a",
Transport: "reverse_quic",
}
if got := peerConnectionProbeTargetNodeID(intent, "node-a"); got != "node-b" {
t.Fatalf("local relay reverse probe target = %q, want peer node-b", got)
}
intent.RelayNodeID = "node-r"
if got := peerConnectionProbeTargetNodeID(intent, "node-a"); got != "node-r" {
t.Fatalf("remote relay probe target = %q, want relay node-r", got)
}
}
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
@@ -205,8 +243,8 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
{
EndpointID: "node-b-dead",
NodeID: "node-b",
Transport: "direct_http",
Address: "http://127.0.0.1:1",
Transport: "lan_quic",
Address: "quic://127.0.0.1:1",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
@@ -214,11 +252,12 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
{
EndpointID: "node-b-live",
NodeID: "node-b",
Transport: "direct_http",
Address: server.URL,
Transport: "lan_quic",
Address: "quic://" + server.Addr().String(),
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 2,
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
},
},
@@ -230,7 +269,7 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 100 * time.Millisecond},
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: 100 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
@@ -243,7 +282,7 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
t.Fatalf("unexpected cycle: %+v", cycle)
}
result := cycle.Results[0]
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != server.URL {
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != "quic://"+server.Addr().String() {
t.Fatalf("fallback did not select live candidate: %+v", result)
}
if len(result.CandidateResults) != 2 ||
@@ -252,7 +291,85 @@ func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
t.Fatalf("candidate probe trail mismatch: %+v", result.CandidateResults)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != server.URL {
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != "quic://"+server.Addr().String() {
t.Fatalf("tracker did not retain selected candidate: %+v", snapshot)
}
}
func TestPeerConnectionManagerSkipsUnspecifiedQUICCandidates(t *testing.T) {
now := time.Date(2026, 5, 17, 6, 0, 0, 0, time.UTC)
current := now
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-unspecified-v6",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://[::]:19131",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 1,
},
{
EndpointID: "node-b-live",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://" + server.Addr().String(),
Reachability: "public",
ConnectivityMode: "direct",
Priority: 2,
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
},
},
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || len(cycle.Results) != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
result := cycle.Results[0]
if result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != "quic://"+server.Addr().String() {
t.Fatalf("manager did not skip unspecified endpoint: %+v", result)
}
if len(result.CandidateResults) != 1 || result.CandidateResults[0].CandidateID != "node-b-live" {
t.Fatalf("unspecified endpoint should not be probed: %+v", result.CandidateResults)
}
}
func peerConnectionProbeMetadata(t *testing.T, certSHA256 string) json.RawMessage {
t.Helper()
payload, err := json.Marshal(map[string]string{"peer_cert_sha256": certSHA256})
if err != nil {
t.Fatalf("marshal probe metadata: %v", err)
}
return payload
}
@@ -9,7 +9,7 @@ func TestPeerConnectionTrackerTransitionsReadyAndDegraded(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "http://node-b:19000"},
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "quic://node-b:19443"},
},
}, now)
@@ -76,12 +76,12 @@ func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
Entries: []PeerCacheEntry{
{
NodeID: "node-c",
Endpoint: "http://relay:19001",
Endpoint: "quic://relay:19443",
Warm: true,
WarmReason: "rendezvous_lease",
RendezvousLeaseID: "lease-1",
RelayNodeID: "node-r",
RelayEndpoint: "http://relay:19001",
RelayEndpoint: "quic://relay:19443",
RelayControl: true,
},
},
@@ -121,7 +121,7 @@ func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
return PeerCacheEntry{
NodeID: nodeID,
Endpoint: "http://" + nodeID + ":19001",
Endpoint: "quic://" + nodeID + ":19443",
Warm: warm,
WarmReason: warmReason,
RecoverySeed: recoverySeed,
@@ -2,42 +2,369 @@ package mesh
import (
"context"
"net/http"
"encoding/json"
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
type ProductionForwardTransport interface {
SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error)
}
type HTTPProductionForwardTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
type QUICProductionForwardTransport struct {
Targets map[string]FabricTransportTarget
RouteSets map[string]FabricRouteSet
Transport FabricTransport
Router FabricChannelRouter
Timeout time.Duration
Pressure *FabricRoutePressureTracker
Health *FabricRouteHealthTracker
sequence atomic.Uint64
}
func NewHTTPProductionForwardTransport(peerURLs map[string]string) *HTTPProductionForwardTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
type QUICProductionForwardTransportSnapshot struct {
RoutePressure FabricRoutePressureSnapshot `json:"route_pressure"`
RouteHealth FabricRouteHealthSnapshot `json:"route_health,omitempty"`
}
func NewQUICProductionForwardTransport(targets map[string]FabricTransportTarget, transport *QUICFabricTransport) *QUICProductionForwardTransport {
routeSets := make(map[string]FabricRouteSet, len(targets))
for nodeID, target := range targets {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
target.Endpoint = strings.TrimRight(strings.TrimSpace(target.Endpoint), "/")
target.Transport = strings.TrimSpace(target.Transport)
if nodeID != "" && target.Endpoint != "" {
target.PeerID = firstNonEmpty(strings.TrimSpace(target.PeerID), nodeID)
routeSets[nodeID] = FabricRouteSetForTransportTargets("", "", nodeID, []FabricTransportTarget{target})
}
}
return &HTTPProductionForwardTransport{PeerURLs: normalized}
if transport == nil {
transport = NewQUICFabricTransport(nil)
}
return NewQUICProductionForwardTransportFromRouteSets(routeSets, transport)
}
func (t *HTTPProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
func NewQUICProductionForwardTransportFromRouteSets(routeSets map[string]FabricRouteSet, transport FabricTransport) *QUICProductionForwardTransport {
normalizedRouteSets := make(map[string]FabricRouteSet, len(routeSets))
targets := make(map[string]FabricTransportTarget, len(routeSets))
for nodeID, routeSet := range routeSets {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
continue
}
normalizedRouteSets[nodeID] = routeSet
if target, err := FabricTransportTargetForRoute(routeSet.Primary); err == nil {
targets[nodeID] = target
}
}
if transport == nil {
transport = NewQUICFabricTransport(nil)
}
return &QUICProductionForwardTransport{
Targets: targets,
RouteSets: normalizedRouteSets,
Transport: transport,
Router: NewFabricChannelRouter(FabricChannelRouterConfig{
MaxAckLatencyMs: 2000,
MinRerouteInterval: 50 * time.Millisecond,
}),
Timeout: 30 * time.Second,
Pressure: NewFabricRoutePressureTracker(),
Health: NewFabricRouteHealthTracker(30 * time.Second),
}
}
func (t *QUICProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
if t == nil || t.Transport == nil {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
nextNodeID = strings.TrimSpace(nextNodeID)
routeSet, ok := t.RouteSets[nextNodeID]
if !ok {
target, targetOK := t.Targets[nextNodeID]
if !targetOK || strings.TrimSpace(target.Endpoint) == "" {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
routeSet = FabricRouteSetForTransportTargets(envelope.ClusterID, envelope.CurrentHopNodeID, nextNodeID, []FabricTransportTarget{target})
}
spec := FabricChannelSpec{
ChannelID: firstNonEmpty(strings.TrimSpace(envelope.MessageID), fmt.Sprintf("production-%d", t.sequence.Add(1))),
ClusterID: envelope.ClusterID,
SourceNodeID: firstNonEmpty(productionRouteSetSourceNodeID(routeSet), envelope.CurrentHopNodeID),
TargetKind: FabricChannelTargetNode,
TargetID: nextNodeID,
TrafficClass: FabricServiceChannelReliable,
CreatedAt: time.Now().UTC(),
}
payload, err := json.Marshal(envelope)
if err != nil {
return ProductionForwardResult{}, err
}
result, err := t.sendProductionWithRouteSet(ctx, spec, routeSet, payload)
if err != nil {
return ProductionForwardResult{}, err
}
return result, nil
}
func productionRouteSetSourceNodeID(routeSet FabricRouteSet) string {
for _, route := range flattenFabricRouteSet(routeSet) {
if sourceNodeID := strings.TrimSpace(route.SourceNodeID); sourceNodeID != "" {
return sourceNodeID
}
}
return ""
}
func (t *QUICProductionForwardTransport) sendProductionWithRouteSet(ctx context.Context, spec FabricChannelSpec, routeSet FabricRouteSet, payload []byte) (ProductionForwardResult, error) {
router := t.Router
if router.Config.MaxRoutePressure == 0 {
router = NewFabricChannelRouter(FabricChannelRouterConfig{MaxAckLatencyMs: 2000, MinRerouteInterval: 50 * time.Millisecond})
}
routeSet = t.routeSetForScheduling(routeSet)
channel, _, err := router.OpenChannel(spec, routeSet, time.Now().UTC())
if err != nil {
return ProductionForwardResult{}, err
}
timeout := t.Timeout
if timeout <= 0 {
timeout = 30 * time.Second
}
for {
routeSet = t.routeSetForScheduling(routeSet)
route, ok := findFabricRoute(routeSet, channel.RouteID)
if !ok {
return ProductionForwardResult{}, ErrFabricRouteNotFound
}
target, err := FabricTransportTargetForRoute(route)
if err != nil {
return ProductionForwardResult{}, err
}
target.PeerID = firstNonEmpty(strings.TrimSpace(target.PeerID), spec.TargetID)
target.MaxPayload = fabricproto.DefaultMaxPayload
releaseRoute := t.acquireProductionRoute(route.RouteID)
session, err := t.Transport.Connect(ctx, target)
if err != nil {
releaseRoute()
t.markProductionRouteFailure(route.RouteID, err)
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "connect_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
if event.Type == FabricChannelRouteEventReroute {
continue
}
if rerouteErr != nil {
return ProductionForwardResult{}, rerouteErr
}
return ProductionForwardResult{}, err
}
response, ackMs, err := t.sendProductionOnSession(ctx, session, payload, timeout)
_ = session.Close()
releaseRoute()
if err == nil {
t.markProductionRouteSuccess(route.RouteID)
_, _, _ = router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
AckLatencyMs: ackMs,
BytesSent: uint64(len(payload)),
FramesSent: 1,
BytesRecv: uint64(len(response.Payload)),
FramesRecv: 1,
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
return decodeQUICProductionForwardResponse(response.Payload)
}
t.markProductionRouteFailure(route.RouteID, err)
updated, event, rerouteErr := router.ObserveChannel(channel, routeSet, FabricChannelObservation{
ChannelID: spec.ChannelID,
RouteID: route.RouteID,
Failed: true,
Reason: "response_failed",
ObservedAt: time.Now().UTC(),
}, time.Now().UTC())
channel = updated
if event.Type == FabricChannelRouteEventReroute {
continue
}
if rerouteErr != nil {
return ProductionForwardResult{}, rerouteErr
}
return ProductionForwardResult{}, err
}
}
func (t *QUICProductionForwardTransport) routeSetWithActiveChannels(routeSet FabricRouteSet) FabricRouteSet {
if t == nil || t.Pressure == nil {
return routeSet
}
return t.Pressure.Apply(routeSet)
}
func (t *QUICProductionForwardTransport) routeSetForScheduling(routeSet FabricRouteSet) FabricRouteSet {
if t != nil && t.Health != nil {
routeSet = t.Health.Apply(routeSet, time.Now().UTC())
}
return t.routeSetWithActiveChannels(routeSet)
}
func (t *QUICProductionForwardTransport) acquireProductionRoute(routeID string) func() {
if t == nil || t.Pressure == nil {
return func() {}
}
return t.Pressure.Acquire(routeID)
}
func (t *QUICProductionForwardTransport) markProductionRouteFailure(routeID string, err error) {
if t == nil || t.Health == nil || err == nil {
return
}
t.Health.MarkFailure(routeID, err.Error(), time.Now().UTC())
}
func (t *QUICProductionForwardTransport) markProductionRouteSuccess(routeID string) {
if t == nil || t.Health == nil {
return
}
t.Health.MarkSuccess(routeID)
}
func (t *QUICProductionForwardTransport) Snapshot() QUICProductionForwardTransportSnapshot {
if t == nil {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
return QUICProductionForwardTransportSnapshot{}
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
var pressure FabricRoutePressureSnapshot
if t.Pressure != nil {
pressure = t.Pressure.SnapshotPressure()
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
var health FabricRouteHealthSnapshot
if t.Health != nil {
health = t.Health.Snapshot(time.Now().UTC())
}
return client.SendProduction(ctx, envelope)
return QUICProductionForwardTransportSnapshot{RoutePressure: pressure, RouteHealth: health}
}
func (t *QUICProductionForwardTransport) sendProductionOnSession(ctx context.Context, session FabricTransportSession, payload []byte, timeout time.Duration) (fabricproto.Frame, int64, error) {
sequence := t.sequence.Add(1)
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: ProductionForwardQUICStreamID,
Sequence: sequence,
Payload: payload,
}); err != nil {
return fabricproto.Frame{}, 0, err
}
waitCtx := ctx
if timeout > 0 {
var cancel context.CancelFunc
waitCtx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
started := time.Now()
for {
select {
case <-waitCtx.Done():
return fabricproto.Frame{}, 0, waitCtx.Err()
case err, ok := <-session.Errors():
if !ok {
return fabricproto.Frame{}, 0, ErrForwardPeerUnavailable
}
if err != nil {
return fabricproto.Frame{}, 0, err
}
case frame, ok := <-session.Frames():
if !ok {
return fabricproto.Frame{}, 0, ErrForwardPeerUnavailable
}
if frame.Type != fabricproto.FrameData || frame.StreamID != ProductionForwardQUICStreamID || frame.Sequence != sequence {
continue
}
return frame, time.Since(started).Milliseconds(), nil
}
}
}
func decodeQUICProductionForwardResponse(payload []byte) (ProductionForwardResult, error) {
var response quicProductionForwardResponse
if err := json.Unmarshal(payload, &response); err != nil {
return ProductionForwardResult{}, err
}
if strings.TrimSpace(response.Error) != "" {
return ProductionForwardResult{}, fmt.Errorf("%w: %s", ErrForwardPeerUnavailable, response.Error)
}
return response.Result, nil
}
func FabricRouteSetForTransportTargets(clusterID string, sourceNodeID string, targetNodeID string, targets []FabricTransportTarget) FabricRouteSet {
routeSet := FabricRouteSet{TargetKind: FabricChannelTargetNode, TargetID: strings.TrimSpace(targetNodeID)}
routes := make([]FabricRoute, 0, len(targets))
for index, target := range targets {
target.Endpoint = strings.TrimRight(strings.TrimSpace(target.Endpoint), "/")
if strings.TrimSpace(target.Endpoint) == "" {
continue
}
peerID := firstNonEmpty(strings.TrimSpace(target.PeerID), strings.TrimSpace(targetNodeID))
routeID := strings.TrimSpace(target.EndpointID)
if routeID == "" {
routeID = fmt.Sprintf("%s-quic-%d", peerID, index)
}
routes = append(routes, FabricRoute{
RouteID: routeID,
ClusterID: strings.TrimSpace(clusterID),
SourceNodeID: strings.TrimSpace(sourceNodeID),
DestinationNodeID: peerID,
Hops: []FabricRouteHop{{
NodeID: peerID,
Mode: fabricRouteModeForTransportTarget(target),
EndpointID: strings.TrimSpace(target.EndpointID),
Address: target.Endpoint,
PeerCertSHA256: strings.TrimSpace(target.PeerCertSHA256),
}},
BaseLatencyMs: routeLatencyForIndex(index),
Capacity: 100,
ActiveChannels: 0,
Healthy: true,
LastUpdatedAt: time.Now().UTC(),
})
}
if len(routes) == 0 {
return routeSet
}
routeSet.Primary = routes[0]
if len(routes) > 1 {
routeSet.WarmStandby = append(routeSet.WarmStandby, routes[1:]...)
}
return routeSet
}
func fabricRouteModeForTransportTarget(target FabricTransportTarget) FabricRouteMode {
switch strings.ToLower(strings.TrimSpace(target.Transport)) {
case string(FabricRouteLAN):
return FabricRouteLAN
case string(FabricRouteReverse):
return FabricRouteReverse
case string(FabricRouteRelay):
return FabricRouteRelay
case string(FabricRouteICE):
return FabricRouteICE
default:
return FabricRouteDirect
}
}
func routeLatencyForIndex(index int) int {
if index <= 0 {
return 10
}
return 10 + index
}
@@ -0,0 +1,339 @@
package mesh
import (
"context"
"encoding/json"
"sync"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestQUICProductionForwardTransportReroutesOnConnectFailure(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.failConnect["quic://dead.example.test:19443"] = true
transport.results["quic://fast.example.test:19443"] = ProductionForwardResult{
Delivered: true,
MessageID: "message-1",
RouteID: "route-1",
}
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
}),
}, transport)
forward.Timeout = time.Second
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
if err != nil {
t.Fatalf("send production: %v", err)
}
if !result.Delivered || result.MessageID != "message-1" {
t.Fatalf("result = %+v", result)
}
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
t.Fatalf("dead connect count = %d, want 1", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
t.Fatalf("fast connect count = %d, want 1", got)
}
snapshot := forward.Snapshot()
if snapshot.RoutePressure.AcquiredTotal != 2 || snapshot.RoutePressure.ReleasedTotal != 2 || snapshot.RoutePressure.MaxActiveTotal == 0 {
t.Fatalf("route pressure snapshot = %+v", snapshot)
}
}
func TestQUICProductionForwardTransportQuarantinesFailedRoute(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.failConnect["quic://dead.example.test:19443"] = true
transport.results["quic://fast.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
}),
}, transport)
forward.Timeout = time.Second
for i := 0; i < 2; i++ {
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
if err != nil {
t.Fatalf("send production #%d: %v", i+1, err)
}
if !result.Delivered {
t.Fatalf("result #%d = %+v", i+1, result)
}
}
if got := transport.connectCount("quic://dead.example.test:19443"); got != 1 {
t.Fatalf("dead connect count = %d, want quarantine after first failure", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 2 {
t.Fatalf("fast connect count = %d, want both sends on healthy route", got)
}
snapshot := forward.Snapshot()
if snapshot.RouteHealth.Quarantined["dead"].Failures != 1 {
t.Fatalf("route health snapshot = %+v, want dead route quarantined", snapshot.RouteHealth)
}
}
func TestFabricRouteHealthTrackerExpiresQuarantine(t *testing.T) {
routeSet := FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "dead", PeerID: "node-b", Endpoint: "quic://dead.example.test:19443", Transport: "quic"},
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
})
tracker := NewFabricRouteHealthTracker(time.Second)
now := time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC)
tracker.MarkFailure("dead", "connect failed", now)
applied := tracker.Apply(routeSet, now.Add(500*time.Millisecond))
if applied.Primary.Healthy || !applied.Primary.Degraded {
t.Fatalf("primary after quarantine = %+v, want unhealthy degraded route", applied.Primary)
}
if len(tracker.Snapshot(now.Add(500*time.Millisecond)).Quarantined) != 1 {
t.Fatalf("route health snapshot = %+v, want one quarantined route", tracker.Snapshot(now.Add(500*time.Millisecond)))
}
applied = tracker.Apply(routeSet, now.Add(2*time.Second))
if !applied.Primary.Healthy || applied.Primary.Degraded {
t.Fatalf("primary after ttl = %+v, want route restored", applied.Primary)
}
if snapshot := tracker.Snapshot(now.Add(2 * time.Second)); len(snapshot.Quarantined) != 0 {
t.Fatalf("route health snapshot after ttl = %+v, want empty quarantine", snapshot)
}
}
func TestQUICProductionForwardTransportReroutesOnResponseTimeout(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.delays["quic://slow.example.test:19443"] = 100 * time.Millisecond
transport.results["quic://slow.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
transport.results["quic://fast.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
"node-b": FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "slow", PeerID: "node-b", Endpoint: "quic://slow.example.test:19443", Transport: "quic"},
{EndpointID: "fast", PeerID: "node-b", Endpoint: "quic://fast.example.test:19443", Transport: "quic"},
}),
}, transport)
forward.Timeout = 10 * time.Millisecond
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
if err != nil {
t.Fatalf("send production: %v", err)
}
if !result.Delivered || result.MessageID != "message-1" {
t.Fatalf("result = %+v", result)
}
if got := transport.connectCount("quic://slow.example.test:19443"); got != 1 {
t.Fatalf("slow connect count = %d, want 1", got)
}
if got := transport.connectCount("quic://fast.example.test:19443"); got != 1 {
t.Fatalf("fast connect count = %d, want 1", got)
}
}
func TestQUICProductionForwardTransportSchedulesWithRouteSetSourceForForwardedEnvelope(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.results["quic://node-c.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-forwarded"}
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{
"node-c": FabricRouteSetForTransportTargets("cluster-a", "node-b", "node-c", []FabricTransportTarget{
{EndpointID: "node-c-direct", PeerID: "node-c", Endpoint: "quic://node-c.example.test:19443", Transport: "quic"},
}),
}, transport)
forward.Timeout = time.Second
envelope := testProductionForwardEnvelope("message-forwarded")
envelope.ClusterID = "cluster-a"
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = "node-c"
envelope.NextHopNodeID = "node-c"
result, err := forward.SendProduction(context.Background(), "node-c", envelope)
if err != nil {
t.Fatalf("send production: %v", err)
}
if !result.Delivered || result.MessageID != "message-forwarded" {
t.Fatalf("result = %+v", result)
}
if got := transport.connectCount("quic://node-c.example.test:19443"); got != 1 {
t.Fatalf("connect count = %d, want 1", got)
}
}
func TestQUICProductionForwardTransportSpreadsConcurrentChannelsByActivePressure(t *testing.T) {
transport := newFakeProductionForwardFabricTransport()
transport.delays["quic://route-a.example.test:19443"] = 80 * time.Millisecond
transport.results["quic://route-a.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-1"}
transport.results["quic://route-b.example.test:19443"] = ProductionForwardResult{Delivered: true, MessageID: "message-2"}
routeSet := FabricRouteSetForTransportTargets("cluster-a", "node-a", "node-b", []FabricTransportTarget{
{EndpointID: "route-a", PeerID: "node-b", Endpoint: "quic://route-a.example.test:19443", Transport: "quic"},
{EndpointID: "route-b", PeerID: "node-b", Endpoint: "quic://route-b.example.test:19443", Transport: "quic"},
})
routeSet.Primary.Capacity = 100
routeSet.WarmStandby[0].Capacity = 100
forward := NewQUICProductionForwardTransportFromRouteSets(map[string]FabricRouteSet{"node-b": routeSet}, transport)
forward.Timeout = time.Second
firstDone := make(chan error, 1)
go func() {
_, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-1"))
firstDone <- err
}()
transport.waitForConnect(t, "quic://route-a.example.test:19443", 1)
result, err := forward.SendProduction(context.Background(), "node-b", testProductionForwardEnvelope("message-2"))
if err != nil {
t.Fatalf("second send production: %v", err)
}
if !result.Delivered || result.MessageID != "message-2" {
t.Fatalf("second result = %+v", result)
}
if got := transport.connectCount("quic://route-b.example.test:19443"); got != 1 {
t.Fatalf("route-b connect count = %d, want 1", got)
}
if err := <-firstDone; err != nil {
t.Fatalf("first send production: %v", err)
}
snapshot := forward.Snapshot()
if snapshot.RoutePressure.MaxActive["route-a"] != 1 || snapshot.RoutePressure.MaxActive["route-b"] != 1 || snapshot.RoutePressure.AcquiredTotal != 2 {
t.Fatalf("route pressure snapshot = %+v", snapshot)
}
}
type fakeProductionForwardFabricTransport struct {
mu sync.Mutex
failConnect map[string]bool
delays map[string]time.Duration
results map[string]ProductionForwardResult
connects map[string]int
}
func newFakeProductionForwardFabricTransport() *fakeProductionForwardFabricTransport {
return &fakeProductionForwardFabricTransport{
failConnect: map[string]bool{},
delays: map[string]time.Duration{},
results: map[string]ProductionForwardResult{},
connects: map[string]int{},
}
}
func (t *fakeProductionForwardFabricTransport) Connect(_ context.Context, target FabricTransportTarget) (FabricTransportSession, error) {
endpoint := target.Endpoint
t.mu.Lock()
t.connects[endpoint]++
fail := t.failConnect[endpoint]
delay := t.delays[endpoint]
result := t.results[endpoint]
t.mu.Unlock()
if fail {
return nil, ErrForwardPeerUnavailable
}
return &fakeProductionForwardFabricSession{
delay: delay,
result: result,
frames: make(chan fabricproto.Frame, 16),
errors: make(chan error, 1),
done: make(chan struct{}),
}, nil
}
func (t *fakeProductionForwardFabricTransport) Close() error {
return nil
}
func (t *fakeProductionForwardFabricTransport) connectCount(endpoint string) int {
t.mu.Lock()
defer t.mu.Unlock()
return t.connects[endpoint]
}
func (t *fakeProductionForwardFabricTransport) waitForConnect(tb testing.TB, endpoint string, count int) {
tb.Helper()
deadline := time.Now().Add(time.Second)
for {
t.mu.Lock()
got := t.connects[endpoint]
t.mu.Unlock()
if got >= count {
return
}
if time.Now().After(deadline) {
tb.Fatalf("timed out waiting for %s connect count %d, got %d", endpoint, count, got)
}
time.Sleep(time.Millisecond)
}
}
type fakeProductionForwardFabricSession struct {
delay time.Duration
result ProductionForwardResult
frames chan fabricproto.Frame
errors chan error
done chan struct{}
once sync.Once
}
func (s *fakeProductionForwardFabricSession) Send(_ context.Context, frame fabricproto.Frame) error {
if frame.Type != fabricproto.FrameData {
return nil
}
responsePayload, _ := json.Marshal(quicProductionForwardResponse{Result: s.result})
go func() {
if s.delay > 0 {
time.Sleep(s.delay)
}
select {
case <-s.done:
case s.frames <- fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: frame.TrafficClass,
StreamID: frame.StreamID,
Sequence: frame.Sequence,
Payload: responsePayload,
}:
}
}()
return nil
}
func (s *fakeProductionForwardFabricSession) Frames() <-chan fabricproto.Frame {
return s.frames
}
func (s *fakeProductionForwardFabricSession) Errors() <-chan error {
return s.errors
}
func (s *fakeProductionForwardFabricSession) Close() error {
s.once.Do(func() {
close(s.done)
})
return nil
}
func (s *fakeProductionForwardFabricSession) Closed() bool {
select {
case <-s.done:
return true
default:
return false
}
}
func testProductionForwardEnvelope(messageID string) ProductionEnvelope {
now := time.Now().UTC()
return ProductionEnvelope{
FabricProtocolVersion: ProtocolVersion,
MessageID: messageID,
RouteID: "route-1",
ClusterID: "cluster-a",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
CurrentHopNodeID: "node-a",
NextHopNodeID: "node-b",
ChannelClass: ProductionChannelFabricControl,
MessageType: ProductionMessageFabricControl,
TTL: 8,
CreatedAt: now,
ExpiresAt: now.Add(time.Minute),
}
}
@@ -106,6 +106,9 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
}
if hasLegacyEndpointScheme(endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint")
}
}
for nodeID, candidates := range cfg.PeerEndpointCandidates {
if strings.TrimSpace(nodeID) == "" {
@@ -121,6 +124,9 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
strings.TrimSpace(candidate.ConnectivityMode) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
}
if !isQUICOnlyCandidateTransport(candidate.Transport) || hasLegacyEndpointScheme(candidate.Address) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint candidate")
}
}
}
for endpointID, observation := range cfg.PeerEndpointObservations {
@@ -179,6 +185,14 @@ func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) err
return nil
}
func hasLegacyEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
}
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
if len(seeds) > 20 {
return fmt.Errorf("scoped synthetic mesh config contains too many recovery seeds")
@@ -191,6 +205,9 @@ func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
strings.TrimSpace(seed.Transport) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
}
if !isQUICOnlyCandidateTransport(seed.Transport) || hasLegacyEndpointScheme(seed.Endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC recovery seed")
}
if _, duplicate := seen[key]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate recovery seed")
}
@@ -224,6 +241,9 @@ func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRo
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
}
if !isQUICOnlyCandidateTransport(lease.Transport) || hasLegacyEndpointScheme(lease.RelayEndpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC rendezvous lease")
}
if _, duplicate := seen[lease.LeaseID]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate rendezvous lease")
}
@@ -18,14 +18,14 @@ func TestLoadScopedSyntheticConfig(t *testing.T) {
ConfigVersion: "config-v1",
PeerDirectoryVersion: "peers-v1",
PolicyVersion: "policy-v1",
PeerEndpoints: map[string]string{"node-b": "http://127.0.0.1:19002"},
PeerEndpoints: map[string]string{"node-b": "quic://127.0.0.1:19443"},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
@@ -55,8 +55,8 @@ func TestLoadScopedSyntheticConfig(t *testing.T) {
RecoverySeeds: []PeerRecoverySeed{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
Transport: "direct_tcp_tls",
Endpoint: "quic://node-b.example.test:19443",
Transport: "direct_quic",
ConnectivityMode: "direct",
Priority: 10,
},
@@ -66,8 +66,8 @@ func TestLoadScopedSyntheticConfig(t *testing.T) {
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r:19443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
RouteIDs: []string{"route-a-b"},
AllowedChannels: []string{"fabric_control", "route_control"},
@@ -158,8 +158,8 @@ func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing
{
EndpointID: "node-b-public",
NodeID: "node-c",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Transport: "direct_quic",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
ConnectivityMode: "direct",
},
@@ -174,6 +174,73 @@ func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpoint(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerEndpoints: map[string]string{"node-b": "https://node-b.example.test:443"},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC peer endpoint error")
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateTransport(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-websocket",
NodeID: "node-b",
Transport: "websocket",
Address: "quic://203.0.113.20:19443",
Reachability: "public",
ConnectivityMode: "direct",
},
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC peer endpoint candidate error")
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateScheme(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-https",
NodeID: "node-b",
Transport: "direct_quic",
Address: "https://node-b.example.test:443",
Reachability: "public",
ConnectivityMode: "direct",
},
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC peer endpoint candidate error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointObservation(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
@@ -217,7 +284,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-b", Endpoint: "", Transport: "direct_tcp_tls"},
{NodeID: "node-b", Endpoint: "", Transport: "direct_quic"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
@@ -228,6 +295,23 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRecoverySeed(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-b", Endpoint: "https://node-b.example.test:443", Transport: "direct_quic"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC recovery seed error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
@@ -238,8 +322,8 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
RelayEndpoint: "quic://node-r:19443",
Transport: "relay_quic",
RouteIDs: []string{"route-a-b"},
ExpiresAt: time.Now().UTC().Add(time.Hour),
},
@@ -253,6 +337,36 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "https://node-r.example.test:443",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
RouteIDs: []string{"route-a-b"},
AllowedChannels: []string{"fabric_control", "route_control"},
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: time.Now().UTC().Add(-time.Minute),
ExpiresAt: time.Now().UTC().Add(time.Hour),
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected non-QUIC rendezvous lease error")
}
}
func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
t.Helper()
payload, err := json.Marshal(cfg)
@@ -265,3 +379,32 @@ func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
}
return path
}
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
MaxTTL: 8,
MaxHops: 8,
ExpiresAt: time.Now().UTC().Add(time.Hour),
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func sameStrings(left, right []string) bool {
if len(left) != len(right) {
return false
}
for i := range left {
if left[i] != right[i] {
return false
}
}
return true
}

Some files were not shown because too many files have changed in this diff Show More