This commit is contained in:
2026-05-18 21:33:39 +03:00
parent 5096155d83
commit 469fa0e860
94 changed files with 8761 additions and 8003 deletions
@@ -140,15 +140,12 @@ func run(ctx context.Context) (smokeReport, error) {
return smokeReport{}, fmt.Errorf("test service: %w", err) return smokeReport{}, fmt.Errorf("test service: %w", err)
} }
fabricSessionStartedAt := time.Now() fabricSessionStartedAt := time.Now()
fabricSession, _, err := mesh.NewClient(nodeB.URL).OpenFabricSession(ctx, mesh.FabricSessionDialOptions{ fabricSession, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx)
Token: "rap_fsn_mesh_live_smoke",
Timeout: 3 * time.Second,
})
if err != nil { if err != nil {
return smokeReport{}, fmt.Errorf("fabric session open: %w", err) return smokeReport{}, fmt.Errorf("fabric quic session open: %w", err)
} }
defer fabricSession.Close() defer fabricSession.Close()
firstFabricSessionResponse, err := fabricSession.RoundTrip(ctx, fabricproto.Frame{ firstFabricSessionResponse, err := smokeFabricSessionRoundTrip(ctx, fabricSession, fabricproto.Frame{
Type: fabricproto.FramePing, Type: fabricproto.FramePing,
Sequence: uint64(fabricSessionStartedAt.UnixNano()), Sequence: uint64(fabricSessionStartedAt.UnixNano()),
Payload: []byte("mesh-live-smoke-fabric-session"), Payload: []byte("mesh-live-smoke-fabric-session"),
@@ -156,7 +153,7 @@ func run(ctx context.Context) (smokeReport, error) {
if err != nil { if err != nil {
return smokeReport{}, fmt.Errorf("fabric session first round trip: %w", err) return smokeReport{}, fmt.Errorf("fabric session first round trip: %w", err)
} }
secondFabricSessionResponse, err := fabricSession.RoundTrip(ctx, fabricproto.Frame{ secondFabricSessionResponse, err := smokeFabricSessionRoundTrip(ctx, fabricSession, fabricproto.Frame{
Type: fabricproto.FramePing, Type: fabricproto.FramePing,
Sequence: uint64(fabricSessionStartedAt.UnixNano()) + 1, Sequence: uint64(fabricSessionStartedAt.UnixNano()) + 1,
Payload: []byte("mesh-live-smoke-fabric-session-2"), Payload: []byte("mesh-live-smoke-fabric-session-2"),
@@ -175,13 +172,9 @@ func run(ctx context.Context) (smokeReport, error) {
} }
fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow, fabricVPNPressureLevel, fabricVPNPressureScore, fabricVPNPressureReasons, fabricVPNPressureAction := smokeVPNFlowSchedulerBulkPressure() fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow, fabricVPNPressureLevel, fabricVPNPressureScore, fabricVPNPressureReasons, fabricVPNPressureAction := smokeVPNFlowSchedulerBulkPressure()
fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS, fabricVPNRecoveryReason := smokeVPNFlowSchedulerRouteRecovery() fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS, fabricVPNRecoveryReason := smokeVPNFlowSchedulerRouteRecovery()
fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx)
if err != nil {
return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err)
}
return smokeReport{ return smokeReport{
Stage: "C17F scoped synthetic config plus live HTTP transport", Stage: "C17F scoped synthetic config plus live QUIC fabric transport",
ProductionForwarding: false, ProductionForwarding: false,
ScopedConfigLoaded: nodeAConfig.ConfigVersion == "smoke-config-v1", ScopedConfigLoaded: nodeAConfig.ConfigVersion == "smoke-config-v1",
DirectProbeAccepted: directAck.MessageType == mesh.SyntheticMessageProbeAck, DirectProbeAccepted: directAck.MessageType == mesh.SyntheticMessageProbeAck,
@@ -210,11 +203,11 @@ func run(ctx context.Context) (smokeReport, error) {
FabricVPNRecoveryMaxMS: fabricVPNRecoveryMaxMS, FabricVPNRecoveryMaxMS: fabricVPNRecoveryMaxMS,
FabricVPNRecoveryAvgMS: fabricVPNRecoveryAvgMS, FabricVPNRecoveryAvgMS: fabricVPNRecoveryAvgMS,
FabricVPNRecoveryReason: fabricVPNRecoveryReason, FabricVPNRecoveryReason: fabricVPNRecoveryReason,
FabricQUICAccepted: fabricQUICAccepted, FabricQUICAccepted: fabricSessionAccepted,
FabricQUICEndpoint: fabricQUICEndpoint, FabricQUICEndpoint: fabricQUICEndpoint,
FabricQUICPressure: fabricQUICPressure, FabricQUICPressure: fabricQUICPressure,
FabricSessionLatencyMS: fabricSessionLatency.Milliseconds(), FabricSessionLatencyMS: fabricSessionLatency.Milliseconds(),
FabricSessionEndpoint: nodeB.URL + "/mesh/v1/fabric/session/ws", FabricSessionEndpoint: "quic://" + fabricQUICEndpoint,
PeerEndpoints: map[string]any{ PeerEndpoints: map[string]any{
"node-a": nodeA.URL, "node-a": nodeA.URL,
"node-r": nodeR.URL, "node-r": nodeR.URL,
@@ -269,18 +262,16 @@ func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64, st
stat.LastRouteSwitchReason stat.LastRouteSwitchReason
} }
func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) { func smokeQUICFabricSession(ctx context.Context) (mesh.FabricTransportSession, string, int, error) {
server, err := mesh.StartQUICFabricServer(ctx, mesh.QUICFabricServerConfig{ server, err := mesh.StartQUICFabricServer(ctx, mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0", ListenAddr: "127.0.0.1:0",
TLSConfig: smokeQUICTLSConfig(), TLSConfig: smokeQUICTLSConfig(),
}) })
if err != nil { if err != nil {
return false, "", 0, err return nil, "", 0, err
} }
defer server.Close()
endpoint := server.Addr().String() endpoint := server.Addr().String()
transport := mesh.NewQUICFabricTransport(nil) transport := mesh.NewQUICFabricTransport(nil)
defer transport.Close()
session, err := transport.Connect(ctx, mesh.FabricTransportTarget{ session, err := transport.Connect(ctx, mesh.FabricTransportTarget{
PeerID: "node-b", PeerID: "node-b",
Endpoint: endpoint, Endpoint: endpoint,
@@ -293,31 +284,12 @@ func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) {
ErrorBuffer: 4, ErrorBuffer: 4,
}) })
if err != nil { if err != nil {
return false, endpoint, 0, err _ = transport.Close()
_ = server.Close()
return nil, endpoint, 0, err
} }
defer session.Close()
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: uint64(time.Now().UnixNano()),
Payload: []byte("mesh-live-smoke-quic"),
}); err != nil {
return false, endpoint, 0, err
}
timer := time.NewTimer(3 * time.Second)
defer timer.Stop()
for {
select {
case frame := <-session.Frames():
snapshot := transport.Snapshot() snapshot := transport.Snapshot()
return frame.Type == fabricproto.FramePong && string(frame.Payload) == "mesh-live-smoke-quic", endpoint, snapshot.CapacityPressurePercent, nil return &smokeManagedFabricSession{session: session, transport: transport, server: server}, endpoint, snapshot.CapacityPressurePercent, nil
case err := <-session.Errors():
return false, endpoint, 0, err
case <-timer.C:
return false, endpoint, 0, fmt.Errorf("timed out waiting for quic pong")
case <-ctx.Done():
return false, endpoint, 0, ctx.Err()
}
}
} }
func smokeQUICTLSConfig() *tls.Config { func smokeQUICTLSConfig() *tls.Config {
@@ -341,25 +313,20 @@ func smokeQUICTLSConfig() *tls.Config {
} }
} }
func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession *mesh.FabricSessionClient) (bool, bool, int, error) { func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession mesh.FabricTransportSession) (bool, bool, int, error) {
const interactiveStreamID uint64 = 4400 const interactiveStreamID uint64 = 4400
const bulkStreamID uint64 = 4401 const bulkStreamID uint64 = 4401
pump := fabricSession.StartPump(ctx, mesh.FabricSessionPumpOptions{
OutboundBuffer: 4,
InboundBuffer: 4,
ErrorBuffer: 4,
})
defer pump.Close()
for _, frame := range []fabricproto.Frame{ for _, frame := range []fabricproto.Frame{
{Type: fabricproto.FrameOpenStream, StreamID: interactiveStreamID, TrafficClass: fabricproto.TrafficClassInteractive}, {Type: fabricproto.FrameOpenStream, StreamID: interactiveStreamID, TrafficClass: fabricproto.TrafficClassInteractive},
{Type: fabricproto.FrameOpenStream, StreamID: bulkStreamID, TrafficClass: fabricproto.TrafficClassBulk}, {Type: fabricproto.FrameOpenStream, StreamID: bulkStreamID, TrafficClass: fabricproto.TrafficClassBulk},
} { } {
if err := pump.Send(ctx, frame); err != nil { if err := fabricSession.Send(ctx, frame); err != nil {
return false, false, 0, err return false, false, 0, err
} }
} }
transport := &vpnruntime.FabricSessionPacketTransport{ transport := &vpnruntime.FabricSessionPacketTransport{
Sender: pump, Sender: fabricSession,
Receiver: fabricSession,
StreamID: interactiveStreamID, StreamID: interactiveStreamID,
VPNConnectionID: "vpn-smoke", VPNConnectionID: "vpn-smoke",
SendDirection: vpnruntime.FabricDirectionGatewayToClient, SendDirection: vpnruntime.FabricDirectionGatewayToClient,
@@ -378,7 +345,7 @@ func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession *mesh.Fa
acked := map[uint64]bool{} acked := map[uint64]bool{}
for { for {
select { select {
case frame := <-pump.Frames(): case frame := <-fabricSession.Frames():
if frame.Type == fabricproto.FrameAck && frame.Sequence == 1 { if frame.Type == fabricproto.FrameAck && frame.Sequence == 1 {
acked[frame.StreamID] = true acked[frame.StreamID] = true
if acked[interactiveStreamID] && acked[bulkStreamID] { if acked[interactiveStreamID] && acked[bulkStreamID] {
@@ -393,7 +360,7 @@ func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession *mesh.Fa
return true, sharded, int(fanout), nil return true, sharded, int(fanout), nil
} }
} }
case err := <-pump.Errors(): case err := <-fabricSession.Errors():
return false, false, 0, err return false, false, 0, err
case <-timer.C: case <-timer.C:
return false, false, 0, fmt.Errorf("timed out waiting for fabric vpn packet ack") return false, false, 0, fmt.Errorf("timed out waiting for fabric vpn packet ack")
@@ -403,6 +370,68 @@ func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession *mesh.Fa
} }
} }
type smokeManagedFabricSession struct {
session mesh.FabricTransportSession
transport *mesh.QUICFabricTransport
server *mesh.QUICFabricServer
}
func (s *smokeManagedFabricSession) Send(ctx context.Context, frame fabricproto.Frame) error {
return s.session.Send(ctx, frame)
}
func (s *smokeManagedFabricSession) Frames() <-chan fabricproto.Frame {
return s.session.Frames()
}
func (s *smokeManagedFabricSession) Errors() <-chan error {
return s.session.Errors()
}
func (s *smokeManagedFabricSession) Closed() bool {
return s.session.Closed()
}
func (s *smokeManagedFabricSession) Close() error {
var firstErr error
if s.session != nil {
firstErr = s.session.Close()
}
if s.transport != nil {
if err := s.transport.Close(); firstErr == nil {
firstErr = err
}
}
if s.server != nil {
if err := s.server.Close(); firstErr == nil {
firstErr = err
}
}
return firstErr
}
func smokeFabricSessionRoundTrip(ctx context.Context, session mesh.FabricTransportSession, frame fabricproto.Frame) (fabricproto.Frame, error) {
if err := session.Send(ctx, frame); err != nil {
return fabricproto.Frame{}, err
}
timer := time.NewTimer(3 * time.Second)
defer timer.Stop()
for {
select {
case response := <-session.Frames():
if response.Sequence == frame.Sequence {
return response, nil
}
case err := <-session.Errors():
return fabricproto.Frame{}, err
case <-timer.C:
return fabricproto.Frame{}, fmt.Errorf("timed out waiting for fabric session response")
case <-ctx.Done():
return fabricproto.Frame{}, ctx.Err()
}
}
}
func smokeIPv4TCPPacket(src [4]byte, dst [4]byte, srcPort uint16, dstPort uint16, flags byte) []byte { func smokeIPv4TCPPacket(src [4]byte, dst [4]byte, srcPort uint16, dstPort uint16, flags byte) []byte {
packet := make([]byte, 40) packet := make([]byte, 40)
packet[0] = 0x45 packet[0] = 0x45
@@ -445,7 +474,7 @@ func writeSmokeScopedConfig(local mesh.PeerIdentity, peers map[string]string, ro
func newSmokeNode(local mesh.PeerIdentity) *smokeNode { func newSmokeNode(local mesh.PeerIdentity) *smokeNode {
node := &smokeNode{Local: local} node := &smokeNode{Local: local}
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime, FabricSessionEnabled: true, FabricSessionWebSocketEnabled: true}.Handler().ServeHTTP(w, r) mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
})) }))
node.URL = node.server.URL node.URL = node.server.URL
return node return node
@@ -6,7 +6,6 @@ import (
"flag" "flag"
"fmt" "fmt"
"log" "log"
"net/http"
"os" "os"
"os/signal" "os/signal"
"runtime" "runtime"
@@ -15,9 +14,7 @@ import (
"time" "time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/hostagent" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/hostagent"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
) )
type installCommandConfig struct { type installCommandConfig struct {
@@ -82,10 +79,6 @@ func main() {
if err := runUpdateHostAgentLoop(ctx, os.Args[2:]); err != nil { if err := runUpdateHostAgentLoop(ctx, os.Args[2:]); err != nil {
log.Fatalf("update-host-agent-loop failed: %v", err) log.Fatalf("update-host-agent-loop failed: %v", err)
} }
case "fabric-session-smoke":
if err := runFabricSessionSmoke(ctx, os.Args[2:]); err != nil {
log.Fatalf("fabric-session-smoke failed: %v", err)
}
default: default:
usage() usage()
os.Exit(2) os.Exit(2)
@@ -117,78 +110,6 @@ func applyStagedSelfUpdate() {
_ = os.Remove(backup) _ = os.Remove(backup)
} }
func runFabricSessionSmoke(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("fabric-session-smoke", flag.ContinueOnError)
var meshURL string
var token string
var timeoutSeconds int
var payload string
var authorityPayload string
var authoritySignature string
fs.StringVar(&meshURL, "mesh-url", getenv("RAP_MESH_SMOKE_URL", ""), "Mesh base URL, for example http://node:19131.")
fs.StringVar(&token, "token", getenv("RAP_FABRIC_SESSION_TOKEN", ""), "Fabric session token starting with rap_fsn_.")
fs.IntVar(&timeoutSeconds, "timeout-seconds", getenvInt("RAP_FABRIC_SESSION_SMOKE_TIMEOUT_SECONDS", 5), "Smoke timeout in seconds.")
fs.StringVar(&payload, "payload", getenv("RAP_FABRIC_SESSION_SMOKE_PAYLOAD", "rap-fabric-session-smoke"), "Ping payload.")
fs.StringVar(&authorityPayload, "authority-payload", getenv("RAP_FABRIC_SESSION_AUTHORITY_PAYLOAD", ""), "Base64 or JSON fabric session authority payload header.")
fs.StringVar(&authoritySignature, "authority-signature", getenv("RAP_FABRIC_SESSION_AUTHORITY_SIGNATURE", ""), "Base64 or JSON fabric session authority signature header.")
if err := fs.Parse(args); err != nil {
return err
}
if strings.TrimSpace(meshURL) == "" {
return fmt.Errorf("mesh-url is required")
}
if strings.TrimSpace(token) == "" {
return fmt.Errorf("token is required")
}
if timeoutSeconds <= 0 {
timeoutSeconds = 5
}
smokeCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSeconds)*time.Second)
defer cancel()
header := make(http.Header)
if strings.TrimSpace(authorityPayload) != "" {
header.Set("X-RAP-Fabric-Session-Authority-Payload", strings.TrimSpace(authorityPayload))
}
if strings.TrimSpace(authoritySignature) != "" {
header.Set("X-RAP-Fabric-Session-Authority-Signature", strings.TrimSpace(authoritySignature))
}
startedAt := time.Now()
response, err := mesh.NewClient(meshURL).SendFabricSessionFrame(smokeCtx, mesh.FabricSessionDialOptions{
Token: token,
Header: header,
Timeout: time.Duration(timeoutSeconds) * time.Second,
}, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: uint64(startedAt.UnixNano()),
Payload: []byte(payload),
})
duration := time.Since(startedAt)
result := map[string]any{
"schema_version": "rap.fabric_session_smoke_result.v1",
"mesh_url": strings.TrimSpace(meshURL),
"ok": err == nil && response.Type == fabricproto.FramePong && string(response.Payload) == payload,
"latency_ms": duration.Milliseconds(),
"response_type": response.Type,
"sequence": response.Sequence,
"authority": strings.TrimSpace(authorityPayload) != "" || strings.TrimSpace(authoritySignature) != "",
}
if err != nil {
result["error"] = err.Error()
}
encoded, marshalErr := json.MarshalIndent(result, "", " ")
if marshalErr != nil {
return marshalErr
}
fmt.Println(string(encoded))
if err != nil {
return err
}
if response.Type != fabricproto.FramePong || string(response.Payload) != payload {
return fmt.Errorf("fabric session smoke returned unexpected response type=%d payload=%q", response.Type, string(response.Payload))
}
return nil
}
func runInstallLinux(ctx context.Context, args []string) error { func runInstallLinux(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("install-linux", flag.ContinueOnError) fs := flag.NewFlagSet("install-linux", flag.ContinueOnError)
cfg := hostagent.LinuxInstallConfig{} cfg := hostagent.LinuxInstallConfig{}
@@ -215,16 +136,15 @@ func runInstallLinux(ctx context.Context, args []string) error {
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.") fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent path copied to the persistent updater location.") fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent path copied to the persistent updater location.")
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.") fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.") fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable historical synthetic mesh runtime.")
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.") fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.")
fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.") fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.") fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.") fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.") fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Historical synthetic mesh HTTP listen address.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.") fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.") fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.") fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
@@ -303,16 +223,15 @@ func runInstallWindows(ctx context.Context, args []string) error {
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.") fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent.exe path copied to the persistent updater location.") fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent.exe path copied to the persistent updater location.")
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.") fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.") fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable historical synthetic mesh runtime.")
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.") fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.")
fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.") fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.") fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.") fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.") fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Historical synthetic mesh HTTP listen address.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.") fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.") fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.") fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
@@ -517,6 +436,9 @@ func runUpdateLoop(ctx context.Context, args []string) error {
ClusterID: req.ClusterID, ClusterID: req.ClusterID,
NodeID: req.NodeID, NodeID: req.NodeID,
StateDir: req.StateDir, StateDir: req.StateDir,
ClusterAuthorityPublicKey: req.ClusterAuthorityPublicKey,
FabricRegistryRecordsJSON: req.FabricRegistryRecordsJSON,
MeshRegion: req.MeshRegion,
CurrentVersion: hostAgentVersion, CurrentVersion: hostAgentVersion,
Channel: req.Channel, Channel: req.Channel,
OS: firstNonEmptyLocal(req.OS, runtime.GOOS), OS: firstNonEmptyLocal(req.OS, runtime.GOOS),
@@ -569,6 +491,9 @@ func parseMonitor(args []string) (hostagent.MonitorConfig, error) {
fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.") fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.") fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.")
fs.StringVar(&cfg.ClusterAuthorityPublicKey, "cluster-authority-public-key", getenv("RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned Ed25519 cluster authority public key for signed fabric registry records.")
fs.StringVar(&cfg.FabricRegistryRecordsJSON, "fabric-registry-records-json", getenv("RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry records used to reach update/control services.")
fs.StringVar(&cfg.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint for fabric registry endpoint selection.")
fs.StringVar(&cfg.Product, "product", getenv("RAP_MONITOR_PRODUCT", hostagent.DefaultMonitorProduct), "Status product name.") fs.StringVar(&cfg.Product, "product", getenv("RAP_MONITOR_PRODUCT", hostagent.DefaultMonitorProduct), "Status product name.")
fs.StringVar(&cfg.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Current rap-host-agent version.") fs.StringVar(&cfg.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Current rap-host-agent version.")
fs.StringVar(&cfg.DockerBinary, "docker-binary", getenv("RAP_DOCKER_BINARY", "docker"), "Docker CLI binary.") fs.StringVar(&cfg.DockerBinary, "docker-binary", getenv("RAP_DOCKER_BINARY", "docker"), "Docker CLI binary.")
@@ -716,6 +641,9 @@ func parseHostAgentUpdate(args []string) (hostagent.HostAgentUpdateRequest, int,
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.") fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json.") fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json.")
fs.StringVar(&req.ClusterAuthorityPublicKey, "cluster-authority-public-key", getenv("RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned Ed25519 cluster authority public key for signed fabric registry records.")
fs.StringVar(&req.FabricRegistryRecordsJSON, "fabric-registry-records-json", getenv("RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry records used to reach update/control services.")
fs.StringVar(&req.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint for fabric registry endpoint selection.")
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Currently installed rap-host-agent version.") fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Currently installed rap-host-agent version.")
fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.") fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
fs.StringVar(&req.OS, "os", getenv("RAP_HOST_AGENT_UPDATE_OS", runtime.GOOS), "Host-agent artifact OS selector.") fs.StringVar(&req.OS, "os", getenv("RAP_HOST_AGENT_UPDATE_OS", runtime.GOOS), "Host-agent artifact OS selector.")
@@ -739,6 +667,9 @@ func registerUpdateFlags(fs *flag.FlagSet, req *hostagent.UpdateRequest, healthT
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.") fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json; used when node-id is not known yet.") fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json; used when node-id is not known yet.")
fs.StringVar(&req.ClusterAuthorityPublicKey, "cluster-authority-public-key", getenv("RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned Ed25519 cluster authority public key for signed fabric registry records.")
fs.StringVar(&req.FabricRegistryRecordsJSON, "fabric-registry-records-json", getenv("RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry records used to reach update/control services.")
fs.StringVar(&req.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint for fabric registry endpoint selection.")
fs.StringVar(&req.Product, "product", getenv("RAP_UPDATE_PRODUCT", hostagent.DefaultUpdateProduct), "Update product name.") fs.StringVar(&req.Product, "product", getenv("RAP_UPDATE_PRODUCT", hostagent.DefaultUpdateProduct), "Update product name.")
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Currently running product version.") fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Currently running product version.")
fs.StringVar(&req.OS, "os", getenv("RAP_UPDATE_OS", runtime.GOOS), "Artifact OS selector.") fs.StringVar(&req.OS, "os", getenv("RAP_UPDATE_OS", runtime.GOOS), "Artifact OS selector.")
@@ -797,16 +728,15 @@ func parseInstall(args []string) (installCommandConfig, error) {
fs.IntVar(&autoUpdate.MonitorDiskCritical, "monitor-disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.") fs.IntVar(&autoUpdate.MonitorDiskCritical, "monitor-disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.")
fs.BoolVar(&autoUpdate.MonitorCleanupDocker, "monitor-cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.") fs.BoolVar(&autoUpdate.MonitorCleanupDocker, "monitor-cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.")
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.") fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable synthetic mesh runtime.") fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable historical synthetic mesh runtime.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.") fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.")
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.") fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.") fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.") fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.") fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.") fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.IntVar(&cfg.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.") fs.IntVar(&cfg.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Synthetic mesh HTTP listen address inside container.") fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Historical synthetic mesh HTTP listen address inside container.")
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", ""), "Mesh listen port behavior: manual, auto, or disabled.") fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", ""), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 0), "First port used when mesh listen port mode is auto.") fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 0), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.") fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.")
@@ -941,13 +871,12 @@ func usage() {
rap-host-agent install -backend-url URL -cluster-id ID -join-token TOKEN -node-name NAME [docker options] rap-host-agent install -backend-url URL -cluster-id ID -join-token TOKEN -node-name NAME [docker options]
rap-host-agent install-windows -profile-url URL -install-token TOKEN [-node-name NAME] [windows options] rap-host-agent install-windows -profile-url URL -install-token TOKEN [-node-name NAME] [windows options]
rap-host-agent install-linux -profile-url URL -install-token TOKEN [-node-name NAME] [linux/systemd options] rap-host-agent install-linux -profile-url URL -install-token TOKEN [-node-name NAME] [linux/systemd options]
rap-host-agent install-updater -backend-url URL -cluster-id ID -state-dir DIR -container-name NAME rap-host-agent install-updater (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR -container-name NAME
rap-host-agent update-host-agent -backend-url URL -cluster-id ID -state-dir DIR rap-host-agent update-host-agent (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR
rap-host-agent update-host-agent-loop -backend-url URL -cluster-id ID -state-dir DIR rap-host-agent update-host-agent-loop (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR
rap-host-agent monitor-loop -backend-url URL -cluster-id ID -state-dir DIR --watch-container NAME rap-host-agent monitor-loop (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR --watch-container NAME
rap-host-agent monitor-once -backend-url URL -cluster-id ID -state-dir DIR --watch-container NAME rap-host-agent monitor-once (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR --watch-container NAME
rap-host-agent fabric-session-smoke -mesh-url URL -token rap_fsn_TOKEN [-authority-payload VALUE -authority-signature VALUE] rap-host-agent update (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -node-id ID [-container-name NAME]
rap-host-agent update -backend-url URL -cluster-id ID -node-id ID [-container-name NAME] rap-host-agent update-loop (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -node-id ID [-container-name NAME]
rap-host-agent update-loop -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
rap-host-agent status [-container-name NAME]`) rap-host-agent status [-container-name NAME]`)
} }
File diff suppressed because it is too large Load Diff
@@ -21,6 +21,7 @@ import (
"net/http/httptest" "net/http/httptest"
"os" "os"
"path/filepath" "path/filepath"
"reflect"
"strings" "strings"
"testing" "testing"
"time" "time"
@@ -204,7 +205,7 @@ func TestRouteManagerDecisionsFromControlPlaneConsumesRebuildRouteCommand(t *tes
} }
decision := decisions[0] decision := decisions[0]
if decision.RouteID != "route-primary" || if decision.RouteID != "route-primary" ||
decision.RebuildStatus != "pending_degraded_fallback" || decision.RebuildStatus != "pending_degraded_route_state" ||
decision.DecisionSource != "service_channel_remediation_command" || decision.DecisionSource != "service_channel_remediation_command" ||
decision.RebuildRequestID != "cmd-rebuild" { decision.RebuildRequestID != "cmd-rebuild" {
t.Fatalf("unexpected rebuild remediation decision: %+v", decision) t.Fatalf("unexpected rebuild remediation decision: %+v", decision)
@@ -279,7 +280,6 @@ func TestGatewayTransportForAssignmentUsesFabricSessionWhenEnabled(t *testing.T)
&syntheticMeshState{ &syntheticMeshState{
ProductionForwardTransport: noopProductionForwardTransport{}, ProductionForwardTransport: noopProductionForwardTransport{},
VPNFabricInbox: inbox, VPNFabricInbox: inbox,
VPNFabricSessionPeers: mesh.NewFabricSessionPeerManager(),
PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{ PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{
"entry-1": {{ "entry-1": {{
EndpointID: "entry-1-quic", EndpointID: "entry-1-quic",
@@ -322,7 +322,6 @@ func TestGatewayTransportForAssignmentFallsBackWhenFabricSessionUnavailable(t *t
&syntheticMeshState{ &syntheticMeshState{
ProductionForwardTransport: noopProductionForwardTransport{}, ProductionForwardTransport: noopProductionForwardTransport{},
VPNFabricInbox: inbox, VPNFabricInbox: inbox,
VPNFabricSessionPeers: mesh.NewFabricSessionPeerManager(),
PeerEndpoints: map[string]string{}, PeerEndpoints: map[string]string{},
Routes: []mesh.SyntheticRoute{{ Routes: []mesh.SyntheticRoute{{
RouteID: "route-exit-entry", RouteID: "route-exit-entry",
@@ -424,6 +423,496 @@ func testMainQUICCertSHA256(t *testing.T, config *tls.Config) string {
return hex.EncodeToString(sum[:]) return hex.EncodeToString(sum[:])
} }
func TestFabricControlForwardHandlerUsesRegistryQUICControlAPI(t *testing.T) {
tlsConfig := testMainQUICTLSConfig(t)
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
var req client.RawControlRequest
if err := json.Unmarshal(payload, &req); err != nil {
return nil, err
}
if req.Path != "/auth/login" {
return nil, fmt.Errorf("unexpected path %s", req.Path)
}
return json.Marshal(client.RawControlResponse{StatusCode: 200, Body: json.RawMessage(`{"via":"fabric"}`)})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
now := time.Now().UTC()
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
t.Fatalf("generate key: %v", err)
}
issuer := mesh.FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: mesh.FabricRegistryAuthorityControl, PublicKey: publicKey}
record := mesh.FabricRegistryGossipRecord{
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
ClusterID: "cluster-1",
Service: mesh.FabricRegistryServiceControlAPI,
Scope: mesh.FabricRegistryScopeCluster,
Epoch: 1,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Hour),
IssuerNodeID: "authority-1",
IssuerRole: mesh.FabricRegistryAuthorityControl,
Endpoints: []mesh.FabricRegistryEndpoint{{
EndpointID: "control-a",
Address: "quic://" + server.Addr().String(),
Transport: "direct_quic",
PeerCertSHA256: testMainQUICCertSHA256(t, tlsConfig),
}},
}
signed, err := mesh.SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign registry record: %v", err)
}
registry := mesh.NewFabricRegistry()
if _, _, err := registry.ApplyGossipRecord(signed, mesh.FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []mesh.FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}, true); err != nil {
t.Fatalf("apply registry record: %v", err)
}
transport := mesh.NewQUICFabricTransport(nil)
transport.SetLocalPeerID("node-a")
handler := fabricControlForwardHandlerFromMeshState(nil, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, &syntheticMeshState{
FabricRegistry: registry,
VPNFabricQUICTransport: transport,
ListenerRuntimeConfig: config.Config{MeshRegion: "test"},
})
payload, err := handler(context.Background(), []byte(`{"method":"POST","path":"/auth/login","body":{"user":"a"}}`))
if err != nil {
t.Fatalf("fabric control handler: %v", err)
}
var response client.RawControlResponse
if err := json.Unmarshal(payload, &response); err != nil {
t.Fatalf("decode raw control response: %v", err)
}
if response.StatusCode != 200 || string(response.Body) != `{"via":"fabric"}` {
t.Fatalf("response = %+v", response)
}
}
func TestHeartbeatViaFabricControlUsesRegistryQUICControlAPI(t *testing.T) {
tlsConfig := testMainQUICTLSConfig(t)
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
var req client.RawControlRequest
if err := json.Unmarshal(payload, &req); err != nil {
return nil, err
}
if req.Method != http.MethodPost || req.Path != "/clusters/cluster-1/nodes/node-a/heartbeats" {
return nil, fmt.Errorf("unexpected request: %+v", req)
}
return json.Marshal(client.RawControlResponse{
StatusCode: 202,
Body: json.RawMessage(`{
"heartbeat":{"id":"hb-1"},
"testing_flags":{"enabled":true,"synthetic_links_enabled":true,"applied_scopes":["cluster"]},
"update_hint":{"schema_version":"rap.node_update_hint.v1","check_now":true,"generation":"gen-1"}
}`),
})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
now := time.Now().UTC()
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
t.Fatalf("generate key: %v", err)
}
issuer := mesh.FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: mesh.FabricRegistryAuthorityControl, PublicKey: publicKey}
record := mesh.FabricRegistryGossipRecord{
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
ClusterID: "cluster-1",
Service: mesh.FabricRegistryServiceControlAPI,
Scope: mesh.FabricRegistryScopeCluster,
Epoch: 1,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Hour),
IssuerNodeID: "authority-1",
IssuerRole: mesh.FabricRegistryAuthorityControl,
Endpoints: []mesh.FabricRegistryEndpoint{{
EndpointID: "control-a",
Address: "quic://" + server.Addr().String(),
Transport: "direct_quic",
PeerCertSHA256: testMainQUICCertSHA256(t, tlsConfig),
}},
}
signed, err := mesh.SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign registry record: %v", err)
}
registry := mesh.NewFabricRegistry()
if _, _, err := registry.ApplyGossipRecord(signed, mesh.FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []mesh.FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}, true); err != nil {
t.Fatalf("apply registry record: %v", err)
}
response, viaFabric, err := heartbeatViaFabricControl(context.Background(), state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, &syntheticMeshState{
FabricRegistry: registry,
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
}, client.HeartbeatRequest{HealthStatus: "healthy"})
if err != nil {
t.Fatalf("heartbeat via fabric: %v", err)
}
if !viaFabric || !response.TestingFlags.Enabled || response.UpdateHint == nil || response.UpdateHint.Generation != "gen-1" {
t.Fatalf("unexpected heartbeat response viaFabric=%t response=%+v", viaFabric, response)
}
}
func TestSyntheticMeshConfigRefreshUsesRegistryQUICControlAPI(t *testing.T) {
tlsConfig := testMainQUICTLSConfig(t)
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
var req client.RawControlRequest
if err := json.Unmarshal(payload, &req); err != nil {
return nil, err
}
if req.Method != http.MethodGet || req.Path != "/clusters/cluster-1/nodes/node-a/mesh/synthetic-config" {
return nil, fmt.Errorf("unexpected request: %+v", req)
}
return json.Marshal(client.RawControlResponse{
StatusCode: 200,
Body: json.RawMessage(`{
"synthetic_mesh_config":{
"enabled":true,
"config_version":"fabric-gen-1",
"peer_directory_version":"pd-1",
"policy_version":"pol-1",
"peer_endpoints":{},
"routes":[]
}
}`),
})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
loaded, err := loadSyntheticMeshConfigRuntime(context.Background(), config.Config{}, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, nil, &syntheticMeshState{
FabricRegistry: registry,
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
})
if err != nil {
t.Fatalf("load synthetic mesh config via fabric: %v", err)
}
if loaded.Source != "control_plane" || loaded.ConfigVersion != "fabric-gen-1" {
t.Fatalf("loaded = %+v", loaded)
}
}
func TestReportMeshLinkUsesRegistryQUICControlAPI(t *testing.T) {
tlsConfig := testMainQUICTLSConfig(t)
var received client.RawControlRequest
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
if err := json.Unmarshal(payload, &received); err != nil {
return nil, err
}
if received.Method != http.MethodPost || received.Path != "/clusters/cluster-1/mesh/links" {
return nil, fmt.Errorf("unexpected request: %+v", received)
}
return json.Marshal(client.RawControlResponse{StatusCode: 202, Body: json.RawMessage(`{"ok":true}`)})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
err = reportMeshLink(context.Background(), nil, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, &syntheticMeshState{
FabricRegistry: registry,
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
}, client.MeshLinkObservationRequest{
SourceNodeID: "node-a",
TargetNodeID: "node-b",
LinkStatus: "reachable",
})
if err != nil {
t.Fatalf("report mesh link via fabric: %v", err)
}
if len(received.Body) == 0 || !strings.Contains(string(received.Body), `"target_node_id":"node-b"`) {
t.Fatalf("unexpected received body: %s", string(received.Body))
}
}
func TestReportTelemetryUsesRegistryQUICControlAPI(t *testing.T) {
tlsConfig := testMainQUICTLSConfig(t)
var received client.RawControlRequest
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
if err := json.Unmarshal(payload, &received); err != nil {
return nil, err
}
if received.Method != http.MethodPost || received.Path != "/clusters/cluster-1/nodes/node-a/telemetry" {
return nil, fmt.Errorf("unexpected request: %+v", received)
}
return json.Marshal(client.RawControlResponse{StatusCode: 202, Body: json.RawMessage(`{"ok":true}`)})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
err = reportTelemetry(context.Background(), nil, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, &syntheticMeshState{
FabricRegistry: registry,
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
}, client.TelemetryRequest{Payload: map[string]any{"fabric": "quic"}})
if err != nil {
t.Fatalf("report telemetry via fabric: %v", err)
}
if len(received.Body) == 0 || !strings.Contains(string(received.Body), `"fabric":"quic"`) {
t.Fatalf("unexpected received body: %s", string(received.Body))
}
}
func TestWorkloadControlUsesRegistryQUICControlAPI(t *testing.T) {
tlsConfig := testMainQUICTLSConfig(t)
var paths []string
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
var req client.RawControlRequest
if err := json.Unmarshal(payload, &req); err != nil {
return nil, err
}
paths = append(paths, req.Method+" "+req.Path)
switch req.Path {
case "/clusters/cluster-1/nodes/node-a/workloads/desired":
return json.Marshal(client.RawControlResponse{
StatusCode: 200,
Body: json.RawMessage(`{"desired_workloads":[{"service_type":"vpn-egress","desired_state":"enabled","runtime_mode":"node"}]}`),
})
case "/clusters/cluster-1/nodes/node-a/workloads/vpn-egress/status":
if len(req.Body) == 0 || !strings.Contains(string(req.Body), `"reported_state":"running"`) {
return nil, fmt.Errorf("unexpected status body: %s", string(req.Body))
}
return json.Marshal(client.RawControlResponse{StatusCode: 204, Body: json.RawMessage(`{}`)})
default:
return nil, fmt.Errorf("unexpected request: %+v", req)
}
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
meshState := &syntheticMeshState{
FabricRegistry: registry,
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
}
identity := state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}
desired, err := desiredWorkloads(context.Background(), nil, identity, meshState)
if err != nil {
t.Fatalf("desired workloads via fabric: %v", err)
}
if len(desired) != 1 || desired[0].ServiceType != "vpn-egress" {
t.Fatalf("desired = %+v", desired)
}
if err := reportSingleWorkloadStatus(context.Background(), nil, identity, meshState, "vpn-egress", client.WorkloadStatusRequest{ReportedState: "running"}); err != nil {
t.Fatalf("report workload status via fabric: %v", err)
}
want := []string{
"GET /clusters/cluster-1/nodes/node-a/workloads/desired",
"POST /clusters/cluster-1/nodes/node-a/workloads/vpn-egress/status",
}
if !reflect.DeepEqual(paths, want) {
t.Fatalf("paths = %+v, want %+v", paths, want)
}
}
func TestAdminRuntimeProjectionUsesRegistryQUICControlAPI(t *testing.T) {
tlsConfig := testMainQUICTLSConfig(t)
var received client.RawControlRequest
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
if err := json.Unmarshal(payload, &received); err != nil {
return nil, err
}
if received.Method != http.MethodPost || received.Path != "/clusters/cluster-1/nodes/node-a/admin-runtime/projection" {
return nil, fmt.Errorf("unexpected request: %+v", received)
}
return json.Marshal(client.RawControlResponse{
StatusCode: 200,
Body: json.RawMessage(`{"schema_version":"rap.admin_runtime_projection.v1","status":"ok","status_code":200,"body":{"page":"cluster"}}`),
})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
projection, err := controlAPIProjectionClient{
Identity: state.Identity{ClusterID: "cluster-1", NodeID: "node-a"},
MeshState: &syntheticMeshState{
FabricRegistry: registry,
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
},
}.Project(context.Background(), webingress.ControlAPIProjectionRequest{
SchemaVersion: "rap.web_ingress_projection.v1",
Method: http.MethodGet,
Path: "/cluster-admin",
Scope: "cluster",
ServiceClass: "cluster_admin",
})
if err != nil {
t.Fatalf("admin projection via fabric: %v", err)
}
if projection.StatusCode != 200 || string(projection.Body) != `{"page":"cluster"}` {
t.Fatalf("projection = %+v", projection)
}
if len(received.Body) == 0 || !strings.Contains(string(received.Body), `"service_class":"cluster_admin"`) {
t.Fatalf("unexpected received body: %s", string(received.Body))
}
}
func TestVPNAssignmentControlUsesRegistryQUICControlAPI(t *testing.T) {
tlsConfig := testMainQUICTLSConfig(t)
var paths []string
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
var req client.RawControlRequest
if err := json.Unmarshal(payload, &req); err != nil {
return nil, err
}
paths = append(paths, req.Method+" "+req.Path)
switch req.Path {
case "/clusters/cluster-1/nodes/node-a/vpn/assignments":
return json.Marshal(client.RawControlResponse{
StatusCode: 200,
Body: json.RawMessage(`{"vpn_assignments":[{"vpn_connection_id":"vpn-1","desired_state":"enabled","assignment_reason":"eligible_candidate"}]}`),
})
case "/clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/lease/acquire":
return json.Marshal(client.RawControlResponse{
StatusCode: 201,
Body: json.RawMessage(`{"lease":{"lease_id":"lease-1","owner_node_id":"node-a","lease_generation":1,"status":"active"}}`),
})
case "/clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/lease/lease-1/renew":
return json.Marshal(client.RawControlResponse{StatusCode: 204, Body: json.RawMessage(`{}`)})
case "/clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/status":
if len(req.Body) == 0 || !strings.Contains(string(req.Body), `"observed_status":"assigned"`) {
return nil, fmt.Errorf("unexpected status body: %s", string(req.Body))
}
return json.Marshal(client.RawControlResponse{StatusCode: 204, Body: json.RawMessage(`{}`)})
default:
return nil, fmt.Errorf("unexpected request: %+v", req)
}
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
meshState := &syntheticMeshState{
FabricRegistry: registry,
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
}
identity := state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}
assignments, err := nodeVPNAssignments(context.Background(), nil, identity, meshState)
if err != nil {
t.Fatalf("vpn assignments via fabric: %v", err)
}
if len(assignments) != 1 || assignments[0].VPNConnectionID != "vpn-1" {
t.Fatalf("assignments = %+v", assignments)
}
lease, err := acquireNodeVPNAssignmentLease(context.Background(), nil, identity, meshState, "vpn-1", client.NodeVPNAssignmentLeaseAcquireRequest{TTLSeconds: 300})
if err != nil {
t.Fatalf("acquire lease via fabric: %v", err)
}
if lease == nil || lease.LeaseID != "lease-1" {
t.Fatalf("lease = %+v", lease)
}
if err := renewNodeVPNAssignmentLease(context.Background(), nil, identity, meshState, "vpn-1", "lease-1", client.NodeVPNAssignmentLeaseRenewRequest{TTLSeconds: 300}); err != nil {
t.Fatalf("renew lease via fabric: %v", err)
}
if err := reportNodeVPNAssignmentStatus(context.Background(), nil, identity, meshState, "vpn-1", client.NodeVPNAssignmentStatusRequest{ObservedStatus: "assigned"}); err != nil {
t.Fatalf("report status via fabric: %v", err)
}
want := []string{
"GET /clusters/cluster-1/nodes/node-a/vpn/assignments",
"POST /clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/lease/acquire",
"POST /clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/lease/lease-1/renew",
"POST /clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/status",
}
if !reflect.DeepEqual(paths, want) {
t.Fatalf("paths = %+v, want %+v", paths, want)
}
}
func signedTestControlRegistry(t *testing.T, clusterID string, endpoint string, certSHA256 string) *mesh.FabricRegistry {
t.Helper()
now := time.Now().UTC()
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
t.Fatalf("generate key: %v", err)
}
issuer := mesh.FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: mesh.FabricRegistryAuthorityControl, PublicKey: publicKey}
record := mesh.FabricRegistryGossipRecord{
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
ClusterID: clusterID,
Service: mesh.FabricRegistryServiceControlAPI,
Scope: mesh.FabricRegistryScopeCluster,
Epoch: 1,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Hour),
IssuerNodeID: "authority-1",
IssuerRole: mesh.FabricRegistryAuthorityControl,
Endpoints: []mesh.FabricRegistryEndpoint{{
EndpointID: "control-a",
Address: endpoint,
Transport: "direct_quic",
PeerCertSHA256: certSHA256,
}},
}
signed, err := mesh.SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign registry record: %v", err)
}
registry := mesh.NewFabricRegistry()
if _, _, err := registry.ApplyGossipRecord(signed, mesh.FabricRegistryVerificationPolicy{
LocalClusterID: clusterID,
TrustedIssuers: []mesh.FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}, true); err != nil {
t.Fatalf("apply registry record: %v", err)
}
return registry
}
func TestRouteManagerDecisionsFromControlPlaneKeepsExplicitRemediationCommand(t *testing.T) { func TestRouteManagerDecisionsFromControlPlaneKeepsExplicitRemediationCommand(t *testing.T) {
now := time.Now().UTC() now := time.Now().UTC()
report := &client.RoutePathDecisionReport{Decisions: []client.RoutePathDecision{{ report := &client.RoutePathDecisionReport{Decisions: []client.RoutePathDecision{{
@@ -493,9 +982,10 @@ func TestFabricServiceChannelAccessStatsReportsDataPlaneViolations(t *testing.T)
OccurredAt: time.Unix(10, 0).UTC(), OccurredAt: time.Unix(10, 0).UTC(),
}) })
report := stats.Report(time.Unix(20, 0).UTC()) report := stats.Report(time.Unix(20, 0).UTC())
if report["backend_fallback_blocked"] != int64(1) || if report["degraded_compatibility_blocked"] != int64(1) ||
report["fabric_route_send_failure"] != int64(1) || report["fabric_route_send_failure"] != int64(1) ||
report["last_data_plane_violation_status"] != "fabric_route_send_failed_backend_fallback_blocked" || report["last_data_plane_violation_status"] != "degraded_compatibility_blocked" ||
report["last_data_plane_violation_status_raw"] != "fabric_route_send_failed_backend_fallback_blocked" ||
report["last_data_plane_violation_reason"] != "mesh synthetic route not found" { report["last_data_plane_violation_reason"] != "mesh synthetic route not found" {
t.Fatalf("unexpected violation report: %+v", report) t.Fatalf("unexpected violation report: %+v", report)
} }
@@ -790,7 +1280,56 @@ func TestVerifyEnrollmentBootstrapRejectsPinnedAuthorityMismatch(t *testing.T) {
} }
} }
func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t *testing.T) { func TestLoadFabricRegistryBootstrapAcceptsSignedCandidate(t *testing.T) {
now := time.Now().UTC()
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
record := mesh.FabricRegistryGossipRecord{
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
ClusterID: "cluster-1",
Service: mesh.FabricRegistryServiceControlAPI,
Scope: mesh.FabricRegistryScopeCluster,
Epoch: 1,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Hour),
IssuerNodeID: "authority-node",
IssuerRole: mesh.FabricRegistryAuthorityControl,
Endpoints: []mesh.FabricRegistryEndpoint{
{EndpointID: "control-a", Address: "quic://control.example.test:19443", Transport: "direct_quic"},
},
}
signed, err := mesh.SignFabricRegistryGossipRecord(record, mesh.FabricRegistryTrustedIssuer{
IssuerID: "cluster-authority",
Role: mesh.FabricRegistryAuthorityControl,
}, privateKey)
if err != nil {
t.Fatalf("sign registry record: %v", err)
}
raw, err := json.Marshal([]mesh.FabricRegistryGossipRecord{signed})
if err != nil {
t.Fatalf("marshal registry records: %v", err)
}
registry, report := loadFabricRegistryBootstrap(config.Config{
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
FabricRegistryRecordsJSON: string(raw),
}, state.Identity{ClusterID: "cluster-1"})
if registry == nil || report.Total != 1 || report.Candidate != 1 || report.Rejected != 0 {
t.Fatalf("unexpected registry bootstrap report: %+v registry=%v", report, registry)
}
if _, ok := registry.Active("cluster-1", mesh.FabricRegistryServiceControlAPI, mesh.FabricRegistryScopeCluster, "", now); ok {
t.Fatal("bootstrap record should remain candidate until live verification")
}
if !registry.MarkLiveVerified("cluster-1", mesh.FabricRegistryServiceControlAPI, mesh.FabricRegistryScopeCluster, "", now) {
t.Fatal("MarkLiveVerified = false")
}
if _, ok := registry.Active("cluster-1", mesh.FabricRegistryServiceControlAPI, mesh.FabricRegistryScopeCluster, "", now); !ok {
t.Fatal("expected active record after live verification")
}
}
func TestNormalizeLoadedSyntheticMeshConfigMigratesNonQUICControlPlaneSurfaces(t *testing.T) {
loaded := loadedSyntheticMeshConfig{ loaded := loadedSyntheticMeshConfig{
PeerEndpoints: map[string]string{ PeerEndpoints: map[string]string{
"node-a": "https://node-a.example.test:443", "node-a": "https://node-a.example.test:443",
@@ -798,7 +1337,7 @@ func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t
PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{ PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{
"node-b": { "node-b": {
{ {
EndpointID: "node-b-legacy", EndpointID: "node-b-http-migration",
NodeID: "node-b", NodeID: "node-b",
Transport: "direct_http", Transport: "direct_http",
Address: "https://node-b.example.test:443", Address: "https://node-b.example.test:443",
@@ -816,7 +1355,7 @@ func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t
}, },
RendezvousLeases: []mesh.PeerRendezvousLease{ RendezvousLeases: []mesh.PeerRendezvousLease{
{ {
LeaseID: "lease-legacy", LeaseID: "lease-http-migration",
PeerNodeID: "node-b", PeerNodeID: "node-b",
RelayNodeID: "node-r", RelayNodeID: "node-r",
RelayEndpoint: "http://node-r.example.test:19001", RelayEndpoint: "http://node-r.example.test:19001",
@@ -824,7 +1363,7 @@ func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t
}, },
}, },
RoutePathDecisions: &client.RoutePathDecisionReport{ RoutePathDecisions: &client.RoutePathDecisionReport{
Decisions: []client.RoutePathDecision{{DecisionID: "decision-legacy", SelectedRelayEndpoint: "http://node-r.example.test:19001"}}, Decisions: []client.RoutePathDecision{{DecisionID: "decision-http-migration", SelectedRelayEndpoint: "http://node-r.example.test:19001"}},
}, },
} }
normalizeLoadedSyntheticMeshConfigQUICOnly(&loaded) normalizeLoadedSyntheticMeshConfigQUICOnly(&loaded)
@@ -849,14 +1388,14 @@ func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t
} }
} }
func TestValidateLoadedSyntheticMeshConfigRejectsUnnormalizedLegacyControlPlaneSurfaces(t *testing.T) { func TestValidateLoadedSyntheticMeshConfigRejectsUnnormalizedNonQUICControlPlaneSurfaces(t *testing.T) {
err := validateLoadedSyntheticMeshConfigQUICOnly(loadedSyntheticMeshConfig{ err := validateLoadedSyntheticMeshConfigQUICOnly(loadedSyntheticMeshConfig{
RoutePathDecisions: &client.RoutePathDecisionReport{ RoutePathDecisions: &client.RoutePathDecisionReport{
Decisions: []client.RoutePathDecision{{DecisionID: "decision-legacy", SelectedRelayEndpoint: "http://node-r.example.test:19001"}}, Decisions: []client.RoutePathDecision{{DecisionID: "decision-http-migration", SelectedRelayEndpoint: "http://node-r.example.test:19001"}},
}, },
}) })
if err == nil || !strings.Contains(err.Error(), "QUIC selected relay endpoint") { if err == nil || !strings.Contains(err.Error(), "QUIC selected relay endpoint") {
t.Fatalf("expected legacy selected relay endpoint rejection, got %v", err) t.Fatalf("expected non-QUIC selected relay endpoint rejection, got %v", err)
} }
} }
@@ -942,7 +1481,6 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
MeshRegion: "eu", MeshRegion: "eu",
MeshSyntheticRuntimeEnabled: true, MeshSyntheticRuntimeEnabled: true,
MeshProductionForwardingEnabled: true, MeshProductionForwardingEnabled: true,
MeshFabricSessionEnabled: true,
VPNFabricSessionTransportEnabled: true, VPNFabricSessionTransportEnabled: true,
VPNFabricSessionStreamShards: 6, VPNFabricSessionStreamShards: 6,
VPNFabricQUICMaxStreamsPerConn: 24, VPNFabricQUICMaxStreamsPerConn: 24,
@@ -952,7 +1490,6 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
ClusterID: "cluster-1", ClusterID: "cluster-1",
NodeID: "node-a", NodeID: "node-a",
}, &syntheticMeshState{ }, &syntheticMeshState{
VPNFabricSessionPeers: mesh.NewFabricSessionPeerManager(),
VPNFabricQUICTransport: func() *mesh.QUICFabricTransport { VPNFabricQUICTransport: func() *mesh.QUICFabricTransport {
transport := mesh.NewQUICFabricTransport(nil) transport := mesh.NewQUICFabricTransport(nil)
transport.MaxStreamsPerConn = 24 transport.MaxStreamsPerConn = 24
@@ -1010,8 +1547,7 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
if report, ok := payload.Metadata["vpn_fabric_session_transport_report"].(map[string]any); !ok || if report, ok := payload.Metadata["vpn_fabric_session_transport_report"].(map[string]any); !ok ||
report["packet_payload"] != "rap.vpn_packet_batch.fabric.v1" || report["packet_payload"] != "rap.vpn_packet_batch.fabric.v1" ||
report["transport"] != "fabric_session_binary_frames" || report["transport"] != "fabric_session_binary_frames" ||
report["stream_shards_per_class"] != 6 || report["stream_shards_per_class"] != 6 {
report["peer_sessions"] == nil {
t.Fatalf("vpn fabric session report missing: %+v", payload.Metadata) t.Fatalf("vpn fabric session report missing: %+v", payload.Metadata)
} else if report["quic_sessions"] == nil || report["quic_max_streams_per_conn"] != 24 { } else if report["quic_sessions"] == nil || report["quic_max_streams_per_conn"] != 24 {
t.Fatalf("vpn fabric quic session report missing: %+v", report) t.Fatalf("vpn fabric quic session report missing: %+v", report)
@@ -1242,14 +1778,14 @@ func TestVPNFabricSessionTargetPrefersRankedQUICCandidate(t *testing.T) {
} }
} }
func TestVPNFabricSessionTargetFallsBackToLegacyPeerEndpoint(t *testing.T) { func TestVPNFabricSessionTargetRejectsNonQUICPeerEndpoint(t *testing.T) {
_, ok := vpnFabricSessionTarget(&syntheticMeshState{ _, ok := vpnFabricSessionTarget(&syntheticMeshState{
PeerEndpoints: map[string]string{ PeerEndpoints: map[string]string{
"node-b": "https://node-b.example.test:443/", "node-b": "https://node-b.example.test:443/",
}, },
}, "node-b") }, "node-b")
if ok { if ok {
t.Fatal("legacy peer endpoint unexpectedly produced a QUIC target") t.Fatal("non-QUIC peer endpoint unexpectedly produced a QUIC target")
} }
} }
@@ -1257,7 +1793,7 @@ func TestVPNFabricSessionTargetsIncludeRankedQUICCandidatesWithoutLegacyFallback
now := time.Now().UTC() now := time.Now().UTC()
targets := vpnFabricSessionTargets(&syntheticMeshState{ targets := vpnFabricSessionTargets(&syntheticMeshState{
PeerEndpoints: map[string]string{ PeerEndpoints: map[string]string{
"node-b": "https://node-b-legacy.example.test:443/", "node-b": "https://node-b-http-migration.example.test:443/",
}, },
PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{ PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{
"node-b": { "node-b": {
@@ -2731,7 +3267,7 @@ func TestWebIngressForwardHandlerFromConfigVerifiesSignedEnvelope(t *testing.T)
keyID := "web-key-1" keyID := "web-key-1"
handler := webIngressForwardHandlerFromConfig(config.Config{ handler := webIngressForwardHandlerFromConfig(config.Config{
WebIngressTrustedKeysJSON: webingress.TrustedKeysJSONForPublicKey(keyID, publicKey), WebIngressTrustedKeysJSON: webingress.TrustedKeysJSONForPublicKey(keyID, publicKey),
}, state.Identity{ClusterID: "cluster-1", NodeID: "node-1"}, nil) }, state.Identity{ClusterID: "cluster-1", NodeID: "node-1"}, nil, nil)
if handler == nil { if handler == nil {
t.Fatal("handler is nil") t.Fatal("handler is nil")
} }
@@ -2780,10 +3316,10 @@ func TestWebIngressForwardHandlerFromConfigVerifiesSignedEnvelope(t *testing.T)
} }
func TestWebIngressForwardHandlerFromConfigDisabledWithoutTrustedKeys(t *testing.T) { func TestWebIngressForwardHandlerFromConfigDisabledWithoutTrustedKeys(t *testing.T) {
if handler := webIngressForwardHandlerFromConfig(config.Config{}, state.Identity{}, nil); handler != nil { if handler := webIngressForwardHandlerFromConfig(config.Config{}, state.Identity{}, nil, nil); handler != nil {
t.Fatal("handler should be nil without trusted keys") t.Fatal("handler should be nil without trusted keys")
} }
if handler := webIngressForwardHandlerFromConfig(config.Config{WebIngressTrustedKeysJSON: `{"bad":"key"}`}, state.Identity{}, nil); handler != nil { if handler := webIngressForwardHandlerFromConfig(config.Config{WebIngressTrustedKeysJSON: `{"bad":"key"}`}, state.Identity{}, nil, nil); handler != nil {
t.Fatal("handler should be nil with invalid trusted keys") t.Fatal("handler should be nil with invalid trusted keys")
} }
} }
@@ -7,7 +7,7 @@ import (
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
) )
const Version = "0.2.309-latencyaware" const Version = "0.2.321-directreadytarget"
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest { func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
return client.EnrollRequest{ return client.EnrollRequest{
@@ -828,9 +828,6 @@ func (c *Client) RawControl(ctx context.Context, request RawControlRequest) (Raw
if err != nil { if err != nil {
return RawControlResponse{}, err return RawControlResponse{}, err
} }
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return RawControlResponse{}, fmt.Errorf("backend returned status %d: %s", httpResp.StatusCode, string(payload))
}
return RawControlResponse{StatusCode: httpResp.StatusCode, Body: json.RawMessage(payload)}, nil return RawControlResponse{StatusCode: httpResp.StatusCode, Body: json.RawMessage(payload)}, nil
} }
@@ -1,6 +1,7 @@
package config package config
import ( import (
"encoding/json"
"errors" "errors"
"flag" "flag"
"os" "os"
@@ -31,7 +32,6 @@ type Config struct {
EnrollmentPollTimeout time.Duration EnrollmentPollTimeout time.Duration
MeshSyntheticRuntimeEnabled bool MeshSyntheticRuntimeEnabled bool
MeshProductionForwardingEnabled bool MeshProductionForwardingEnabled bool
MeshFabricSessionEnabled bool
VPNFabricSessionTransportEnabled bool VPNFabricSessionTransportEnabled bool
MeshQUICFabricEnabled bool MeshQUICFabricEnabled bool
MeshQUICFabricListenAddr string MeshQUICFabricListenAddr string
@@ -45,6 +45,7 @@ type Config struct {
MeshListenAutoPortEnd int MeshListenAutoPortEnd int
MeshAdvertiseEndpoint string MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string MeshAdvertiseEndpointsJSON string
FabricRegistryRecordsJSON string
MeshAdvertiseTransport string MeshAdvertiseTransport string
MeshConnectivityMode string MeshConnectivityMode string
MeshNATType string MeshNATType string
@@ -86,7 +87,6 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.StringVar(&cfg.WebIngressRuntimeServiceClasses, "web-ingress-runtime-service-classes", getEnv(env, "RAP_WEB_INGRESS_RUNTIME_SERVICE_CLASSES", ""), "Optional comma-separated allow-list of web ingress runtime service classes accepted by this node.") fs.StringVar(&cfg.WebIngressRuntimeServiceClasses, "web-ingress-runtime-service-classes", getEnv(env, "RAP_WEB_INGRESS_RUNTIME_SERVICE_CLASSES", ""), "Optional comma-separated allow-list of web ingress runtime service classes accepted by this node.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.") fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.") fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getEnvBool(env, "RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint. Disabled by default.")
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getEnvBool(env, "RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric session when explicitly enabled. Disabled by default.") fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getEnvBool(env, "RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric session when explicitly enabled. Disabled by default.")
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getEnvBool(env, "RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener. Disabled by default.") fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getEnvBool(env, "RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener. Disabled by default.")
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getEnv(env, "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "Listen address for QUIC/UDP fabric endpoint, for example :19443.") fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getEnv(env, "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "Listen address for QUIC/UDP fabric endpoint, for example :19443.")
@@ -94,12 +94,13 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getEnvInt(env, "RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.") fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getEnvInt(env, "RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.DurationVar(&cfg.VPNFabricQUICIdleTTL, "vpn-fabric-quic-idle-ttl", time.Duration(getEnvInt(env, "RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300))*time.Second, "Idle TTL for cached VPN QUIC carrier connections.") fs.DurationVar(&cfg.VPNFabricQUICIdleTTL, "vpn-fabric-quic-idle-ttl", time.Duration(getEnvInt(env, "RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300))*time.Second, "Idle TTL for cached VPN QUIC carrier connections.")
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.") fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.") fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default historical synthetic mesh HTTP endpoint.")
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.") fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.") fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.") fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.") fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.") fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
fs.StringVar(&cfg.FabricRegistryRecordsJSON, "fabric-registry-records-json", getEnv(env, "RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry gossip records used as bootstrap discovery seeds.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Transport label for the advertised mesh endpoint.") fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Transport label for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.") fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.") fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.")
@@ -150,6 +151,7 @@ func Load(args []string, env map[string]string) (Config, error) {
} }
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/") cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON) cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.FabricRegistryRecordsJSON = strings.TrimSpace(cfg.FabricRegistryRecordsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport) cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
if cfg.MeshAdvertiseTransport == "" { if cfg.MeshAdvertiseTransport == "" {
cfg.MeshAdvertiseTransport = "quic" cfg.MeshAdvertiseTransport = "quic"
@@ -199,6 +201,9 @@ func Load(args []string, env map[string]string) (Config, error) {
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity { if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum") return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
} }
if cfg.FabricRegistryRecordsJSON != "" && !isJSONArray(cfg.FabricRegistryRecordsJSON) {
return Config{}, errors.New("fabric registry records must be a JSON array")
}
switch cfg.MeshListenPortMode { switch cfg.MeshListenPortMode {
case "", "manual", "auto", "disabled": case "", "manual", "auto", "disabled":
if cfg.MeshListenPortMode == "" { if cfg.MeshListenPortMode == "" {
@@ -269,6 +274,11 @@ func hasLegacyEndpointScheme(endpoint string) bool {
strings.HasPrefix(endpoint, "wss://") strings.HasPrefix(endpoint, "wss://")
} }
func isJSONArray(value string) bool {
var items []json.RawMessage
return json.Unmarshal([]byte(strings.TrimSpace(value)), &items) == nil
}
func readEnv() map[string]string { func readEnv() map[string]string {
out := map[string]string{} out := map[string]string{}
for _, pair := range os.Environ() { for _, pair := range os.Environ() {
@@ -25,7 +25,6 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30", "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true", "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true",
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true", "RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
"RAP_MESH_FABRIC_SESSION_ENABLED": "true",
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED": "true", "RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED": "true",
"RAP_MESH_QUIC_FABRIC_ENABLED": "true", "RAP_MESH_QUIC_FABRIC_ENABLED": "true",
"RAP_MESH_QUIC_FABRIC_LISTEN_ADDR": ":19443", "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR": ":19443",
@@ -39,6 +38,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020", "RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443/", "RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443/",
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
"RAP_FABRIC_REGISTRY_RECORDS_JSON": ` [{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}] `,
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic", "RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
"RAP_MESH_CONNECTIVITY_MODE": "outbound_only", "RAP_MESH_CONNECTIVITY_MODE": "outbound_only",
"RAP_MESH_NAT_TYPE": "symmetric", "RAP_MESH_NAT_TYPE": "symmetric",
@@ -93,9 +93,6 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if !cfg.MeshProductionForwardingEnabled { if !cfg.MeshProductionForwardingEnabled {
t.Fatal("MeshProductionForwardingEnabled = false, want true") t.Fatal("MeshProductionForwardingEnabled = false, want true")
} }
if !cfg.MeshFabricSessionEnabled {
t.Fatal("MeshFabricSessionEnabled = false, want true")
}
if !cfg.VPNFabricSessionTransportEnabled { if !cfg.VPNFabricSessionTransportEnabled {
t.Fatal("VPNFabricSessionTransportEnabled = false, want true") t.Fatal("VPNFabricSessionTransportEnabled = false, want true")
} }
@@ -122,6 +119,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
} }
if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:19443" || if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:19443" ||
cfg.MeshAdvertiseEndpointsJSON == "" || cfg.MeshAdvertiseEndpointsJSON == "" ||
cfg.FabricRegistryRecordsJSON != `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]` ||
cfg.MeshAdvertiseTransport != "direct_quic" || cfg.MeshAdvertiseTransport != "direct_quic" ||
cfg.MeshConnectivityMode != "outbound_only" || cfg.MeshConnectivityMode != "outbound_only" ||
cfg.MeshNATType != "symmetric" || cfg.MeshNATType != "symmetric" ||
@@ -1,6 +1,7 @@
package hostagent package hostagent
import ( import (
"encoding/json"
"errors" "errors"
"fmt" "fmt"
"strings" "strings"
@@ -29,7 +30,6 @@ type RuntimeConfig struct {
WorkloadSupervisionEnabled bool WorkloadSupervisionEnabled bool
MeshSyntheticRuntimeEnabled bool MeshSyntheticRuntimeEnabled bool
MeshProductionForwardingEnabled bool MeshProductionForwardingEnabled bool
MeshFabricSessionEnabled bool
VPNFabricSessionTransportEnabled bool VPNFabricSessionTransportEnabled bool
MeshQUICFabricEnabled bool MeshQUICFabricEnabled bool
MeshQUICFabricListenAddr string MeshQUICFabricListenAddr string
@@ -42,6 +42,7 @@ type RuntimeConfig struct {
MeshListenAutoPortEnd int MeshListenAutoPortEnd int
MeshAdvertiseEndpoint string MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string MeshAdvertiseEndpointsJSON string
FabricRegistryRecordsJSON string
MeshAdvertiseTransport string MeshAdvertiseTransport string
MeshConnectivityMode string MeshConnectivityMode string
MeshNATType string MeshNATType string
@@ -84,6 +85,7 @@ func (cfg RuntimeConfig) Normalize() RuntimeConfig {
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode)) cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/") cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON) cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.FabricRegistryRecordsJSON = strings.TrimSpace(cfg.FabricRegistryRecordsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport) cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode) cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType) cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
@@ -145,6 +147,9 @@ func (cfg RuntimeConfig) ValidateInstall() error {
if cfg.ProductionObservationSinkCap < 0 { if cfg.ProductionObservationSinkCap < 0 {
return errors.New("production observation sink capacity must not be negative") return errors.New("production observation sink capacity must not be negative")
} }
if cfg.FabricRegistryRecordsJSON != "" && !isJSONArray(cfg.FabricRegistryRecordsJSON) {
return errors.New("fabric registry records must be a JSON array")
}
for _, item := range cfg.ExtraEnv { for _, item := range cfg.ExtraEnv {
if !strings.Contains(item, "=") { if !strings.Contains(item, "=") {
return fmt.Errorf("extra env %q must be KEY=VALUE", item) return fmt.Errorf("extra env %q must be KEY=VALUE", item)
@@ -176,3 +181,8 @@ func hasLegacyEndpointScheme(endpoint string) bool {
strings.HasPrefix(endpoint, "ws://") || strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://") strings.HasPrefix(endpoint, "wss://")
} }
func isJSONArray(value string) bool {
var items []json.RawMessage
return json.Unmarshal([]byte(strings.TrimSpace(value)), &items) == nil
}
@@ -264,7 +264,6 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
"RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled), "RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled),
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled), "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled),
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled), "RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled),
"RAP_MESH_FABRIC_SESSION_ENABLED=" + boolString(cfg.MeshFabricSessionEnabled),
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED=" + boolString(cfg.VPNFabricSessionTransportEnabled), "RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED=" + boolString(cfg.VPNFabricSessionTransportEnabled),
"RAP_MESH_QUIC_FABRIC_ENABLED=" + boolString(cfg.MeshQUICFabricEnabled), "RAP_MESH_QUIC_FABRIC_ENABLED=" + boolString(cfg.MeshQUICFabricEnabled),
"RAP_VPN_FABRIC_SESSION_STREAM_SHARDS=" + strconv.Itoa(cfg.VPNFabricSessionStreamShards), "RAP_VPN_FABRIC_SESSION_STREAM_SHARDS=" + strconv.Itoa(cfg.VPNFabricSessionStreamShards),
@@ -295,6 +294,9 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
if cfg.MeshAdvertiseEndpointsJSON != "" { if cfg.MeshAdvertiseEndpointsJSON != "" {
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON="+cfg.MeshAdvertiseEndpointsJSON) env = append(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON="+cfg.MeshAdvertiseEndpointsJSON)
} }
if cfg.FabricRegistryRecordsJSON != "" {
env = append(env, "RAP_FABRIC_REGISTRY_RECORDS_JSON="+cfg.FabricRegistryRecordsJSON)
}
if cfg.MeshAdvertiseTransport != "" { if cfg.MeshAdvertiseTransport != "" {
env = append(env, "RAP_MESH_ADVERTISE_TRANSPORT="+cfg.MeshAdvertiseTransport) env = append(env, "RAP_MESH_ADVERTISE_TRANSPORT="+cfg.MeshAdvertiseTransport)
} }
@@ -74,6 +74,7 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
VPNFabricQUICIdleTTLSeconds: 120, VPNFabricQUICIdleTTLSeconds: 120,
MeshListenAddr: ":19131", MeshListenAddr: ":19131",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443/", MeshAdvertiseEndpoint: "quic://10.0.0.11:19443/",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
MeshAdvertiseTransport: "direct_quic", MeshAdvertiseTransport: "direct_quic",
MeshConnectivityMode: "private_lan", MeshConnectivityMode: "private_lan",
}) })
@@ -96,6 +97,7 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS=120", "RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS=120",
"RAP_MESH_LISTEN_ADDR=:19131", "RAP_MESH_LISTEN_ADDR=:19131",
"RAP_MESH_ADVERTISE_ENDPOINT=quic://10.0.0.11:19443", "RAP_MESH_ADVERTISE_ENDPOINT=quic://10.0.0.11:19443",
`RAP_FABRIC_REGISTRY_RECORDS_JSON=[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
"RAP_MESH_ADVERTISE_TRANSPORT=direct_quic", "RAP_MESH_ADVERTISE_TRANSPORT=direct_quic",
"RAP_MESH_CONNECTIVITY_MODE=private_lan", "RAP_MESH_CONNECTIVITY_MODE=private_lan",
"rap-node-agent:test", "rap-node-agent:test",
@@ -164,6 +166,11 @@ func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
"node_name": "node-a", "node_name": "node-a",
"image": "rap-node-agent:test", "image": "rap-node-agent:test",
"artifact_endpoints": []string{"https://cache.example.test/artifacts"}, "artifact_endpoints": []string{"https://cache.example.test/artifacts"},
"fabric_registry_records": []map[string]any{{
"schema": "rap.fabric.registry.gossip_record.v1",
"service_class": "control-api",
"service_id": "control-a",
}},
"docker_image_artifact": map[string]any{ "docker_image_artifact": map[string]any{
"kind": "docker_image_tar", "kind": "docker_image_tar",
"image": "rap-node-agent:test", "image": "rap-node-agent:test",
@@ -207,6 +214,7 @@ func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
!cfg.MeshQUICFabricEnabled || !cfg.MeshQUICFabricEnabled ||
cfg.MeshQUICFabricListenAddr != ":19443" || cfg.MeshQUICFabricListenAddr != ":19443" ||
cfg.VPNFabricSessionStreamShards != 6 || cfg.VPNFabricSessionStreamShards != 6 ||
cfg.FabricRegistryRecordsJSON != `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api","service_id":"control-a"}]` ||
cfg.MeshConnectivityMode != "outbound_only" { cfg.MeshConnectivityMode != "outbound_only" {
t.Fatalf("unexpected cfg: %+v", cfg) t.Fatalf("unexpected cfg: %+v", cfg)
} }
@@ -72,7 +72,6 @@ func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConf
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled, WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled, MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled, MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshFabricSessionEnabled: profile.MeshFabricSessionEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled, VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled, MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr, MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr,
@@ -287,7 +286,6 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
args := []string{ args := []string{
result.HostAgentPath, result.HostAgentPath,
"update-loop", "update-loop",
"--backend-url", cfg.RuntimeConfig.BackendURL,
"--cluster-id", cfg.RuntimeConfig.ClusterID, "--cluster-id", cfg.RuntimeConfig.ClusterID,
"--state-dir", result.StateDir, "--state-dir", result.StateDir,
"--current-version", cfg.AutoUpdateCurrentVersion, "--current-version", cfg.AutoUpdateCurrentVersion,
@@ -303,6 +301,10 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
"--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"), "--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"),
"--host-agent-binary-path", result.HostAgentPath, "--host-agent-binary-path", result.HostAgentPath,
} }
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
args = append(args, "--backend-url", strings.TrimSpace(cfg.RuntimeConfig.BackendURL))
}
args = appendFabricUpdateArgs(args, cfg.RuntimeConfig)
if strings.TrimSpace(cfg.NodeID) != "" { if strings.TrimSpace(cfg.NodeID) != "" {
args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID)) args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID))
} }
@@ -363,48 +365,48 @@ func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Updat
} }
status.Payload["systemd_unit"] = req.SystemdUnitName status.Payload["systemd_unit"] = req.SystemdUnitName
status.Payload["binary_path"] = req.BinaryPath status.Payload["binary_path"] = req.BinaryPath
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status) _ = ReportNodeUpdateStatusForRequest(ctx, req, status)
} }
return result, nil return result, nil
} }
if plan.ProductionForwarding && !req.AllowProductionMesh { if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled") err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err return result, err
} }
if plan.Artifact == nil { if plan.Artifact == nil {
err := errors.New("update plan has no artifact") err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err return result, err
} }
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != BinaryUpdateInstallType { if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != BinaryUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType) err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err return result, err
} }
if req.DryRun { if req.DryRun {
return result, nil return result, nil
} }
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL) urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}}) _ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes) path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil { if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
return result, err return result, err
} }
defer os.Remove(path) defer os.Remove(path)
runner := m.runner() runner := m.runner()
_, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName) _, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil { if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "apply", "failed", err))
return result, err return result, err
} }
result.Replaced = true result.Replaced = true
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil { if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
return result, err return result, err
} }
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}}) _ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
_ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()}) _ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()})
return result, nil return result, nil
} }
@@ -35,6 +35,9 @@ type MonitorConfig struct {
ClusterID string ClusterID string
NodeID string NodeID string
StateDir string StateDir string
ClusterAuthorityPublicKey string
FabricRegistryRecordsJSON string
MeshRegion string
Product string Product string
CurrentVersion string CurrentVersion string
Interval time.Duration Interval time.Duration
@@ -421,7 +424,18 @@ func reportMonitorStatus(ctx context.Context, cfg MonitorConfig, result MonitorR
if errText != "" { if errText != "" {
req.ErrorMessage = &errText req.ErrorMessage = &errText
} }
return ReportNodeUpdateStatus(ctx, cfg.BackendURL, clusterID, nodeID, req) return ReportNodeUpdateStatusForRequest(ctx, UpdateRequest{
BackendURL: cfg.BackendURL,
ClusterID: clusterID,
NodeID: nodeID,
StateDir: cfg.StateDir,
ClusterAuthorityPublicKey: cfg.ClusterAuthorityPublicKey,
FabricRegistryRecordsJSON: cfg.FabricRegistryRecordsJSON,
MeshRegion: cfg.MeshRegion,
Product: cfg.Product,
CurrentVersion: cfg.CurrentVersion,
InstallType: DefaultUpdateInstallType,
}, req)
} }
func resolveMonitorIdentity(cfg MonitorConfig) (string, string, error) { func resolveMonitorIdentity(cfg MonitorConfig) (string, string, error) {
@@ -16,6 +16,7 @@ type DockerInstallProfile struct {
BackendURL string `json:"backend_url"` BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"` ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"` ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"` DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"`
JoinToken string `json:"join_token"` JoinToken string `json:"join_token"`
NodeName string `json:"node_name"` NodeName string `json:"node_name"`
@@ -30,7 +31,6 @@ type DockerInstallProfile struct {
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"` WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"` MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"` MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshFabricSessionEnabled bool `json:"mesh_fabric_session_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"` VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"` MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"` MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"`
@@ -70,6 +70,7 @@ type WindowsInstallProfile struct {
BackendURL string `json:"backend_url"` BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"` ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"` ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"` NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
JoinToken string `json:"join_token"` JoinToken string `json:"join_token"`
NodeName string `json:"node_name"` NodeName string `json:"node_name"`
@@ -79,7 +80,6 @@ type WindowsInstallProfile struct {
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"` WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"` MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"` MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshFabricSessionEnabled bool `json:"mesh_fabric_session_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"` VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"` MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"` MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"`
@@ -109,6 +109,7 @@ type LinuxInstallProfile struct {
BackendURL string `json:"backend_url"` BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"` ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"` ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"` NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
JoinToken string `json:"join_token"` JoinToken string `json:"join_token"`
NodeName string `json:"node_name"` NodeName string `json:"node_name"`
@@ -118,7 +119,6 @@ type LinuxInstallProfile struct {
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"` WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"` MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"` MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshFabricSessionEnabled bool `json:"mesh_fabric_session_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"` VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"` MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"` MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"`
@@ -302,7 +302,6 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled, WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled, MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled, MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshFabricSessionEnabled: profile.MeshFabricSessionEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled, VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled, MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr, MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr,
@@ -315,6 +314,7 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd, MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint, MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON), MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
FabricRegistryRecordsJSON: string(profile.FabricRegistryRecords),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport, MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode, MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType, MeshNATType: profile.MeshNATType,
@@ -14,6 +14,9 @@ type HostAgentUpdateRequest struct {
ClusterID string ClusterID string
NodeID string NodeID string
StateDir string StateDir string
ClusterAuthorityPublicKey string
FabricRegistryRecordsJSON string
MeshRegion string
CurrentVersion string CurrentVersion string
Channel string Channel string
OS string OS string
@@ -41,6 +44,9 @@ func (req HostAgentUpdateRequest) updateRequest() UpdateRequest {
ClusterID: req.ClusterID, ClusterID: req.ClusterID,
NodeID: req.NodeID, NodeID: req.NodeID,
StateDir: req.StateDir, StateDir: req.StateDir,
ClusterAuthorityPublicKey: req.ClusterAuthorityPublicKey,
FabricRegistryRecordsJSON: req.FabricRegistryRecordsJSON,
MeshRegion: req.MeshRegion,
Product: HostAgentUpdateProduct, Product: HostAgentUpdateProduct,
CurrentVersion: req.CurrentVersion, CurrentVersion: req.CurrentVersion,
OS: firstNonEmpty(req.OS, "linux"), OS: firstNonEmpty(req.OS, "linux"),
@@ -79,25 +85,25 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
status.Payload = map[string]any{} status.Payload = map[string]any{}
} }
status.Payload["binary_path"] = binaryPath status.Payload["binary_path"] = binaryPath
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, status) _ = ReportNodeUpdateStatusForRequest(ctx, resolved, status)
} }
return result, nil return result, nil
} }
if plan.Artifact == nil { if plan.Artifact == nil {
err := errors.New("host-agent update plan has no artifact") err := errors.New("host-agent update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "preflight", "failed", err))
return result, err return result, err
} }
if !isBinaryInstallType(plan.Artifact.InstallType) { if !isBinaryInstallType(plan.Artifact.InstallType) {
err := fmt.Errorf("unsupported host-agent artifact install type %q", plan.Artifact.InstallType) err := fmt.Errorf("unsupported host-agent artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "preflight", "failed", err))
return result, err return result, err
} }
if req.DryRun { if req.DryRun {
return result, nil return result, nil
} }
urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL) urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL)
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{ _ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct, Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion, CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion,
@@ -109,7 +115,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
}) })
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes) path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil { if err != nil {
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "download", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "download", "failed", err))
return result, err return result, err
} }
defer os.Remove(path) defer os.Remove(path)
@@ -125,7 +131,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
Image: binaryPath, Image: binaryPath,
UpdatedAt: time.Now().UTC(), UpdatedAt: time.Now().UTC(),
}) })
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{ _ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct, Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion, CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion,
@@ -137,7 +143,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
}) })
return result, nil return result, nil
} }
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr))) _ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr)))
return result, err return result, err
} }
result.Loaded = true result.Loaded = true
@@ -151,7 +157,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
Image: binaryPath, Image: binaryPath,
UpdatedAt: time.Now().UTC(), UpdatedAt: time.Now().UTC(),
}) })
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{ _ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct, Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion, CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion,
@@ -173,8 +173,8 @@ func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServi
func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) { func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize() runtimeCfg := cfg.RuntimeConfig.Normalize()
var missing []string var missing []string
if runtimeCfg.BackendURL == "" { if runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "" {
missing = append(missing, "backend-url") missing = append(missing, "backend-url-or-fabric-registry-records-json")
} }
if runtimeCfg.ClusterID == "" { if runtimeCfg.ClusterID == "" {
missing = append(missing, "cluster-id") missing = append(missing, "cluster-id")
@@ -191,7 +191,6 @@ func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
args := []string{ args := []string{
cfg.BinaryInstallPath, cfg.BinaryInstallPath,
"update-loop", "update-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID, "--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir, "--state-dir", runtimeCfg.StateDir,
"--container-name", runtimeCfg.ContainerName, "--container-name", runtimeCfg.ContainerName,
@@ -202,9 +201,13 @@ func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter), "--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
"--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec), "--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec),
} }
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if strings.TrimSpace(cfg.Channel) != "" { if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel)) args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
} }
args = appendFabricUpdateArgs(args, runtimeCfg)
execStart := systemdJoin(args) execStart := systemdJoin(args)
return fmt.Sprintf(`[Unit] return fmt.Sprintf(`[Unit]
Description=RAP host-agent updater for %s Description=RAP host-agent updater for %s
@@ -225,8 +228,8 @@ WantedBy=multi-user.target
func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) { func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize() runtimeCfg := cfg.RuntimeConfig.Normalize()
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" { if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host-agent self updater") return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host-agent self updater")
} }
unitName := "rap-host-agent-self-updater.service" unitName := "rap-host-agent-self-updater.service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName) unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
@@ -234,7 +237,6 @@ func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, stri
args := []string{ args := []string{
cfg.BinaryInstallPath, cfg.BinaryInstallPath,
"update-host-agent-loop", "update-host-agent-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID, "--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir, "--state-dir", runtimeCfg.StateDir,
"--binary-path", firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath), "--binary-path", firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath),
@@ -243,9 +245,13 @@ func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, stri
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30), "--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30),
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter), "--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
} }
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if strings.TrimSpace(cfg.Channel) != "" { if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel)) args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
} }
args = appendFabricUpdateArgs(args, runtimeCfg)
return fmt.Sprintf(`[Unit] return fmt.Sprintf(`[Unit]
Description=RAP host-agent self updater Description=RAP host-agent self updater
After=network-online.target docker.service After=network-online.target docker.service
@@ -265,8 +271,8 @@ WantedBy=multi-user.target
func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string, error) { func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize() runtimeCfg := cfg.RuntimeConfig.Normalize()
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" { if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host monitor") return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host monitor")
} }
containers := uniqueTrimmed(append([]string{runtimeCfg.ContainerName}, cfg.MonitorContainers...)) containers := uniqueTrimmed(append([]string{runtimeCfg.ContainerName}, cfg.MonitorContainers...))
if len(containers) == 0 { if len(containers) == 0 {
@@ -277,7 +283,6 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
args := []string{ args := []string{
cfg.BinaryInstallPath, cfg.BinaryInstallPath,
"monitor-loop", "monitor-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID, "--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir, "--state-dir", runtimeCfg.StateDir,
"--current-version", firstNonEmpty(cfg.SelfUpdateVersion, cfg.CurrentVersion), "--current-version", firstNonEmpty(cfg.SelfUpdateVersion, cfg.CurrentVersion),
@@ -286,6 +291,9 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
"--disk-cleanup-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCleanup, DefaultMonitorDiskCleanupPercent)), "--disk-cleanup-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCleanup, DefaultMonitorDiskCleanupPercent)),
"--disk-critical-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCritical, DefaultMonitorDiskCriticalPercent)), "--disk-critical-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCritical, DefaultMonitorDiskCriticalPercent)),
} }
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if cfg.MonitorCleanupDocker { if cfg.MonitorCleanupDocker {
args = append(args, "--cleanup-docker") args = append(args, "--cleanup-docker")
} }
@@ -295,6 +303,7 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
for _, container := range containers { for _, container := range containers {
args = append(args, "--watch-container", container) args = append(args, "--watch-container", container)
} }
args = appendFabricUpdateArgs(args, runtimeCfg)
return fmt.Sprintf(`[Unit] return fmt.Sprintf(`[Unit]
Description=RAP host-agent monitor for %s Description=RAP host-agent monitor for %s
After=network-online.target docker.service After=network-online.target docker.service
@@ -312,6 +321,16 @@ WantedBy=multi-user.target
`, runtimeCfg.ContainerName, systemdJoin(args)), unitName, unitPath, nil `, runtimeCfg.ContainerName, systemdJoin(args)), unitName, unitPath, nil
} }
func appendFabricUpdateArgs(args []string, runtimeCfg RuntimeConfig) []string {
if strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON) != "" {
args = append(args, "--fabric-registry-records-json", strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON))
}
if strings.TrimSpace(runtimeCfg.MeshRegion) != "" {
args = append(args, "--mesh-region", strings.TrimSpace(runtimeCfg.MeshRegion))
}
return args
}
func firstNonZero(values ...int) int { func firstNonZero(values ...int) int {
for _, value := range values { for _, value := range values {
if value != 0 { if value != 0 {
@@ -119,7 +119,7 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
for _, want := range []string{ for _, want := range []string{
":loop", ":loop",
"rap-host-agent.exe.next", "rap-host-agent.exe.next",
"update-loop --backend-url", "update-loop --cluster-id",
"--backend-url \"http://control/api/v1\"", "--backend-url \"http://control/api/v1\"",
"--cluster-id \"cluster-1\"", "--cluster-id \"cluster-1\"",
"--node-id \"node-1\"", "--node-id \"node-1\"",
@@ -139,6 +139,35 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
} }
} }
func TestWindowsHostAgentUpdateScriptOmitsEmptyBackendURL(t *testing.T) {
cfg := WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
ClusterID: "cluster-1",
FabricRegistryRecordsJSON: `[{"record_id":"r1"}]`,
MeshRegion: "ru-msk",
},
AutoUpdateCurrentVersion: "0.1.2",
}
result := WindowsInstallResult{
NodeName: "win-a",
StateDir: `C:\ProgramData\RAP\nodes\win-a`,
NodeAgentPath: `C:\Program Files\RAP\win-a\rap-node-agent.exe`,
TaskName: "RAP Node Agent win-a",
}
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
if strings.Contains(script, "--backend-url") {
t.Fatalf("script must not include backend-url when it is empty:\n%s", script)
}
for _, want := range []string{
`--fabric-registry-records-json [{"record_id":"r1"}]`,
"--mesh-region ru-msk",
} {
if !strings.Contains(script, want) {
t.Fatalf("script missing %q:\n%s", want, script)
}
}
}
func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) { func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) {
result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{ result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{ RuntimeConfig: RuntimeConfig{
@@ -3,6 +3,8 @@ package hostagent
import ( import (
"bytes" "bytes"
"context" "context"
"crypto/ed25519"
"encoding/base64"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
@@ -17,6 +19,8 @@ import (
"time" "time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority" clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
) )
@@ -37,6 +41,9 @@ type UpdateRequest struct {
ClusterID string ClusterID string
NodeID string NodeID string
StateDir string StateDir string
ClusterAuthorityPublicKey string
FabricRegistryRecordsJSON string
MeshRegion string
Product string Product string
CurrentVersion string CurrentVersion string
OS string OS string
@@ -204,6 +211,9 @@ func (req UpdateRequest) Normalize() UpdateRequest {
req.ClusterID = strings.TrimSpace(req.ClusterID) req.ClusterID = strings.TrimSpace(req.ClusterID)
req.NodeID = strings.TrimSpace(req.NodeID) req.NodeID = strings.TrimSpace(req.NodeID)
req.StateDir = strings.TrimSpace(req.StateDir) req.StateDir = strings.TrimSpace(req.StateDir)
req.ClusterAuthorityPublicKey = strings.TrimSpace(req.ClusterAuthorityPublicKey)
req.FabricRegistryRecordsJSON = strings.TrimSpace(req.FabricRegistryRecordsJSON)
req.MeshRegion = strings.TrimSpace(req.MeshRegion)
req.Product = firstNonEmpty(req.Product, DefaultUpdateProduct) req.Product = firstNonEmpty(req.Product, DefaultUpdateProduct)
req.OS = firstNonEmpty(req.OS, runtime.GOOS) req.OS = firstNonEmpty(req.OS, runtime.GOOS)
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH) req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
@@ -222,8 +232,8 @@ func (req UpdateRequest) Normalize() UpdateRequest {
func (req UpdateRequest) Validate() error { func (req UpdateRequest) Validate() error {
req = req.Normalize() req = req.Normalize()
var missing []string var missing []string
if req.BackendURL == "" { if req.BackendURL == "" && req.FabricRegistryRecordsJSON == "" {
missing = append(missing, "backend-url") missing = append(missing, "backend-url-or-fabric-registry-records-json")
} }
if req.ClusterID == "" { if req.ClusterID == "" {
missing = append(missing, "cluster-id") missing = append(missing, "cluster-id")
@@ -285,30 +295,30 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
} }
if plan.Action != "update" { if plan.Action != "update" {
if !req.DryRun { if !req.DryRun {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromNoopPlan(req, plan)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromNoopPlan(req, plan))
} }
return result, nil return result, nil
} }
if plan.ProductionForwarding && !req.AllowProductionMesh { if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled") err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err return result, err
} }
if plan.Artifact == nil { if plan.Artifact == nil {
err := errors.New("update plan has no artifact") err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err return result, err
} }
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != DefaultUpdateInstallType { if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != DefaultUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType) err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err return result, err
} }
if req.DryRun { if req.DryRun {
result.NewImage = artifactImage(*plan.Artifact, "") result.NewImage = artifactImage(*plan.Artifact, "")
return result, nil return result, nil
} }
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{ _ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product, Product: req.Product,
CurrentVersion: req.CurrentVersion, CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion,
@@ -321,7 +331,7 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
current, cfg, err := m.runtimeConfigFromContainer(ctx, runner, docker, req.ContainerName) current, cfg, err := m.runtimeConfigFromContainer(ctx, runner, docker, req.ContainerName)
if err != nil { if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "inspect", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "inspect", "failed", err))
return result, err return result, err
} }
result.PreviousImageID = current.Image result.PreviousImageID = current.Image
@@ -339,7 +349,7 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
cfg.JoinToken = "" cfg.JoinToken = ""
result.NewImage = cfg.Image result.NewImage = cfg.Image
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{ _ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product, Product: req.Product,
CurrentVersion: req.CurrentVersion, CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion,
@@ -351,7 +361,7 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
}) })
installed, err := m.Install(ctx, cfg) installed, err := m.Install(ctx, cfg)
if err != nil { if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "apply", "failed", err))
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed) rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
if rollbackErr == nil && plan.RollbackAllowed { if rollbackErr == nil && plan.RollbackAllowed {
result.RolledBack = true result.RolledBack = true
@@ -363,14 +373,14 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
result.ContainerID = installed.ContainerID result.ContainerID = installed.ContainerID
if err := m.waitContainerRunning(ctx, runner, docker, req.ContainerName, req.HealthTimeout); err != nil { if err := m.waitContainerRunning(ctx, runner, docker, req.ContainerName, req.HealthTimeout); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "health_check", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "health_check", "failed", err))
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed) rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
if rollbackErr == nil && plan.RollbackAllowed { if rollbackErr == nil && plan.RollbackAllowed {
result.RolledBack = true result.RolledBack = true
} }
return result, err return result, err
} }
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{ _ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product, Product: req.Product,
CurrentVersion: req.CurrentVersion, CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion,
@@ -515,7 +525,27 @@ func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan
if req.Channel != "" { if req.Channel != "" {
values.Set("channel", req.Channel) values.Set("channel", req.Channel)
} }
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/plan?%s", req.BackendURL, url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode()) path := fmt.Sprintf("/clusters/%s/nodes/%s/updates/plan?%s", url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode())
if raw, viaFabric, err := updateControlRawViaFabric(ctx, req, client.RawControlRequest{Method: http.MethodGet, Path: path}); viaFabric {
if err != nil {
return NodeUpdatePlan{}, err
}
if raw.StatusCode < 200 || raw.StatusCode >= 300 {
return NodeUpdatePlan{}, fmt.Errorf("fetch update plan via fabric: status %d", raw.StatusCode)
}
var out NodeUpdatePlanResponse
if err := json.Unmarshal(raw.Body, &out); err != nil {
return NodeUpdatePlan{}, err
}
if err := verifyNodeUpdatePlanAuthority(req, out.Plan); err != nil {
return NodeUpdatePlan{}, err
}
return out.Plan, nil
}
endpoint := req.BackendURL + path
if req.BackendURL == "" {
return NodeUpdatePlan{}, errors.New("update plan control API is unavailable: no active fabric route and backend-url is empty")
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil { if err != nil {
return NodeUpdatePlan{}, err return NodeUpdatePlan{}, err
@@ -538,6 +568,110 @@ func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan
return out.Plan, nil return out.Plan, nil
} }
func updateControlRawViaFabric(ctx context.Context, req UpdateRequest, rawReq client.RawControlRequest) (client.RawControlResponse, bool, error) {
if strings.TrimSpace(req.FabricRegistryRecordsJSON) == "" {
return client.RawControlResponse{}, false, nil
}
publicKey, err := decodeUpdateFabricRegistryPublicKey(req)
if err != nil {
return client.RawControlResponse{}, false, err
}
registry, _, err := mesh.LoadFabricRegistryBootstrapRecords(req.FabricRegistryRecordsJSON, mesh.FabricRegistryVerificationPolicy{
LocalClusterID: req.ClusterID,
TrustedIssuers: []mesh.FabricRegistryTrustedIssuer{{
IssuerID: "cluster-authority",
Role: mesh.FabricRegistryAuthorityControl,
PublicKey: publicKey,
Scopes: []string{mesh.FabricRegistryScopeFarm, mesh.FabricRegistryScopeCluster, mesh.FabricRegistryScopeOrganization},
Services: []string{mesh.FabricRegistryServiceControlAPI},
}},
RequiredSignatures: 1,
MaxClockSkew: 2 * time.Minute,
Now: time.Now().UTC(),
}, false)
if err != nil {
return client.RawControlResponse{}, false, err
}
transport := mesh.NewQUICFabricTransport(nil)
if req.NodeID != "" {
transport.SetLocalPeerID(req.NodeID)
}
registry.VerifyCandidates(ctx, transport, mesh.FabricRegistryLiveProbeRequest{
ClusterID: req.ClusterID,
PreferredRegion: req.MeshRegion,
Timeout: 2 * time.Second,
MaxCandidates: 8,
Now: time.Now().UTC(),
})
resolved := registry.ResolveService(mesh.FabricRegistryResolveRequest{
ClusterID: req.ClusterID,
Service: mesh.FabricRegistryServiceControlAPI,
Scope: mesh.FabricRegistryScopeCluster,
PreferredRegion: req.MeshRegion,
Now: time.Now().UTC(),
})
if !resolved.Found || len(resolved.Endpoints) == 0 {
return client.RawControlResponse{}, false, nil
}
payload, err := json.Marshal(rawReq)
if err != nil {
return client.RawControlResponse{}, false, err
}
var lastErr error
for _, endpoint := range resolved.Endpoints {
result, err := mesh.SendFabricControlForward(ctx, transport, endpoint, payload, 5*time.Second)
if err != nil {
lastErr = err
continue
}
var envelope struct {
Payload json.RawMessage `json:"payload,omitempty"`
Error string `json:"error,omitempty"`
}
if err := json.Unmarshal(result.Payload, &envelope); err != nil {
lastErr = err
continue
}
if strings.TrimSpace(envelope.Error) != "" {
lastErr = errors.New(envelope.Error)
continue
}
var raw client.RawControlResponse
if err := json.Unmarshal(envelope.Payload, &raw); err != nil {
lastErr = err
continue
}
return raw, true, nil
}
if lastErr == nil {
lastErr = errors.New("fabric control registry endpoints unavailable")
}
return client.RawControlResponse{}, false, lastErr
}
func decodeUpdateFabricRegistryPublicKey(req UpdateRequest) (ed25519.PublicKey, error) {
value := strings.TrimSpace(req.ClusterAuthorityPublicKey)
if value == "" && strings.TrimSpace(req.StateDir) != "" {
if identity, err := state.Load(filepath.Join(req.StateDir, state.FileName)); err == nil {
value = strings.TrimSpace(identity.ClusterAuthorityPublicKey)
}
}
if value == "" {
return nil, errors.New("cluster authority public key is required for fabric registry records")
}
decoded, err := base64.StdEncoding.DecodeString(value)
if err != nil {
decoded, err = base64.RawStdEncoding.DecodeString(value)
}
if err != nil {
decoded, err = base64.RawURLEncoding.DecodeString(value)
}
if err != nil || len(decoded) != ed25519.PublicKeySize {
return nil, errors.New("cluster authority public key must be base64 Ed25519 public key")
}
return ed25519.PublicKey(decoded), nil
}
func verifyNodeUpdatePlanAuthority(req UpdateRequest, plan NodeUpdatePlan) error { func verifyNodeUpdatePlanAuthority(req UpdateRequest, plan NodeUpdatePlan) error {
identity, ok := pinnedUpdatePlanAuthority(req) identity, ok := pinnedUpdatePlanAuthority(req)
if !ok { if !ok {
@@ -642,6 +776,9 @@ func resolveUpdateRequest(req UpdateRequest) (UpdateRequest, error) {
func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID string, request NodeUpdateStatusRequest) error { func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID string, request NodeUpdateStatusRequest) error {
backendURL = strings.TrimRight(strings.TrimSpace(backendURL), "/") backendURL = strings.TrimRight(strings.TrimSpace(backendURL), "/")
if backendURL == "" {
return errors.New("update status control API is unavailable: backend-url is empty")
}
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/status", backendURL, url.PathEscape(clusterID), url.PathEscape(nodeID)) endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/status", backendURL, url.PathEscape(clusterID), url.PathEscape(nodeID))
body, err := json.Marshal(request) body, err := json.Marshal(request)
if err != nil { if err != nil {
@@ -663,6 +800,33 @@ func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID s
return nil return nil
} }
func ReportNodeUpdateStatusForRequest(ctx context.Context, req UpdateRequest, request NodeUpdateStatusRequest) error {
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return err
}
body, err := json.Marshal(request)
if err != nil {
return err
}
raw, viaFabric, err := updateControlRawViaFabric(ctx, req, client.RawControlRequest{
Method: http.MethodPost,
Path: fmt.Sprintf("/clusters/%s/nodes/%s/updates/status", url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID)),
Body: body,
})
if viaFabric {
if err != nil {
return err
}
if raw.StatusCode < 200 || raw.StatusCode >= 300 {
return fmt.Errorf("report update status via fabric: status %d", raw.StatusCode)
}
return nil
}
return ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, request)
}
func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner CommandRunner, docker, containerName string) (dockerInspectContainer, RuntimeConfig, error) { func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner CommandRunner, docker, containerName string) (dockerInspectContainer, RuntimeConfig, error) {
out, err := runner.Run(ctx, docker, "inspect", containerName) out, err := runner.Run(ctx, docker, "inspect", containerName)
if err != nil { if err != nil {
@@ -686,9 +850,8 @@ func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner Co
Network: firstNonEmpty(inspected[0].HostConfig.NetworkMode, DefaultNetwork), Network: firstNonEmpty(inspected[0].HostConfig.NetworkMode, DefaultNetwork),
RestartPolicy: firstNonEmpty(inspected[0].HostConfig.RestartPolicy.Name, "unless-stopped"), RestartPolicy: firstNonEmpty(inspected[0].HostConfig.RestartPolicy.Name, "unless-stopped"),
WorkloadSupervisionEnabled: parseBool(env["RAP_WORKLOAD_SUPERVISION_ENABLED"]), WorkloadSupervisionEnabled: parseBool(env["RAP_WORKLOAD_SUPERVISION_ENABLED"]),
MeshSyntheticRuntimeEnabled: true, MeshSyntheticRuntimeEnabled: parseBool(env["RAP_MESH_SYNTHETIC_RUNTIME_ENABLED"]),
MeshProductionForwardingEnabled: parseBool(env["RAP_MESH_PRODUCTION_FORWARDING_ENABLED"]), MeshProductionForwardingEnabled: parseBool(env["RAP_MESH_PRODUCTION_FORWARDING_ENABLED"]),
MeshFabricSessionEnabled: parseBool(env["RAP_MESH_FABRIC_SESSION_ENABLED"]),
VPNFabricSessionTransportEnabled: parseBool(env["RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED"]), VPNFabricSessionTransportEnabled: parseBool(env["RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED"]),
MeshQUICFabricEnabled: parseBool(env["RAP_MESH_QUIC_FABRIC_ENABLED"]), MeshQUICFabricEnabled: parseBool(env["RAP_MESH_QUIC_FABRIC_ENABLED"]),
MeshQUICFabricListenAddr: env["RAP_MESH_QUIC_FABRIC_LISTEN_ADDR"], MeshQUICFabricListenAddr: env["RAP_MESH_QUIC_FABRIC_LISTEN_ADDR"],
@@ -4,9 +4,17 @@ import (
"context" "context"
"crypto/ed25519" "crypto/ed25519"
cryptorand "crypto/rand" cryptorand "crypto/rand"
"crypto/rsa"
"crypto/sha256"
"crypto/tls"
"crypto/x509"
"crypto/x509/pkix"
"encoding/base64" "encoding/base64"
"encoding/hex"
"encoding/json" "encoding/json"
"fmt" "fmt"
"math/big"
"net"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"os" "os"
@@ -16,6 +24,8 @@ import (
"time" "time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority" clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
) )
@@ -120,6 +130,81 @@ func signHostAgentPayload(t *testing.T, payload json.RawMessage, privateKey ed25
} }
} }
func testHostAgentQUICTLSConfig(t *testing.T) *tls.Config {
t.Helper()
key, err := rsa.GenerateKey(cryptorand.Reader, 2048)
if err != nil {
t.Fatalf("generate rsa key: %v", err)
}
template := x509.Certificate{
SerialNumber: big.NewInt(1),
Subject: pkix.Name{CommonName: "127.0.0.1"},
NotBefore: time.Now().Add(-time.Hour),
NotAfter: time.Now().Add(time.Hour),
KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
IPAddresses: []net.IP{net.ParseIP("127.0.0.1")},
}
der, err := x509.CreateCertificate(cryptorand.Reader, &template, &template, &key.PublicKey, key)
if err != nil {
t.Fatalf("create cert: %v", err)
}
return &tls.Config{
Certificates: []tls.Certificate{{Certificate: [][]byte{der}, PrivateKey: key}},
NextProtos: []string{"rap-fabric-data-session-v1"},
}
}
func testHostAgentQUICCertSHA256(t *testing.T, cfg *tls.Config) string {
t.Helper()
if len(cfg.Certificates) == 0 || len(cfg.Certificates[0].Certificate) == 0 {
t.Fatal("missing test certificate")
}
sum := sha256.Sum256(cfg.Certificates[0].Certificate[0])
return hex.EncodeToString(sum[:])
}
func signedUpdateControlRegistry(t *testing.T, clusterID, endpoint, certSHA256 string, publicKey ed25519.PublicKey, privateKey ed25519.PrivateKey) string {
t.Helper()
now := time.Now().UTC()
issuer := mesh.FabricRegistryTrustedIssuer{IssuerID: "cluster-authority", Role: mesh.FabricRegistryAuthorityControl, PublicKey: publicKey}
record := mesh.FabricRegistryGossipRecord{
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
ClusterID: clusterID,
Service: mesh.FabricRegistryServiceControlAPI,
Scope: mesh.FabricRegistryScopeCluster,
Epoch: 1,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Hour),
IssuerNodeID: "cluster-authority",
IssuerRole: mesh.FabricRegistryAuthorityControl,
Endpoints: []mesh.FabricRegistryEndpoint{{
EndpointID: "control-a",
Address: endpoint,
Transport: "direct_quic",
PeerCertSHA256: certSHA256,
}},
}
signed, err := mesh.SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign registry record: %v", err)
}
raw, err := json.Marshal([]mesh.FabricRegistryGossipRecord{signed})
if err != nil {
t.Fatalf("marshal registry record: %v", err)
}
return string(raw)
}
func mustJSONRaw(t *testing.T, value any) json.RawMessage {
t.Helper()
raw, err := json.Marshal(value)
if err != nil {
t.Fatalf("marshal json: %v", err)
}
return raw
}
func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.T) { func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.T) {
urls := artifactURLsForBackend(ReleaseArtifact{ urls := artifactURLsForBackend(ReleaseArtifact{
URL: "/downloads/rap-node-agent-0.2.92.tar", URL: "/downloads/rap-node-agent-0.2.92.tar",
@@ -223,6 +308,111 @@ func TestFetchNodeUpdatePlanAcceptsSignedPlanWithPinnedAuthority(t *testing.T) {
} }
} }
func TestFetchNodeUpdatePlanUsesFabricRegistryQUICControlAPI(t *testing.T) {
stateDir, publicKey, privateKey := writePinnedAuthorityIdentity(t)
plan := map[string]any{
"schema_version": "rap.node_update_plan.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0",
"action": "none",
"reason": "already_current",
"production_forwarding": false,
}
payload := map[string]any{
"schema_version": "rap.node_update_plan_authority.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0",
"action": "none",
"target_version": "",
"artifact_sha256": "",
"control_plane_only": true,
"production_forwarding": false,
}
rawPayload, signature := signedAuthorityPayload(t, publicKey, privateKey, payload)
plan["authority_payload"] = json.RawMessage(rawPayload)
plan["authority_signature"] = signature
tlsConfig := testHostAgentQUICTLSConfig(t)
var received client.RawControlRequest
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
if err := json.Unmarshal(payload, &received); err != nil {
return nil, err
}
if received.Method != http.MethodGet || !strings.HasPrefix(received.Path, "/clusters/cluster-1/nodes/node-1/updates/plan?") {
return nil, fmt.Errorf("unexpected request: %+v", received)
}
return json.Marshal(client.RawControlResponse{StatusCode: 200, Body: mustJSONRaw(t, map[string]any{"node_update_plan": plan})})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
got, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
BackendURL: "http://127.0.0.1:1",
ClusterID: "cluster-1",
NodeID: "node-1",
StateDir: stateDir,
FabricRegistryRecordsJSON: signedUpdateControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testHostAgentQUICCertSHA256(t, tlsConfig), publicKey, privateKey),
CurrentVersion: "0.1.0",
OS: "linux",
Arch: "amd64",
InstallType: "docker",
})
if err != nil {
t.Fatalf("fetch plan via fabric: %v", err)
}
if got.Action != "none" || got.Reason != "already_current" {
t.Fatalf("plan = %+v", got)
}
}
func TestReportNodeUpdateStatusUsesFabricRegistryQUICControlAPI(t *testing.T) {
stateDir, publicKey, privateKey := writePinnedAuthorityIdentity(t)
tlsConfig := testHostAgentQUICTLSConfig(t)
var received client.RawControlRequest
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
if err := json.Unmarshal(payload, &received); err != nil {
return nil, err
}
if received.Method != http.MethodPost || received.Path != "/clusters/cluster-1/nodes/node-1/updates/status" {
return nil, fmt.Errorf("unexpected request: %+v", received)
}
return json.Marshal(client.RawControlResponse{StatusCode: 204, Body: json.RawMessage(`{}`)})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
err = ReportNodeUpdateStatusForRequest(context.Background(), UpdateRequest{
BackendURL: "http://127.0.0.1:1",
ClusterID: "cluster-1",
NodeID: "node-1",
StateDir: stateDir,
FabricRegistryRecordsJSON: signedUpdateControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testHostAgentQUICCertSHA256(t, tlsConfig), publicKey, privateKey),
CurrentVersion: "0.1.0",
OS: "linux",
Arch: "amd64",
InstallType: "docker",
}, NodeUpdateStatusRequest{Product: "rap-node-agent", Phase: "download", Status: "started"})
if err != nil {
t.Fatalf("report status via fabric: %v", err)
}
if len(received.Body) == 0 || !strings.Contains(string(received.Body), `"phase":"download"`) {
t.Fatalf("unexpected status body: %s", string(received.Body))
}
}
func TestFetchNodeUpdatePlanAcceptsQuorumSignedPlan(t *testing.T) { func TestFetchNodeUpdatePlanAcceptsQuorumSignedPlan(t *testing.T) {
stateDir, descriptor, privateKeys := writePinnedQuorumIdentity(t) stateDir, descriptor, privateKeys := writePinnedQuorumIdentity(t)
plan := map[string]any{ plan := map[string]any{
@@ -66,7 +66,6 @@ func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInsta
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled, WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled, MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled, MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshFabricSessionEnabled: profile.MeshFabricSessionEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled, VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled, MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr, MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr,
@@ -48,29 +48,29 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
} }
status.Payload["task"] = req.WindowsTaskName status.Payload["task"] = req.WindowsTaskName
status.Payload["binary_path"] = req.BinaryPath status.Payload["binary_path"] = req.BinaryPath
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status) _ = ReportNodeUpdateStatusForRequest(ctx, req, status)
} }
return result, nil return result, nil
} }
if plan.ProductionForwarding && !req.AllowProductionMesh { if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled") err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err return result, err
} }
if plan.Artifact == nil { if plan.Artifact == nil {
err := errors.New("update plan has no artifact") err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err return result, err
} }
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != WindowsUpdateInstallType { if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != WindowsUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType) err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err return result, err
} }
if req.DryRun { if req.DryRun {
return result, nil return result, nil
} }
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{ _ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product, Product: req.Product,
CurrentVersion: req.CurrentVersion, CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion,
@@ -81,7 +81,7 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName}, Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName},
}) })
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL) urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{ _ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product, Product: req.Product,
CurrentVersion: req.CurrentVersion, CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion,
@@ -93,7 +93,7 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
}) })
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes) path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil { if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
return result, err return result, err
} }
defer os.Remove(path) defer os.Remove(path)
@@ -101,16 +101,16 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
if err := copyFile(path, req.BinaryPath, 0o755); err != nil { if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath) m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if retryErr := copyFile(path, req.BinaryPath, 0o755); retryErr != nil { if retryErr := copyFile(path, req.BinaryPath, 0o755); retryErr != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "apply", "failed", err))
return result, err return result, err
} }
} }
result.Replaced = true result.Replaced = true
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil { if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err)) _ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
return result, err return result, err
} }
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{ _ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product, Product: req.Product,
CurrentVersion: req.CurrentVersion, CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion,
@@ -290,7 +290,6 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
updateLoopArgs := []string{ updateLoopArgs := []string{
`"` + hostAgentPath + `"`, `"` + hostAgentPath + `"`,
"update-loop", "update-loop",
"--backend-url", `"` + cfg.RuntimeConfig.BackendURL + `"`,
"--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`, "--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`,
"--state-dir", `"` + result.StateDir + `"`, "--state-dir", `"` + result.StateDir + `"`,
"--current-version", currentVersion, "--current-version", currentVersion,
@@ -306,6 +305,10 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
"--host-agent-current-version", currentVersion, "--host-agent-current-version", currentVersion,
"--host-agent-binary-path", `"` + hostAgentPath + `"`, "--host-agent-binary-path", `"` + hostAgentPath + `"`,
} }
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
updateLoopArgs = append(updateLoopArgs, "--backend-url", `"`+strings.TrimSpace(cfg.RuntimeConfig.BackendURL)+`"`)
}
updateLoopArgs = appendFabricUpdateArgs(updateLoopArgs, cfg.RuntimeConfig)
if strings.TrimSpace(cfg.NodeID) != "" { if strings.TrimSpace(cfg.NodeID) != "" {
updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`) updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`)
} }
@@ -6,13 +6,7 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"net/http" "net/http"
"net/url"
"strings"
"sync"
"time" "time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
"github.com/gorilla/websocket"
) )
type Client struct { type Client struct {
@@ -20,38 +14,6 @@ type Client struct {
HTTPClient *http.Client HTTPClient *http.Client
} }
type FabricSessionDialOptions struct {
Token string
Header http.Header
Dialer *websocket.Dialer
Timeout time.Duration
MaxPayload int
}
type FabricSessionClient struct {
conn *websocket.Conn
timeout time.Duration
maxPayload int
readMu sync.Mutex
writeMu sync.Mutex
}
type FabricSessionPumpOptions struct {
OutboundBuffer int
InboundBuffer int
ErrorBuffer int
}
type FabricSessionPump struct {
session *FabricSessionClient
outbound chan fabricproto.Frame
inbound chan fabricproto.Frame
errors chan error
done chan struct{}
cancel context.CancelFunc
closeMu sync.Once
}
func NewClient(baseURL string) Client { func NewClient(baseURL string) Client {
return Client{ return Client{
BaseURL: baseURL, BaseURL: baseURL,
@@ -147,270 +109,3 @@ func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope)
} }
return result, nil return result, nil
} }
func (c Client) DialFabricSession(ctx context.Context, opts FabricSessionDialOptions) (*websocket.Conn, *http.Response, error) {
target, err := c.fabricSessionWebSocketURL()
if err != nil {
return nil, nil, err
}
header := cloneHeader(opts.Header)
if strings.TrimSpace(opts.Token) != "" {
header.Set("X-RAP-Fabric-Session-Token", strings.TrimSpace(opts.Token))
}
dialer := opts.Dialer
if dialer == nil {
base := *websocket.DefaultDialer
if opts.Timeout > 0 {
base.HandshakeTimeout = opts.Timeout
}
dialer = &base
}
return dialer.DialContext(ctx, target, header)
}
func (c Client) OpenFabricSession(ctx context.Context, opts FabricSessionDialOptions) (*FabricSessionClient, *http.Response, error) {
conn, resp, err := c.DialFabricSession(ctx, opts)
if err != nil {
if resp != nil {
return nil, resp, fmt.Errorf("fabric session websocket rejected with status %d: %w", resp.StatusCode, err)
}
return nil, resp, err
}
maxPayload := opts.MaxPayload
if maxPayload <= 0 {
maxPayload = fabricproto.DefaultMaxPayload
}
return &FabricSessionClient{
conn: conn,
timeout: opts.Timeout,
maxPayload: maxPayload,
}, resp, nil
}
func (c Client) SendFabricSessionFrame(ctx context.Context, opts FabricSessionDialOptions, frame fabricproto.Frame) (fabricproto.Frame, error) {
session, _, err := c.OpenFabricSession(ctx, opts)
if err != nil {
return fabricproto.Frame{}, err
}
defer session.Close()
return session.RoundTrip(ctx, frame)
}
func (c *FabricSessionClient) Close() error {
if c == nil || c.conn == nil {
return nil
}
return c.conn.Close()
}
func (c *FabricSessionClient) WriteFrame(ctx context.Context, frame fabricproto.Frame) error {
if c == nil || c.conn == nil {
return fmt.Errorf("fabric session client is closed")
}
payload, err := fabricproto.MarshalFrame(frame)
if err != nil {
return err
}
c.writeMu.Lock()
defer c.writeMu.Unlock()
c.applyWriteDeadline(ctx)
return c.conn.WriteMessage(websocket.BinaryMessage, payload)
}
func (c *FabricSessionClient) ReadFrame(ctx context.Context) (fabricproto.Frame, error) {
if c == nil || c.conn == nil {
return fabricproto.Frame{}, fmt.Errorf("fabric session client is closed")
}
c.readMu.Lock()
defer c.readMu.Unlock()
c.applyReadDeadline(ctx)
messageType, responsePayload, err := c.conn.ReadMessage()
if err != nil {
return fabricproto.Frame{}, err
}
if messageType != websocket.BinaryMessage {
return fabricproto.Frame{}, fmt.Errorf("fabric session websocket returned non-binary message type %d", messageType)
}
return fabricproto.UnmarshalFrame(responsePayload, c.maxPayload)
}
func (c *FabricSessionClient) RoundTrip(ctx context.Context, frame fabricproto.Frame) (fabricproto.Frame, error) {
if err := c.WriteFrame(ctx, frame); err != nil {
return fabricproto.Frame{}, err
}
return c.ReadFrame(ctx)
}
func (c *FabricSessionClient) StartPump(ctx context.Context, opts FabricSessionPumpOptions) *FabricSessionPump {
if opts.OutboundBuffer <= 0 {
opts.OutboundBuffer = 64
}
if opts.InboundBuffer <= 0 {
opts.InboundBuffer = 64
}
if opts.ErrorBuffer <= 0 {
opts.ErrorBuffer = 8
}
pumpCtx, cancel := context.WithCancel(ctx)
pump := &FabricSessionPump{
session: c,
outbound: make(chan fabricproto.Frame, opts.OutboundBuffer),
inbound: make(chan fabricproto.Frame, opts.InboundBuffer),
errors: make(chan error, opts.ErrorBuffer),
done: make(chan struct{}),
cancel: cancel,
}
go pump.writeLoop(pumpCtx)
go pump.readLoop(pumpCtx)
return pump
}
func (p *FabricSessionPump) Send(ctx context.Context, frame fabricproto.Frame) error {
if p == nil {
return fmt.Errorf("fabric session pump is nil")
}
select {
case <-ctx.Done():
return ctx.Err()
case <-p.done:
return fmt.Errorf("fabric session pump is closed")
case p.outbound <- frame:
return nil
}
}
func (p *FabricSessionPump) Frames() <-chan fabricproto.Frame {
if p == nil {
return nil
}
return p.inbound
}
func (p *FabricSessionPump) Errors() <-chan error {
if p == nil {
return nil
}
return p.errors
}
func (p *FabricSessionPump) Closed() bool {
if p == nil {
return true
}
select {
case <-p.done:
return true
default:
return false
}
}
func (p *FabricSessionPump) Close() error {
if p == nil {
return nil
}
var err error
p.closeMu.Do(func() {
close(p.done)
p.cancel()
err = p.session.Close()
})
return err
}
func (p *FabricSessionPump) writeLoop(ctx context.Context) {
defer p.Close()
for {
select {
case <-ctx.Done():
p.reportError(ctx.Err())
return
case <-p.done:
return
case frame := <-p.outbound:
if err := p.session.WriteFrame(ctx, frame); err != nil {
p.reportError(err)
return
}
}
}
}
func (p *FabricSessionPump) readLoop(ctx context.Context) {
defer p.Close()
for {
frame, err := p.session.ReadFrame(ctx)
if err != nil {
p.reportError(err)
return
}
select {
case <-ctx.Done():
p.reportError(ctx.Err())
return
case <-p.done:
return
case p.inbound <- frame:
}
}
}
func (p *FabricSessionPump) reportError(err error) {
if err == nil {
return
}
select {
case p.errors <- err:
default:
}
}
func (c *FabricSessionClient) applyReadDeadline(ctx context.Context) {
if deadline, ok := ctx.Deadline(); ok {
_ = c.conn.SetReadDeadline(deadline)
} else if c.timeout > 0 {
_ = c.conn.SetReadDeadline(time.Now().Add(c.timeout))
}
}
func (c *FabricSessionClient) applyWriteDeadline(ctx context.Context) {
if deadline, ok := ctx.Deadline(); ok {
_ = c.conn.SetWriteDeadline(deadline)
} else if c.timeout > 0 {
_ = c.conn.SetWriteDeadline(time.Now().Add(c.timeout))
}
}
func (c Client) fabricSessionWebSocketURL() (string, error) {
base := strings.TrimSpace(c.BaseURL)
if base == "" {
return "", fmt.Errorf("mesh base url is required")
}
parsed, err := url.Parse(base)
if err != nil {
return "", err
}
switch parsed.Scheme {
case "http":
parsed.Scheme = "ws"
case "https":
parsed.Scheme = "wss"
case "ws", "wss":
default:
return "", fmt.Errorf("unsupported mesh base url scheme %q", parsed.Scheme)
}
parsed.Path = strings.TrimRight(parsed.Path, "/") + "/mesh/v1/fabric/session/ws"
parsed.RawQuery = ""
parsed.Fragment = ""
return parsed.String(), nil
}
func cloneHeader(header http.Header) http.Header {
out := http.Header{}
for key, values := range header {
for _, value := range values {
out.Add(key, value)
}
}
return out
}
@@ -1,243 +0,0 @@
package mesh
import (
"context"
"net/http/httptest"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestClientFabricSessionFrameRoundTrip(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
response, err := client.SendFabricSessionFrame(ctx, FabricSessionDialOptions{
Token: "rap_fsn_clienttest",
Timeout: time.Second,
}, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 12,
Payload: []byte("probe"),
})
if err != nil {
t.Fatalf("send fabric session frame: %v", err)
}
if response.Type != fabricproto.FramePong || response.Sequence != 12 || string(response.Payload) != "probe" {
t.Fatalf("response = %+v, want pong seq 12", response)
}
}
func TestClientFabricSessionPersistentRoundTrips(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
session, _, err := client.OpenFabricSession(ctx, FabricSessionDialOptions{
Token: "rap_fsn_persistent",
Timeout: time.Second,
})
if err != nil {
t.Fatalf("open fabric session: %v", err)
}
defer session.Close()
first, err := session.RoundTrip(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 1,
Payload: []byte("first"),
})
if err != nil {
t.Fatalf("first round trip: %v", err)
}
second, err := session.RoundTrip(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 2,
Payload: []byte("second"),
})
if err != nil {
t.Fatalf("second round trip: %v", err)
}
if first.Type != fabricproto.FramePong || first.Sequence != 1 || string(first.Payload) != "first" {
t.Fatalf("first response = %+v, want pong seq 1", first)
}
if second.Type != fabricproto.FramePong || second.Sequence != 2 || string(second.Payload) != "second" {
t.Fatalf("second response = %+v, want pong seq 2", second)
}
}
func TestClientFabricSessionPersistentDataAcks(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
session, _, err := client.OpenFabricSession(ctx, FabricSessionDialOptions{
Token: "rap_fsn_dataacks",
Timeout: time.Second,
})
if err != nil {
t.Fatalf("open fabric session: %v", err)
}
defer session.Close()
if err := session.WriteFrame(ctx, fabricproto.Frame{
Type: fabricproto.FrameOpenStream,
StreamID: 77,
TrafficClass: fabricproto.TrafficClassInteractive,
}); err != nil {
t.Fatalf("open stream frame: %v", err)
}
first, err := session.RoundTrip(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
StreamID: 77,
Sequence: 10,
TrafficClass: fabricproto.TrafficClassInteractive,
Payload: []byte("first payload"),
})
if err != nil {
t.Fatalf("first data round trip: %v", err)
}
second, err := session.RoundTrip(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
StreamID: 77,
Sequence: 11,
TrafficClass: fabricproto.TrafficClassInteractive,
Payload: []byte("second payload"),
})
if err != nil {
t.Fatalf("second data round trip: %v", err)
}
if first.Type != fabricproto.FrameAck || first.StreamID != 77 || first.Sequence != 10 {
t.Fatalf("first ack = %+v, want stream 77 seq 10", first)
}
if second.Type != fabricproto.FrameAck || second.StreamID != 77 || second.Sequence != 11 {
t.Fatalf("second ack = %+v, want stream 77 seq 11", second)
}
}
func TestClientFabricSessionPumpMovesIndependentFrames(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
session, _, err := client.OpenFabricSession(ctx, FabricSessionDialOptions{
Token: "rap_fsn_pump",
Timeout: time.Second,
})
if err != nil {
t.Fatalf("open fabric session: %v", err)
}
pump := session.StartPump(ctx, FabricSessionPumpOptions{
OutboundBuffer: 4,
InboundBuffer: 4,
ErrorBuffer: 4,
})
defer pump.Close()
if err := pump.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameOpenStream,
StreamID: 900,
TrafficClass: fabricproto.TrafficClassBulk,
}); err != nil {
t.Fatalf("send open bulk stream: %v", err)
}
if err := pump.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
StreamID: 900,
Sequence: 31,
TrafficClass: fabricproto.TrafficClassBulk,
Payload: []byte("bulk payload"),
}); err != nil {
t.Fatalf("send bulk data: %v", err)
}
if err := pump.Send(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 32,
Payload: []byte("control ping"),
}); err != nil {
t.Fatalf("send ping: %v", err)
}
gotAck := false
gotPong := false
for !gotAck || !gotPong {
select {
case frame := <-pump.Frames():
switch {
case frame.Type == fabricproto.FrameAck && frame.StreamID == 900 && frame.Sequence == 31:
gotAck = true
case frame.Type == fabricproto.FramePong && frame.Sequence == 32 && string(frame.Payload) == "control ping":
gotPong = true
}
case err := <-pump.Errors():
t.Fatalf("pump error: %v", err)
case <-ctx.Done():
t.Fatalf("timed out waiting for pump frames: ack=%v pong=%v", gotAck, gotPong)
}
}
}
func TestClientFabricSessionReportsRejectedStatus(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, err := client.SendFabricSessionFrame(ctx, FabricSessionDialOptions{}, fabricproto.Frame{Type: fabricproto.FramePing})
if err == nil {
t.Fatal("send fabric session without token unexpectedly succeeded")
}
}
func TestClientFabricSessionWebSocketURL(t *testing.T) {
cases := []struct {
base string
want string
}{
{base: "http://node.example", want: "ws://node.example/mesh/v1/fabric/session/ws"},
{base: "https://node.example/base/", want: "wss://node.example/base/mesh/v1/fabric/session/ws"},
{base: "ws://node.example", want: "ws://node.example/mesh/v1/fabric/session/ws"},
}
for _, tc := range cases {
client := NewClient(tc.base)
got, err := client.fabricSessionWebSocketURL()
if err != nil {
t.Fatalf("fabricSessionWebSocketURL(%q): %v", tc.base, err)
}
if got != tc.want {
t.Fatalf("fabricSessionWebSocketURL(%q) = %q, want %q", tc.base, got, tc.want)
}
}
}
@@ -0,0 +1,94 @@
package mesh
import (
"context"
"encoding/json"
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
var fabricControlForwardSequence atomic.Uint64
type FabricControlForwardResult struct {
Payload json.RawMessage `json:"payload,omitempty"`
LatencyMs int64 `json:"latency_ms"`
Endpoint string `json:"endpoint,omitempty"`
}
func FabricTransportTargetFromRegistryEndpoint(endpoint FabricRegistryEndpoint) FabricTransportTarget {
return FabricTransportTarget{
EndpointID: strings.TrimSpace(endpoint.EndpointID),
PeerID: strings.TrimSpace(endpoint.EndpointID),
Endpoint: strings.TrimSpace(endpoint.Address),
Transport: strings.TrimSpace(endpoint.Transport),
PeerCertSHA256: strings.TrimSpace(endpoint.PeerCertSHA256),
Timeout: 5 * time.Second,
InboundBuffer: 4,
ErrorBuffer: 4,
}
}
func SendFabricControlForward(ctx context.Context, transport FabricTransport, endpoint FabricRegistryEndpoint, payload []byte, timeout time.Duration) (FabricControlForwardResult, error) {
if transport == nil {
return FabricControlForwardResult{}, fmt.Errorf("fabric control transport is unavailable")
}
if len(payload) == 0 {
return FabricControlForwardResult{}, fmt.Errorf("fabric control payload is empty")
}
if timeout <= 0 {
timeout = 5 * time.Second
}
target := FabricTransportTargetFromRegistryEndpoint(endpoint)
target.Timeout = timeout
session, err := transport.Connect(ctx, target)
if err != nil {
return FabricControlForwardResult{}, err
}
defer session.Close()
sequence := fabricControlForwardSequence.Add(1)
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: FabricControlForwardQUICStreamID,
Sequence: sequence,
Payload: append([]byte(nil), payload...),
}); err != nil {
return FabricControlForwardResult{}, err
}
waitCtx := ctx
var cancel context.CancelFunc
if timeout > 0 {
waitCtx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
startedAt := time.Now()
for {
select {
case <-waitCtx.Done():
return FabricControlForwardResult{}, waitCtx.Err()
case err, ok := <-session.Errors():
if !ok {
return FabricControlForwardResult{}, fmt.Errorf("fabric control session closed")
}
if err != nil {
return FabricControlForwardResult{}, err
}
case frame, ok := <-session.Frames():
if !ok {
return FabricControlForwardResult{}, fmt.Errorf("fabric control session closed")
}
if frame.Type != fabricproto.FrameData || frame.StreamID != FabricControlForwardQUICStreamID || frame.Sequence != sequence {
continue
}
return FabricControlForwardResult{
Payload: append(json.RawMessage(nil), frame.Payload...),
LatencyMs: time.Since(startedAt).Milliseconds(),
Endpoint: endpoint.Address,
}, nil
}
}
}
@@ -565,6 +565,43 @@ func TestQUICFabricServerHandlesWebIngressForwardFrames(t *testing.T) {
} }
} }
func TestSendFabricControlForwardUsesQUICStream(t *testing.T) {
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
if string(payload) != `{"method":"GET","path":"/auth/login"}` {
return nil, ErrForwardRuntimeUnavailable
}
return []byte(`{"status_code":200,"body":{"ok":true}}`), nil
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
result, err := SendFabricControlForward(ctx, NewQUICFabricTransport(nil), FabricRegistryEndpoint{
EndpointID: "control-a",
Address: "quic://" + server.Addr().String(),
Transport: "direct_quic",
PeerCertSHA256: testQUICCertSHA256(t, tlsConfig),
}, []byte(`{"method":"GET","path":"/auth/login"}`), time.Second)
if err != nil {
t.Fatalf("send fabric control forward: %v", err)
}
var response quicFabricControlForwardResponse
if err := json.Unmarshal(result.Payload, &response); err != nil {
t.Fatalf("decode response: %v", err)
}
if response.Error != "" || string(response.Payload) != `{"status_code":200,"body":{"ok":true}}` {
t.Fatalf("response = %+v", response)
}
}
func startQUICFabricEchoServer(t *testing.T) *quic.Listener { func startQUICFabricEchoServer(t *testing.T) *quic.Listener {
t.Helper() t.Helper()
return startQUICFabricEchoServerWithTLS(t, testQUICTLSConfig(t)) return startQUICFabricEchoServerWithTLS(t, testQUICTLSConfig(t))
@@ -164,6 +164,7 @@ func fabricRouteHopsForCandidate(candidate PeerEndpointCandidate, metadata Fabri
case FabricRouteRelay: case FabricRouteRelay:
relayNodeID := firstNonEmpty(strings.TrimSpace(metadata.RelayNodeID), strings.TrimSpace(metadata.ViaNodeID)) relayNodeID := firstNonEmpty(strings.TrimSpace(metadata.RelayNodeID), strings.TrimSpace(metadata.ViaNodeID))
relayEndpoint := firstNonEmpty(strings.TrimRight(strings.TrimSpace(metadata.RelayEndpoint), "/"), endpoint) relayEndpoint := firstNonEmpty(strings.TrimRight(strings.TrimSpace(metadata.RelayEndpoint), "/"), endpoint)
relayPeerCertSHA256 := candidatePeerCertSHA256(candidate)
hops := []FabricRouteHop{} hops := []FabricRouteHop{}
if localNodeID != "" { if localNodeID != "" {
hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: FabricRouteDirect}) hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: FabricRouteDirect})
@@ -173,7 +174,7 @@ func fabricRouteHopsForCandidate(candidate PeerEndpointCandidate, metadata Fabri
return hops return hops
} }
hops = append(hops, hops = append(hops,
FabricRouteHop{NodeID: relayNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID + ":relay", Address: relayEndpoint}, FabricRouteHop{NodeID: relayNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID + ":relay", Address: relayEndpoint, PeerCertSHA256: relayPeerCertSHA256},
FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)}, FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)},
) )
return hops return hops
@@ -44,7 +44,13 @@ func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
} }
func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T) { func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{RelayNodeID: "node-r", RelayEndpoint: "quic://node-r:19443"}) metadata, _ := json.Marshal(struct {
FabricCandidateMetadata
TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"`
}{
FabricCandidateMetadata: FabricCandidateMetadata{RelayNodeID: "node-r", RelayEndpoint: "quic://node-r:19443"},
TLSCertSHA256: "relay-cert",
})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{ routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-relay", EndpointID: "node-b-relay",
NodeID: "node-b", NodeID: "node-b",
@@ -69,6 +75,9 @@ func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T
if got := routeSet.Primary.Hops[1].NodeID; got != "node-r" { if got := routeSet.Primary.Hops[1].NodeID; got != "node-r" {
t.Fatalf("relay hop = %q, want node-r", got) t.Fatalf("relay hop = %q, want node-r", got)
} }
if got := routeSet.Primary.Hops[1].PeerCertSHA256; got != "relay-cert" {
t.Fatalf("relay hop peer cert = %q, want relay-cert", got)
}
if routeSet.Primary.Capacity != 50 { if routeSet.Primary.Capacity != 50 {
t.Fatalf("capacity = %d, want 50", routeSet.Primary.Capacity) t.Fatalf("capacity = %d, want 50", routeSet.Primary.Capacity)
} }
@@ -1,156 +0,0 @@
package mesh
import (
"context"
"fmt"
"strings"
"sync"
)
type FabricSessionPeerManager struct {
mu sync.Mutex
sessions map[string]*FabricSessionPump
stats FabricSessionPeerManagerStats
}
type FabricSessionPeerTarget struct {
PeerID string
BaseURL string
Options FabricSessionDialOptions
Pump FabricSessionPumpOptions
}
type FabricSessionPeerManagerStats struct {
Opens uint64 `json:"opens"`
Reuses uint64 `json:"reuses"`
ClosedEvicted uint64 `json:"closed_evicted"`
ClosePeerCalls uint64 `json:"close_peer_calls"`
CloseAllCalls uint64 `json:"close_all_calls"`
}
type FabricSessionPeerManagerSnapshot struct {
SchemaVersion string `json:"schema_version"`
ActiveCount int `json:"active_count"`
ClosedCount int `json:"closed_count"`
Stats FabricSessionPeerManagerStats `json:"stats"`
}
func NewFabricSessionPeerManager() *FabricSessionPeerManager {
return &FabricSessionPeerManager{
sessions: map[string]*FabricSessionPump{},
}
}
func (m *FabricSessionPeerManager) Get(ctx context.Context, target FabricSessionPeerTarget) (*FabricSessionPump, error) {
if m == nil {
return nil, fmt.Errorf("fabric session peer manager is nil")
}
key, err := fabricSessionPeerKey(target)
if err != nil {
return nil, err
}
m.mu.Lock()
if pump := m.sessions[key]; pump != nil {
if pump.Closed() {
delete(m.sessions, key)
m.stats.ClosedEvicted++
} else {
m.stats.Reuses++
m.mu.Unlock()
return pump, nil
}
}
m.mu.Unlock()
session, _, err := NewClient(target.BaseURL).OpenFabricSession(ctx, target.Options)
if err != nil {
return nil, err
}
pump := session.StartPump(context.Background(), target.Pump)
m.mu.Lock()
if existing := m.sessions[key]; existing != nil {
if existing.Closed() {
delete(m.sessions, key)
m.stats.ClosedEvicted++
} else {
m.stats.Reuses++
m.mu.Unlock()
_ = pump.Close()
return existing, nil
}
}
if m.sessions == nil {
m.sessions = map[string]*FabricSessionPump{}
}
m.sessions[key] = pump
m.stats.Opens++
m.mu.Unlock()
return pump, nil
}
func (m *FabricSessionPeerManager) ClosePeer(target FabricSessionPeerTarget) error {
if m == nil {
return nil
}
key, err := fabricSessionPeerKey(target)
if err != nil {
return err
}
m.mu.Lock()
m.stats.ClosePeerCalls++
pump := m.sessions[key]
delete(m.sessions, key)
m.mu.Unlock()
if pump == nil {
return nil
}
return pump.Close()
}
func (m *FabricSessionPeerManager) Close() error {
if m == nil {
return nil
}
m.mu.Lock()
m.stats.CloseAllCalls++
sessions := m.sessions
m.sessions = map[string]*FabricSessionPump{}
m.mu.Unlock()
var firstErr error
for _, pump := range sessions {
if err := pump.Close(); err != nil && firstErr == nil {
firstErr = err
}
}
return firstErr
}
func (m *FabricSessionPeerManager) Snapshot() FabricSessionPeerManagerSnapshot {
if m == nil {
return FabricSessionPeerManagerSnapshot{SchemaVersion: "rap.fabric_session_peer_manager.v1"}
}
m.mu.Lock()
defer m.mu.Unlock()
snapshot := FabricSessionPeerManagerSnapshot{
SchemaVersion: "rap.fabric_session_peer_manager.v1",
Stats: m.stats,
}
for _, pump := range m.sessions {
if pump == nil || pump.Closed() {
snapshot.ClosedCount++
continue
}
snapshot.ActiveCount++
}
return snapshot
}
func fabricSessionPeerKey(target FabricSessionPeerTarget) (string, error) {
peerID := strings.TrimSpace(target.PeerID)
baseURL := strings.TrimRight(strings.TrimSpace(target.BaseURL), "/")
if peerID == "" || baseURL == "" {
return "", fmt.Errorf("fabric session peer id and base url are required")
}
return peerID + "\x00" + baseURL, nil
}
@@ -1,194 +0,0 @@
package mesh
import (
"context"
"net/http/httptest"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestFabricSessionPeerManagerReusesPeerPump(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
}
},
}.Handler())
defer server.Close()
manager := NewFabricSessionPeerManager()
defer manager.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricSessionPeerTarget{
PeerID: "node-a",
BaseURL: server.URL,
Options: FabricSessionDialOptions{
Token: "rap_fsn_manager",
Timeout: time.Second,
},
Pump: FabricSessionPumpOptions{
OutboundBuffer: 4,
InboundBuffer: 4,
},
}
first, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("first get: %v", err)
}
second, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("second get: %v", err)
}
if first != second {
t.Fatal("manager did not reuse peer pump")
}
if opened != 1 {
t.Fatalf("opened sessions = %d, want 1", opened)
}
snapshot := manager.Snapshot()
if snapshot.SchemaVersion != "rap.fabric_session_peer_manager.v1" ||
snapshot.ActiveCount != 1 ||
snapshot.ClosedCount != 0 ||
snapshot.Stats.Opens != 1 ||
snapshot.Stats.Reuses != 1 {
t.Fatalf("snapshot = %+v", snapshot)
}
if err := first.Send(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 1,
Payload: []byte("manager"),
}); err != nil {
t.Fatalf("send ping: %v", err)
}
select {
case frame := <-first.Frames():
if frame.Type != fabricproto.FramePong || frame.Sequence != 1 || string(frame.Payload) != "manager" {
t.Fatalf("frame = %+v", frame)
}
case err := <-first.Errors():
t.Fatalf("pump error: %v", err)
case <-ctx.Done():
t.Fatal(ctx.Err())
}
}
func TestFabricSessionPeerManagerClosePeerReopens(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
}
},
}.Handler())
defer server.Close()
manager := NewFabricSessionPeerManager()
defer manager.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricSessionPeerTarget{
PeerID: "node-a",
BaseURL: server.URL,
Options: FabricSessionDialOptions{
Token: "rap_fsn_manager_reopen",
Timeout: time.Second,
},
}
first, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("first get: %v", err)
}
if err := manager.ClosePeer(target); err != nil {
t.Fatalf("close peer: %v", err)
}
second, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("second get: %v", err)
}
if first == second {
t.Fatal("manager reused pump after close peer")
}
if opened != 2 {
t.Fatalf("opened sessions = %d, want 2", opened)
}
if snapshot := manager.Snapshot(); snapshot.Stats.ClosePeerCalls != 1 || snapshot.Stats.Opens != 2 {
t.Fatalf("snapshot = %+v", snapshot)
}
}
func TestFabricSessionPeerManagerReopensClosedPump(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
}
},
}.Handler())
defer server.Close()
manager := NewFabricSessionPeerManager()
defer manager.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricSessionPeerTarget{
PeerID: "node-a",
BaseURL: server.URL,
Options: FabricSessionDialOptions{
Token: "rap_fsn_manager_closed",
Timeout: time.Second,
},
}
first, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("first get: %v", err)
}
if err := first.Close(); err != nil {
t.Fatalf("close first pump: %v", err)
}
if !first.Closed() {
t.Fatal("first pump should report closed")
}
second, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("second get: %v", err)
}
if first == second {
t.Fatal("manager reused closed pump")
}
if opened != 2 {
t.Fatalf("opened sessions = %d, want 2", opened)
}
snapshot := manager.Snapshot()
if snapshot.ActiveCount != 1 ||
snapshot.Stats.Opens != 2 ||
snapshot.Stats.ClosedEvicted != 1 {
t.Fatalf("snapshot = %+v", snapshot)
}
}
func TestFabricSessionPeerManagerRejectsIncompleteTarget(t *testing.T) {
manager := NewFabricSessionPeerManager()
_, err := manager.Get(context.Background(), FabricSessionPeerTarget{PeerID: "node-a"})
if err == nil {
t.Fatal("incomplete target unexpectedly succeeded")
}
}
@@ -308,7 +308,7 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
Transport: intent.Transport, Transport: intent.Transport,
PeerCertSHA256: intent.BestPeerCertSHA256, PeerCertSHA256: intent.BestPeerCertSHA256,
}} }}
if intent.DirectCandidate { if intent.DirectCandidate || peerConnectionShouldProbeDirectUpgrade(intent, cacheEntry) {
targets = peerConnectionProbeTargets(intent, cacheEntry) targets = peerConnectionProbeTargets(intent, cacheEntry)
} }
var lastFailure string var lastFailure string
@@ -354,7 +354,9 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
result.SelectedCandidateID = probePeer.BestCandidateID result.SelectedCandidateID = probePeer.BestCandidateID
result.SelectedEndpoint = probePeer.Endpoint result.SelectedEndpoint = probePeer.Endpoint
result.LatencyMs = latency result.LatencyMs = latency
if intent.RelayCandidate { if probeTargetUsesDirectQUIC(probeTarget) {
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
} else if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt) result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt)
} else { } else {
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt) result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
@@ -410,6 +412,10 @@ func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer P
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget { func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
seen := map[string]struct{}{} seen := map[string]struct{}{}
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1) out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
fallbackPeerCertSHA256 := firstNonEmpty(
strings.TrimSpace(cacheEntry.BestPeerCertSHA256),
strings.TrimSpace(intent.BestPeerCertSHA256),
)
add := func(candidateID, endpoint, transport, peerCertSHA256 string) { add := func(candidateID, endpoint, transport, peerCertSHA256 string) {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/") endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
if endpoint == "" { if endpoint == "" {
@@ -423,6 +429,9 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
return return
} }
seen[key] = struct{}{} seen[key] = struct{}{}
if strings.TrimSpace(peerCertSHA256) == "" {
peerCertSHA256 = fallbackPeerCertSHA256
}
out = append(out, peerConnectionProbeTarget{ out = append(out, peerConnectionProbeTarget{
CandidateID: strings.TrimSpace(candidateID), CandidateID: strings.TrimSpace(candidateID),
Endpoint: endpoint, Endpoint: endpoint,
@@ -440,6 +449,31 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
return out return out
} }
func peerConnectionShouldProbeDirectUpgrade(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) bool {
if intent.DirectCandidate {
return true
}
if strings.TrimSpace(intent.ConnectionState) != PeerConnectionRelayReady &&
!intent.RelayCandidate &&
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayControl {
return false
}
for _, candidate := range cacheEntry.EndpointCandidates {
if candidateUsableForDirectProbe(candidate) {
return true
}
}
return false
}
func probeTargetUsesDirectQUIC(target peerConnectionProbeTarget) bool {
transport := strings.ToLower(strings.TrimSpace(target.Transport))
if strings.Contains(transport, "relay") || strings.Contains(transport, "reverse") || strings.Contains(transport, "outbound") {
return false
}
return peerConnectionTargetIsQUIC(target.Transport, target.Endpoint)
}
func peerConnectionTargetIsQUIC(transport string, endpoint string) bool { func peerConnectionTargetIsQUIC(transport string, endpoint string) bool {
return isQUICOnlyCandidateTransport(transport) || strings.HasPrefix(strings.ToLower(strings.TrimSpace(endpoint)), "quic://") return isQUICOnlyCandidateTransport(transport) || strings.HasPrefix(strings.ToLower(strings.TrimSpace(endpoint)), "quic://")
} }
@@ -221,6 +221,125 @@ func TestPeerConnectionProbeTargetKeepsPeerForLocalRelayReverseQUIC(t *testing.T
} }
} }
func TestPeerConnectionProbeTargetsFallsBackToBestPeerCertSHA256(t *testing.T) {
intent := PeerConnectionIntent{
NodeID: "node-b",
BestPeerCertSHA256: "intent-cert",
}
cacheEntry := PeerCacheEntry{
NodeID: "node-b",
BestPeerCertSHA256: "cache-cert",
BestCandidateID: "node-b-best",
BestTransport: "direct_quic",
Endpoint: "quic://94.141.118.222:19199",
EndpointCandidates: []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19199",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 1,
},
},
}
targets := peerConnectionProbeTargets(intent, cacheEntry)
if len(targets) != 1 {
t.Fatalf("target count = %d, want 1", len(targets))
}
for _, target := range targets {
if target.Endpoint != "quic://94.141.118.222:19199" {
continue
}
if target.PeerCertSHA256 != "cache-cert" {
t.Fatalf("peer cert = %q, want cache-cert", target.PeerCertSHA256)
}
}
}
func TestPeerConnectionProbeTargetsUpgradeRelayReadyPeerToDirectQUIC(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
current := now
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
leases := []PeerRendezvousLease{{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "quic://127.0.0.1:1",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
}}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://" + server.Addr().String(),
Reachability: "public",
ConnectivityMode: "direct",
Priority: 1,
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
},
},
RendezvousLeases: leases,
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
RendezvousLeases: leases,
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || len(cycle.Results) != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
result := cycle.Results[0]
if result.SelectedCandidateID != "node-b-direct" || result.SelectedEndpoint != "quic://"+server.Addr().String() {
t.Fatalf("relay-ready peer did not upgrade to direct candidate: %+v", result)
}
if result.ConnectionState.State != PeerConnectionReady {
t.Fatalf("connection state = %q, want ready", result.ConnectionState.State)
}
if len(result.CandidateResults) == 0 || result.CandidateResults[0].Transport != "direct_quic" || result.CandidateResults[0].LinkStatus != PeerConnectionProbeReachable {
t.Fatalf("candidate trail missing direct probe success: %+v", result.CandidateResults)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || snapshot.RelayReady != 0 {
t.Fatalf("unexpected tracker snapshot after direct upgrade: %+v", snapshot)
}
}
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) { func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC) now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
current := now current := now
@@ -102,8 +102,11 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
continue continue
} }
switch connection.State { switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady: case PeerConnectionReady:
ready++ ready++
case PeerConnectionRelayReady:
// Relay-ready peers remain valuable for control-plane reachability,
// but they do not satisfy the target for direct-ready transport paths.
case PeerConnectionDegraded: case PeerConnectionDegraded:
degraded++ degraded++
case PeerConnectionBackoff: case PeerConnectionBackoff:
@@ -69,7 +69,7 @@ func TestPeerRecoveryPlanAddsRecoverySeedWhenReadyDeficit(t *testing.T) {
} }
} }
func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) { func TestPeerRecoveryPlanTreatsRelayReadyPeersAsRecoveryGap(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{ plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{ PeerCache: PeerCacheSnapshot{
@@ -92,12 +92,15 @@ func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
Now: now, Now: now,
}) })
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy { if plan.Mode != PeerRecoveryModeRecovery || plan.Healthy {
t.Fatalf("unexpected steady plan: %+v", plan) t.Fatalf("unexpected relay-ready recovery plan: %+v", plan)
} }
if !recoveryPlanHasCandidate(plan, "node-c", "maintain_ready") { if !recoveryPlanHasCandidate(plan, "node-c", "maintain_ready") {
t.Fatalf("relay-ready peer was not maintained: %+v", plan.Candidates) t.Fatalf("relay-ready peer was not maintained: %+v", plan.Candidates)
} }
if plan.ReadyPeerCount != 0 || plan.Deficit != 1 {
t.Fatalf("relay-ready peer should not satisfy direct-ready target: %+v", plan)
}
} }
func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) { func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
@@ -0,0 +1,713 @@
package mesh
import (
"bytes"
"context"
"crypto/ed25519"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"sort"
"strings"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
const (
FabricRegistryGossipRecordSchema = "rap.fabric.registry.gossip_record.v1"
FabricRegistryScopeFarm = "farm"
FabricRegistryScopeCluster = "cluster"
FabricRegistryScopeOrganization = "organization"
FabricRegistryServiceControlAPI = "control-api"
FabricRegistryServiceUpdateStore = "update-store"
FabricRegistryServiceUpdateCache = "update-cache"
FabricRegistryServiceWebAdmin = "web-admin"
FabricRegistryServiceVPNExitPool = "vpn-egress-pool"
FabricRegistryAuthorityControl = "control-authority"
FabricRegistryAuthorityUpdate = "update-authority"
FabricRegistryAuthorityStorage = "storage-authority"
FabricRegistryAuthorityRoute = "route-authority"
)
type FabricRegistryEndpoint struct {
EndpointID string `json:"endpoint_id"`
Address string `json:"address"`
Transport string `json:"transport"`
Reachability string `json:"reachability,omitempty"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
Region string `json:"region,omitempty"`
Priority int `json:"priority,omitempty"`
Weight int `json:"weight,omitempty"`
PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type FabricRegistrySignature struct {
KeyID string `json:"key_id"`
IssuerID string `json:"issuer_id"`
Role string `json:"role"`
Alg string `json:"alg"`
Value string `json:"value"`
}
type FabricRegistryGossipRecord struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
Service string `json:"service"`
Scope string `json:"scope"`
OrganizationID string `json:"organization_id,omitempty"`
Epoch int64 `json:"epoch"`
Generation string `json:"generation,omitempty"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
IssuerNodeID string `json:"issuer_node_id"`
IssuerRole string `json:"issuer_role"`
Endpoints []FabricRegistryEndpoint `json:"endpoints"`
Metadata json.RawMessage `json:"metadata,omitempty"`
Signatures []FabricRegistrySignature `json:"signatures,omitempty"`
}
type FabricRegistryTrustedIssuer struct {
IssuerID string
Role string
PublicKey ed25519.PublicKey
Scopes []string
Services []string
}
type FabricRegistryVerificationPolicy struct {
LocalClusterID string
TrustedIssuers []FabricRegistryTrustedIssuer
RequiredSignatures int
MaxClockSkew time.Duration
Now time.Time
}
type FabricRegistryVerificationResult struct {
AcceptedSignatureCount int `json:"accepted_signature_count"`
AcceptedIssuers []string `json:"accepted_issuers,omitempty"`
RecordHash string `json:"record_hash"`
}
type FabricRegistryEntryState string
const (
FabricRegistryCandidate FabricRegistryEntryState = "candidate"
FabricRegistryActive FabricRegistryEntryState = "active"
FabricRegistryExpired FabricRegistryEntryState = "expired"
FabricRegistryRejected FabricRegistryEntryState = "rejected"
)
type FabricRegistryEntry struct {
Record FabricRegistryGossipRecord `json:"record"`
State FabricRegistryEntryState `json:"state"`
AcceptedAt time.Time `json:"accepted_at"`
PromotedAt *time.Time `json:"promoted_at,omitempty"`
VerifyResult FabricRegistryVerificationResult `json:"verify_result"`
}
type FabricRegistryBootstrapReport struct {
Total int `json:"total"`
Active int `json:"active"`
Candidate int `json:"candidate"`
Rejected int `json:"rejected"`
Rejects []string `json:"rejects,omitempty"`
RecordKeys []string `json:"record_keys,omitempty"`
}
type FabricRegistryResolveRequest struct {
ClusterID string
Service string
Scope string
OrganizationID string
PreferredRegion string
Now time.Time
}
type FabricRegistryResolvedService struct {
Found bool `json:"found"`
Service string `json:"service"`
Scope string `json:"scope,omitempty"`
OrganizationID string `json:"organization_id,omitempty"`
RecordEpoch int64 `json:"record_epoch,omitempty"`
RecordHash string `json:"record_hash,omitempty"`
Endpoints []FabricRegistryEndpoint `json:"endpoints,omitempty"`
Reason string `json:"reason,omitempty"`
}
type FabricRegistryLiveProbeRequest struct {
ClusterID string
PreferredRegion string
Timeout time.Duration
Now time.Time
MaxCandidates int
}
type FabricRegistryLiveProbeResult struct {
Service string `json:"service"`
Scope string `json:"scope"`
OrganizationID string `json:"organization_id,omitempty"`
EndpointID string `json:"endpoint_id,omitempty"`
Address string `json:"address,omitempty"`
Status string `json:"status"`
LatencyMs int64 `json:"latency_ms,omitempty"`
Promoted bool `json:"promoted"`
Error string `json:"error,omitempty"`
}
type FabricRegistrySnapshot struct {
Active int `json:"active"`
Candidate int `json:"candidate"`
ActiveKeys []string `json:"active_keys,omitempty"`
CandidateKeys []string `json:"candidate_keys,omitempty"`
}
type FabricRegistry struct {
entries map[string]FabricRegistryEntry
candidates map[string]FabricRegistryEntry
}
func NewFabricRegistry() *FabricRegistry {
return &FabricRegistry{entries: map[string]FabricRegistryEntry{}, candidates: map[string]FabricRegistryEntry{}}
}
func LoadFabricRegistryBootstrapRecords(recordsJSON string, policy FabricRegistryVerificationPolicy, liveVerified bool) (*FabricRegistry, FabricRegistryBootstrapReport, error) {
registry := NewFabricRegistry()
recordsJSON = strings.TrimSpace(recordsJSON)
if recordsJSON == "" {
return registry, FabricRegistryBootstrapReport{}, nil
}
var records []FabricRegistryGossipRecord
if err := json.Unmarshal([]byte(recordsJSON), &records); err != nil {
return nil, FabricRegistryBootstrapReport{}, fmt.Errorf("decode fabric registry bootstrap records: %w", err)
}
report := FabricRegistryBootstrapReport{Total: len(records)}
for _, record := range records {
entry, changed, err := registry.ApplyGossipRecord(record, policy, liveVerified)
if err != nil {
report.Rejected++
report.Rejects = append(report.Rejects, err.Error())
continue
}
if !changed {
continue
}
report.RecordKeys = append(report.RecordKeys, fabricRegistryRecordKey(record))
switch entry.State {
case FabricRegistryActive:
report.Active++
case FabricRegistryCandidate:
report.Candidate++
}
}
return registry, report, nil
}
func (r *FabricRegistry) ApplyGossipRecord(record FabricRegistryGossipRecord, policy FabricRegistryVerificationPolicy, liveVerified bool) (FabricRegistryEntry, bool, error) {
if r == nil {
return FabricRegistryEntry{}, false, fmt.Errorf("fabric registry is nil")
}
result, err := VerifyFabricRegistryGossipRecord(record, policy)
if err != nil {
return FabricRegistryEntry{}, false, err
}
now := registryNow(policy.Now)
key := fabricRegistryRecordKey(record)
current, exists := r.entries[key]
if exists && !fabricRegistryRecordNewer(record, current.Record, now) {
return current, false, nil
}
state := FabricRegistryCandidate
var promotedAt *time.Time
if liveVerified {
state = FabricRegistryActive
t := now
promotedAt = &t
}
entry := FabricRegistryEntry{
Record: normalizeFabricRegistryRecord(record),
State: state,
AcceptedAt: now,
PromotedAt: promotedAt,
VerifyResult: result,
}
if state == FabricRegistryActive {
r.entries[key] = entry
delete(r.candidates, key)
return entry, true, nil
}
if r.candidates == nil {
r.candidates = map[string]FabricRegistryEntry{}
}
r.candidates[key] = entry
return entry, true, nil
}
func (r *FabricRegistry) MarkLiveVerified(clusterID, service, scope, organizationID string, now time.Time) bool {
if r == nil {
return false
}
key := fabricRegistryKey(clusterID, service, scope, organizationID)
entry, ok := r.candidates[key]
if !ok || entry.State == FabricRegistryExpired || entry.State == FabricRegistryRejected {
return false
}
t := registryNow(now)
entry.State = FabricRegistryActive
entry.PromotedAt = &t
r.entries[key] = entry
delete(r.candidates, key)
return true
}
func (r *FabricRegistry) Active(clusterID, service, scope, organizationID string, now time.Time) (FabricRegistryGossipRecord, bool) {
if r == nil {
return FabricRegistryGossipRecord{}, false
}
entry, ok := r.entries[fabricRegistryKey(clusterID, service, scope, organizationID)]
if !ok || entry.State != FabricRegistryActive || !entry.Record.ExpiresAt.After(registryNow(now)) {
return FabricRegistryGossipRecord{}, false
}
return entry.Record, true
}
func (r *FabricRegistry) ResolveService(req FabricRegistryResolveRequest) FabricRegistryResolvedService {
service := strings.ToLower(strings.TrimSpace(req.Service))
if service == "" {
return FabricRegistryResolvedService{Found: false, Reason: "service_required"}
}
scopeOrder := fabricRegistryScopeResolutionOrder(req.Scope, req.OrganizationID)
for _, scope := range scopeOrder {
organizationID := strings.TrimSpace(req.OrganizationID)
if scope != FabricRegistryScopeOrganization {
organizationID = ""
}
record, ok := r.Active(req.ClusterID, service, scope, organizationID, req.Now)
if !ok {
continue
}
endpoints := selectFabricRegistryEndpoints(record.Endpoints, req.PreferredRegion)
if len(endpoints) == 0 {
return FabricRegistryResolvedService{Found: false, Service: service, Scope: scope, OrganizationID: organizationID, Reason: "no_usable_endpoints"}
}
result, _ := canonicalFabricRegistryPayload(record)
sum := sha256.Sum256(result)
return FabricRegistryResolvedService{
Found: true,
Service: service,
Scope: scope,
OrganizationID: organizationID,
RecordEpoch: record.Epoch,
RecordHash: hex.EncodeToString(sum[:]),
Endpoints: endpoints,
}
}
return FabricRegistryResolvedService{Found: false, Service: service, Reason: "no_active_record"}
}
func (r *FabricRegistry) Snapshot(now time.Time) FabricRegistrySnapshot {
if r == nil {
return FabricRegistrySnapshot{}
}
now = registryNow(now)
out := FabricRegistrySnapshot{}
for key, entry := range r.entries {
if entry.State == FabricRegistryActive && entry.Record.ExpiresAt.After(now) {
out.Active++
out.ActiveKeys = append(out.ActiveKeys, key)
}
}
for key, entry := range r.candidates {
if entry.State == FabricRegistryCandidate && entry.Record.ExpiresAt.After(now) {
out.Candidate++
out.CandidateKeys = append(out.CandidateKeys, key)
}
}
sort.Strings(out.ActiveKeys)
sort.Strings(out.CandidateKeys)
return out
}
func (r *FabricRegistry) VerifyCandidates(ctx context.Context, transport FabricTransport, req FabricRegistryLiveProbeRequest) []FabricRegistryLiveProbeResult {
if r == nil {
return nil
}
now := registryNow(req.Now)
timeout := req.Timeout
if timeout <= 0 {
timeout = 2 * time.Second
}
maxCandidates := req.MaxCandidates
if maxCandidates <= 0 {
maxCandidates = 16
}
candidates := make([]FabricRegistryEntry, 0, len(r.candidates))
for _, entry := range r.candidates {
if entry.State != FabricRegistryCandidate || !entry.Record.ExpiresAt.After(now) {
continue
}
if clusterID := strings.TrimSpace(req.ClusterID); clusterID != "" && entry.Record.ClusterID != clusterID {
continue
}
candidates = append(candidates, entry)
}
sort.SliceStable(candidates, func(i, j int) bool {
if candidates[i].Record.Service != candidates[j].Record.Service {
return candidates[i].Record.Service < candidates[j].Record.Service
}
if candidates[i].Record.Scope != candidates[j].Record.Scope {
return candidates[i].Record.Scope < candidates[j].Record.Scope
}
return candidates[i].Record.Epoch > candidates[j].Record.Epoch
})
if len(candidates) > maxCandidates {
candidates = candidates[:maxCandidates]
}
results := make([]FabricRegistryLiveProbeResult, 0, len(candidates))
for _, entry := range candidates {
record := entry.Record
result := FabricRegistryLiveProbeResult{
Service: record.Service,
Scope: record.Scope,
OrganizationID: record.OrganizationID,
Status: "unreachable",
}
endpoints := selectFabricRegistryEndpoints(record.Endpoints, req.PreferredRegion)
if len(endpoints) == 0 {
result.Error = "no_usable_endpoints"
results = append(results, result)
continue
}
for _, endpoint := range endpoints {
probeCtx, cancel := context.WithTimeout(ctx, timeout)
latency, err := probeFabricRegistryEndpoint(probeCtx, transport, endpoint, timeout)
cancel()
result.EndpointID = endpoint.EndpointID
result.Address = endpoint.Address
if err != nil {
result.Error = err.Error()
continue
}
result.Status = "reachable"
result.LatencyMs = latency.Milliseconds()
result.Promoted = r.MarkLiveVerified(record.ClusterID, record.Service, record.Scope, record.OrganizationID, now)
result.Error = ""
break
}
results = append(results, result)
}
return results
}
func SignFabricRegistryGossipRecord(record FabricRegistryGossipRecord, issuer FabricRegistryTrustedIssuer, privateKey ed25519.PrivateKey) (FabricRegistryGossipRecord, error) {
payload, err := canonicalFabricRegistryPayload(record)
if err != nil {
return record, err
}
sig := ed25519.Sign(privateKey, payload)
record.Signatures = append(record.Signatures, FabricRegistrySignature{
KeyID: firstNonEmpty(issuer.IssuerID, record.IssuerNodeID),
IssuerID: firstNonEmpty(issuer.IssuerID, record.IssuerNodeID),
Role: firstNonEmpty(issuer.Role, record.IssuerRole),
Alg: "ed25519",
Value: hex.EncodeToString(sig),
})
return record, nil
}
func VerifyFabricRegistryGossipRecord(record FabricRegistryGossipRecord, policy FabricRegistryVerificationPolicy) (FabricRegistryVerificationResult, error) {
record = normalizeFabricRegistryRecord(record)
if err := validateFabricRegistryGossipRecord(record, policy); err != nil {
return FabricRegistryVerificationResult{}, err
}
payload, err := canonicalFabricRegistryPayload(record)
if err != nil {
return FabricRegistryVerificationResult{}, err
}
sum := sha256.Sum256(payload)
trusted := map[string]FabricRegistryTrustedIssuer{}
for _, issuer := range policy.TrustedIssuers {
if strings.TrimSpace(issuer.IssuerID) != "" {
trusted[issuer.IssuerID] = issuer
}
if strings.TrimSpace(issuer.IssuerID) != "" && strings.TrimSpace(issuer.Role) != "" {
trusted[issuer.IssuerID+"\x00"+issuer.Role] = issuer
}
}
accepted := map[string]struct{}{}
for _, signature := range record.Signatures {
if strings.ToLower(strings.TrimSpace(signature.Alg)) != "ed25519" {
continue
}
issuer, ok := trusted[strings.TrimSpace(signature.IssuerID)+"\x00"+strings.TrimSpace(signature.Role)]
if !ok {
issuer, ok = trusted[strings.TrimSpace(signature.IssuerID)]
}
if !ok || !fabricRegistryIssuerAllowed(issuer, record) {
continue
}
rawSig, err := hex.DecodeString(strings.TrimSpace(signature.Value))
if err != nil || len(rawSig) != ed25519.SignatureSize || len(issuer.PublicKey) != ed25519.PublicKeySize {
continue
}
if ed25519.Verify(issuer.PublicKey, payload, rawSig) {
accepted[signature.IssuerID] = struct{}{}
}
}
required := policy.RequiredSignatures
if required <= 0 {
required = 1
}
if len(accepted) < required {
return FabricRegistryVerificationResult{RecordHash: hex.EncodeToString(sum[:])}, fmt.Errorf("fabric registry gossip record lacks required trusted signatures")
}
issuers := make([]string, 0, len(accepted))
for issuer := range accepted {
issuers = append(issuers, issuer)
}
sort.Strings(issuers)
return FabricRegistryVerificationResult{
AcceptedSignatureCount: len(accepted),
AcceptedIssuers: issuers,
RecordHash: hex.EncodeToString(sum[:]),
}, nil
}
func validateFabricRegistryGossipRecord(record FabricRegistryGossipRecord, policy FabricRegistryVerificationPolicy) error {
if record.SchemaVersion != FabricRegistryGossipRecordSchema {
return fmt.Errorf("fabric registry gossip record schema_version is invalid")
}
if strings.TrimSpace(record.ClusterID) == "" || (strings.TrimSpace(policy.LocalClusterID) != "" && record.ClusterID != policy.LocalClusterID) {
return ErrClusterMismatch
}
if strings.TrimSpace(record.Service) == "" || strings.TrimSpace(record.Scope) == "" || strings.TrimSpace(record.IssuerNodeID) == "" || strings.TrimSpace(record.IssuerRole) == "" {
return fmt.Errorf("fabric registry gossip record is missing service, scope, or issuer")
}
if record.Epoch <= 0 || record.IssuedAt.IsZero() || record.ExpiresAt.IsZero() || !record.ExpiresAt.After(record.IssuedAt) {
return fmt.Errorf("fabric registry gossip record has invalid epoch or validity window")
}
now := registryNow(policy.Now)
skew := policy.MaxClockSkew
if skew <= 0 {
skew = time.Minute
}
if record.IssuedAt.After(now.Add(skew)) || !record.ExpiresAt.After(now) {
return fmt.Errorf("fabric registry gossip record is not currently valid")
}
if len(record.Endpoints) == 0 {
return fmt.Errorf("fabric registry gossip record has no endpoints")
}
for _, endpoint := range record.Endpoints {
if strings.TrimSpace(endpoint.EndpointID) == "" || strings.TrimSpace(endpoint.Address) == "" || strings.TrimSpace(endpoint.Transport) == "" {
return fmt.Errorf("fabric registry gossip record contains invalid endpoint")
}
if !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
return fmt.Errorf("fabric registry gossip endpoint must be QUIC-only")
}
if len(endpoint.Metadata) > 0 && !json.Valid(endpoint.Metadata) {
return fmt.Errorf("fabric registry gossip endpoint metadata is invalid")
}
}
if len(record.Metadata) > 0 && !json.Valid(record.Metadata) {
return fmt.Errorf("fabric registry gossip metadata is invalid")
}
return nil
}
func canonicalFabricRegistryPayload(record FabricRegistryGossipRecord) ([]byte, error) {
record = normalizeFabricRegistryRecord(record)
record.Signatures = nil
payload, err := json.Marshal(record)
if err != nil {
return nil, err
}
var compact bytes.Buffer
if err := json.Compact(&compact, payload); err != nil {
return nil, err
}
return compact.Bytes(), nil
}
func normalizeFabricRegistryRecord(record FabricRegistryGossipRecord) FabricRegistryGossipRecord {
record.SchemaVersion = strings.TrimSpace(record.SchemaVersion)
record.ClusterID = strings.TrimSpace(record.ClusterID)
record.Service = strings.ToLower(strings.TrimSpace(record.Service))
record.Scope = strings.ToLower(strings.TrimSpace(record.Scope))
record.OrganizationID = strings.TrimSpace(record.OrganizationID)
record.IssuerNodeID = strings.TrimSpace(record.IssuerNodeID)
record.IssuerRole = strings.TrimSpace(record.IssuerRole)
record.Generation = strings.TrimSpace(record.Generation)
for i := range record.Endpoints {
record.Endpoints[i].EndpointID = strings.TrimSpace(record.Endpoints[i].EndpointID)
record.Endpoints[i].Address = strings.TrimSpace(record.Endpoints[i].Address)
record.Endpoints[i].Transport = strings.TrimSpace(record.Endpoints[i].Transport)
record.Endpoints[i].Reachability = strings.TrimSpace(record.Endpoints[i].Reachability)
record.Endpoints[i].ConnectivityMode = strings.TrimSpace(record.Endpoints[i].ConnectivityMode)
record.Endpoints[i].Region = strings.TrimSpace(record.Endpoints[i].Region)
record.Endpoints[i].PeerCertSHA256 = normalizeCertSHA256(record.Endpoints[i].PeerCertSHA256)
}
sort.SliceStable(record.Endpoints, func(i, j int) bool {
if record.Endpoints[i].Priority != record.Endpoints[j].Priority {
return record.Endpoints[i].Priority < record.Endpoints[j].Priority
}
return record.Endpoints[i].EndpointID < record.Endpoints[j].EndpointID
})
sort.SliceStable(record.Signatures, func(i, j int) bool {
if record.Signatures[i].IssuerID != record.Signatures[j].IssuerID {
return record.Signatures[i].IssuerID < record.Signatures[j].IssuerID
}
return record.Signatures[i].KeyID < record.Signatures[j].KeyID
})
return record
}
func fabricRegistryIssuerAllowed(issuer FabricRegistryTrustedIssuer, record FabricRegistryGossipRecord) bool {
if strings.TrimSpace(issuer.Role) != "" && issuer.Role != record.IssuerRole {
return false
}
if len(issuer.Scopes) > 0 && !stringInSlice(record.Scope, issuer.Scopes) {
return false
}
if len(issuer.Services) > 0 && !stringInSlice(record.Service, issuer.Services) {
return false
}
return true
}
func fabricRegistryRecordKey(record FabricRegistryGossipRecord) string {
return fabricRegistryKey(record.ClusterID, record.Service, record.Scope, record.OrganizationID)
}
func fabricRegistryScopeResolutionOrder(scope string, organizationID string) []string {
scope = strings.ToLower(strings.TrimSpace(scope))
switch scope {
case FabricRegistryScopeOrganization:
if strings.TrimSpace(organizationID) != "" {
return []string{FabricRegistryScopeOrganization, FabricRegistryScopeCluster, FabricRegistryScopeFarm}
}
return []string{FabricRegistryScopeCluster, FabricRegistryScopeFarm}
case FabricRegistryScopeFarm:
return []string{FabricRegistryScopeFarm}
case FabricRegistryScopeCluster, "":
return []string{FabricRegistryScopeCluster, FabricRegistryScopeFarm}
default:
return []string{scope, FabricRegistryScopeCluster, FabricRegistryScopeFarm}
}
}
func selectFabricRegistryEndpoints(endpoints []FabricRegistryEndpoint, preferredRegion string) []FabricRegistryEndpoint {
preferredRegion = strings.TrimSpace(preferredRegion)
out := make([]FabricRegistryEndpoint, 0, len(endpoints))
for _, endpoint := range endpoints {
if strings.TrimSpace(endpoint.Address) == "" || !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
continue
}
out = append(out, endpoint)
}
sort.SliceStable(out, func(i, j int) bool {
if preferredRegion != "" {
iMatch := strings.EqualFold(out[i].Region, preferredRegion)
jMatch := strings.EqualFold(out[j].Region, preferredRegion)
if iMatch != jMatch {
return iMatch
}
}
if out[i].Priority != out[j].Priority {
return out[i].Priority < out[j].Priority
}
if out[i].Weight != out[j].Weight {
return out[i].Weight > out[j].Weight
}
return out[i].EndpointID < out[j].EndpointID
})
return out
}
func probeFabricRegistryEndpoint(ctx context.Context, transport FabricTransport, endpoint FabricRegistryEndpoint, timeout time.Duration) (time.Duration, error) {
if transport == nil {
return 0, fmt.Errorf("fabric registry live probe transport is unavailable")
}
if timeout <= 0 {
timeout = 2 * time.Second
}
target := FabricTransportTarget{
EndpointID: endpoint.EndpointID,
PeerID: endpoint.EndpointID,
Endpoint: endpoint.Address,
Transport: endpoint.Transport,
PeerCertSHA256: endpoint.PeerCertSHA256,
Timeout: timeout,
InboundBuffer: 2,
ErrorBuffer: 2,
}
startedAt := time.Now()
session, err := transport.Connect(ctx, target)
if err != nil {
return 0, err
}
defer session.Close()
sequence := uint64(startedAt.UnixNano())
if err := session.Send(ctx, fabricproto.Frame{Type: fabricproto.FramePing, TrafficClass: fabricproto.TrafficClassReliable, Sequence: sequence, Payload: []byte("fabric-registry-live-probe")}); err != nil {
return 0, err
}
for {
select {
case frame, ok := <-session.Frames():
if !ok {
return 0, fmt.Errorf("fabric registry live probe session closed")
}
if frame.Type == fabricproto.FramePong && frame.Sequence == sequence {
return time.Since(startedAt), nil
}
case err, ok := <-session.Errors():
if !ok {
return 0, fmt.Errorf("fabric registry live probe error channel closed")
}
if err != nil {
return 0, err
}
case <-ctx.Done():
return 0, ctx.Err()
}
}
}
func fabricRegistryKey(clusterID, service, scope, organizationID string) string {
return strings.TrimSpace(clusterID) + "\x00" + strings.ToLower(strings.TrimSpace(service)) + "\x00" + strings.ToLower(strings.TrimSpace(scope)) + "\x00" + strings.TrimSpace(organizationID)
}
func fabricRegistryRecordNewer(next, current FabricRegistryGossipRecord, now time.Time) bool {
if !current.ExpiresAt.After(now) {
return true
}
if next.Epoch != current.Epoch {
return next.Epoch > current.Epoch
}
if !next.IssuedAt.Equal(current.IssuedAt) {
return next.IssuedAt.After(current.IssuedAt)
}
return strings.TrimSpace(next.Generation) > strings.TrimSpace(current.Generation)
}
func registryNow(now time.Time) time.Time {
if now.IsZero() {
return time.Now().UTC()
}
return now.UTC()
}
func stringInSlice(value string, values []string) bool {
value = strings.TrimSpace(value)
for _, candidate := range values {
if strings.TrimSpace(candidate) == value {
return true
}
}
return false
}
@@ -0,0 +1,280 @@
package mesh
import (
"context"
"crypto/ed25519"
"testing"
"time"
)
func TestFabricRegistryGossipRecordRequiresTrustedSignature(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
record := testFabricRegistryGossipRecord(now, 10)
issuer := FabricRegistryTrustedIssuer{
IssuerID: "authority-1",
Role: FabricRegistryAuthorityControl,
PublicKey: publicKey,
Scopes: []string{FabricRegistryScopeCluster},
Services: []string{FabricRegistryServiceControlAPI},
}
signed, err := SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign record: %v", err)
}
if _, err := VerifyFabricRegistryGossipRecord(signed, FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}); err != nil {
t.Fatalf("verify signed record: %v", err)
}
tampered := signed
tampered.Endpoints[0].Address = "quic://10.10.10.10:19443"
if _, err := VerifyFabricRegistryGossipRecord(tampered, FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}); err == nil {
t.Fatal("tampered record verified")
}
}
func TestFabricRegistryRejectsLegacyEndpointAndExpiredRecord(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
record := testFabricRegistryGossipRecord(now, 10)
record.Endpoints[0].Address = "https://control.example.test/api/v1"
signed, err := SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign record: %v", err)
}
if _, err := VerifyFabricRegistryGossipRecord(signed, FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{
{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey},
},
Now: now,
}); err == nil {
t.Fatal("legacy HTTP endpoint was accepted")
}
expired := testFabricRegistryGossipRecord(now.Add(-2*time.Hour), 11)
expired.ExpiresAt = now.Add(-time.Minute)
expiredSigned, err := SignFabricRegistryGossipRecord(expired, issuer, privateKey)
if err != nil {
t.Fatalf("sign expired record: %v", err)
}
if _, err := VerifyFabricRegistryGossipRecord(expiredSigned, FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{
{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey},
},
Now: now,
}); err == nil {
t.Fatal("expired record was accepted")
}
}
func TestFabricRegistryKeepsActiveRecordUntilNewerVerified(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
policy := FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}
registry := NewFabricRegistry()
active, err := SignFabricRegistryGossipRecord(testFabricRegistryGossipRecord(now, 10), issuer, privateKey)
if err != nil {
t.Fatalf("sign active: %v", err)
}
entry, changed, err := registry.ApplyGossipRecord(active, policy, true)
if err != nil || !changed || entry.State != FabricRegistryActive {
t.Fatalf("apply active entry changed=%t entry=%+v err=%v", changed, entry, err)
}
old := testFabricRegistryGossipRecord(now.Add(time.Minute), 9)
old.Endpoints[0].Address = "quic://192.0.2.9:19443"
oldSigned, err := SignFabricRegistryGossipRecord(old, issuer, privateKey)
if err != nil {
t.Fatalf("sign old: %v", err)
}
entry, changed, err = registry.ApplyGossipRecord(oldSigned, policy, true)
if err != nil {
t.Fatalf("apply old: %v", err)
}
if changed || entry.Record.Epoch != 10 || entry.Record.Endpoints[0].Address != "quic://192.0.2.10:19443" {
t.Fatalf("older record replaced active entry: changed=%t entry=%+v", changed, entry)
}
newer := testFabricRegistryGossipRecord(now.Add(2*time.Minute), 11)
newer.Endpoints[0].Address = "quic://192.0.2.11:19443"
newerSigned, err := SignFabricRegistryGossipRecord(newer, issuer, privateKey)
if err != nil {
t.Fatalf("sign newer: %v", err)
}
policy.Now = now.Add(2 * time.Minute)
entry, changed, err = registry.ApplyGossipRecord(newerSigned, policy, false)
if err != nil || !changed || entry.State != FabricRegistryCandidate {
t.Fatalf("apply newer candidate changed=%t entry=%+v err=%v", changed, entry, err)
}
activeRecord, ok := registry.Active("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", policy.Now)
if !ok || activeRecord.Endpoints[0].Address != "quic://192.0.2.10:19443" {
t.Fatalf("unverified newer candidate displaced active fallback: ok=%t record=%+v", ok, activeRecord)
}
if !registry.MarkLiveVerified("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", policy.Now.Add(time.Second)) {
t.Fatal("mark live verified failed")
}
activeRecord, ok = registry.Active("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", policy.Now.Add(time.Second))
if !ok || activeRecord.Endpoints[0].Address != "quic://192.0.2.11:19443" {
t.Fatalf("newer verified record not active: ok=%t record=%+v", ok, activeRecord)
}
}
func TestFabricRegistryResolveServicePrefersVerifiedScopedRegionalEndpoint(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
policy := FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}
registry := NewFabricRegistry()
clusterRecord := testFabricRegistryGossipRecord(now, 10)
clusterRecord.Endpoints = []FabricRegistryEndpoint{
{EndpointID: "control-eu", Address: "quic://eu.example.test:19443", Transport: "direct_quic", Region: "eu", Priority: 10, Weight: 1},
{EndpointID: "control-us", Address: "quic://us.example.test:19443", Transport: "direct_quic", Region: "us", Priority: 10, Weight: 10},
}
signedCluster, err := SignFabricRegistryGossipRecord(clusterRecord, issuer, privateKey)
if err != nil {
t.Fatalf("sign cluster record: %v", err)
}
if _, _, err := registry.ApplyGossipRecord(signedCluster, policy, true); err != nil {
t.Fatalf("apply cluster record: %v", err)
}
orgRecord := testFabricRegistryGossipRecord(now.Add(time.Minute), 11)
orgRecord.Scope = FabricRegistryScopeOrganization
orgRecord.OrganizationID = "org-1"
orgRecord.Endpoints = []FabricRegistryEndpoint{
{EndpointID: "control-org", Address: "quic://org.example.test:19443", Transport: "direct_quic", Region: "eu", Priority: 1, Weight: 1},
}
signedOrg, err := SignFabricRegistryGossipRecord(orgRecord, issuer, privateKey)
if err != nil {
t.Fatalf("sign org record: %v", err)
}
policy.Now = now.Add(time.Minute)
if _, _, err := registry.ApplyGossipRecord(signedOrg, policy, false); err != nil {
t.Fatalf("apply org candidate: %v", err)
}
resolved := registry.ResolveService(FabricRegistryResolveRequest{
ClusterID: "cluster-1",
Service: FabricRegistryServiceControlAPI,
Scope: FabricRegistryScopeOrganization,
OrganizationID: "org-1",
PreferredRegion: "us",
Now: now.Add(time.Minute),
})
if !resolved.Found || resolved.Scope != FabricRegistryScopeCluster || resolved.Endpoints[0].EndpointID != "control-us" {
t.Fatalf("expected cluster fallback with preferred region endpoint, got %+v", resolved)
}
if !registry.MarkLiveVerified("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeOrganization, "org-1", now.Add(2*time.Minute)) {
t.Fatal("mark org live verified failed")
}
resolved = registry.ResolveService(FabricRegistryResolveRequest{
ClusterID: "cluster-1",
Service: FabricRegistryServiceControlAPI,
Scope: FabricRegistryScopeOrganization,
OrganizationID: "org-1",
Now: now.Add(2 * time.Minute),
})
if !resolved.Found || resolved.Scope != FabricRegistryScopeOrganization || resolved.Endpoints[0].EndpointID != "control-org" {
t.Fatalf("expected verified organization record, got %+v", resolved)
}
snapshot := registry.Snapshot(now.Add(2 * time.Minute))
if snapshot.Active != 2 || snapshot.Candidate != 0 {
t.Fatalf("unexpected snapshot: %+v", snapshot)
}
}
func TestFabricRegistryVerifyCandidatesPromotesAfterQUICPong(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
tlsConfig := testQUICTLSConfig(t)
listener := startQUICFabricEchoServerWithTLS(t, tlsConfig)
defer listener.Close()
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
policy := FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}
record := testFabricRegistryGossipRecord(now, 12)
record.Endpoints[0].Address = "quic://" + listener.Addr().String()
record.Endpoints[0].PeerCertSHA256 = testQUICCertSHA256(t, tlsConfig)
signed, err := SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign record: %v", err)
}
registry := NewFabricRegistry()
if entry, changed, err := registry.ApplyGossipRecord(signed, policy, false); err != nil || !changed || entry.State != FabricRegistryCandidate {
t.Fatalf("apply candidate changed=%t entry=%+v err=%v", changed, entry, err)
}
results := registry.VerifyCandidates(context.Background(), NewQUICFabricTransport(nil), FabricRegistryLiveProbeRequest{
ClusterID: "cluster-1",
Timeout: 3 * time.Second,
Now: now.Add(time.Second),
MaxCandidates: 1,
})
if len(results) != 1 || results[0].Status != "reachable" || !results[0].Promoted {
t.Fatalf("unexpected live probe results: %+v", results)
}
if _, ok := registry.Active("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", now.Add(time.Second)); !ok {
t.Fatal("candidate was not promoted to active")
}
}
func testFabricRegistryGossipRecord(now time.Time, epoch int64) FabricRegistryGossipRecord {
return FabricRegistryGossipRecord{
SchemaVersion: FabricRegistryGossipRecordSchema,
ClusterID: "cluster-1",
Service: FabricRegistryServiceControlAPI,
Scope: FabricRegistryScopeCluster,
Epoch: epoch,
Generation: "gen",
IssuedAt: now,
ExpiresAt: now.Add(10 * time.Minute),
IssuerNodeID: "authority-1",
IssuerRole: FabricRegistryAuthorityControl,
Endpoints: []FabricRegistryEndpoint{
{
EndpointID: "control-a",
Address: "quic://192.0.2.10:19443",
Transport: "direct_quic",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 1,
},
},
}
}
+59 -728
View File
@@ -20,7 +20,6 @@ import (
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
"github.com/gorilla/websocket"
) )
type ProductionEnvelopeObserver func(context.Context, ProductionEnvelopeObservation) error type ProductionEnvelopeObserver func(context.Context, ProductionEnvelopeObservation) error
@@ -55,6 +54,22 @@ type RemoteWorkspaceFrameSinkSessionMailboxConsumerResume interface {
type RemoteWorkspaceFrameSinkSessionMailboxPreflight interface { type RemoteWorkspaceFrameSinkSessionMailboxPreflight interface {
PreflightAdapterSessionMailboxConsumerResume(adapterSessionID string, consumerID string, resumeFrom string, limit int, now time.Time) (RemoteWorkspaceAdapterMailboxPreflightSnapshot, error) PreflightAdapterSessionMailboxConsumerResume(adapterSessionID string, consumerID string, resumeFrom string, limit int, now time.Time) (RemoteWorkspaceAdapterMailboxPreflightSnapshot, error)
} }
type FabricSessionEventLogEntry struct {
Event string `json:"event"`
ClusterID string `json:"cluster_id,omitempty"`
NodeID string `json:"node_id,omitempty"`
PeerID string `json:"peer_id,omitempty"`
AcceptedBy string `json:"accepted_by,omitempty"`
SessionID string `json:"session_id,omitempty"`
SessionEvent fabricproto.SessionEventType `json:"session_event,omitempty"`
StreamID uint64 `json:"stream_id,omitempty"`
Sequence uint64 `json:"sequence,omitempty"`
TrafficClass fabricproto.TrafficClass `json:"traffic_class,omitempty"`
RemoteAddr string `json:"remote_addr,omitempty"`
Reason string `json:"reason,omitempty"`
ObservedAt time.Time `json:"observed_at"`
}
type VPNPacketIngress interface { type VPNPacketIngress interface {
SendClientPacketBatch(ctx context.Context, clusterID string, vpnConnectionID string, packets [][]byte) error SendClientPacketBatch(ctx context.Context, clusterID string, vpnConnectionID string, packets [][]byte) error
ReceiveClientPacketBatch(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration) ([][]byte, error) ReceiveClientPacketBatch(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration) ([][]byte, error)
@@ -84,9 +99,6 @@ type Server struct {
BackendProxyBaseURL string BackendProxyBaseURL string
ClusterAuthorityPublicKey string ClusterAuthorityPublicKey string
ServiceChannelIntrospection bool ServiceChannelIntrospection bool
FabricSessionEnabled bool
FabricSessionWebSocketEnabled bool
FabricSessionLogger FabricSessionEventLogger
} }
func (s Server) Handler() http.Handler { func (s Server) Handler() http.Handler {
@@ -94,9 +106,6 @@ func (s Server) Handler() http.Handler {
mux.HandleFunc("/mesh/v1/health", s.handleHealth) mux.HandleFunc("/mesh/v1/health", s.handleHealth)
mux.HandleFunc("/mesh/v1/forward", s.handleForward) mux.HandleFunc("/mesh/v1/forward", s.handleForward)
mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe) mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe)
if s.FabricSessionEnabled && s.FabricSessionWebSocketEnabled {
mux.HandleFunc("/mesh/v1/fabric/session/ws", s.handleFabricSessionWebSocket)
}
if s.RemoteWorkspaceFrameSink != nil { if s.RemoteWorkspaceFrameSink != nil {
mux.HandleFunc("/mesh/v1/remote-workspace/adapter-sessions/", s.handleRemoteWorkspaceAdapterSessionControl) mux.HandleFunc("/mesh/v1/remote-workspace/adapter-sessions/", s.handleRemoteWorkspaceAdapterSessionControl)
} }
@@ -196,185 +205,6 @@ func (s Server) handleRemoteWorkspaceAdapterSessionSnapshot(w http.ResponseWrite
_ = json.NewEncoder(w).Encode(snapshotter.SnapshotAdapterSessions(includeTerminal, limit, time.Now().UTC())) _ = json.NewEncoder(w).Encode(snapshotter.SnapshotAdapterSessions(includeTerminal, limit, time.Now().UTC()))
} }
type FabricSessionEventLogEntry struct {
Event string `json:"event"`
ClusterID string `json:"cluster_id,omitempty"`
NodeID string `json:"node_id,omitempty"`
PeerID string `json:"peer_id,omitempty"`
AcceptedBy string `json:"accepted_by,omitempty"`
SessionID string `json:"session_id,omitempty"`
SessionEvent fabricproto.SessionEventType `json:"session_event,omitempty"`
StreamID uint64 `json:"stream_id,omitempty"`
Sequence uint64 `json:"sequence,omitempty"`
TrafficClass fabricproto.TrafficClass `json:"traffic_class,omitempty"`
RemoteAddr string `json:"remote_addr,omitempty"`
Reason string `json:"reason,omitempty"`
ObservedAt time.Time `json:"observed_at"`
}
type fabricSessionAuthorityPayload struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
SessionID string `json:"session_id"`
SourceNodeID string `json:"source_node_id,omitempty"`
SelectedEntryNodeID string `json:"selected_entry_node_id,omitempty"`
TokenHash string `json:"token_hash"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
}
type fabricSessionAuthDecision struct {
AcceptedBy string
SessionID string
}
func (s Server) handleFabricSessionWebSocket(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
decision, ok := s.validateFabricSessionRequest(w, r)
if !ok {
return
}
upgrader := websocket.Upgrader{
CheckOrigin: func(_ *http.Request) bool { return true },
}
conn, err := upgrader.Upgrade(w, r, nil)
if err != nil {
return
}
defer conn.Close()
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_websocket_opened",
ClusterID: s.Local.ClusterID,
NodeID: s.Local.NodeID,
AcceptedBy: decision.AcceptedBy,
SessionID: decision.SessionID,
RemoteAddr: r.RemoteAddr,
ObservedAt: time.Now().UTC(),
})
loop := fabricproto.TransportLoop{
Session: fabricproto.NewSession(fabricproto.SessionConfig{}),
OnEvent: func(event fabricproto.SessionEvent) ([]fabricproto.Frame, error) {
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_event",
ClusterID: s.Local.ClusterID,
NodeID: s.Local.NodeID,
AcceptedBy: decision.AcceptedBy,
SessionID: decision.SessionID,
SessionEvent: event.Type,
StreamID: event.StreamID,
Sequence: event.Sequence,
TrafficClass: event.TrafficClass,
RemoteAddr: r.RemoteAddr,
ObservedAt: time.Now().UTC(),
})
return nil, nil
},
}
err = loop.RunWebSocket(r.Context(), conn, fabricproto.WebSocketTransportConfig{})
if err != nil && !errors.Is(err, context.Canceled) {
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_websocket_closed",
ClusterID: s.Local.ClusterID,
NodeID: s.Local.NodeID,
AcceptedBy: decision.AcceptedBy,
SessionID: decision.SessionID,
RemoteAddr: r.RemoteAddr,
Reason: err.Error(),
ObservedAt: time.Now().UTC(),
})
return
}
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_websocket_closed",
ClusterID: s.Local.ClusterID,
NodeID: s.Local.NodeID,
AcceptedBy: decision.AcceptedBy,
SessionID: decision.SessionID,
RemoteAddr: r.RemoteAddr,
ObservedAt: time.Now().UTC(),
})
}
func (s Server) validateFabricSessionRequest(w http.ResponseWriter, r *http.Request) (fabricSessionAuthDecision, bool) {
var decision fabricSessionAuthDecision
token := fabricSessionBearerToken(r)
if !strings.HasPrefix(token, "rap_fsn_") {
http.Error(w, "fabric session token is required", http.StatusUnauthorized)
return decision, false
}
payload, err := s.verifyFabricSessionAuthority(r, token)
if err != nil {
http.Error(w, err.Error(), http.StatusForbidden)
return decision, false
}
decision.AcceptedBy = "legacy_unsigned"
if payload != nil {
decision.AcceptedBy = "signed"
decision.SessionID = strings.TrimSpace(payload.SessionID)
}
return decision, true
}
func (s Server) verifyFabricSessionAuthority(r *http.Request, token string) (*fabricSessionAuthorityPayload, error) {
publicKey := strings.TrimSpace(s.ClusterAuthorityPublicKey)
payloadHeader := strings.TrimSpace(r.Header.Get("X-RAP-Fabric-Session-Authority-Payload"))
signatureHeader := strings.TrimSpace(r.Header.Get("X-RAP-Fabric-Session-Authority-Signature"))
if payloadHeader == "" && signatureHeader == "" {
if publicKey != "" {
return nil, fmt.Errorf("%w: signed fabric session authority is required", ErrUnauthorizedChannel)
}
return nil, nil
}
if publicKey == "" {
return nil, ErrUnauthorizedChannel
}
if payloadHeader == "" || signatureHeader == "" {
return nil, fmt.Errorf("%w: fabric session authority payload and signature are required together", ErrUnauthorizedChannel)
}
payloadRaw, err := decodeHeaderJSON(payloadHeader)
if err != nil {
return nil, fmt.Errorf("%w: invalid fabric session authority payload", ErrUnauthorizedChannel)
}
signatureRaw, err := decodeHeaderJSON(signatureHeader)
if err != nil {
return nil, fmt.Errorf("%w: invalid fabric session authority signature", ErrUnauthorizedChannel)
}
var signature authority.Signature
if err := json.Unmarshal(signatureRaw, &signature); err != nil {
return nil, fmt.Errorf("%w: invalid fabric session authority signature", ErrUnauthorizedChannel)
}
if err := authority.VerifyRaw(publicKey, payloadRaw, signature); err != nil {
return nil, fmt.Errorf("%w: fabric session authority signature rejected", ErrUnauthorizedChannel)
}
var payload fabricSessionAuthorityPayload
if err := json.Unmarshal(payloadRaw, &payload); err != nil {
return nil, fmt.Errorf("%w: invalid fabric session authority payload", ErrUnauthorizedChannel)
}
if payload.SchemaVersion != "rap.fabric_session_authority.v1" ||
payload.ClusterID != s.Local.ClusterID ||
payload.TokenHash != fabricSessionTokenHash(token) ||
strings.TrimSpace(payload.SessionID) == "" {
return nil, fmt.Errorf("%w: fabric session authority payload mismatch", ErrUnauthorizedChannel)
}
if payload.SelectedEntryNodeID != "" && s.Local.NodeID != "" && payload.SelectedEntryNodeID != s.Local.NodeID {
return nil, fmt.Errorf("%w: fabric session entry node mismatch", ErrUnauthorizedChannel)
}
if !payload.ExpiresAt.IsZero() && !payload.ExpiresAt.After(time.Now().UTC()) {
return nil, fmt.Errorf("%w: fabric session lease expired", ErrUnauthorizedChannel)
}
return &payload, nil
}
func (s Server) logFabricSession(entry FabricSessionEventLogEntry) {
if s.FabricSessionLogger != nil {
s.FabricSessionLogger(entry)
}
}
func (s Server) handleRemoteWorkspaceAdapterSessionMailbox(w http.ResponseWriter, r *http.Request) { func (s Server) handleRemoteWorkspaceAdapterSessionMailbox(w http.ResponseWriter, r *http.Request) {
reader, ok := s.RemoteWorkspaceFrameSink.(RemoteWorkspaceFrameSinkSessionMailbox) reader, ok := s.RemoteWorkspaceFrameSink.(RemoteWorkspaceFrameSinkSessionMailbox)
if !ok { if !ok {
@@ -711,15 +541,15 @@ func parseRemoteWorkspaceAdapterSessionControlPath(path string) (string, bool) {
} }
func (s Server) handleVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool { func (s Server) handleVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool {
if clusterID, vpnConnectionID, ok := parseVPNClientPacketWebSocketPath(r.URL.Path); ok { if isVPNClientPacketWebSocketPath(r.URL.Path) {
s.handleVPNPacketWebSocket(w, r, clusterID, "", vpnConnectionID, false, true, "") http.Error(w, "legacy VPN WebSocket dataplane is removed; use QUIC fabric route", http.StatusGone)
return true return true
} }
clusterID, vpnConnectionID, ok := parseVPNClientPacketPath(r.URL.Path) if _, _, ok := parseVPNClientPacketPath(r.URL.Path); !ok {
if !ok {
return false return false
} }
return s.handleVPNPacketHTTP(w, r, clusterID, "", vpnConnectionID, "", false, true, "") http.Error(w, "legacy VPN HTTP dataplane is removed; use QUIC fabric route", http.StatusGone)
return true
} }
func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.ResponseWriter, r *http.Request) bool { func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.ResponseWriter, r *http.Request) bool {
@@ -728,7 +558,7 @@ func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.Response
return false return false
} }
if webSocket { if webSocket {
http.Error(w, "remote workspace service-channel websocket forwarding is not implemented", http.StatusNotImplemented) http.Error(w, "remote workspace service-channel websocket ingress is removed; use QUIC fabric route", http.StatusGone)
return true return true
} }
decision, valid := s.validateFabricServiceChannelRequest(w, r, clusterID, channelID, resourceID, FabricServiceClassRemoteWorkspace, channelClass) decision, valid := s.validateFabricServiceChannelRequest(w, r, clusterID, channelID, resourceID, FabricServiceClassRemoteWorkspace, channelClass)
@@ -809,7 +639,7 @@ func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.Response
"channel_id": channelID, "channel_id": channelID,
"resource_id": resourceID, "resource_id": resourceID,
"data_plane": "validated", "data_plane": "validated",
"payload_flow": "not_implemented", "payload_flow": "validated_only",
}) })
return true return true
} }
@@ -898,7 +728,7 @@ func validateRemoteWorkspaceFrameBatchProbe(payload []byte, requiredChannelClass
return decoded, fmt.Errorf("unsupported remote workspace frame batch schema") return decoded, fmt.Errorf("unsupported remote workspace frame batch schema")
} }
if !decoded.ProbeOnly { if !decoded.ProbeOnly {
return decoded, fmt.Errorf("remote workspace payload forwarding is not implemented") return decoded, fmt.Errorf("remote workspace production payload forwarding is disabled; probe_only required")
} }
if strings.TrimSpace(strings.ToLower(decoded.ServiceClass)) != FabricServiceClassRemoteWorkspace { if strings.TrimSpace(strings.ToLower(decoded.ServiceClass)) != FabricServiceClassRemoteWorkspace {
return decoded, fmt.Errorf("remote workspace frame batch service class mismatch") return decoded, fmt.Errorf("remote workspace frame batch service class mismatch")
@@ -952,438 +782,6 @@ func isAllowedRemoteWorkspaceAdapterFrameDirection(channel string, direction str
} }
} }
func (s Server) handleFabricServiceChannelVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool {
if clusterID, channelID, vpnConnectionID, ok := parseFabricServiceChannelVPNPacketWebSocketPath(r.URL.Path); ok {
decision, valid := s.validateFabricServiceChannelVPNRequest(w, r, clusterID, channelID, vpnConnectionID)
if !valid {
return true
}
s.logFabricServiceChannelAccess(r, clusterID, channelID, vpnConnectionID, decision)
s.preferVPNPacketIngressRoute(decision.PreferredRouteID)
s.handleVPNPacketWebSocket(w, r, clusterID, channelID, vpnConnectionID, decision.ForceBackendFallback, decision.BackendFallbackAllowed(), decision.BackendRelayPolicy)
return true
}
clusterID, channelID, vpnConnectionID, ok := parseFabricServiceChannelVPNPacketPath(r.URL.Path)
if !ok {
return false
}
decision, valid := s.validateFabricServiceChannelVPNRequest(w, r, clusterID, channelID, vpnConnectionID)
if !valid {
return true
}
w.Header().Set("X-RAP-Service-Channel-Accepted-By", decision.AcceptedBy)
s.logFabricServiceChannelAccess(r, clusterID, channelID, vpnConnectionID, decision)
s.preferVPNPacketIngressRoute(decision.PreferredRouteID)
backendPath := "/api/v1/clusters/" + clusterID + "/vpn-connections/" + vpnConnectionID + "/tunnel/client/packets"
return s.handleVPNPacketHTTP(w, r, clusterID, channelID, vpnConnectionID, backendPath, decision.ForceBackendFallback, decision.BackendFallbackAllowed(), decision.BackendRelayPolicy)
}
func (s Server) preferVPNPacketIngressRoute(routeID string) {
routeID = strings.TrimSpace(routeID)
if routeID == "" || s.VPNPacketIngress == nil {
return
}
if preferred, ok := s.VPNPacketIngress.(VPNPacketIngressRoutePreference); ok {
preferred.PreferClientRoute(routeID)
}
}
func (s Server) handleVPNPacketHTTP(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, vpnConnectionID string, backendFallbackPath string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) bool {
switch r.Method {
case http.MethodPost:
body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, MaxProductionVPNPacketPayloadBytes))
if err != nil {
http.Error(w, "invalid vpn packet payload", http.StatusBadRequest)
return true
}
if r.URL.Query().Get("batch") != "true" && len(body) == 0 {
http.Error(w, "empty vpn packet payload", http.StatusBadRequest)
return true
}
packets := [][]byte{body}
if r.URL.Query().Get("batch") == "true" {
packets, err = decodeVPNIngressPacketBatch(body)
if err != nil {
http.Error(w, "invalid vpn packet batch", http.StatusBadRequest)
return true
}
}
packets = cleanVPNIngressPacketBatch(packets)
if len(packets) == 0 {
http.Error(w, "empty vpn packet batch", http.StatusBadRequest)
return true
}
if forceBackendFallback {
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, body, backendFallbackPath) {
return true
}
s.logFabricServiceChannelViolation(r, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
http.Error(w, ErrRouteNotFound.Error(), vpnIngressStatusCode(ErrRouteNotFound))
return true
}
trafficClass := inferVPNPacketTrafficClass(r.Header.Get("X-RAP-Traffic-Class"), packets)
var sendErr error
if classIngress, ok := s.VPNPacketIngress.(VPNPacketIngressTrafficClass); ok {
sendErr = classIngress.SendClientPacketBatchWithTrafficClass(r.Context(), clusterID, vpnConnectionID, trafficClass, packets)
} else {
sendErr = s.VPNPacketIngress.SendClientPacketBatch(r.Context(), clusterID, vpnConnectionID, packets)
}
if sendErr != nil {
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, body, backendFallbackPath) {
return true
}
s.logFabricServiceChannelViolation(r, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "fabric_route_send_failed_backend_fallback_blocked", sendErr.Error())
http.Error(w, sendErr.Error(), vpnIngressStatusCode(sendErr))
return true
}
w.WriteHeader(http.StatusAccepted)
return true
case http.MethodGet:
if forceBackendFallback {
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, nil, backendFallbackPath) {
return true
}
s.logFabricServiceChannelViolation(r, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
w.WriteHeader(http.StatusNoContent)
return true
}
timeout := vpnIngressTimeout(r)
packets, err := s.VPNPacketIngress.ReceiveClientPacketBatch(r.Context(), clusterID, vpnConnectionID, timeout)
if err != nil {
http.Error(w, err.Error(), vpnIngressStatusCode(err))
return true
}
packets = cleanVPNIngressPacketBatch(packets)
if len(packets) == 0 {
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, nil, backendFallbackPath) {
return true
}
w.WriteHeader(http.StatusNoContent)
return true
}
if r.URL.Query().Get("batch") == "true" {
w.Header().Set("Content-Type", "application/vnd.rap.vpn-packet-batch.v1")
_, _ = w.Write(encodeVPNIngressPacketBatch(packets))
return true
}
w.Header().Set("Content-Type", "application/octet-stream")
_, _ = w.Write(packets[0])
return true
default:
w.WriteHeader(http.StatusMethodNotAllowed)
return true
}
}
func (s Server) handleVPNPacketWebSocket(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, vpnConnectionID string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) {
if r.Method != http.MethodGet {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if s.VPNPacketIngress == nil {
http.Error(w, ErrForwardRuntimeUnavailable.Error(), http.StatusServiceUnavailable)
return
}
upgrader := websocket.Upgrader{
CheckOrigin: func(_ *http.Request) bool { return true },
}
conn, err := upgrader.Upgrade(w, r, nil)
if err != nil {
return
}
defer conn.Close()
conn.SetReadLimit(MaxProductionVPNPacketPayloadBytes)
ctx, cancel := context.WithCancel(r.Context())
defer cancel()
trafficClass := r.Header.Get("X-RAP-Traffic-Class")
errCh := make(chan error, 2)
go func() {
errCh <- s.readVPNPacketWebSocket(ctx, conn, clusterID, channelID, vpnConnectionID, trafficClass, forceBackendFallback, backendFallbackAllowed, backendRelayPolicy)
}()
go func() {
errCh <- s.writeVPNPacketWebSocket(ctx, conn, clusterID, channelID, vpnConnectionID, forceBackendFallback, backendFallbackAllowed, backendRelayPolicy)
}()
select {
case <-ctx.Done():
case <-errCh:
cancel()
}
}
func (s Server) readVPNPacketWebSocket(ctx context.Context, conn *websocket.Conn, clusterID string, channelID string, vpnConnectionID string, trafficClass string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) error {
for {
messageType, payload, err := conn.ReadMessage()
if err != nil {
return err
}
if messageType != websocket.BinaryMessage {
continue
}
packets, err := decodeVPNIngressPacketBatch(payload)
if err != nil {
return err
}
packets = cleanVPNIngressPacketBatch(packets)
if len(packets) == 0 {
continue
}
if forceBackendFallback {
if !backendFallbackAllowed {
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
return ErrRouteNotFound
}
if proxyErr := s.backendVPNPacketPost(ctx, clusterID, vpnConnectionID, payload); proxyErr != nil {
return proxyErr
}
continue
}
sendErr := s.sendVPNPacketWebSocketBatch(ctx, clusterID, vpnConnectionID, inferVPNPacketTrafficClass(trafficClass, packets), packets, !backendFallbackAllowed)
if sendErr != nil {
if !backendFallbackAllowed {
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "fabric_route_send_failed_backend_fallback_blocked", sendErr.Error())
if isRetryableVPNPacketIngressError(sendErr) {
continue
}
return sendErr
}
if proxyErr := s.backendVPNPacketPost(ctx, clusterID, vpnConnectionID, payload); proxyErr != nil {
return sendErr
}
}
}
}
func (s Server) sendVPNPacketWebSocketBatch(ctx context.Context, clusterID string, vpnConnectionID string, trafficClass string, packets [][]byte, retryRouteErrors bool) error {
const maxAttempts = 6
var lastErr error
for attempt := 0; attempt < maxAttempts; attempt++ {
if err := ctx.Err(); err != nil {
return err
}
var sendErr error
if classIngress, ok := s.VPNPacketIngress.(VPNPacketIngressTrafficClass); ok {
sendErr = classIngress.SendClientPacketBatchWithTrafficClass(ctx, clusterID, vpnConnectionID, trafficClass, packets)
} else {
sendErr = s.VPNPacketIngress.SendClientPacketBatch(ctx, clusterID, vpnConnectionID, packets)
}
if sendErr == nil {
return nil
}
lastErr = sendErr
if !retryRouteErrors || !isRetryableVPNPacketIngressError(sendErr) {
return sendErr
}
timer := time.NewTimer(time.Duration(75+attempt*50) * time.Millisecond)
select {
case <-ctx.Done():
timer.Stop()
return ctx.Err()
case <-timer.C:
}
}
return lastErr
}
func isRetryableVPNPacketIngressError(err error) bool {
return errors.Is(err, ErrRouteNotFound) ||
errors.Is(err, ErrForwardRuntimeUnavailable) ||
errors.Is(err, ErrForwardPeerUnavailable) ||
errors.Is(err, ErrSyntheticPeerUnavailable)
}
func (s Server) receiveVPNPacketWebSocketBatch(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration, retryRouteErrors bool) ([][]byte, error) {
const maxAttempts = 4
var lastErr error
for attempt := 0; attempt < maxAttempts; attempt++ {
if err := ctx.Err(); err != nil {
return nil, err
}
packets, err := s.VPNPacketIngress.ReceiveClientPacketBatch(ctx, clusterID, vpnConnectionID, timeout)
if err == nil {
return packets, nil
}
lastErr = err
if !retryRouteErrors || !isRetryableVPNPacketIngressError(err) {
return nil, err
}
timer := time.NewTimer(time.Duration(75+attempt*50) * time.Millisecond)
select {
case <-ctx.Done():
timer.Stop()
return nil, ctx.Err()
case <-timer.C:
}
}
if retryRouteErrors && isRetryableVPNPacketIngressError(lastErr) {
return nil, nil
}
return nil, lastErr
}
func (s Server) writeVPNPacketWebSocket(ctx context.Context, conn *websocket.Conn, clusterID string, channelID string, vpnConnectionID string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) error {
lastPing := time.Now()
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
var packets [][]byte
var err error
if !forceBackendFallback {
packets, err = s.receiveVPNPacketWebSocketBatch(ctx, clusterID, vpnConnectionID, 50*time.Millisecond, !backendFallbackAllowed)
}
if forceBackendFallback && !backendFallbackAllowed {
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
return ErrRouteNotFound
}
if err != nil && !backendFallbackAllowed {
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "fabric_route_receive_failed_backend_fallback_blocked", err.Error())
return err
}
if backendFallbackAllowed && (forceBackendFallback || err != nil || len(packets) == 0) {
backendPackets, proxyErr := s.backendVPNPacketGet(ctx, clusterID, vpnConnectionID, 50*time.Millisecond)
if proxyErr != nil && err != nil {
return err
}
if len(backendPackets) > 0 {
packets = backendPackets
}
}
if len(packets) > 0 {
if err := conn.SetWriteDeadline(time.Now().Add(5 * time.Second)); err != nil {
return err
}
if err := conn.WriteMessage(websocket.BinaryMessage, encodeVPNIngressPacketBatch(packets)); err != nil {
return err
}
continue
}
if time.Since(lastPing) >= 15*time.Second {
if err := conn.SetWriteDeadline(time.Now().Add(5 * time.Second)); err != nil {
return err
}
if err := conn.WriteMessage(websocket.PingMessage, []byte("rap-vpn")); err != nil {
return err
}
lastPing = time.Now()
}
}
}
func (s Server) backendVPNPacketPost(ctx context.Context, clusterID string, vpnConnectionID string, batchPayload []byte) error {
target := strings.TrimRight(strings.TrimSpace(s.BackendProxyBaseURL), "/")
if target == "" {
return ErrRouteNotFound
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, target+"/clusters/"+clusterID+"/vpn-connections/"+vpnConnectionID+"/tunnel/client/packets?batch=true", bytes.NewReader(batchPayload))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/octet-stream")
req.Header.Set("X-RAP-Entry-Node", s.Local.NodeID)
req.Header.Set("X-RAP-Entry-Cluster", s.Local.ClusterID)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("backend vpn packet post failed: status=%d", resp.StatusCode)
}
return nil
}
func (s Server) backendVPNPacketGet(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration) ([][]byte, error) {
target := strings.TrimRight(strings.TrimSpace(s.BackendProxyBaseURL), "/")
if target == "" {
return nil, ErrRouteNotFound
}
if timeout <= 0 {
timeout = 50 * time.Millisecond
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, target+"/clusters/"+clusterID+"/vpn-connections/"+vpnConnectionID+"/tunnel/client/packets?batch=true&timeout_ms="+strconv.FormatInt(timeout.Milliseconds(), 10), nil)
if err != nil {
return nil, err
}
req.Header.Set("Accept", "application/vnd.rap.vpn-packet-batch.v1")
req.Header.Set("X-RAP-Entry-Node", s.Local.NodeID)
req.Header.Set("X-RAP-Entry-Cluster", s.Local.ClusterID)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNoContent {
return nil, nil
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, fmt.Errorf("backend vpn packet get failed: status=%d", resp.StatusCode)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, MaxProductionVPNPacketPayloadBytes))
if err != nil {
return nil, err
}
if len(body) == 0 {
return nil, nil
}
return decodeVPNIngressPacketBatch(body)
}
func (s Server) proxyVPNPacketIngressToBackend(w http.ResponseWriter, r *http.Request, body []byte) bool {
return s.proxyVPNPacketIngressToBackendPath(w, r, body, "")
}
func (s Server) proxyVPNPacketIngressToBackendPath(w http.ResponseWriter, r *http.Request, body []byte, backendPath string) bool {
if strings.TrimSpace(s.BackendProxyBaseURL) == "" {
return false
}
target, err := url.Parse(s.BackendProxyBaseURL)
if err != nil || target.Scheme == "" || target.Host == "" {
return false
}
if strings.EqualFold(target.Host, r.Host) {
return false
}
var reader io.Reader
if body != nil {
reader = bytes.NewReader(body)
}
requestURI := r.URL.RequestURI()
if backendPath != "" {
requestURI = backendPath
if r.URL.RawQuery != "" {
requestURI += "?" + r.URL.RawQuery
}
}
req, err := http.NewRequestWithContext(r.Context(), r.Method, target.Scheme+"://"+target.Host+requestURI, reader)
if err != nil {
return false
}
for _, key := range []string{"Accept", "Content-Type"} {
if value := r.Header.Get(key); value != "" {
req.Header.Set(key, value)
}
}
req.Header.Set("X-RAP-Entry-Node", s.Local.NodeID)
req.Header.Set("X-RAP-Entry-Cluster", s.Local.ClusterID)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return false
}
defer resp.Body.Close()
for _, key := range []string{"Content-Type"} {
if value := resp.Header.Get(key); value != "" {
w.Header().Set(key, value)
}
}
w.WriteHeader(resp.StatusCode)
_, _ = io.Copy(w, resp.Body)
return true
}
type fabricServiceChannelLeaseAuthorityPayload struct { type fabricServiceChannelLeaseAuthorityPayload struct {
SchemaVersion string `json:"schema_version"` SchemaVersion string `json:"schema_version"`
ChannelID string `json:"channel_id"` ChannelID string `json:"channel_id"`
@@ -1443,10 +841,6 @@ func (d fabricServiceChannelRequestDecision) BackendFallbackAllowed() bool {
return strings.TrimSpace(d.BackendRelayPolicy) != "disabled" return strings.TrimSpace(d.BackendRelayPolicy) != "disabled"
} }
func (s Server) validateFabricServiceChannelVPNRequest(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, vpnConnectionID string) (fabricServiceChannelRequestDecision, bool) {
return s.validateFabricServiceChannelRequest(w, r, clusterID, channelID, vpnConnectionID, FabricServiceClassVPNPackets, ProductionChannelVPNPacket)
}
func (s Server) validateFabricServiceChannelRequest(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, resourceID string, expectedServiceClass string, defaultChannelClass string) (fabricServiceChannelRequestDecision, bool) { func (s Server) validateFabricServiceChannelRequest(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, resourceID string, expectedServiceClass string, defaultChannelClass string) (fabricServiceChannelRequestDecision, bool) {
var decision fabricServiceChannelRequestDecision var decision fabricServiceChannelRequestDecision
expectedServiceClass = strings.TrimSpace(strings.ToLower(expectedServiceClass)) expectedServiceClass = strings.TrimSpace(strings.ToLower(expectedServiceClass))
@@ -1485,7 +879,7 @@ func (s Server) validateFabricServiceChannelRequest(w http.ResponseWriter, r *ht
http.Error(w, err.Error(), http.StatusForbidden) http.Error(w, err.Error(), http.StatusForbidden)
return decision, false return decision, false
} }
decision.AcceptedBy = "legacy_unsigned" decision.AcceptedBy = "token_authorized"
decision.ServiceClass = serviceClass decision.ServiceClass = serviceClass
decision.ChannelClass = channelClass decision.ChannelClass = channelClass
if payload != nil && (payload.Status == "degraded_fallback" || payload.PrimaryRoute.Status == "missing_route_intent") { if payload != nil && (payload.Status == "degraded_fallback" || payload.PrimaryRoute.Status == "missing_route_intent") {
@@ -1571,30 +965,6 @@ func (s Server) logFabricServiceChannelAccess(r *http.Request, clusterID string,
s.FabricServiceChannelLogger(entry) s.FabricServiceChannelLogger(entry)
} }
func (s Server) logFabricServiceChannelViolation(r *http.Request, clusterID string, channelID string, resourceID string, backendRelayPolicy string, status string, reason string) {
if s.FabricServiceChannelLogger == nil || strings.TrimSpace(channelID) == "" {
return
}
entry := FabricServiceChannelAccessLogEntry{
Event: "fabric_service_channel_data_plane_violation",
ClusterID: clusterID,
ChannelID: channelID,
ResourceID: resourceID,
LocalNodeID: s.Local.NodeID,
BackendRelayPolicy: strings.TrimSpace(backendRelayPolicy),
ViolationStatus: strings.TrimSpace(status),
ViolationReason: strings.TrimSpace(reason),
OccurredAt: time.Now().UTC(),
}
if r != nil {
entry.Method = r.Method
if r.URL != nil {
entry.Path = r.URL.Path
}
}
s.FabricServiceChannelLogger(entry)
}
func (s Server) verifyFabricServiceChannelLeaseAuthority(r *http.Request, clusterID string, channelID string, resourceID string, serviceClass string, channelClass string, token string) (*fabricServiceChannelLeaseAuthorityPayload, error) { func (s Server) verifyFabricServiceChannelLeaseAuthority(r *http.Request, clusterID string, channelID string, resourceID string, serviceClass string, channelClass string, token string) (*fabricServiceChannelLeaseAuthorityPayload, error) {
publicKey := strings.TrimSpace(s.ClusterAuthorityPublicKey) publicKey := strings.TrimSpace(s.ClusterAuthorityPublicKey)
payloadHeader := strings.TrimSpace(r.Header.Get("X-RAP-Service-Channel-Authority-Payload")) payloadHeader := strings.TrimSpace(r.Header.Get("X-RAP-Service-Channel-Authority-Payload"))
@@ -1657,15 +1027,15 @@ func validateFabricServiceChannelDataPlaneContract(contract fabricServiceChannel
} }
requiredFlowClass = strings.TrimSpace(strings.ToLower(requiredFlowClass)) requiredFlowClass = strings.TrimSpace(strings.ToLower(requiredFlowClass))
if contract.SchemaVersion != "rap.fabric_service_channel_data_plane.v1" || if contract.SchemaVersion != "rap.fabric_service_channel_data_plane.v1" ||
contract.WorkingDataTransport != "fabric_service_channel" || contract.WorkingDataTransport != "fabric_quic_route" ||
contract.SteadyStateTransport != "fabric_route" || contract.SteadyStateTransport != "fabric_route" ||
(contract.BackendRelayPolicy != "degraded_fallback_only" && contract.BackendRelayPolicy != "disabled") || contract.BackendRelayPolicy != "disabled" ||
!contract.ServiceNeutral || !contract.ServiceNeutral ||
!contract.ProtocolAgnostic || !contract.ProtocolAgnostic ||
contract.LogicalFlowMode != "multi_flow_isolated" { contract.LogicalFlowMode != "multi_flow_isolated" {
return fmt.Errorf("%w: unsupported service channel data-plane contract", ErrUnauthorizedChannel) return fmt.Errorf("%w: unsupported service channel data-plane contract", ErrUnauthorizedChannel)
} }
if contract.Mode != "" && contract.Mode != "fabric_primary" && contract.Mode != "degraded_backend_fallback" { if contract.Mode != "" && contract.Mode != "fabric_primary" && contract.Mode != "fabric_quic_only" {
return fmt.Errorf("%w: unsupported service channel data-plane mode", ErrUnauthorizedChannel) return fmt.Errorf("%w: unsupported service channel data-plane mode", ErrUnauthorizedChannel)
} }
if requiredFlowClass != "" && len(contract.RequiredFlowIsolationClasses) > 0 && !containsString(contract.RequiredFlowIsolationClasses, requiredFlowClass) { if requiredFlowClass != "" && len(contract.RequiredFlowIsolationClasses) > 0 && !containsString(contract.RequiredFlowIsolationClasses, requiredFlowClass) {
@@ -1796,29 +1166,6 @@ func fabricServiceChannelBearerToken(r *http.Request) string {
return strings.TrimSpace(r.URL.Query().Get("service_channel_token")) return strings.TrimSpace(r.URL.Query().Get("service_channel_token"))
} }
func fabricSessionTokenHash(token string) string {
sum := sha256.Sum256([]byte(strings.TrimSpace(token)))
return hex.EncodeToString(sum[:])
}
func fabricSessionBearerToken(r *http.Request) string {
if r == nil {
return ""
}
if token := strings.TrimSpace(r.Header.Get("X-RAP-Fabric-Session-Token")); token != "" {
return token
}
auth := strings.TrimSpace(r.Header.Get("Authorization"))
if len(auth) > len("Bearer ") && strings.EqualFold(auth[:len("Bearer ")], "Bearer ") {
return strings.TrimSpace(auth[len("Bearer "):])
}
return strings.TrimSpace(r.URL.Query().Get("fabric_session_token"))
}
func isAllowedFabricServiceVPNChannel(channel string) bool {
return isAllowedFabricServiceChannelForClass(FabricServiceClassVPNPackets, channel)
}
func isAllowedFabricServiceChannelForClass(serviceClass string, channel string) bool { func isAllowedFabricServiceChannelForClass(serviceClass string, channel string) bool {
serviceClass = strings.TrimSpace(strings.ToLower(serviceClass)) serviceClass = strings.TrimSpace(strings.ToLower(serviceClass))
channel = strings.TrimSpace(strings.ToLower(channel)) channel = strings.TrimSpace(strings.ToLower(channel))
@@ -1846,25 +1193,6 @@ func containsString(values []string, target string) bool {
return false return false
} }
func parseFabricServiceChannelVPNPacketWebSocketPath(path string) (string, string, string, bool) {
parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) != 11 ||
parts[0] != "api" ||
parts[1] != "v1" ||
parts[2] != "clusters" ||
parts[4] != "fabric" ||
parts[5] != "service-channels" ||
parts[7] != "vpn-connections" ||
parts[9] != "packets" ||
parts[10] != "ws" {
return "", "", "", false
}
if parts[3] == "" || parts[6] == "" || parts[8] == "" {
return "", "", "", false
}
return parts[3], parts[6], parts[8], true
}
func parseFabricServiceChannelRemoteWorkspacePath(path string) (string, string, string, string, bool, bool) { func parseFabricServiceChannelRemoteWorkspacePath(path string) (string, string, string, string, bool, bool) {
parts := strings.Split(strings.Trim(path, "/"), "/") parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) == 11 && if len(parts) == 11 &&
@@ -1897,6 +1225,34 @@ func parseFabricServiceChannelRemoteWorkspacePath(path string) (string, string,
return parts[3], parts[6], parts[8], strings.TrimSpace(strings.ToLower(parts[10])), false, true return parts[3], parts[6], parts[8], strings.TrimSpace(strings.ToLower(parts[10])), false, true
} }
func (s Server) handleFabricServiceChannelVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool {
if isFabricServiceChannelVPNPacketWebSocketPath(r.URL.Path) {
http.Error(w, "fabric service-channel WebSocket dataplane is removed; use QUIC fabric route", http.StatusGone)
return true
}
if _, _, _, ok := parseFabricServiceChannelVPNPacketPath(r.URL.Path); !ok {
return false
}
http.Error(w, "fabric service-channel HTTP dataplane is removed; use QUIC fabric route", http.StatusGone)
return true
}
func isFabricServiceChannelVPNPacketWebSocketPath(path string) bool {
parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) != 11 ||
parts[0] != "api" ||
parts[1] != "v1" ||
parts[2] != "clusters" ||
parts[4] != "fabric" ||
parts[5] != "service-channels" ||
parts[7] != "vpn-connections" ||
parts[9] != "packets" ||
parts[10] != "ws" {
return false
}
return parts[3] != "" && parts[6] != "" && parts[8] != ""
}
func parseFabricServiceChannelVPNPacketPath(path string) (string, string, string, bool) { func parseFabricServiceChannelVPNPacketPath(path string) (string, string, string, bool) {
parts := strings.Split(strings.Trim(path, "/"), "/") parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) != 10 || if len(parts) != 10 ||
@@ -1915,7 +1271,7 @@ func parseFabricServiceChannelVPNPacketPath(path string) (string, string, string
return parts[3], parts[6], parts[8], true return parts[3], parts[6], parts[8], true
} }
func parseVPNClientPacketWebSocketPath(path string) (string, string, bool) { func isVPNClientPacketWebSocketPath(path string) bool {
parts := strings.Split(strings.Trim(path, "/"), "/") parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) != 10 || if len(parts) != 10 ||
parts[0] != "api" || parts[0] != "api" ||
@@ -1926,12 +1282,9 @@ func parseVPNClientPacketWebSocketPath(path string) (string, string, bool) {
parts[7] != "client" || parts[7] != "client" ||
parts[8] != "packets" || parts[8] != "packets" ||
parts[9] != "ws" { parts[9] != "ws" {
return "", "", false return false
} }
if parts[3] == "" || parts[5] == "" { return parts[3] != "" && parts[5] != ""
return "", "", false
}
return parts[3], parts[5], true
} }
func parseVPNClientPacketPath(path string) (string, string, bool) { func parseVPNClientPacketPath(path string) (string, string, bool) {
@@ -1952,28 +1305,6 @@ func parseVPNClientPacketPath(path string) (string, string, bool) {
return parts[3], parts[5], true return parts[3], parts[5], true
} }
func vpnIngressTimeout(r *http.Request) time.Duration {
timeoutMs, _ := strconv.Atoi(r.URL.Query().Get("timeout_ms"))
if timeoutMs <= 0 {
timeoutMs = 25000
}
if timeoutMs > 30000 {
timeoutMs = 30000
}
return time.Duration(timeoutMs) * time.Millisecond
}
func vpnIngressStatusCode(err error) int {
switch err {
case ErrForwardRuntimeUnavailable, ErrRouteNotFound, ErrForwardPeerUnavailable:
return http.StatusServiceUnavailable
case ErrUnauthorizedChannel, ErrClusterMismatch, ErrNodeMismatch:
return http.StatusForbidden
default:
return http.StatusBadGateway
}
}
func encodeVPNIngressPacketBatch(packets [][]byte) []byte { func encodeVPNIngressPacketBatch(packets [][]byte) []byte {
packets = cleanVPNIngressPacketBatch(packets) packets = cleanVPNIngressPacketBatch(packets)
total := 0 total := 0
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,49 @@
package fabricvpn
import (
"encoding/json"
"os"
"strings"
"testing"
)
func TestLiveFabricControlRequest(t *testing.T) {
cfg := strings.TrimSpace(os.Getenv("RAP_LIVE_FABRIC_CONTROL_CONFIG"))
if cfg == "" {
t.Skip("set RAP_LIVE_FABRIC_CONTROL_CONFIG to run live fabric control test")
}
path := strings.TrimSpace(os.Getenv("RAP_LIVE_FABRIC_CONTROL_PATH"))
if path == "" {
path = "/organizations/?user_id=3fded8a8-f19b-4974-919f-44d34ac5f63d"
}
method := strings.TrimSpace(os.Getenv("RAP_LIVE_FABRIC_CONTROL_METHOD"))
if method == "" {
method = "GET"
}
body := strings.TrimSpace(os.Getenv("RAP_LIVE_FABRIC_CONTROL_BODY"))
manager := NewManager()
if err := manager.Start(cfg); err != nil {
t.Fatalf("start manager: %v", err)
}
defer manager.Stop()
request := map[string]any{"method": method, "path": path}
if body != "" {
var raw json.RawMessage
if err := json.Unmarshal([]byte(body), &raw); err != nil {
t.Fatalf("invalid request body: %v", err)
}
request["body"] = raw
}
payload, err := json.Marshal(request)
if err != nil {
t.Fatal(err)
}
response, err := manager.ControlRequest(string(payload))
if err != nil {
t.Fatalf("control request failed: %v", err)
}
if !strings.Contains(response, "status_code") {
t.Fatalf("unexpected control response: %s", response)
}
t.Log(response)
}
@@ -243,7 +243,7 @@ func (m *Manager) connect(ctx context.Context, cfg runtimeConfig, cancel context
if lastErr == nil { if lastErr == nil {
lastErr = fmt.Errorf("no QUIC exit endpoints available") lastErr = fmt.Errorf("no QUIC exit endpoints available")
} }
return lastErr return fmt.Errorf("fabric bootstrap failed after %d endpoint candidates: %w", len(cfg.Endpoints), lastErr)
} }
func (m *Manager) protectedQUICDialer() func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error) { func (m *Manager) protectedQUICDialer() func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error) {
@@ -447,11 +447,17 @@ func (m *Manager) ControlRequest(payloadJSON string) (string, error) {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return "", ctx.Err() return "", ctx.Err()
case err := <-session.Errors(): case err, ok := <-session.Errors():
if !ok {
return "", fmt.Errorf("fabric control error stream closed")
}
if err != nil { if err != nil {
return "", err return "", err
} }
case frame := <-session.Frames(): case frame, ok := <-session.Frames():
if !ok {
return "", fmt.Errorf("fabric control stream closed")
}
if frame.Type != fabricproto.FrameData || frame.StreamID != mesh.FabricControlForwardQUICStreamID { if frame.Type != fabricproto.FrameData || frame.StreamID != mesh.FabricControlForwardQUICStreamID {
continue continue
} }
@@ -460,7 +466,7 @@ func (m *Manager) ControlRequest(payloadJSON string) (string, error) {
return "", err return "", err
} }
if response.Error != "" { if response.Error != "" {
return "", fmt.Errorf(response.Error) return "", fmt.Errorf("%s", response.Error)
} }
return string(response.Payload), nil return string(response.Payload), nil
} }
+109
View File
@@ -166,6 +166,7 @@ type DockerInstallProfile struct {
BackendURL string `json:"backend_url"` BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints,omitempty"` ControlPlaneEndpoints []string `json:"control_plane_endpoints,omitempty"`
ArtifactEndpoints []string `json:"artifact_endpoints,omitempty"` ArtifactEndpoints []string `json:"artifact_endpoints,omitempty"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records,omitempty"`
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact,omitempty"` DockerImageArtifact *DockerArtifact `json:"docker_image_artifact,omitempty"`
JoinToken string `json:"join_token"` JoinToken string `json:"join_token"`
NodeName string `json:"node_name"` NodeName string `json:"node_name"`
@@ -203,6 +204,7 @@ type WindowsInstallProfile struct {
BackendURL string `json:"backend_url"` BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints,omitempty"` ControlPlaneEndpoints []string `json:"control_plane_endpoints,omitempty"`
ArtifactEndpoints []string `json:"artifact_endpoints,omitempty"` ArtifactEndpoints []string `json:"artifact_endpoints,omitempty"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records,omitempty"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact,omitempty"` NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact,omitempty"`
JoinToken string `json:"join_token"` JoinToken string `json:"join_token"`
NodeName string `json:"node_name"` NodeName string `json:"node_name"`
@@ -235,6 +237,7 @@ type LinuxInstallProfile struct {
BackendURL string `json:"backend_url"` BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints,omitempty"` ControlPlaneEndpoints []string `json:"control_plane_endpoints,omitempty"`
ArtifactEndpoints []string `json:"artifact_endpoints,omitempty"` ArtifactEndpoints []string `json:"artifact_endpoints,omitempty"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records,omitempty"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact,omitempty"` NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact,omitempty"`
JoinToken string `json:"join_token"` JoinToken string `json:"join_token"`
NodeName string `json:"node_name"` NodeName string `json:"node_name"`
@@ -372,6 +375,28 @@ type NodeUpdatePlan struct {
ProductionForwarding bool `json:"production_forwarding"` ProductionForwarding bool `json:"production_forwarding"`
} }
type NodeBridgeReplayProductPlan struct {
Product string `json:"product"`
RecoveryBridgeMode string `json:"recovery_bridge_mode,omitempty"`
RecoveryBridgeReplayReady bool `json:"recovery_bridge_replay_ready"`
LastStatusReason string `json:"last_status_reason,omitempty"`
UpdatePlan NodeUpdatePlan `json:"update_plan"`
}
type NodeBridgeReplayPlan struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
NodeName string `json:"node_name,omitempty"`
HealthStatus string `json:"health_status,omitempty"`
HeartbeatStale bool `json:"heartbeat_stale"`
BridgeHoldRequired bool `json:"bridge_hold_required"`
RecoveryBridgeReplayReady bool `json:"recovery_bridge_replay_ready"`
BridgeHoldReasons []string `json:"bridge_hold_reasons,omitempty"`
BridgeActions []string `json:"bridge_actions,omitempty"`
Products []NodeBridgeReplayProductPlan `json:"products,omitempty"`
}
type NodeUpdateStatus struct { type NodeUpdateStatus struct {
ID string `json:"id"` ID string `json:"id"`
ClusterID string `json:"cluster_id"` ClusterID string `json:"cluster_id"`
@@ -388,6 +413,77 @@ type NodeUpdateStatus struct {
ObservedAt time.Time `json:"observed_at"` ObservedAt time.Time `json:"observed_at"`
} }
type StaleNodeRiskReport struct {
ClusterID string `json:"cluster_id"`
GeneratedAt time.Time `json:"generated_at"`
HeartbeatStaleAfterSeconds int `json:"heartbeat_stale_after_seconds"`
LegacyRemovalAllowed bool `json:"legacy_removal_allowed"`
BridgeHoldRequired bool `json:"bridge_hold_required"`
BridgeHoldNodeIDs []string `json:"bridge_hold_node_ids,omitempty"`
BridgeHoldReasons []string `json:"bridge_hold_reasons,omitempty"`
BlockedOperations []string `json:"blocked_operations,omitempty"`
Nodes []StaleNodeRiskNode `json:"nodes"`
Summary StaleNodeRiskSummary `json:"summary"`
}
type StaleNodeRiskSummary struct {
TotalNodes int `json:"total_nodes"`
StaleNodes int `json:"stale_nodes"`
BlockedNodes int `json:"blocked_nodes"`
DirectPeerAlertNodes int `json:"direct_peer_alert_nodes"`
ArtifactGapNodes int `json:"artifact_gap_nodes"`
UnknownProfileNodes int `json:"unknown_profile_nodes"`
WaitingUpdateStatusNodes int `json:"waiting_update_status_nodes"`
UnknownVersionNodes int `json:"unknown_version_nodes"`
LegacyRecoveryContractNodes int `json:"legacy_recovery_contract_nodes"`
RecoveryBridgeRequiredNodes int `json:"recovery_bridge_required_nodes"`
RecoveryBridgeReplayReadyNodes int `json:"recovery_bridge_replay_ready_nodes"`
WaitingRecoveryHeartbeatNodes int `json:"waiting_recovery_heartbeat_nodes"`
}
type StaleNodeRiskNode struct {
NodeID string `json:"node_id"`
Name string `json:"name"`
RegistrationStatus string `json:"registration_status"`
HealthStatus string `json:"health_status"`
ReportedVersion *string `json:"reported_version,omitempty"`
LastSeenAt *time.Time `json:"last_seen_at,omitempty"`
HeartbeatStale bool `json:"heartbeat_stale"`
Blocked bool `json:"blocked"`
DirectPeerAlert bool `json:"direct_peer_alert"`
DirectPeerReadyCount int `json:"direct_peer_ready_count,omitempty"`
DirectPeerTargetCount int `json:"direct_peer_target_count,omitempty"`
DirectPeerDeficit int `json:"direct_peer_deficit,omitempty"`
Alerts []string `json:"alerts,omitempty"`
RecoveryBridgeRequired bool `json:"recovery_bridge_required"`
RecoveryBridgeReplayReady bool `json:"recovery_bridge_replay_ready"`
RecoveryBridgeActions []string `json:"recovery_bridge_actions,omitempty"`
Risks []string `json:"risks,omitempty"`
Products []StaleNodeRiskProduct `json:"products,omitempty"`
}
type StaleNodeRiskProduct struct {
Product string `json:"product"`
CurrentVersion string `json:"current_version,omitempty"`
TargetVersion *string `json:"target_version,omitempty"`
Channel string `json:"channel,omitempty"`
Strategy string `json:"strategy,omitempty"`
Enabled bool `json:"enabled"`
DetectedOS string `json:"detected_os,omitempty"`
DetectedArch string `json:"detected_arch,omitempty"`
DetectedInstallType string `json:"detected_install_type,omitempty"`
CompatibleArtifactFound bool `json:"compatible_artifact_found"`
MatchingReleaseVersion string `json:"matching_release_version,omitempty"`
LastStatusObservedAt *time.Time `json:"last_status_observed_at,omitempty"`
LastStatusPhase string `json:"last_status_phase,omitempty"`
LastStatusValue string `json:"last_status_value,omitempty"`
LastStatusReason string `json:"last_status_reason,omitempty"`
RecoveryBridgeRequired bool `json:"recovery_bridge_required"`
RecoveryBridgeReplayReady bool `json:"recovery_bridge_replay_ready"`
RecoveryBridgeMode string `json:"recovery_bridge_mode,omitempty"`
Risks []string `json:"risks,omitempty"`
}
type NodeBootstrap struct { type NodeBootstrap struct {
NodeID string `json:"node_id"` NodeID string `json:"node_id"`
ClusterID string `json:"cluster_id"` ClusterID string `json:"cluster_id"`
@@ -769,6 +865,8 @@ type NodeMeshListenerConfig struct {
AutoPortStart int `json:"auto_port_start,omitempty"` AutoPortStart int `json:"auto_port_start,omitempty"`
AutoPortEnd int `json:"auto_port_end,omitempty"` AutoPortEnd int `json:"auto_port_end,omitempty"`
AdvertiseEndpoint string `json:"advertise_endpoint,omitempty"` AdvertiseEndpoint string `json:"advertise_endpoint,omitempty"`
AdvertiseEndpoints []string `json:"advertise_endpoints,omitempty"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
AdvertiseTransport string `json:"advertise_transport,omitempty"` AdvertiseTransport string `json:"advertise_transport,omitempty"`
ConnectivityMode string `json:"connectivity_mode,omitempty"` ConnectivityMode string `json:"connectivity_mode,omitempty"`
NATType string `json:"nat_type,omitempty"` NATType string `json:"nat_type,omitempty"`
@@ -2027,6 +2125,17 @@ type GetNodeUpdatePlanInput struct {
ArtifactOrigin string ArtifactOrigin string
} }
type GetStaleNodeRiskReportInput struct {
ActorUserID string
ClusterID string
}
type GetNodeBridgeReplayPlanInput struct {
ActorUserID string
ClusterID string
NodeID string
}
type ReportNodeUpdateStatusInput struct { type ReportNodeUpdateStatusInput struct {
ClusterID string ClusterID string
NodeID string NodeID string
@@ -84,8 +84,10 @@ func (m *Module) RegisterRoutes(router chi.Router) {
r.Post("/{clusterID}/updates/releases", m.createReleaseVersion) r.Post("/{clusterID}/updates/releases", m.createReleaseVersion)
r.Put("/{clusterID}/nodes/{nodeID}/updates/policy", m.upsertNodeUpdatePolicy) r.Put("/{clusterID}/nodes/{nodeID}/updates/policy", m.upsertNodeUpdatePolicy)
r.Get("/{clusterID}/nodes/{nodeID}/updates/plan", m.getNodeUpdatePlan) r.Get("/{clusterID}/nodes/{nodeID}/updates/plan", m.getNodeUpdatePlan)
r.Get("/{clusterID}/nodes/{nodeID}/updates/bridge-replay-plan", m.getNodeBridgeReplayPlan)
r.Post("/{clusterID}/nodes/{nodeID}/updates/status", m.reportNodeUpdateStatus) r.Post("/{clusterID}/nodes/{nodeID}/updates/status", m.reportNodeUpdateStatus)
r.Get("/{clusterID}/nodes/{nodeID}/updates/statuses", m.listNodeUpdateStatuses) r.Get("/{clusterID}/nodes/{nodeID}/updates/statuses", m.listNodeUpdateStatuses)
r.Get("/{clusterID}/updates/stale-node-risk-report", m.getStaleNodeRiskReport)
r.Get("/{clusterID}/nodes/{nodeID}/testing-flags", m.getEffectiveNodeTestingFlags) r.Get("/{clusterID}/nodes/{nodeID}/testing-flags", m.getEffectiveNodeTestingFlags)
r.Get("/{clusterID}/nodes/{nodeID}/mesh/synthetic-config", m.getNodeSyntheticMeshConfig) r.Get("/{clusterID}/nodes/{nodeID}/mesh/synthetic-config", m.getNodeSyntheticMeshConfig)
r.Post("/{clusterID}/nodes/{nodeID}/telemetry", m.recordNodeTelemetry) r.Post("/{clusterID}/nodes/{nodeID}/telemetry", m.recordNodeTelemetry)
@@ -843,6 +845,29 @@ func (m *Module) listNodeUpdateStatuses(w http.ResponseWriter, r *http.Request)
httpx.WriteJSON(w, http.StatusOK, map[string]any{"node_update_statuses": items}) httpx.WriteJSON(w, http.StatusOK, map[string]any{"node_update_statuses": items})
} }
func (m *Module) getStaleNodeRiskReport(w http.ResponseWriter, r *http.Request) {
item, err := m.service.GetStaleNodeRiskReport(r.Context(), GetStaleNodeRiskReportInput{
ActorUserID: r.URL.Query().Get("actor_user_id"),
ClusterID: chi.URLParam(r, "clusterID"),
})
if writeServiceError(w, err) {
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"stale_node_risk_report": item})
}
func (m *Module) getNodeBridgeReplayPlan(w http.ResponseWriter, r *http.Request) {
item, err := m.service.GetNodeBridgeReplayPlan(r.Context(), GetNodeBridgeReplayPlanInput{
ActorUserID: r.URL.Query().Get("actor_user_id"),
ClusterID: chi.URLParam(r, "clusterID"),
NodeID: chi.URLParam(r, "nodeID"),
})
if writeServiceError(w, err) {
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"node_bridge_replay_plan": item})
}
func (m *Module) getEffectiveNodeTestingFlags(w http.ResponseWriter, r *http.Request) { func (m *Module) getEffectiveNodeTestingFlags(w http.ResponseWriter, r *http.Request) {
item, err := m.service.GetEffectiveNodeTestingFlags(r.Context(), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID")) item, err := m.service.GetEffectiveNodeTestingFlags(r.Context(), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID"))
if writeServiceError(w, err) { if writeServiceError(w, err) {
@@ -3386,6 +3411,7 @@ func writeServiceError(w http.ResponseWriter, err error) bool {
if err == nil { if err == nil {
return false return false
} }
var legacyRemovalBlocked *LegacyRemovalBlockedError
switch { switch {
case errors.Is(err, ErrAccessDenied): case errors.Is(err, ErrAccessDenied):
httpx.WriteError(w, http.StatusForbidden, err.Error()) httpx.WriteError(w, http.StatusForbidden, err.Error())
@@ -3393,6 +3419,12 @@ func writeServiceError(w http.ResponseWriter, err error) bool {
httpx.WriteError(w, http.StatusForbidden, err.Error()) httpx.WriteError(w, http.StatusForbidden, err.Error())
case errors.Is(err, ErrClusterReadOnly): case errors.Is(err, ErrClusterReadOnly):
httpx.WriteError(w, http.StatusConflict, err.Error()) httpx.WriteError(w, http.StatusConflict, err.Error())
case errors.As(err, &legacyRemovalBlocked):
httpx.WriteErrorMessage(w, http.StatusConflict, httpx.ErrorResponse{
Error: httpx.NewErrorMessage(http.StatusConflict, err.Error(), legacyRemovalBlockedErrorDetails(*legacyRemovalBlocked), ""),
})
case errors.Is(err, ErrLegacyRemovalBlocked):
httpx.WriteError(w, http.StatusConflict, err.Error())
case errors.Is(err, ErrVPNLeaseAlreadyActive): case errors.Is(err, ErrVPNLeaseAlreadyActive):
httpx.WriteError(w, http.StatusConflict, err.Error()) httpx.WriteError(w, http.StatusConflict, err.Error())
case errors.Is(err, ErrInvalidPayload), errors.Is(err, ErrInvalidJoinToken), errors.Is(err, ErrInvalidNodeRole): case errors.Is(err, ErrInvalidPayload), errors.Is(err, ErrInvalidJoinToken), errors.Is(err, ErrInvalidNodeRole):
@@ -3404,3 +3436,37 @@ func writeServiceError(w http.ResponseWriter, err error) bool {
} }
return true return true
} }
func legacyRemovalBlockedErrorDetails(err LegacyRemovalBlockedError) map[string]any {
details := map[string]any{
"blocked_operation": err.BlockedOperation,
"legacy_removal_allowed": err.Report.LegacyRemovalAllowed,
"bridge_hold_required": err.Report.BridgeHoldRequired,
"bridge_hold_reasons": err.Report.BridgeHoldReasons,
"blocked_operations": err.Report.BlockedOperations,
"heartbeat_stale_after_seconds": err.Report.HeartbeatStaleAfterSeconds,
"stale_nodes": err.Report.Summary.StaleNodes,
"blocked_nodes": err.Report.Summary.BlockedNodes,
"artifact_gap_nodes": err.Report.Summary.ArtifactGapNodes,
"unknown_profile_nodes": err.Report.Summary.UnknownProfileNodes,
"waiting_update_status_nodes": err.Report.Summary.WaitingUpdateStatusNodes,
"unknown_version_nodes": err.Report.Summary.UnknownVersionNodes,
"legacy_recovery_contract_nodes": err.Report.Summary.LegacyRecoveryContractNodes,
"recovery_bridge_required_nodes": err.Report.Summary.RecoveryBridgeRequiredNodes,
"recovery_bridge_replay_ready_nodes": err.Report.Summary.RecoveryBridgeReplayReadyNodes,
"waiting_recovery_heartbeat_nodes": err.Report.Summary.WaitingRecoveryHeartbeatNodes,
}
blockedNodeIDs := make([]string, 0, len(err.Report.Nodes))
for _, node := range err.Report.Nodes {
if node.Blocked {
blockedNodeIDs = append(blockedNodeIDs, node.NodeID)
}
}
if len(blockedNodeIDs) > 0 {
details["blocked_node_ids"] = blockedNodeIDs
}
if len(err.Report.BridgeHoldNodeIDs) > 0 {
details["bridge_hold_node_ids"] = err.Report.BridgeHoldNodeIDs
}
return details
}
@@ -0,0 +1,68 @@
package cluster
import (
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
)
func TestWriteServiceErrorLegacyRemovalBlockedIncludesBreakdownDetails(t *testing.T) {
recorder := httptest.NewRecorder()
handled := writeServiceError(recorder, &LegacyRemovalBlockedError{
BlockedOperation: "create_breaking_release",
Report: StaleNodeRiskReport{
HeartbeatStaleAfterSeconds: 900,
LegacyRemovalAllowed: false,
BridgeHoldRequired: true,
BridgeHoldNodeIDs: []string{"node-1"},
BridgeHoldReasons: []string{"legacy_contract_overlap"},
BlockedOperations: []string{"create_breaking_release", "target_breaking_update_policy", "remove_recovery_bridge_overlap"},
Nodes: []StaleNodeRiskNode{
{NodeID: "node-1", Blocked: true, RecoveryBridgeRequired: true},
{NodeID: "node-2", Blocked: false},
},
Summary: StaleNodeRiskSummary{
StaleNodes: 1,
BlockedNodes: 1,
ArtifactGapNodes: 0,
UnknownProfileNodes: 0,
WaitingUpdateStatusNodes: 0,
UnknownVersionNodes: 0,
LegacyRecoveryContractNodes: 0,
WaitingRecoveryHeartbeatNodes: 1,
},
},
})
if !handled {
t.Fatalf("writeServiceError returned false")
}
if recorder.Code != http.StatusConflict {
t.Fatalf("status = %d, want %d", recorder.Code, http.StatusConflict)
}
var payload struct {
Error struct {
Details map[string]any `json:"details"`
} `json:"error"`
}
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
t.Fatalf("unmarshal response: %v", err)
}
if payload.Error.Details["blocked_operation"] != "create_breaking_release" {
t.Fatalf("blocked_operation = %v", payload.Error.Details["blocked_operation"])
}
if payload.Error.Details["waiting_recovery_heartbeat_nodes"] != float64(1) {
t.Fatalf("waiting_recovery_heartbeat_nodes = %v", payload.Error.Details["waiting_recovery_heartbeat_nodes"])
}
if payload.Error.Details["bridge_hold_required"] != true {
t.Fatalf("bridge_hold_required = %v", payload.Error.Details["bridge_hold_required"])
}
blockedNodeIDs, ok := payload.Error.Details["blocked_node_ids"].([]any)
if !ok || len(blockedNodeIDs) != 1 || blockedNodeIDs[0] != "node-1" {
t.Fatalf("blocked_node_ids = %#v", payload.Error.Details["blocked_node_ids"])
}
bridgeHoldNodeIDs, ok := payload.Error.Details["bridge_hold_node_ids"].([]any)
if !ok || len(bridgeHoldNodeIDs) != 1 || bridgeHoldNodeIDs[0] != "node-1" {
t.Fatalf("bridge_hold_node_ids = %#v", payload.Error.Details["bridge_hold_node_ids"])
}
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -158,6 +158,7 @@ func (m *Module) bootstrapEnrollment(w http.ResponseWriter, r *http.Request) {
func (m *Module) registerAgent(w http.ResponseWriter, r *http.Request) { func (m *Module) registerAgent(w http.ResponseWriter, r *http.Request) {
var payload struct { var payload struct {
ClusterID string `json:"cluster_id"`
NodeKey string `json:"node_key"` NodeKey string `json:"node_key"`
Name string `json:"name"` Name string `json:"name"`
OwnershipType string `json:"ownership_type"` OwnershipType string `json:"ownership_type"`
@@ -197,6 +198,19 @@ func (m *Module) registerAgent(w http.ResponseWriter, r *http.Request) {
httpx.WriteError(w, http.StatusInternalServerError, err.Error()) httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return return
} }
if payload.ClusterID != "" {
if _, err := m.db.Exec(r.Context(), `
INSERT INTO cluster_memberships (cluster_id, node_id, membership_status, joined_at, last_seen_at, metadata)
VALUES ($1::uuid, $2::uuid, 'active', $3, $3, $4::jsonb)
ON CONFLICT (cluster_id, node_id) DO UPDATE SET
membership_status = 'active',
last_seen_at = EXCLUDED.last_seen_at,
metadata = cluster_memberships.metadata || EXCLUDED.metadata
`, payload.ClusterID, nodeID, now, []byte(`{"source":"fabric_control_candidate_registration"}`)); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{ httpx.WriteJSON(w, http.StatusOK, map[string]any{
"node_id": nodeID, "node_id": nodeID,
"status": "registered", "status": "registered",
+26 -8
View File
@@ -1,18 +1,25 @@
# RAP Android VPN # RAP Android VPN
This is the Android client for the experimental RAP VPN service. This is the Android mobile node build with the `vpn-client` service enabled.
Implemented now: Implemented now:
- login through `/auth/login`; - installation as a first-class fabric node with an embedded QUIC bootstrap
- trusted-device reconnect through `/auth/refresh` without retyping the password seed set. The seed set is not a backend selector: it contains every known
while the device session is valid; public or local entry candidate that may help the node join the fabric from
- load organization-scoped VPN client profile from `/clusters/{clusterID}/vpn/client-profile`; its current network.
- runtime launch uses a persisted `fabric_bootstrap_config`, not a backend API
URL. The Android node starts by attaching to the fabric through bootstrap
peers and then discovers/uses services through fabric rules.
- login and trusted-device refresh through the QUIC fabric control channel;
- load organization-scoped VPN client profile through the fabric control channel;
- request Android VPN permission and create a `VpnService` TUN interface; - request Android VPN permission and create a `VpnService` TUN interface;
- run as a normal fabric node with the `vpn-client` service role. The local - run as a normal fabric node with the `vpn-client` service role. The local
`VpnService` TUN is the IPv4 ingress for that node, and packet channels are `VpnService` TUN is the IPv4 ingress for that node, and packet channels are
routed by the farm to an authorized `ipv4-egress` pool. HTTP batch fallback routed by the farm to an authorized `ipv4-egress` pool. The supported
and old VPN protocols are not part of the supported test path. dataplane is the QUIC fabric runtime only. HTTP batch forwarding, WebSocket
packet relay, direct backend packet relay, and old VPN protocols are removed
from the runtime path.
- user-facing HOME-first screen: connect/disconnect is primary, while backend, - user-facing HOME-first screen: connect/disconnect is primary, while backend,
cluster, organization, login, and password are kept in the settings dialog; cluster, organization, login, and password are kept in the settings dialog;
- saved connection settings in app preferences so repeat connects do not require - saved connection settings in app preferences so repeat connects do not require
@@ -20,12 +27,23 @@ Implemented now:
- encrypted refresh-token storage through Android Keystore. If the trusted - encrypted refresh-token storage through Android Keystore. If the trusted
device session is revoked or expires, the app asks for the password once and device session is revoked or expires, the app asks for the password once and
then rotates the device keys/profile again. then rotates the device keys/profile again.
- no separate diagnostic foreground service: runtime status is reported by the
node/VPN service itself, so the Android build does not keep a parallel legacy
control process alive.
This is still a lab runtime. The required target model is Android as a farm This is still a lab runtime. The required target model is Android as a farm
node with the `vpn-client` role. The VPN service must attach to the mesh as node with the `vpn-client` role. The VPN service must attach to the mesh as
that node and route to an authorized IPv4 exit pool; there is no separate VPN that node and route to an authorized IPv4 exit pool; there is no separate VPN
entry point. Exit configuration is always pool based, including pools that entry point. Exit configuration is always pool based, including pools that
currently contain only one node. currently contain only one node. A phone installed in a closed network may join
through local seed nodes from that network; it does not need direct Internet
access if a nearby fabric node can route onward.
Current code contract:
- Android control bootstrap field: `fabric_bootstrap_config`
- Android runtime dataplane: QUIC `Fabricvpn` runtime only
- Android runtime status keys: `fabric_transport_*`
Build from this repository on Windows: Build from this repository on Windows:
+8 -4
View File
@@ -22,8 +22,12 @@ android {
return (value == null ? "" : value.toString()).replace("\\", "\\\\").replace("\"", "\\\"") return (value == null ? "" : value.toString()).replace("\\", "\\\\").replace("\"", "\\\"")
} }
def defaultBackendUrl = project.findProperty("RAP_ANDROID_DEFAULT_BACKEND_URL") ?: "http://192.168.200.61:18080/api/v1" def defaultBackendUrl = project.findProperty("RAP_ANDROID_DEFAULT_BACKEND_URL") ?: ""
def defaultFabricBootstrapPeers = project.findProperty("RAP_ANDROID_FABRIC_BOOTSTRAP_PEERS") ?: "quic://192.168.200.85:18080,quic://195.123.240.88:19131" // This is a node bootstrap seed set, not an API/backend selector. The
// Android app installs as a fabric node and tries every QUIC endpoint that
// may be reachable from its current network: public nodes, LAN nodes, or a
// closed-site neighbor that can route onward through the fabric.
def defaultFabricBootstrapPeers = project.findProperty("RAP_ANDROID_FABRIC_BOOTSTRAP_PEERS") ?: "quic://94.141.118.222:19199#sha256=49892029a27db9c394a41bc4cb917d9cceb1f86219417c351764d2ed9d6bc683,quic://94.141.118.222:19191#sha256=72e51f1631b32c3a7d1e8732fe3325e0395a897a5aa31db645888c142e4ae401,quic://192.168.200.61:19134#sha256=72e51f1631b32c3a7d1e8732fe3325e0395a897a5aa31db645888c142e4ae401,quic://192.168.200.61:19132#sha256=8d28b75144d25d29e3b8f8022b6165258ce3cb0e227a2d9d97996839abb89c2a,quic://192.168.200.61:19133#sha256=a71b07e55b810f57b01696c485b765b336983e963238163085824bf04022ecaa,quic://192.168.200.85:18080#sha256=49892029a27db9c394a41bc4cb917d9cceb1f86219417c351764d2ed9d6bc683,quic://192.168.200.85:18081#sha256=2a3be67e6345943a36cfa1197a5879c2b112c81adc019fd1ee9d7dffbf188b57,quic://192.168.200.85:18082#sha256=a318c1a756ff43595635961768dfd1677afa7e2cbf945d724c107ff82426378a"
def defaultClusterId = project.findProperty("RAP_ANDROID_DEFAULT_CLUSTER_ID") ?: "cfc0743d-d960-49fb-9de8-96e063d5e4aa" def defaultClusterId = project.findProperty("RAP_ANDROID_DEFAULT_CLUSTER_ID") ?: "cfc0743d-d960-49fb-9de8-96e063d5e4aa"
def defaultOrganizationId = project.findProperty("RAP_ANDROID_DEFAULT_ORGANIZATION_ID") ?: "125ff8b2-5ac1-4406-9bbb-ebbe18f7c7ed" def defaultOrganizationId = project.findProperty("RAP_ANDROID_DEFAULT_ORGANIZATION_ID") ?: "125ff8b2-5ac1-4406-9bbb-ebbe18f7c7ed"
@@ -31,8 +35,8 @@ android {
applicationId "su.cin.rapvpn" applicationId "su.cin.rapvpn"
minSdk 26 minSdk 26
targetSdk 35 targetSdk 35
versionCode 227 versionCode 239
versionName "0.2.227" versionName "0.2.239"
buildConfigField "String", "DEFAULT_BACKEND_URL", "\"${normalizeGradleString(defaultBackendUrl)}\"" buildConfigField "String", "DEFAULT_BACKEND_URL", "\"${normalizeGradleString(defaultBackendUrl)}\""
buildConfigField "String", "FABRIC_BOOTSTRAP_PEERS", "\"${normalizeGradleString(defaultFabricBootstrapPeers)}\"" buildConfigField "String", "FABRIC_BOOTSTRAP_PEERS", "\"${normalizeGradleString(defaultFabricBootstrapPeers)}\""
buildConfigField "String", "DEFAULT_CLUSTER_ID", "\"${normalizeGradleString(defaultClusterId)}\"" buildConfigField "String", "DEFAULT_CLUSTER_ID", "\"${normalizeGradleString(defaultClusterId)}\""
Binary file not shown.
Binary file not shown.
@@ -42,15 +42,6 @@
android:value="vpn" /> android:value="vpn" />
</service> </service>
<service
android:name=".RapDiagnosticService"
android:exported="false"
android:foregroundServiceType="specialUse">
<property
android:name="android.app.PROPERTY_SPECIAL_USE_FGS_SUBTYPE"
android:value="vpn-diagnostics" />
</service>
<receiver <receiver
android:name=".RapAutostartReceiver" android:name=".RapAutostartReceiver"
android:exported="false"> android:exported="false">
@@ -1,140 +0,0 @@
package su.cin.rapvpn;
import android.util.Base64;
import org.json.JSONObject;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import okhttp3.Request;
final class FabricServiceChannel {
final boolean enabled;
final String channelId;
final String token;
final String pathTemplate;
final String webSocketPathTemplate;
final String authorityPayloadHeader;
final String authoritySignatureHeader;
final String serviceClass;
final String channelClass;
FabricServiceChannel() {
this(false, "", "", "", "", "", "", "", "");
}
private FabricServiceChannel(
boolean enabled,
String channelId,
String token,
String pathTemplate,
String webSocketPathTemplate,
String authorityPayloadHeader,
String authoritySignatureHeader,
String serviceClass,
String channelClass) {
this.enabled = enabled;
this.channelId = safe(channelId);
this.token = safe(token);
this.pathTemplate = safe(pathTemplate);
this.webSocketPathTemplate = safe(webSocketPathTemplate);
this.authorityPayloadHeader = safe(authorityPayloadHeader);
this.authoritySignatureHeader = safe(authoritySignatureHeader);
this.serviceClass = safe(serviceClass);
this.channelClass = safe(channelClass);
}
static FabricServiceChannel fromLease(JSONObject lease) {
if (lease == null) {
return new FabricServiceChannel();
}
JSONObject tokenObject = lease.optJSONObject("token");
JSONObject entryHttp = lease.optJSONObject("entry_http");
String channelId = lease.optString("channel_id", "");
String token = tokenObject == null ? "" : tokenObject.optString("token", "");
String pathTemplate = entryHttp == null ? "" : entryHttp.optString("path_template", "");
String wsTemplate = entryHttp == null ? "" : entryHttp.optString("websocket_path_template", "");
String serviceClass = lease.optString("service_class", "vpn_packets");
String channelClass = "vpn_packet";
JSONObject authoritySignature = lease.optJSONObject("authority_signature");
JSONObject authorityPayload = lease.optJSONObject("authority_payload");
String payloadHeader = authorityPayload == null ? "" : encodeHeader(authorityPayload.toString());
String signatureHeader = authoritySignature == null ? "" : encodeHeader(authoritySignature.toString());
boolean enabled = !channelId.isEmpty() && token.startsWith("rap_fsc_") && !pathTemplate.isEmpty();
return new FabricServiceChannel(enabled, channelId, token, pathTemplate, wsTemplate, payloadHeader, signatureHeader, serviceClass, channelClass);
}
String packetPath(String clusterId, String vpnConnectionId, boolean webSocket) {
return packetPathForBase("", clusterId, vpnConnectionId, webSocket);
}
String packetPathForBase(String baseUrl, String clusterId, String vpnConnectionId, boolean webSocket) {
String template = webSocket && !webSocketPathTemplate.isEmpty() ? webSocketPathTemplate : pathTemplate;
if (!enabled || template.isEmpty()) {
return "";
}
String path = template
.replace("{cluster_id}", safe(clusterId))
.replace("{clusterID}", safe(clusterId))
.replace("{channel_id}", channelId)
.replace("{channelID}", channelId)
.replace("{resource_id}", safe(vpnConnectionId))
.replace("{resourceID}", safe(vpnConnectionId))
.replace("{vpn_connection_id}", safe(vpnConnectionId))
.replace("{vpnConnectionID}", safe(vpnConnectionId));
path = path.startsWith("/") ? path : "/" + path;
String basePath = "";
try {
URI uri = URI.create(baseUrl == null ? "" : baseUrl);
basePath = uri.getRawPath() == null ? "" : trimRight(uri.getRawPath());
} catch (Exception ignored) {
}
if (basePath.endsWith("/api/v1") && path.startsWith("/api/v1/")) {
path = path.substring("/api/v1".length());
}
return path;
}
Request.Builder applyHeaders(Request.Builder builder) {
if (!enabled || builder == null) {
return builder;
}
builder.header("X-RAP-Service-Channel-Token", token);
builder.header("X-RAP-Fabric-Channel-ID", channelId);
if (!serviceClass.isEmpty()) {
builder.header("X-RAP-Service-Class", serviceClass);
}
if (!channelClass.isEmpty()) {
builder.header("X-RAP-Channel-Class", channelClass);
}
if (!authorityPayloadHeader.isEmpty()) {
builder.header("X-RAP-Service-Channel-Authority-Payload", authorityPayloadHeader);
}
if (!authoritySignatureHeader.isEmpty()) {
builder.header("X-RAP-Service-Channel-Authority-Signature", authoritySignatureHeader);
}
return builder;
}
private static String encodeHeader(String value) {
if (value == null || value.isEmpty()) {
return "";
}
return Base64.encodeToString(value.getBytes(StandardCharsets.UTF_8), Base64.URL_SAFE | Base64.NO_WRAP | Base64.NO_PADDING);
}
private static String safe(String value) {
return value == null ? "" : value.trim();
}
private static String trimRight(String value) {
if (value == null) {
return "";
}
while (value.endsWith("/")) {
value = value.substring(0, value.length() - 1);
}
return value;
}
}
@@ -24,14 +24,13 @@ import java.util.Locale;
public class MainActivity extends Activity { public class MainActivity extends Activity {
private static final String APP_VERSION = BuildConfig.VERSION_NAME; private static final String APP_VERSION = BuildConfig.VERSION_NAME;
private static final String DEFAULT_BACKEND_URL = BuildConfig.DEFAULT_BACKEND_URL;
private static final String FABRIC_BOOTSTRAP_PEERS = BuildConfig.FABRIC_BOOTSTRAP_PEERS; private static final String FABRIC_BOOTSTRAP_PEERS = BuildConfig.FABRIC_BOOTSTRAP_PEERS;
private static final String DEFAULT_CLUSTER_ID = BuildConfig.DEFAULT_CLUSTER_ID; private static final String DEFAULT_CLUSTER_ID = BuildConfig.DEFAULT_CLUSTER_ID;
private static final String DEFAULT_ORGANIZATION_ID = BuildConfig.DEFAULT_ORGANIZATION_ID; private static final String DEFAULT_ORGANIZATION_ID = BuildConfig.DEFAULT_ORGANIZATION_ID;
private static final String PREF_SELECTED_EXIT_NODE_ID = "selected_exit_node_id";
private static final int VPN_PREPARE_REQUEST = 42; private static final int VPN_PREPARE_REQUEST = 42;
private static final String PREFS = "rap-vpn"; private static final String PREFS = "rap-vpn";
private static final String PREF_DEVICE_FINGERPRINT = "device_fingerprint"; private static final String PREF_DEVICE_FINGERPRINT = "device_fingerprint";
private static final String PREF_FABRIC_NODE_ID = "fabric_node_id";
private static final String PREF_REFRESH_TOKEN = "refresh_token"; private static final String PREF_REFRESH_TOKEN = "refresh_token";
private static final String PREF_REFRESH_EXPIRES_AT = "refresh_expires_at"; private static final String PREF_REFRESH_EXPIRES_AT = "refresh_expires_at";
private static final String PREF_USER_ID = "user_id"; private static final String PREF_USER_ID = "user_id";
@@ -39,7 +38,6 @@ public class MainActivity extends Activity {
private static final String PREF_PROFILE_JSON = "profile_json"; private static final String PREF_PROFILE_JSON = "profile_json";
private static final String PREF_VPN_CONNECTION_ID = "vpn_connection_id"; private static final String PREF_VPN_CONNECTION_ID = "vpn_connection_id";
static final String PREF_FORCE_FULL_TUNNEL = "force_full_tunnel"; static final String PREF_FORCE_FULL_TUNNEL = "force_full_tunnel";
private EditText backendUrl;
private EditText clusterId; private EditText clusterId;
private EditText organizationId; private EditText organizationId;
private EditText email; private EditText email;
@@ -66,7 +64,6 @@ public class MainActivity extends Activity {
int pad = dp(20); int pad = dp(20);
root.setPadding(pad, pad, pad, pad); root.setPadding(pad, pad, pad, pad);
backendUrl = field("Fabric control bootstrap", preferredBackendUrl());
clusterId = field("Cluster ID", prefs.getString("cluster_id", DEFAULT_CLUSTER_ID)); clusterId = field("Cluster ID", prefs.getString("cluster_id", DEFAULT_CLUSTER_ID));
organizationId = field("Organization ID", prefs.getString("organization_id", DEFAULT_ORGANIZATION_ID)); organizationId = field("Organization ID", prefs.getString("organization_id", DEFAULT_ORGANIZATION_ID));
email = field("Email", prefs.getString("email", "m")); email = field("Email", prefs.getString("email", "m"));
@@ -102,10 +99,6 @@ public class MainActivity extends Activity {
runtimeStatus.setPadding(0, 0, 0, dp(10)); runtimeStatus.setPadding(0, 0, 0, dp(10));
runtimeStatus.setText(runtimeStatusText()); runtimeStatus.setText(runtimeStatusText());
Button load = new Button(this);
load.setText("Войти / обновить пулы");
load.setOnClickListener(v -> loadProfile(false));
Button start = new Button(this); Button start = new Button(this);
start.setText("Подключить"); start.setText("Подключить");
start.setOnClickListener(v -> prepareVpn()); start.setOnClickListener(v -> prepareVpn());
@@ -148,12 +141,11 @@ public class MainActivity extends Activity {
}); });
Button settings = new Button(this); Button settings = new Button(this);
settings.setText("Аккаунт"); settings.setText("Настройка");
settings.setOnClickListener(v -> showSettingsDialog()); settings.setOnClickListener(v -> showSettingsDialog());
root.addView(title); root.addView(title);
root.addView(profileSummary); root.addView(profileSummary);
root.addView(load);
root.addView(start); root.addView(start);
root.addView(stop); root.addView(stop);
root.addView(settings); root.addView(settings);
@@ -161,9 +153,7 @@ public class MainActivity extends Activity {
root.addView(runtimeStatus); root.addView(runtimeStatus);
setContentView(root); setContentView(root);
scheduleRuntimeStatusRefresh(); scheduleRuntimeStatusRefresh();
if (authContext != null && !authContext.deviceId.isEmpty()) { registerCandidateNodeAsync(false);
startDiagnosticChannel();
}
} }
@Override @Override
@@ -179,62 +169,38 @@ public class MainActivity extends Activity {
return input; return input;
} }
private void loadProfile() { private void prepareVpn() {
loadProfile(false); if (!hasSelectedPool()) {
status.setText("Сначала выберите выходной пул.");
showSettingsDialog();
return;
} }
status.setText("Проверяю доступ к выбранному пулу...");
private void loadProfile(boolean startAfterLoad) {
status.setText("Загрузка...");
saveSettings();
new Thread(() -> { new Thread(() -> {
try { try {
RapApiClient client = new RapApiClient(backendUrl.getText().toString(), this); refreshSavedProfileForCurrentUser();
authContext = authenticate(client); if (!hasSelectedPool()) {
String activeOrganizationId = resolveOrganizationId(client, authContext.userId); throw new IllegalStateException("Выбранный пул больше не доступен.");
profileJson = client.vpnClientProfile( }
clusterId.getText().toString(),
activeOrganizationId,
authContext.userId,
""
);
vpnConnectionId = firstConnectionId(profileJson);
saveProfileState();
runOnUiThread(() -> { runOnUiThread(() -> {
profileSummary.setText(summaryText()); profileSummary.setText(summaryText());
status.setText(startAfterLoad ? "Список пулов обновлен. Подключаю..." : "Список доступных пулов обновлен."); status.setText("Доступ подтвержден. Подключаюсь к выбранному пулу.");
startDiagnosticChannel();
if (startAfterLoad) {
requestVpnPermission(); requestVpnPermission();
}
}); });
} catch (Exception ex) { } catch (Exception ex) {
runOnUiThread(() -> { runOnUiThread(() -> {
String message = friendlyError(ex); String message = friendlyError(ex);
boolean canUseSavedProfile = startAfterLoad && !profileJson.isEmpty() && !vpnConnectionId.isEmpty(); status.setText("Нужна настройка: " + message);
if (canUseSavedProfile) {
status.setText("Список пулов сейчас не обновился: " + message + ". Подключаюсь с сохраненным рабочим профилем.");
startDiagnosticChannel();
requestVpnPermission();
return;
}
status.setText("Ошибка входа: " + message);
if (message.contains("логин") || message.contains("пароль") || message.contains("Сессия устройства")) {
clearSavedAuth(false);
showSettingsDialog(); showSettingsDialog();
}
}); });
} }
}).start(); }).start();
} }
private void prepareVpn() {
loadProfile(true);
status.setText("Обновляю сессию устройства и доступные пулы...");
}
private void requestVpnPermission() { private void requestVpnPermission() {
if (profileJson.isEmpty()) { if (!hasSelectedPool()) {
status.setText("VPN-профиль не загружен."); status.setText("Выходной пул не выбран или больше не доступен.");
showSettingsDialog();
return; return;
} }
Intent prepare = VpnService.prepare(this); Intent prepare = VpnService.prepare(this);
@@ -254,9 +220,10 @@ public class MainActivity extends Activity {
} }
private void startVpn() { private void startVpn() {
try {
Intent intent = new Intent(this, RapVpnService.class); Intent intent = new Intent(this, RapVpnService.class);
intent.putExtra(RapVpnService.EXTRA_PROFILE_JSON, profileJson); intent.putExtra(RapVpnService.EXTRA_PROFILE_JSON, profileJson);
intent.putExtra(RapVpnService.EXTRA_BACKEND_URL, backendUrl.getText().toString()); intent.putExtra(RapVpnService.EXTRA_FABRIC_BOOTSTRAP_CONFIG, fabricControlConfig());
intent.putExtra(RapVpnService.EXTRA_CLUSTER_ID, clusterId.getText().toString()); intent.putExtra(RapVpnService.EXTRA_CLUSTER_ID, clusterId.getText().toString());
intent.putExtra(RapVpnService.EXTRA_VPN_CONNECTION_ID, vpnConnectionId); intent.putExtra(RapVpnService.EXTRA_VPN_CONNECTION_ID, vpnConnectionId);
startForegroundService(intent); startForegroundService(intent);
@@ -270,7 +237,7 @@ public class MainActivity extends Activity {
status.setText("VPN runtime активен, рабочий канал поднят. Android еще обновляет системный статус."); status.setText("VPN runtime активен, рабочий канал поднят. Android еще обновляет системный статус.");
} else if ("stopped".equals(state) || "revoked".equals(state) || "error".equals(state)) { } else if ("stopped".equals(state) || "revoked".equals(state) || "error".equals(state)) {
status.setText("VPN не включился: " + runtimePrefs.getString("message", "Android остановил VPN-сервис") + "."); status.setText("VPN не включился: " + runtimePrefs.getString("message", "Android остановил VPN-сервис") + ".");
} else if ("starting".equals(state) || "tunnel".equals(state) || "relay_selected".equals(state) || "relay".equals(state) || "relay_reset".equals(state)) { } else if ("starting".equals(state) || "tunnel".equals(state) || isTransportWarmupState(state)) {
status.setText("VPN запускается. Android еще применяет туннель, ожидаю рабочий канал."); status.setText("VPN запускается. Android еще применяет туннель, ожидаю рабочий канал.");
} else { } else {
status.setText("VPN еще не активен в Android. Проверьте системный запрос разрешения VPN."); status.setText("VPN еще не активен в Android. Проверьте системный запрос разрешения VPN.");
@@ -280,6 +247,10 @@ public class MainActivity extends Activity {
} }
runtimeStatus.setText(runtimeStatusText()); runtimeStatus.setText(runtimeStatusText());
}, 2500); }, 2500);
} catch (Exception e) {
status.setText("VPN не запущен: bootstrap-конфиг фабрики недоступен.");
runtimeStatus.setText("Ошибка запуска: " + e.getMessage());
}
} }
private void scheduleRuntimeStatusRefresh() { private void scheduleRuntimeStatusRefresh() {
@@ -335,9 +306,9 @@ public class MainActivity extends Activity {
boolean osVpnActive = isSystemVpnActive(); boolean osVpnActive = isSystemVpnActive();
String routes = runtimePrefs.getString("routes", ""); String routes = runtimePrefs.getString("routes", "");
String dnsServers = runtimePrefs.getString("dns_servers", ""); String dnsServers = runtimePrefs.getString("dns_servers", "");
String profileRelayUrl = runtimePrefs.getString("packet_relay_profile_base_url", ""); String profileTransportEndpoint = runtimePrefs.getString("fabric_transport_profile_endpoint", "");
String activeRelayUrl = runtimePrefs.getString("packet_relay_active_base_url", ""); String activeTransportEndpoint = runtimePrefs.getString("fabric_transport_active_endpoint", "");
String relayCandidates = runtimePrefs.getString("packet_relay_candidate_urls", ""); String transportCandidates = runtimePrefs.getString("fabric_transport_candidate_endpoints", "");
boolean forceFullTunnelRuntime = false; boolean forceFullTunnelRuntime = false;
boolean fastPathEnabled = false; boolean fastPathEnabled = false;
try { try {
@@ -350,11 +321,14 @@ public class MainActivity extends Activity {
} }
boolean staleState = updatedAt > 0 && (System.currentTimeMillis() - updatedAt) > 12_000; boolean staleState = updatedAt > 0 && (System.currentTimeMillis() - updatedAt) > 12_000;
boolean runtimeActive = isVpnRuntimeActive(); boolean runtimeActive = isVpnRuntimeActive();
if (!osVpnActive && !runtimeActive && ("running".equals(state) || "tunnel".equals(state) || "relay".equals(state) || "relay_reset".equals(state))) { if (!osVpnActive && !runtimeActive && ("running".equals(state) || "tunnel".equals(state) || isTransportWarmupState(state))) {
state = "stale_no_os_vpn"; state = "stale_no_os_vpn";
message = "Сервис говорит об активном состоянии, но Android VPN-интерфейс не активен. Проверьте разрешения/ручной запуск."; message = "Сервис говорит об активном состоянии, но Android VPN-интерфейс не активен. Проверьте разрешения/ручной запуск.";
staleState = false; staleState = false;
} }
String transportEndpoint = activeTransportEndpoint.isEmpty() ? "-" : activeTransportEndpoint;
String transportTargets = transportCandidates.isEmpty() ? "-" : transportCandidates;
String profileTarget = profileTransportEndpoint.isEmpty() ? "-" : profileTransportEndpoint;
return "Диагностика: " + state return "Диагностика: " + state
+ "\n" + message + "\n" + message
+ "\nOS VPN: " + (osVpnActive ? "активен" : (runtimeActive ? "runtime активен" : "неактивен")) + "\nOS VPN: " + (osVpnActive ? "активен" : (runtimeActive ? "runtime активен" : "неактивен"))
@@ -369,9 +343,9 @@ public class MainActivity extends Activity {
+ " / down " + String.format(Locale.US, "%.1f", downlinkPps) + " / down " + String.format(Locale.US, "%.1f", downlinkPps)
+ "\nDNS выхода: " + (dnsServers.isEmpty() ? "-" : dnsServers) + "\nDNS выхода: " + (dnsServers.isEmpty() ? "-" : dnsServers)
+ "\nroutes: " + (routes.isEmpty() ? "-" : routes) + "\nroutes: " + (routes.isEmpty() ? "-" : routes)
+ "\nrelay active: " + (activeRelayUrl.isEmpty() ? "-" : activeRelayUrl) + "\ntransport endpoint: " + transportEndpoint
+ "\nrelay profile: " + (profileRelayUrl.isEmpty() ? "-" : profileRelayUrl) + "\nprofile target: " + profileTarget
+ "\nrelay candidates: " + (relayCandidates.isEmpty() ? "-" : relayCandidates) + "\ntransport candidates: " + transportTargets
+ "\nforced_full_tunnel: " + (forceFullTunnelRuntime ? "да" : "нет") + "\nforced_full_tunnel: " + (forceFullTunnelRuntime ? "да" : "нет")
+ "\nfast_path_mode: " + (fastPathEnabled ? "включен" : "выключен") + "\nfast_path_mode: " + (fastPathEnabled ? "включен" : "выключен")
+ "\nbytes read/sent/down: " + readBytes + "/" + sentBytes + "/" + downBytes + "\nbytes read/sent/down: " + readBytes + "/" + sentBytes + "/" + downBytes
@@ -389,13 +363,6 @@ public class MainActivity extends Activity {
+ "\nобновлено: " + age; + "\nобновлено: " + age;
} }
private void startDiagnosticChannel() {
if (authContext == null || authContext.deviceId.isEmpty()) {
return;
}
RapDiagnosticService.start(this);
}
private boolean isSystemVpnActive() { private boolean isSystemVpnActive() {
try { try {
ConnectivityManager connectivityManager = (ConnectivityManager) getSystemService(CONNECTIVITY_SERVICE); ConnectivityManager connectivityManager = (ConnectivityManager) getSystemService(CONNECTIVITY_SERVICE);
@@ -426,20 +393,31 @@ public class MainActivity extends Activity {
if (updatedAt <= 0 || (System.currentTimeMillis() - updatedAt) > 15_000) { if (updatedAt <= 0 || (System.currentTimeMillis() - updatedAt) > 15_000) {
return false; return false;
} }
String relay = runtimePrefs.getString("packet_relay_active_base_url", ""); String activeTransportEndpoint = runtimePrefs.getString("fabric_transport_active_endpoint", "");
long read = runtimePrefs.getLong("uplink_read_total", 0); long read = runtimePrefs.getLong("uplink_read_total", 0);
long sent = runtimePrefs.getLong("uplink_sent_total", 0); long sent = runtimePrefs.getLong("uplink_sent_total", 0);
long down = runtimePrefs.getLong("downlink_received_total", 0); long down = runtimePrefs.getLong("downlink_received_total", 0);
return !relay.isEmpty() && ("running".equals(state) return !activeTransportEndpoint.isEmpty() && ("running".equals(state)
|| "relay".equals(state) || "fabric_transport".equals(state)
|| "relay_reset".equals(state) || "fabric_transport_reset".equals(state)
|| "downlink".equals(state) || "downlink".equals(state)
|| "downlink_idle".equals(state) || "downlink_idle".equals(state)
|| "uplink_sent".equals(state) || "uplink_sent".equals(state)
|| read > 0 || sent > 0 || down > 0); || read > 0 || sent > 0 || down > 0);
} }
private boolean isTransportWarmupState(String state) {
return "fabric_transport_selected".equals(state)
|| "fabric_transport".equals(state)
|| "fabric_transport_reset".equals(state)
|| "fabric_transport_switch".equals(state);
}
private String firstConnectionId(String profile) throws Exception { private String firstConnectionId(String profile) throws Exception {
String selected = prefs == null ? "" : prefs.getString(PREF_VPN_CONNECTION_ID, "").trim();
if (!selected.isEmpty() && profileContainsConnection(profile, selected)) {
return selected;
}
JSONObject root = new JSONObject(profile); JSONObject root = new JSONObject(profile);
JSONObject vpnProfile = root.getJSONObject("vpn_client_profile"); JSONObject vpnProfile = root.getJSONObject("vpn_client_profile");
JSONArray connections = vpnProfile.getJSONArray("connections"); JSONArray connections = vpnProfile.getJSONArray("connections");
@@ -489,6 +467,36 @@ public class MainActivity extends Activity {
return connections.getJSONObject(0).getString("id"); return connections.getJSONObject(0).getString("id");
} }
private boolean hasSelectedPool() {
return profileJson != null
&& !profileJson.trim().isEmpty()
&& vpnConnectionId != null
&& !vpnConnectionId.trim().isEmpty()
&& profileContainsConnection(profileJson, vpnConnectionId.trim());
}
private boolean profileContainsConnection(String profile, String connectionId) {
if (profile == null || profile.trim().isEmpty() || connectionId == null || connectionId.trim().isEmpty()) {
return false;
}
try {
JSONObject root = new JSONObject(profile);
JSONObject vpnProfile = root.optJSONObject("vpn_client_profile");
JSONArray connections = vpnProfile == null ? null : vpnProfile.optJSONArray("connections");
if (connections == null) {
return false;
}
for (int i = 0; i < connections.length(); i++) {
JSONObject connection = connections.optJSONObject(i);
if (connection != null && connectionId.trim().equals(connection.optString("id", ""))) {
return true;
}
}
} catch (Exception ignored) {
}
return false;
}
private int dp(int value) { private int dp(int value) {
return (int) (value * getResources().getDisplayMetrics().density); return (int) (value * getResources().getDisplayMetrics().density);
} }
@@ -504,8 +512,8 @@ public class MainActivity extends Activity {
return "Версия: " + APP_VERSION return "Версия: " + APP_VERSION
+ "\nУзел Android: в ферме" + "\nУзел Android: в ферме"
+ "\nBootstrap фермы: " + bootstrapPeerCount() + " узл." + "\nBootstrap фермы: " + bootstrapPeerCount() + " узл."
+ "\nДоступные выходы: " + (poolText.isEmpty() ? "войдите для загрузки" : poolText) + "\nДоступные выходы: " + (poolText.isEmpty() ? "не загружены" : poolText)
+ "\nВыбранный выход: " + (selectedPoolText.isEmpty() ? "автоматически" : selectedPoolText) + "\nВыбранный выход: " + (selectedPoolText.isEmpty() ? "не выбран" : selectedPoolText)
+ "\nDNS выхода: " + (profileDNS.isEmpty() ? "будет получен из профиля" : profileDNS) + "\nDNS выхода: " + (profileDNS.isEmpty() ? "будет получен из профиля" : profileDNS)
+ "\nТрафик: " + (prefs.getBoolean(PREF_FORCE_FULL_TUNNEL, true) ? "весь через VPN" : "по профилю") + "\nТрафик: " + (prefs.getBoolean(PREF_FORCE_FULL_TUNNEL, true) ? "весь через VPN" : "по профилю")
+ "\nDevice: " + (deviceId.isEmpty() ? "нет" : deviceId) + "\nDevice: " + (deviceId.isEmpty() ? "нет" : deviceId)
@@ -647,20 +655,7 @@ public class MainActivity extends Activity {
return out.toString(); return out.toString();
} }
private String preferredBackendUrl() {
String saved = prefs.getString("backend_url", DEFAULT_BACKEND_URL);
String normalized = normalizeBackendUrl(saved);
if (!normalized.equals(saved == null ? "" : saved.trim())) {
prefs.edit().putString("backend_url", normalized).apply();
}
return normalized;
}
private void saveSettings() { private void saveSettings() {
String normalizedBackend = normalizeBackendUrl(backendUrl.getText().toString());
if (!normalizedBackend.equals(backendUrl.getText().toString().trim())) {
backendUrl.setText(normalizedBackend);
}
normalizeAndPersistDefaults(); normalizeAndPersistDefaults();
if (clusterId.getText().toString().trim().isEmpty()) { if (clusterId.getText().toString().trim().isEmpty()) {
clusterId.setText(DEFAULT_CLUSTER_ID); clusterId.setText(DEFAULT_CLUSTER_ID);
@@ -669,7 +664,6 @@ public class MainActivity extends Activity {
organizationId.setText(DEFAULT_ORGANIZATION_ID); organizationId.setText(DEFAULT_ORGANIZATION_ID);
} }
prefs.edit() prefs.edit()
.putString("backend_url", normalizedBackend)
.putString("cluster_id", clusterId.getText().toString()) .putString("cluster_id", clusterId.getText().toString())
.putString("organization_id", organizationId.getText().toString()) .putString("organization_id", organizationId.getText().toString())
.putString("email", email.getText().toString()) .putString("email", email.getText().toString())
@@ -677,10 +671,6 @@ public class MainActivity extends Activity {
} }
private void normalizeAndPersistDefaults() { private void normalizeAndPersistDefaults() {
String normalizedBackend = normalizeBackendUrl(backendUrl.getText().toString());
if (normalizedBackend.isEmpty()) {
backendUrl.setText(DEFAULT_BACKEND_URL);
}
if (clusterId.getText().toString().trim().isEmpty()) { if (clusterId.getText().toString().trim().isEmpty()) {
clusterId.setText(DEFAULT_CLUSTER_ID); clusterId.setText(DEFAULT_CLUSTER_ID);
} }
@@ -689,38 +679,48 @@ public class MainActivity extends Activity {
} }
} }
private String normalizeBackendUrl(String value) { private String fabricControlConfig() throws Exception {
String candidate = value == null ? "" : value.trim().replaceAll("/+$", ""); JSONArray endpoints = new JSONArray();
if (candidate.isEmpty()) { for (String peer : FABRIC_BOOTSTRAP_PEERS.split(",")) {
return DEFAULT_BACKEND_URL; String raw = peer == null ? "" : peer.trim();
String address = raw;
String certSHA256 = "";
int fragmentIndex = raw.indexOf('#');
if (fragmentIndex >= 0) {
address = raw.substring(0, fragmentIndex).trim();
String fragment = raw.substring(fragmentIndex + 1).trim();
if (fragment.startsWith("sha256=")) {
certSHA256 = fragment.substring("sha256=".length()).trim();
} }
String lower = candidate.toLowerCase(Locale.US);
if ("http://vpn.cin.su:19191/api/v1".equals(lower)
|| "http://vpn.cin.su/api/v1".equals(lower)
|| "https://vpn.cin.su:443/api/v1".equals(lower)
|| "http://94.141.118.222:19191/api/v1".equals(lower)
|| "http://195.123.240.88:19131/api/v1".equals(lower)) {
return DEFAULT_BACKEND_URL;
} }
return candidate; if (address.isEmpty()) {
continue;
} }
JSONObject endpoint = new JSONObject();
private String selectedExitNodeId() { endpoint.put("endpoint_id", address);
return ""; endpoint.put("address", address);
endpoint.put("transport", "direct_quic");
if (certSHA256.matches("^[0-9a-fA-F]{64}$")) {
endpoint.put("peer_cert_sha256", certSHA256.toLowerCase(Locale.US));
} }
endpoints.put(endpoint);
private String normalizeSelectedExitNodeId(String value) {
String candidate = value == null ? "" : value.trim();
if (candidate.isEmpty()) {
return "";
} }
if (candidate.matches("^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")) { if (endpoints.length() == 0) {
return candidate; throw new IllegalStateException("В клиенте нет bootstrap-узлов фермы.");
} }
if (candidate.matches("^[A-Za-z0-9][A-Za-z0-9._-]{2,63}$")) { JSONObject service = new JSONObject();
return candidate; service.put("schema_version", "rap.fabric_service_channel_request.v1");
} service.put("channel_id", "android-control");
return ""; service.put("service_class", "identity_runtime");
service.put("source_role", "vpn-client");
JSONObject cfg = new JSONObject();
cfg.put("cluster_id", DEFAULT_CLUSTER_ID);
cfg.put("local_node_id", fabricNodeId());
cfg.put("vpn_connection_id", "fabric-control");
cfg.put("stream_shards", 1);
cfg.put("service_channel_request", service);
cfg.put("endpoints", endpoints);
return cfg.toString();
} }
private RapApiClient.AuthContext authenticate(RapApiClient client) throws Exception { private RapApiClient.AuthContext authenticate(RapApiClient client) throws Exception {
@@ -743,6 +743,44 @@ public class MainActivity extends Activity {
return loggedIn; return loggedIn;
} }
private RapApiClient.AuthContext authenticateWithPassword(RapApiClient client, String emailValue, String passwordValue) throws Exception {
if (passwordValue == null || passwordValue.trim().isEmpty()) {
throw new IllegalStateException("Введите пароль для идентификации устройства и выбора пула.");
}
RapApiClient.AuthContext loggedIn = client.login(emailValue.trim(), passwordValue.trim(), deviceFingerprint());
saveAuthContext(loggedIn);
return loggedIn;
}
private void refreshSavedProfileForCurrentUser() throws Exception {
String userId = prefs.getString(PREF_USER_ID, "");
if (userId == null || userId.trim().isEmpty()) {
throw new IllegalStateException("Устройство еще не привязано к пользователю.");
}
RapApiClient client = new RapApiClient(fabricControlConfig(), this);
String refreshToken = savedRefreshToken();
if (!refreshToken.isEmpty()) {
authContext = client.refresh(refreshToken);
saveAuthContext(authContext);
userId = authContext.userId;
}
String activeOrganizationId = resolveOrganizationId(client, userId);
String refreshedProfile = client.vpnClientProfile(
clusterId.getText().toString(),
activeOrganizationId,
userId,
""
);
if (!profileContainsConnection(refreshedProfile, vpnConnectionId)) {
profileJson = refreshedProfile;
vpnConnectionId = "";
saveProfileState();
throw new IllegalStateException("Администратор закрыл доступ к выбранному пулу или пул удален.");
}
profileJson = refreshedProfile;
saveProfileState();
}
private String resolveOrganizationId(RapApiClient client, String userId) throws Exception { private String resolveOrganizationId(RapApiClient client, String userId) throws Exception {
JSONObject payload = client.organizations(userId); JSONObject payload = client.organizations(userId);
JSONArray organizations = payload.optJSONArray("organizations"); JSONArray organizations = payload.optJSONArray("organizations");
@@ -850,6 +888,89 @@ public class MainActivity extends Activity {
return generated; return generated;
} }
private String fabricNodeId() {
String existing = prefs.getString(PREF_FABRIC_NODE_ID, "");
if (existing != null && existing.matches("^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")) {
return existing.toLowerCase(Locale.US);
}
String generated = java.util.UUID.randomUUID().toString();
prefs.edit().putString(PREF_FABRIC_NODE_ID, generated).apply();
return generated;
}
private void registerCandidateNodeAsync(boolean showStatus) {
new Thread(() -> {
try {
RapApiClient client = new RapApiClient(fabricControlConfig(), this);
String nodeId = registerCandidateNode(client);
sendCandidateHeartbeat(client, nodeId);
if (showStatus) {
runOnUiThread(() -> status.setText("Узел телефона виден ферме как кандидат: " + nodeId));
}
} catch (Exception ex) {
if (showStatus) {
runOnUiThread(() -> status.setText("Узел телефона пока не зарегистрирован в ферме: " + friendlyError(ex)));
}
}
}, "rap-fabric-candidate-register").start();
}
private String registerCandidateNode(RapApiClient client) throws Exception {
String nodeId = fabricNodeId();
JSONObject metadata = new JSONObject();
metadata.put("source", "android_vpn_client");
metadata.put("candidate_access", true);
metadata.put("fabric_transport", "quic");
metadata.put("connectivity_mode", "outbound_only");
metadata.put("app_version", APP_VERSION);
metadata.put("device_fingerprint", deviceFingerprint());
JSONObject payload = new JSONObject();
payload.put("cluster_id", clusterId.getText().toString().trim().isEmpty() ? DEFAULT_CLUSTER_ID : clusterId.getText().toString().trim());
payload.put("node_key", "android-vpn:" + deviceFingerprint());
payload.put("name", "android-vpn-" + deviceFingerprint().replace("android-", "").substring(0, Math.min(8, deviceFingerprint().replace("android-", "").length())));
payload.put("ownership_type", "customer_managed");
payload.put("owner_organization_id", organizationId.getText().toString().trim().isEmpty() ? DEFAULT_ORGANIZATION_ID : organizationId.getText().toString().trim());
payload.put("reported_version", APP_VERSION);
payload.put("metadata", metadata);
JSONObject response = client.registerFabricNode(payload);
String registeredNodeId = response.optString("node_id", nodeId).trim();
if (!registeredNodeId.isEmpty()) {
prefs.edit().putString(PREF_FABRIC_NODE_ID, registeredNodeId).apply();
return registeredNodeId;
}
return nodeId;
}
private void sendCandidateHeartbeat(RapApiClient client, String nodeId) throws Exception {
JSONObject capabilities = new JSONObject();
capabilities.put("fabric_quic_node", true);
capabilities.put("android_vpn_client", true);
capabilities.put("candidate_access", true);
capabilities.put("vpn_client", true);
JSONObject serviceStates = new JSONObject();
serviceStates.put("vpn-client", new JSONObject()
.put("state", isSystemVpnActive() ? "running" : "candidate")
.put("runtime", "android_vpnservice")
.put("transport", "fabric_quic_route"));
JSONObject metadata = new JSONObject();
metadata.put("source", "android_vpn_client");
metadata.put("candidate", true);
metadata.put("passive", true);
metadata.put("app_version", APP_VERSION);
metadata.put("mesh_endpoint_report", new JSONObject()
.put("schema_version", "rap.mesh_endpoint_report.v1")
.put("transport", "quic")
.put("connectivity_mode", "outbound_only")
.put("endpoint_candidates", new JSONArray()));
JSONObject payload = new JSONObject();
payload.put("health_status", "healthy");
payload.put("reported_version", APP_VERSION);
payload.put("capabilities", capabilities);
payload.put("service_states", serviceStates);
payload.put("metadata", metadata);
client.sendFabricNodeHeartbeat(clusterId.getText().toString().trim().isEmpty() ? DEFAULT_CLUSTER_ID : clusterId.getText().toString().trim(), nodeId, payload);
}
private void showSettingsDialog() { private void showSettingsDialog() {
LinearLayout form = new LinearLayout(this); LinearLayout form = new LinearLayout(this);
form.setOrientation(LinearLayout.VERTICAL); form.setOrientation(LinearLayout.VERTICAL);
@@ -877,17 +998,15 @@ public class MainActivity extends Activity {
form.addView(showPassword); form.addView(showPassword);
form.addView(forceFullTunnel); form.addView(forceFullTunnel);
new AlertDialog.Builder(this) new AlertDialog.Builder(this)
.setTitle("Аккаунт VPN") .setTitle("Настройка VPN")
.setView(form) .setView(form)
.setPositiveButton("Сохранить", (dialog, which) -> { .setPositiveButton("Войти и выбрать выход", (dialog, which) -> {
email.setText(emailDraft.getText().toString()); email.setText(emailDraft.getText().toString());
password.setText(passwordDraft.getText().toString()); String passwordValue = passwordDraft.getText().toString();
prefs.edit() password.setText("");
.remove(PREF_SELECTED_EXIT_NODE_ID)
.apply();
prefs.edit().putBoolean(PREF_FORCE_FULL_TUNNEL, forceFullTunnel.isChecked()).apply(); prefs.edit().putBoolean(PREF_FORCE_FULL_TUNNEL, forceFullTunnel.isChecked()).apply();
saveSettings(); saveSettings();
profileSummary.setText(summaryText()); loginAndChoosePool(emailDraft.getText().toString(), passwordValue);
}) })
.setNeutralButton("Забыть устройство", (dialog, which) -> { .setNeutralButton("Забыть устройство", (dialog, which) -> {
clearSavedAuth(true); clearSavedAuth(true);
@@ -897,6 +1016,72 @@ public class MainActivity extends Activity {
.show(); .show();
} }
private void loginAndChoosePool(String emailValue, String passwordValue) {
status.setText("Идентифицирую устройство и загружаю доступные выходы...");
new Thread(() -> {
try {
RapApiClient client = new RapApiClient(fabricControlConfig(), this);
authContext = authenticateWithPassword(client, emailValue, passwordValue);
String activeOrganizationId = resolveOrganizationId(client, authContext.userId);
String loadedProfile = client.vpnClientProfile(
clusterId.getText().toString(),
activeOrganizationId,
authContext.userId,
""
);
runOnUiThread(() -> showPoolChoiceDialog(loadedProfile));
} catch (Exception ex) {
runOnUiThread(() -> {
status.setText("Ошибка настройки: " + friendlyError(ex));
if (friendlyError(ex).contains("пароль")) {
clearSavedAuth(false);
}
});
}
}).start();
}
private void showPoolChoiceDialog(String loadedProfile) {
try {
JSONObject root = new JSONObject(loadedProfile);
JSONObject vpnProfile = root.optJSONObject("vpn_client_profile");
JSONArray connections = vpnProfile == null ? null : vpnProfile.optJSONArray("connections");
if (connections == null || connections.length() == 0) {
throw new IllegalStateException("Для пользователя нет доступных выходных пулов.");
}
String[] labels = new String[connections.length()];
String[] ids = new String[connections.length()];
int selectedIndex = 0;
for (int i = 0; i < connections.length(); i++) {
JSONObject connection = connections.getJSONObject(i);
ids[i] = connection.optString("id", "");
String name = connection.optString("exit_pool_name", "").trim();
if (name.isEmpty()) {
name = connection.optString("name", "").trim();
}
labels[i] = name.isEmpty() ? "Выход " + (i + 1) : name;
if (!vpnConnectionId.isEmpty() && vpnConnectionId.equals(ids[i])) {
selectedIndex = i;
}
}
int initialSelection = selectedIndex;
new AlertDialog.Builder(this)
.setTitle("Выходной пул")
.setSingleChoiceItems(labels, initialSelection, (dialog, which) -> {
profileJson = loadedProfile;
vpnConnectionId = ids[which];
saveProfileState();
profileSummary.setText(summaryText());
status.setText("Выбран выходной пул: " + labels[which]);
dialog.dismiss();
})
.setNegativeButton("Отмена", null)
.show();
} catch (Exception ex) {
status.setText("Ошибка выбора пула: " + friendlyError(ex));
}
}
private String friendlyError(Exception ex) { private String friendlyError(Exception ex) {
String message = ex.getMessage(); String message = ex.getMessage();
if (message == null || message.trim().isEmpty()) { if (message == null || message.trim().isEmpty()) {
@@ -4,7 +4,6 @@ import android.content.Context;
import android.net.ConnectivityManager; import android.net.ConnectivityManager;
import android.net.Network; import android.net.Network;
import android.net.NetworkCapabilities; import android.net.NetworkCapabilities;
import android.net.VpnService;
import okhttp3.MediaType; import okhttp3.MediaType;
import okhttp3.OkHttpClient; import okhttp3.OkHttpClient;
@@ -16,35 +15,28 @@ import okhttp3.RequestBody;
import okhttp3.Response; import okhttp3.Response;
import okhttp3.ResponseBody; import okhttp3.ResponseBody;
import org.json.JSONArray;
import org.json.JSONObject; import org.json.JSONObject;
import java.io.ByteArrayOutputStream; import su.cin.rapvpn.fabric.fabricvpn.Fabricvpn;
import java.io.IOException; import su.cin.rapvpn.fabric.fabricvpn.Manager;
import java.io.InterruptedIOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URI; import java.net.URI;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Collections;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import javax.net.SocketFactory; import javax.net.SocketFactory;
final class RapApiClient { final class RapApiClient {
private static final MediaType JSON = MediaType.get("application/json; charset=utf-8"); private static final MediaType JSON = MediaType.get("application/json; charset=utf-8");
private static final MediaType OCTET_STREAM = MediaType.get("application/octet-stream");
private static final int MAX_PACKET_BATCH_PACKETS = 512;
private static final int MAX_PACKET_BATCH_BYTES = 512 * 1024;
private static final int MAX_SINGLE_PACKET_BYTES = 65535;
private static final int MAX_BATCH_HEADER_BYTES = 4;
private final String baseUrl; private final String baseUrl;
private final OkHttpClient httpClient; private final OkHttpClient httpClient;
private final String networkMode; private final String networkMode;
private final FabricServiceChannel fabricServiceChannel; private final Manager fabricControlManager;
RapApiClient(String baseUrl) { RapApiClient(String baseUrl) {
this(baseUrl, (Context) null); this(baseUrl, (Context) null);
@@ -52,7 +44,7 @@ final class RapApiClient {
RapApiClient(String baseUrl, Context context) { RapApiClient(String baseUrl, Context context) {
this.baseUrl = trimRight(baseUrl); this.baseUrl = trimRight(baseUrl);
this.fabricServiceChannel = new FabricServiceChannel(); this.fabricControlManager = startFabricControlManager(baseUrl);
OkHttpClient.Builder builder = new OkHttpClient.Builder(); OkHttpClient.Builder builder = new OkHttpClient.Builder();
// Regular app and diagnostic requests should use Android's default // Regular app and diagnostic requests should use Android's default
// routing. Some devices reject binding app sockets to a specific // routing. Some devices reject binding app sockets to a specific
@@ -74,7 +66,7 @@ final class RapApiClient {
RapApiClient(String baseUrl, Context context, boolean preferUnderlyingNetwork) { RapApiClient(String baseUrl, Context context, boolean preferUnderlyingNetwork) {
this.baseUrl = trimRight(baseUrl); this.baseUrl = trimRight(baseUrl);
this.fabricServiceChannel = new FabricServiceChannel(); this.fabricControlManager = startFabricControlManager(baseUrl);
OkHttpClient.Builder builder = new OkHttpClient.Builder(); OkHttpClient.Builder builder = new OkHttpClient.Builder();
String mode = context == null ? "default_network" : "default_network_context"; String mode = context == null ? "default_network" : "default_network_context";
if (preferUnderlyingNetwork && context != null) { if (preferUnderlyingNetwork && context != null) {
@@ -99,74 +91,27 @@ final class RapApiClient {
this.httpClient = builder.build(); this.httpClient = builder.build();
} }
RapApiClient(String baseUrl, VpnService vpnService) {
this(baseUrl, vpnService, new FabricServiceChannel());
}
RapApiClient(String baseUrl, VpnService vpnService, FabricServiceChannel fabricServiceChannel) {
this.baseUrl = trimRight(baseUrl);
this.fabricServiceChannel = fabricServiceChannel == null ? new FabricServiceChannel() : fabricServiceChannel;
OkHttpClient.Builder builder = new OkHttpClient.Builder();
if (vpnService != null) {
builder.socketFactory(new ProtectedSocketFactory(vpnService));
builder.dns(new BackendPinnedDns(baseUrl));
this.networkMode = "protected_socket";
} else {
this.networkMode = "default_network";
}
builder.connectTimeout(3, TimeUnit.SECONDS);
builder.writeTimeout(8, TimeUnit.SECONDS);
builder.readTimeout(8, TimeUnit.SECONDS);
builder.callTimeout(10, TimeUnit.SECONDS);
builder.retryOnConnectionFailure(false);
Dispatcher dispatcher = new Dispatcher();
dispatcher.setMaxRequests(64);
dispatcher.setMaxRequestsPerHost(32);
builder.dispatcher(dispatcher);
builder.connectionPool(new ConnectionPool(16, 5, TimeUnit.MINUTES));
this.httpClient = builder.build();
}
RapApiClient(String baseUrl, Network network) {
this.baseUrl = trimRight(baseUrl);
this.fabricServiceChannel = new FabricServiceChannel();
OkHttpClient.Builder builder = new OkHttpClient.Builder();
if (network != null) {
builder.socketFactory(network.getSocketFactory());
builder.dns(hostname -> {
InetAddress[] addresses = network.getAllByName(hostname);
if (addresses == null || addresses.length == 0) {
throw new UnknownHostException(hostname);
}
List<InetAddress> out = new ArrayList<>();
Collections.addAll(out, addresses);
return out;
});
this.networkMode = "vpn_network";
} else {
builder.dns(new BackendPinnedDns(baseUrl));
this.networkMode = "default_network";
}
builder.connectTimeout(5, TimeUnit.SECONDS);
builder.writeTimeout(12, TimeUnit.SECONDS);
builder.readTimeout(12, TimeUnit.SECONDS);
builder.callTimeout(15, TimeUnit.SECONDS);
builder.retryOnConnectionFailure(true);
Dispatcher dispatcher = new Dispatcher();
dispatcher.setMaxRequests(64);
dispatcher.setMaxRequestsPerHost(32);
builder.dispatcher(dispatcher);
builder.connectionPool(new ConnectionPool(16, 5, TimeUnit.MINUTES));
this.httpClient = builder.build();
}
String networkMode() { String networkMode() {
return networkMode; return networkMode;
} }
private Manager startFabricControlManager(String config) {
String value = config == null ? "" : config.trim();
if (!value.startsWith("{")) {
return null;
}
try {
Fabricvpn.touch();
Manager manager = Fabricvpn.newManager();
manager.start(value);
return manager;
} catch (Exception e) {
String detail = e.getMessage() == null ? e.getClass().getSimpleName() : e.getMessage();
throw new IllegalStateException("Не удалось подключиться к ферме через QUIC bootstrap. Последняя ошибка: " + detail, e);
}
}
static final class BackendPinnedDns implements Dns { static final class BackendPinnedDns implements Dns {
private static final String VPN_PUBLIC_HOST = "vpn.cin.su";
private static final String VPN_PUBLIC_IPV4 = "94.141.118.222";
private final String backendHost; private final String backendHost;
BackendPinnedDns(String baseUrl) { BackendPinnedDns(String baseUrl) {
@@ -180,10 +125,6 @@ final class RapApiClient {
@Override @Override
public List<InetAddress> lookup(String hostname) throws UnknownHostException { public List<InetAddress> lookup(String hostname) throws UnknownHostException {
String host = hostname == null ? "" : hostname.trim().toLowerCase();
if (!backendHost.isEmpty() && host.equals(backendHost) && VPN_PUBLIC_HOST.equals(host)) {
return Collections.singletonList(InetAddress.getByName(VPN_PUBLIC_IPV4));
}
return Dns.SYSTEM.lookup(hostname); return Dns.SYSTEM.lookup(hostname);
} }
} }
@@ -243,103 +184,26 @@ final class RapApiClient {
return get(path); return get(path);
} }
JSONObject startSession(String resourceId, String userId, String deviceId) throws Exception { JSONObject registerFabricNode(JSONObject payload) throws Exception {
JSONObject body = new JSONObject(); return post("/node-agents/register", payload);
body.put("resource_id", resourceId);
body.put("user_id", userId);
body.put("device_id", deviceId);
return post("/sessions/", body);
} }
JSONObject reportVPNDiagnosticStatus(String clusterId, String deviceId, JSONObject payload) throws Exception { JSONObject sendFabricNodeHeartbeat(String clusterId, String nodeId, JSONObject payload) throws Exception {
return post("/clusters/" + clusterId + "/vpn/client-diagnostics/" + deviceId + "/status", payload); return post("/clusters/" + clusterId + "/nodes/" + nodeId + "/heartbeats", payload);
}
JSONObject nextVPNDiagnosticCommand(String clusterId, String deviceId, int timeoutMs) throws Exception {
byte[] payload = getBytes("/clusters/" + clusterId + "/vpn/client-diagnostics/" + deviceId + "/commands?timeout_ms=" + timeoutMs);
if (payload.length == 0) {
return null;
}
return new JSONObject(new String(payload, StandardCharsets.UTF_8));
}
JSONObject vpnPacketStats(String clusterId, String vpnConnectionId) throws Exception {
return get("/clusters/" + clusterId + "/vpn-connections/" + vpnConnectionId + "/tunnel/stats");
}
JSONObject resetVPNPacketQueues(String clusterId, String vpnConnectionId) throws Exception {
return post("/clusters/" + clusterId + "/vpn-connections/" + vpnConnectionId + "/tunnel/reset", new JSONObject());
}
void sendClientPacket(String clusterId, String vpnConnectionId, byte[] packet, int length) throws Exception {
postBytes(clientPacketPath(clusterId, vpnConnectionId, ""), packet, length);
}
void sendClientPacketBatch(String clusterId, String vpnConnectionId, List<byte[]> packets) throws Exception {
if (packets == null || packets.isEmpty()) {
return;
}
List<List<byte[]>> chunks = chunkPacketsForBatch(packets);
if (chunks.isEmpty()) {
return;
}
for (List<byte[]> chunk : chunks) {
postBytes(clientPacketPath(clusterId, vpnConnectionId, "?batch=true"), encodePacketBatch(chunk));
}
}
byte[] receiveClientPacket(String clusterId, String vpnConnectionId, int timeoutMs) throws Exception {
try {
return getBytes(clientPacketPath(clusterId, vpnConnectionId, "?timeout_ms=" + timeoutMs));
} catch (InterruptedIOException e) {
return new byte[0];
} catch (IOException e) {
if (e.getMessage() != null && e.getMessage().toLowerCase().contains("timeout")) {
return new byte[0];
}
throw e;
} catch (IllegalStateException e) {
String message = e.getMessage();
if (message != null && (message.contains("HTTP 502") || message.contains("HTTP 503") || message.contains("HTTP 504"))) {
return new byte[0];
}
throw e;
}
}
List<byte[]> receiveClientPacketBatch(String clusterId, String vpnConnectionId, int timeoutMs) throws Exception {
byte[] payload;
try {
payload = getBytes(clientPacketPath(clusterId, vpnConnectionId, "?batch=true&timeout_ms=" + timeoutMs));
if (payload == null || payload.length == 0) {
return new ArrayList<>();
}
if (!isLikelyPacketBatch(payload)) {
return receiveSinglePacketAsBatch(clusterId, vpnConnectionId, timeoutMs);
}
return decodePacketBatch(payload);
} catch (InterruptedIOException e) {
return new ArrayList<>();
} catch (IOException e) {
if (e.getMessage() != null && e.getMessage().toLowerCase().contains("timeout")) {
return new ArrayList<>();
}
throw e;
} catch (IllegalStateException e) {
String message = e.getMessage();
if (message != null && (message.contains("HTTP 502") || message.contains("HTTP 503") || message.contains("HTTP 504"))) {
return new ArrayList<>();
}
throw e;
}
} }
private JSONObject get(String path) throws Exception { private JSONObject get(String path) throws Exception {
if (fabricControlManager != null) {
return fabricControlJSON("GET", path, null);
}
Request request = new Request.Builder().url(baseUrl + path).get().build(); Request request = new Request.Builder().url(baseUrl + path).get().build();
return read(request); return read(request);
} }
private JSONObject post(String path, JSONObject body) throws Exception { private JSONObject post(String path, JSONObject body) throws Exception {
if (fabricControlManager != null) {
return fabricControlJSON("POST", path, body);
}
Request request = new Request.Builder() Request request = new Request.Builder()
.url(baseUrl + path) .url(baseUrl + path)
.post(RequestBody.create(body.toString().getBytes(StandardCharsets.UTF_8), JSON)) .post(RequestBody.create(body.toString().getBytes(StandardCharsets.UTF_8), JSON))
@@ -347,39 +211,60 @@ final class RapApiClient {
return read(request); return read(request);
} }
private byte[] getBytes(String path) throws Exception { private JSONObject fabricControlJSON(String method, String path, JSONObject body) throws Exception {
Request.Builder builder = new Request.Builder().url(baseUrl + path).get(); byte[] payload = fabricControlBodyBytes(method, path, body);
applyFabricHeadersIfNeeded(builder, path); if (payload.length == 0) {
Request request = builder.build(); return new JSONObject();
try (Response response = httpClient.newCall(request).execute()) {
if (response.code() == 204) {
return new byte[0];
}
if (!response.isSuccessful()) {
throw new IllegalStateException(describeHttpFailure(response));
}
ResponseBody body = response.body();
return body == null ? new byte[0] : body.bytes();
} }
return new JSONObject(new String(payload, StandardCharsets.UTF_8));
} }
private void postBytes(String path, byte[] packet, int length) throws Exception { private byte[] fabricControlBodyBytes(String method, String path, JSONObject body) throws Exception {
byte[] bodyBytes = new byte[length]; JSONObject request = new JSONObject();
System.arraycopy(packet, 0, bodyBytes, 0, length); request.put("method", method);
postBytes(path, bodyBytes); request.put("path", path);
if (body != null) {
request.put("body", body);
}
String raw;
try {
raw = fabricControlManager.controlRequest(request.toString());
} catch (Exception e) {
throw new IllegalStateException("Ферма сейчас не смогла выполнить контрольный запрос. Попробуйте еще раз.", e);
}
JSONObject wrapper = raw == null || raw.trim().isEmpty() ? new JSONObject() : new JSONObject(raw);
int statusCode = wrapper.optInt("status_code", 200);
Object bodyValue = wrapper.opt("body");
String bodyText = jsonBodyText(bodyValue);
if (statusCode < 200 || statusCode >= 300) {
if (statusCode == 401 && bodyText.contains("auth.invalid_credentials")) {
throw new IllegalStateException("Неверный логин или пароль.");
}
if (statusCode == 401 && bodyText.contains("auth.invalid_refresh_token")) {
throw new IllegalStateException("Сессия устройства истекла. Введите пароль один раз.");
}
throw new IllegalStateException("fabric control HTTP " + statusCode + ": " + compactText(bodyText, 240));
}
return bodyText.getBytes(java.nio.charset.StandardCharsets.UTF_8);
} }
private void postBytes(String path, byte[] bodyBytes) throws Exception { private String jsonBodyText(Object bodyValue) {
Request.Builder builder = new Request.Builder() if (bodyValue == null || JSONObject.NULL.equals(bodyValue)) {
.url(baseUrl + path) return "";
.post(RequestBody.create(bodyBytes, OCTET_STREAM));
applyFabricHeadersIfNeeded(builder, path);
Request request = builder.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful()) {
throw new IllegalStateException(describeHttpFailure(response));
} }
if (bodyValue instanceof JSONObject || bodyValue instanceof JSONArray) {
return bodyValue.toString();
} }
String text = String.valueOf(bodyValue);
return text == null ? "" : text;
}
private String compactText(String text, int limit) {
String value = text == null ? "" : text.replace('\n', ' ').replace('\r', ' ').trim();
if (value.length() > limit) {
return value.substring(0, limit);
}
return value;
} }
private String describeHttpFailure(Response response) { private String describeHttpFailure(Response response) {
@@ -401,45 +286,6 @@ final class RapApiClient {
return message.toString(); return message.toString();
} }
private String clientPacketPath(String clusterId, String vpnConnectionId, String suffix) throws IOException {
String path = fabricServiceChannel.packetPathForBase(baseUrl, clusterId, vpnConnectionId, false);
if (path.isEmpty()) {
throw new IOException("fabric service channel lease required for VPN packet dataplane");
}
return path + (suffix == null ? "" : suffix);
}
private void applyFabricHeadersIfNeeded(Request.Builder builder, String path) {
if (path != null && path.contains("/fabric/service-channels/")) {
fabricServiceChannel.applyHeaders(builder);
}
}
private byte[] encodePacketBatch(List<byte[]> packets) {
int total = 0;
for (byte[] packet : packets) {
if (packet != null && packet.length > 0) {
total += 4 + packet.length;
}
}
byte[] out = new byte[total];
int offset = 0;
for (byte[] packet : packets) {
if (packet == null || packet.length == 0) {
continue;
}
int length = packet.length;
out[offset] = (byte) ((length >> 24) & 0xff);
out[offset + 1] = (byte) ((length >> 16) & 0xff);
out[offset + 2] = (byte) ((length >> 8) & 0xff);
out[offset + 3] = (byte) (length & 0xff);
offset += 4;
System.arraycopy(packet, 0, out, offset, length);
offset += length;
}
return out;
}
private JSONObject read(Request request) throws Exception { private JSONObject read(Request request) throws Exception {
try (Response response = httpClient.newCall(request).execute()) { try (Response response = httpClient.newCall(request).execute()) {
ResponseBody body = response.body(); ResponseBody body = response.body();
@@ -457,93 +303,6 @@ final class RapApiClient {
} }
} }
private List<byte[]> decodePacketBatch(byte[] payload) {
List<byte[]> packets = new ArrayList<>();
int offset = 0;
while (payload != null && offset + 4 <= payload.length) {
int length = ((payload[offset] & 0xff) << 24)
| ((payload[offset + 1] & 0xff) << 16)
| ((payload[offset + 2] & 0xff) << 8)
| (payload[offset + 3] & 0xff);
offset += 4;
if (length <= 0 || offset + length > payload.length) {
break;
}
byte[] packet = new byte[length];
System.arraycopy(payload, offset, packet, 0, length);
packets.add(packet);
offset += length;
}
return packets;
}
private List<List<byte[]>> chunkPacketsForBatch(List<byte[]> packets) {
List<List<byte[]>> chunks = new ArrayList<>();
List<byte[]> current = new ArrayList<>();
int currentBytes = 0;
boolean hasData = false;
for (byte[] packet : packets) {
if (packet == null || packet.length == 0) {
continue;
}
if (packet.length > MAX_SINGLE_PACKET_BYTES) {
continue;
}
hasData = true;
int projected = currentBytes + MAX_BATCH_HEADER_BYTES + packet.length;
if (!current.isEmpty() && (current.size() >= MAX_PACKET_BATCH_PACKETS || projected > MAX_PACKET_BATCH_BYTES)) {
chunks.add(current);
current = new ArrayList<>();
currentBytes = 0;
}
current.add(packet);
currentBytes = projected;
}
if (!hasData) {
return chunks;
}
if (!current.isEmpty()) {
chunks.add(current);
}
return chunks;
}
private boolean isLikelyPacketBatch(byte[] payload) {
if (payload == null || payload.length < MAX_BATCH_HEADER_BYTES) {
return false;
}
int offset = 0;
int consumed = 0;
while (offset + MAX_BATCH_HEADER_BYTES <= payload.length) {
int length = ((payload[offset] & 0xff) << 24)
| ((payload[offset + 1] & 0xff) << 16)
| ((payload[offset + 2] & 0xff) << 8)
| (payload[offset + 3] & 0xff);
offset += MAX_BATCH_HEADER_BYTES;
if (length <= 0 || length > MAX_SINGLE_PACKET_BYTES) {
return false;
}
if (offset + length > payload.length) {
return false;
}
offset += length;
consumed++;
if (consumed > MAX_PACKET_BATCH_PACKETS) {
return false;
}
}
return offset == payload.length && consumed > 0;
}
private List<byte[]> receiveSinglePacketAsBatch(String clusterId, String vpnConnectionId, int timeoutMs) throws Exception {
byte[] payload = receiveClientPacket(clusterId, vpnConnectionId, timeoutMs);
if (payload == null || payload.length == 0) {
return new ArrayList<>();
}
return new ArrayList<>(Collections.singletonList(payload));
}
private AuthContext parseAuthContext(JSONObject response) throws Exception { private AuthContext parseAuthContext(JSONObject response) throws Exception {
JSONObject user = response.getJSONObject("user"); JSONObject user = response.getJSONObject("user");
String userId = user.optString("id", ""); String userId = user.optString("id", "");
@@ -570,65 +329,6 @@ final class RapApiClient {
return value; return value;
} }
static final class ProtectedSocketFactory extends SocketFactory {
private final SocketFactory delegate = SocketFactory.getDefault();
private final VpnService vpnService;
ProtectedSocketFactory(VpnService vpnService) {
this.vpnService = vpnService;
}
@Override
public Socket createSocket() throws IOException {
Socket socket = delegate.createSocket();
socket.bind(null);
return protect(socket);
}
@Override
public Socket createSocket(String host, int port) throws IOException {
Socket socket = createSocket();
socket.connect(new InetSocketAddress(host, port));
return socket;
}
@Override
public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException {
Socket socket = delegate.createSocket();
socket.bind(new InetSocketAddress(localHost, localPort));
protect(socket);
socket.connect(new InetSocketAddress(host, port));
return socket;
}
@Override
public Socket createSocket(InetAddress host, int port) throws IOException {
Socket socket = createSocket();
socket.connect(new InetSocketAddress(host, port));
return socket;
}
@Override
public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException {
Socket socket = delegate.createSocket();
socket.bind(new InetSocketAddress(localAddress, localPort));
protect(socket);
socket.connect(new InetSocketAddress(address, port));
return socket;
}
private Socket protect(Socket socket) throws IOException {
if (!vpnService.protect(socket)) {
try {
socket.close();
} catch (IOException ignored) {
}
throw new IOException("protect control-plane socket failed");
}
return socket;
}
}
static final class AuthContext { static final class AuthContext {
final String userId; final String userId;
final String deviceId; final String deviceId;
@@ -10,7 +10,6 @@ import android.os.Build;
public final class RapAutostartReceiver extends BroadcastReceiver { public final class RapAutostartReceiver extends BroadcastReceiver {
private static final String PREFS = "rap-vpn"; private static final String PREFS = "rap-vpn";
private static final String PREF_PROFILE_JSON = "profile_json"; private static final String PREF_PROFILE_JSON = "profile_json";
private static final String PREF_BACKEND_URL = "backend_url";
private static final String PREF_CLUSTER_ID = "cluster_id"; private static final String PREF_CLUSTER_ID = "cluster_id";
private static final String PREF_VPN_CONNECTION_ID = "vpn_connection_id"; private static final String PREF_VPN_CONNECTION_ID = "vpn_connection_id";
private static final String PREF_MANUAL_STOPPED = "manual_stopped"; private static final String PREF_MANUAL_STOPPED = "manual_stopped";
@@ -25,21 +24,18 @@ public final class RapAutostartReceiver extends BroadcastReceiver {
&& !Intent.ACTION_BOOT_COMPLETED.equals(action)) { && !Intent.ACTION_BOOT_COMPLETED.equals(action)) {
return; return;
} }
RapDiagnosticService.start(context);
SharedPreferences prefs = context.getSharedPreferences(PREFS, Context.MODE_PRIVATE); SharedPreferences prefs = context.getSharedPreferences(PREFS, Context.MODE_PRIVATE);
if (prefs.getBoolean(PREF_MANUAL_STOPPED, false)) { if (prefs.getBoolean(PREF_MANUAL_STOPPED, false)) {
return; return;
} }
if (Intent.ACTION_MY_PACKAGE_REPLACED.equals(action)) { if (Intent.ACTION_MY_PACKAGE_REPLACED.equals(action)) {
// Diagnostic service owns post-upgrade VPN restart. Starting both services from // After package replacement we wait for an explicit user action or runtime resume.
// MY_PACKAGE_REPLACED can race foreground-service startup and leave diagnostics stale.
return; return;
} }
String profile = prefs.getString(PREF_PROFILE_JSON, ""); String profile = prefs.getString(PREF_PROFILE_JSON, "");
String backendUrl = prefs.getString(PREF_BACKEND_URL, "");
String clusterId = prefs.getString(PREF_CLUSTER_ID, ""); String clusterId = prefs.getString(PREF_CLUSTER_ID, "");
String vpnConnectionId = prefs.getString(PREF_VPN_CONNECTION_ID, ""); String vpnConnectionId = prefs.getString(PREF_VPN_CONNECTION_ID, "");
if (profile.isEmpty() || backendUrl.isEmpty() || clusterId.isEmpty() || vpnConnectionId.isEmpty()) { if (profile.isEmpty() || clusterId.isEmpty() || vpnConnectionId.isEmpty()) {
return; return;
} }
if (VpnService.prepare(context) != null) { if (VpnService.prepare(context) != null) {
@@ -47,7 +43,6 @@ public final class RapAutostartReceiver extends BroadcastReceiver {
} }
Intent service = new Intent(context, RapVpnService.class); Intent service = new Intent(context, RapVpnService.class);
service.putExtra("profile_json", profile); service.putExtra("profile_json", profile);
service.putExtra("backend_url", backendUrl);
service.putExtra("cluster_id", clusterId); service.putExtra("cluster_id", clusterId);
service.putExtra("vpn_connection_id", vpnConnectionId); service.putExtra("vpn_connection_id", vpnConnectionId);
if (Build.VERSION.SDK_INT >= 26) { if (Build.VERSION.SDK_INT >= 26) {
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -54,7 +54,7 @@ public class TestTrafficActivity extends Activity {
setContentView(layout); setContentView(layout);
String url = getIntent().getStringExtra(EXTRA_URL); String url = getIntent().getStringExtra(EXTRA_URL);
if (url == null || url.isEmpty()) { if (url == null || url.isEmpty()) {
url = "http://192.168.200.61:18080/"; url = "http://example.com/";
} }
target = url; target = url;
assetErrorCount = 0; assetErrorCount = 0;
@@ -11,7 +11,7 @@ import java.nio.charset.StandardCharsets;
public class TestVpnActivity extends Activity { public class TestVpnActivity extends Activity {
public static final String EXTRA_PROFILE_JSON = "profile_json"; public static final String EXTRA_PROFILE_JSON = "profile_json";
public static final String EXTRA_PROFILE_BASE64 = "profile_base64"; public static final String EXTRA_PROFILE_BASE64 = "profile_base64";
public static final String EXTRA_BACKEND_URL = "backend_url"; public static final String EXTRA_FABRIC_BOOTSTRAP_CONFIG = "fabric_bootstrap_config";
public static final String EXTRA_CLUSTER_ID = "cluster_id"; public static final String EXTRA_CLUSTER_ID = "cluster_id";
public static final String EXTRA_VPN_CONNECTION_ID = "vpn_connection_id"; public static final String EXTRA_VPN_CONNECTION_ID = "vpn_connection_id";
private static final int VPN_PREPARE_REQUEST = 77; private static final int VPN_PREPARE_REQUEST = 77;
@@ -44,7 +44,10 @@ public class TestVpnActivity extends Activity {
private Intent buildServiceIntent(Intent source) { private Intent buildServiceIntent(Intent source) {
Intent intent = new Intent(this, RapVpnService.class); Intent intent = new Intent(this, RapVpnService.class);
intent.putExtra(RapVpnService.EXTRA_PROFILE_JSON, profileJson(source)); intent.putExtra(RapVpnService.EXTRA_PROFILE_JSON, profileJson(source));
intent.putExtra(RapVpnService.EXTRA_BACKEND_URL, source.getStringExtra(EXTRA_BACKEND_URL)); String fabricBootstrapConfig = source.getStringExtra(EXTRA_FABRIC_BOOTSTRAP_CONFIG);
if (fabricBootstrapConfig != null && !fabricBootstrapConfig.isEmpty()) {
intent.putExtra(RapVpnService.EXTRA_FABRIC_BOOTSTRAP_CONFIG, fabricBootstrapConfig);
}
intent.putExtra(RapVpnService.EXTRA_CLUSTER_ID, source.getStringExtra(EXTRA_CLUSTER_ID)); intent.putExtra(RapVpnService.EXTRA_CLUSTER_ID, source.getStringExtra(EXTRA_CLUSTER_ID));
intent.putExtra(RapVpnService.EXTRA_VPN_CONNECTION_ID, source.getStringExtra(EXTRA_VPN_CONNECTION_ID)); intent.putExtra(RapVpnService.EXTRA_VPN_CONNECTION_ID, source.getStringExtra(EXTRA_VPN_CONNECTION_ID));
return intent; return intent;
@@ -1,393 +0,0 @@
package su.cin.rapvpn;
import android.net.VpnService;
import android.util.Log;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import okhttp3.WebSocket;
import okhttp3.WebSocketListener;
import okio.ByteString;
final class VpnPacketWebSocketRelay {
private static final String TAG = "RapVpnWebSocketRelay";
private static final int MAX_PACKET_BATCH_PACKETS = 512;
private static final int MAX_PACKET_BATCH_BYTES = 1024 * 1024;
private static final int MAX_SINGLE_PACKET_BYTES = 65535;
private static final long CONNECTING_STALE_MS = 8000;
private static final long OPEN_WAIT_MS = 3500;
private static final int PRIORITY_GRACE_MS = 2;
private final String baseUrl;
private final VpnService vpnService;
private final OkHttpClient httpClient;
private final FabricServiceChannel fabricServiceChannel;
private final BlockingQueue<List<byte[]>> priorityIncoming = new ArrayBlockingQueue<>(512);
private final BlockingQueue<List<byte[]>> incoming = new ArrayBlockingQueue<>(2048);
private final Object lock = new Object();
private WebSocket webSocket;
private String connectedClusterId = "";
private String connectedVpnConnectionId = "";
private volatile boolean open;
private volatile boolean connecting;
private volatile long connectingSinceMs;
private volatile long reconnectAfterMs;
private volatile String lastError = "";
VpnPacketWebSocketRelay(String baseUrl, VpnService vpnService) {
this(baseUrl, vpnService, new FabricServiceChannel());
}
VpnPacketWebSocketRelay(String baseUrl, VpnService vpnService, FabricServiceChannel fabricServiceChannel) {
this.baseUrl = trimRight(baseUrl);
this.vpnService = vpnService;
this.fabricServiceChannel = fabricServiceChannel == null ? new FabricServiceChannel() : fabricServiceChannel;
OkHttpClient.Builder builder = new OkHttpClient.Builder();
if (vpnService != null) {
builder.socketFactory(new RapApiClient.ProtectedSocketFactory(vpnService));
}
builder.dns(new RapApiClient.BackendPinnedDns(baseUrl));
builder.connectTimeout(5, TimeUnit.SECONDS);
builder.writeTimeout(10, TimeUnit.SECONDS);
builder.readTimeout(0, TimeUnit.SECONDS);
builder.retryOnConnectionFailure(true);
Dispatcher dispatcher = new Dispatcher();
dispatcher.setMaxRequests(16);
dispatcher.setMaxRequestsPerHost(8);
builder.dispatcher(dispatcher);
builder.connectionPool(new ConnectionPool(8, 5, TimeUnit.MINUTES));
this.httpClient = builder.build();
}
String baseUrl() {
return baseUrl;
}
boolean isOpen() {
return open;
}
String lastError() {
return lastError == null ? "" : lastError;
}
void connect(String clusterId, String vpnConnectionId) {
if (clusterId == null || clusterId.isEmpty() || vpnConnectionId == null || vpnConnectionId.isEmpty()) {
return;
}
long now = System.currentTimeMillis();
synchronized (lock) {
if (open && clusterId.equals(connectedClusterId) && vpnConnectionId.equals(connectedVpnConnectionId)) {
return;
}
if (connecting && clusterId.equals(connectedClusterId) && vpnConnectionId.equals(connectedVpnConnectionId)) {
if (now - connectingSinceMs < CONNECTING_STALE_MS) {
return;
}
lastError = "stale websocket connect";
closeLocked();
}
if (now < reconnectAfterMs) {
return;
}
closeLocked();
String wsUrl = webSocketUrl(clusterId, vpnConnectionId);
if (wsUrl.isEmpty()) {
lastError = "invalid websocket url";
reconnectAfterMs = now + 5000;
return;
}
connectedClusterId = clusterId;
connectedVpnConnectionId = vpnConnectionId;
connecting = true;
connectingSinceMs = now;
Request.Builder requestBuilder = new Request.Builder().url(wsUrl);
this.fabricServiceChannel.applyHeaders(requestBuilder);
Request request = requestBuilder.build();
lastError = "connecting";
webSocket = httpClient.newWebSocket(request, new Listener());
}
}
boolean sendClientPacketBatch(String clusterId, String vpnConnectionId, List<byte[]> packets) {
packets = cleanPacketBatch(packets);
if (packets.isEmpty()) {
return true;
}
connect(clusterId, vpnConnectionId);
if (!awaitOpen(OPEN_WAIT_MS)) {
return false;
}
WebSocket socket = webSocket;
if (socket == null) {
lastError = "websocket missing after open";
return false;
}
byte[] payload = encodePacketBatch(packets);
if (payload.length == 0) {
return true;
}
boolean queued = socket.send(ByteString.of(payload));
if (!queued) {
lastError = "websocket send queue rejected batch";
synchronized (lock) {
if (socket == webSocket) {
reconnectAfterMs = 0;
closeLocked();
}
}
}
return queued;
}
List<byte[]> receiveClientPacketBatch(String clusterId, String vpnConnectionId, int timeoutMs) throws InterruptedException {
connect(clusterId, vpnConnectionId);
awaitOpen(Math.min(OPEN_WAIT_MS, Math.max(1, timeoutMs)));
int waitMs = Math.max(1, timeoutMs);
List<byte[]> packets = priorityIncoming.poll();
if (packets != null) {
return packets;
}
packets = priorityIncoming.poll(Math.min(PRIORITY_GRACE_MS, waitMs), TimeUnit.MILLISECONDS);
if (packets != null) {
return packets;
}
packets = incoming.poll();
if (packets != null) {
return packets;
}
packets = priorityIncoming.poll();
if (packets != null) {
return packets;
}
packets = incoming.poll(Math.max(1, waitMs - PRIORITY_GRACE_MS), TimeUnit.MILLISECONDS);
return packets == null ? new ArrayList<>() : packets;
}
void close() {
synchronized (lock) {
closeLocked();
}
}
private void closeLocked() {
open = false;
connecting = false;
connectingSinceMs = 0;
priorityIncoming.clear();
incoming.clear();
if (webSocket != null) {
try {
webSocket.close(1000, "relay switch");
} catch (Exception ignored) {
}
}
webSocket = null;
}
private boolean awaitOpen(long timeoutMs) {
long deadline = System.currentTimeMillis() + Math.max(1, timeoutMs);
synchronized (lock) {
while (!open && connecting) {
long waitMs = deadline - System.currentTimeMillis();
if (waitMs <= 0) {
break;
}
try {
lock.wait(waitMs);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
lastError = "interrupted waiting for websocket open";
return false;
}
}
if (!open && "connecting".equals(lastError)) {
lastError = "connecting_timeout";
}
return open;
}
}
private String webSocketUrl(String clusterId, String vpnConnectionId) {
try {
URI uri = URI.create(baseUrl);
String scheme = "https".equalsIgnoreCase(uri.getScheme()) ? "wss" : "ws";
String path = uri.getRawPath() == null || uri.getRawPath().isEmpty() ? "" : trimRight(uri.getRawPath());
String fabricPath = fabricServiceChannel.packetPathForBase(baseUrl, clusterId, vpnConnectionId, true);
if (!fabricPath.isEmpty()) {
path += fabricPath;
} else {
path += "/clusters/" + clusterId + "/vpn-connections/" + vpnConnectionId + "/tunnel/client/packets/ws";
}
URI ws = new URI(scheme, uri.getRawUserInfo(), uri.getHost(), uri.getPort(), path, null, null);
return ws.toString();
} catch (Exception e) {
lastError = e.getClass().getSimpleName() + ": " + e.getMessage();
return "";
}
}
private final class Listener extends WebSocketListener {
@Override
public void onOpen(WebSocket webSocket, Response response) {
synchronized (lock) {
open = true;
connecting = false;
reconnectAfterMs = 0;
lastError = "";
lock.notifyAll();
}
Log.i(TAG, "vpn packet websocket opened " + baseUrl);
}
@Override
public void onMessage(WebSocket webSocket, ByteString bytes) {
List<byte[]> packets = decodePacketBatch(bytes.toByteArray());
if (packets.isEmpty()) {
return;
}
offerIncomingPacketBatch(packets);
}
@Override
public void onClosed(WebSocket webSocket, int code, String reason) {
synchronized (lock) {
open = false;
connecting = false;
reconnectAfterMs = System.currentTimeMillis() + 1000;
lastError = "closed " + code + " " + reason;
lock.notifyAll();
}
}
@Override
public void onFailure(WebSocket webSocket, Throwable t, Response response) {
String responseStatus = "";
if (response != null) {
responseStatus = " status=" + response.code();
}
synchronized (lock) {
open = false;
connecting = false;
reconnectAfterMs = System.currentTimeMillis() + 3000;
lastError = (t == null ? "websocket failure" : t.getClass().getSimpleName() + ": " + t.getMessage()) + responseStatus;
lock.notifyAll();
}
Log.w(TAG, "vpn packet websocket failed " + baseUrl + ": " + lastError);
}
}
private static List<byte[]> cleanPacketBatch(List<byte[]> packets) {
List<byte[]> cleaned = new ArrayList<>();
int bytes = 0;
if (packets == null) {
return cleaned;
}
for (byte[] packet : packets) {
if (packet == null || packet.length <= 0 || packet.length > MAX_SINGLE_PACKET_BYTES) {
continue;
}
int projected = bytes + 4 + packet.length;
if (cleaned.size() >= MAX_PACKET_BATCH_PACKETS || projected > MAX_PACKET_BATCH_BYTES) {
break;
}
cleaned.add(packet);
bytes = projected;
}
return cleaned;
}
private static byte[] encodePacketBatch(List<byte[]> packets) {
packets = cleanPacketBatch(packets);
int total = 0;
for (byte[] packet : packets) {
total += 4 + packet.length;
}
byte[] out = new byte[total];
int offset = 0;
for (byte[] packet : packets) {
int length = packet.length;
out[offset] = (byte) ((length >> 24) & 0xff);
out[offset + 1] = (byte) ((length >> 16) & 0xff);
out[offset + 2] = (byte) ((length >> 8) & 0xff);
out[offset + 3] = (byte) (length & 0xff);
offset += 4;
System.arraycopy(packet, 0, out, offset, length);
offset += length;
}
return out;
}
private static List<byte[]> decodePacketBatch(byte[] payload) {
List<byte[]> packets = new ArrayList<>();
int offset = 0;
while (payload != null && offset + 4 <= payload.length && packets.size() < MAX_PACKET_BATCH_PACKETS) {
int length = ((payload[offset] & 0xff) << 24)
| ((payload[offset + 1] & 0xff) << 16)
| ((payload[offset + 2] & 0xff) << 8)
| (payload[offset + 3] & 0xff);
offset += 4;
if (length <= 0 || length > MAX_SINGLE_PACKET_BYTES || offset + length > payload.length) {
break;
}
byte[] packet = new byte[length];
System.arraycopy(payload, offset, packet, 0, length);
packets.add(packet);
offset += length;
}
return packets;
}
private void offerIncomingPacketBatch(List<byte[]> packets) {
BlockingQueue<List<byte[]>> target = containsTCPControlPacket(packets) ? priorityIncoming : incoming;
if (!target.offer(packets)) {
target.poll();
target.offer(packets);
}
}
private static boolean containsTCPControlPacket(List<byte[]> packets) {
if (packets == null) {
return false;
}
for (byte[] packet : packets) {
if (isTCPControlPacket(packet)) {
return true;
}
}
return false;
}
private static boolean isTCPControlPacket(byte[] packet) {
if (packet == null || packet.length < 20 || (packet[0] >> 4) != 4) {
return false;
}
int ihl = (packet[0] & 0x0f) * 4;
if (ihl < 20 || packet.length < ihl + 20 || packet[9] != 6) {
return false;
}
int flags = packet[ihl + 13] & 0xff;
return (flags & 0x17) != 0;
}
private static String trimRight(String value) {
if (value == null) {
return "";
}
while (value.endsWith("/")) {
value = value.substring(0, value.length() - 1);
}
return value;
}
}
+15 -2
View File
@@ -6,6 +6,16 @@ This file exists so architecture documents have a stable guardrails reference
inside `docs/architecture`. The operational Codex guardrails remain in inside `docs/architecture`. The operational Codex guardrails remain in
`docs/codex/ARCHITECTURE_GUARDRAILS.md`. `docs/codex/ARCHITECTURE_GUARDRAILS.md`.
Transport clarification: references in this document to direct worker WSS and
backend gateway fallback belong to the preserved historical RDP service
baseline. They are not the active source of truth for inter-node transport.
Current fabric node-to-node transport is QUIC-only and is defined by
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
Node survivability, recovery overlap, and no-manual-access repair rules are
defined by `docs/architecture/FABRIC_NODE_SURVIVAL_AND_RECOVERY_POLICY.md`.
## 1. Preserve the Proven RDP Baseline ## 1. Preserve the Proven RDP Baseline
The following are already proven and must remain stable: The following are already proven and must remain stable:
@@ -16,8 +26,8 @@ The following are already proven and must remain stable:
- detach without killing the remote session - detach without killing the remote session
- reattach without recreating the remote session - reattach without recreating the remote session
- takeover without recreating the remote session - takeover without recreating the remote session
- direct worker WSS data plane - historical direct worker WSS RDP path
- backend gateway fallback - historical backend gateway fallback for the RDP baseline
- C++ RDP Adapter as the active RDP runtime - C++ RDP Adapter as the active RDP runtime
Architecture clarification must not silently weaken this behavior. Architecture clarification must not silently weaken this behavior.
@@ -191,6 +201,9 @@ Updates must support:
- local update cache where approved - local update cache where approved
- OS / architecture specific artifacts under signed release manifests - OS / architecture specific artifacts under signed release manifests
- explicit migration bundles when data structures change - explicit migration bundles when data structures change
- legacy recovery compatibility until the fleet is converged or explicitly
retired
- multi-source artifact retrieval for stranded or NAT-only nodes
Version Storage stores immutable release manifests, artifacts, hashes, Version Storage stores immutable release manifests, artifacts, hashes,
signatures, compatibility metadata, provenance, and approved migration bundles. signatures, compatibility metadata, provenance, and approved migration bundles.
@@ -1059,7 +1059,8 @@ accepts a signed/introspected `remote_workspace` service-channel lease on
`remote-workspaces/{resource_id}/streams/{channel_class}`, validates service `remote-workspaces/{resource_id}/streams/{channel_class}`, validates service
class, channel class, selected entry node, and data-plane flow isolation, and class, channel class, selected entry node, and data-plane flow isolation, and
reports access telemetry. It intentionally returns a probe contract with reports access telemetry. It intentionally returns a probe contract with
`payload_flow=not_implemented` for non-empty RDP payloads; this stage proves `payload_flow=validated_only` for empty control probes; non-empty RDP payloads are
rejected with `probe_only required`. This stage proves
the Fabric ingress contract without forwarding desktop frames yet. The live the Fabric ingress contract without forwarding desktop frames yet. The live
smoke is `scripts/fabric/c19d-remote-workspace-entry-ingress-smoke.ps1`. smoke is `scripts/fabric/c19d-remote-workspace-entry-ingress-smoke.ps1`.
+7
View File
@@ -1,5 +1,12 @@
# Data Plane v1 for RDP # Data Plane v1 for RDP
Archived status: this document is a historical RDP/WebSocket stage record, not
the current runtime source of truth for transport architecture. The active
fabric transport model is QUIC-only between nodes; see
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
Status: DP-3A grayscale full-frame binary render foundation is implemented and smoke-proven on the test Docker environment as of 2026-04-25. DP-3B adaptive quality policy/selection is intentionally paused. The accepted C++ RDP Adapter baseline is the ordered-region path. RDP-Perf-6 makes direct dirty-region binary render explicit with `render.frame.full` / `render.frame.region` RAP2 message types and is build/probe/live-smoke-proven on the test Docker environment as of 2026-04-26. The current test Docker deployment for the RDP Adapter performance path is `rap-rdp-worker:rdp-perf6-dirty-region`. The Stage 5.2 core download data path remains runtime-proven for direct worker WSS and backend gateway fallback. Data-plane and RDP work are paused; the next active focus is Stage C10 Fabric Core / cluster foundation, not another data-plane feature. Status: DP-3A grayscale full-frame binary render foundation is implemented and smoke-proven on the test Docker environment as of 2026-04-25. DP-3B adaptive quality policy/selection is intentionally paused. The accepted C++ RDP Adapter baseline is the ordered-region path. RDP-Perf-6 makes direct dirty-region binary render explicit with `render.frame.full` / `render.frame.region` RAP2 message types and is build/probe/live-smoke-proven on the test Docker environment as of 2026-04-26. The current test Docker deployment for the RDP Adapter performance path is `rap-rdp-worker:rdp-perf6-dirty-region`. The Stage 5.2 core download data path remains runtime-proven for direct worker WSS and backend gateway fallback. Data-plane and RDP work are paused; the next active focus is Stage C10 Fabric Core / cluster foundation, not another data-plane feature.
This document defines the first staged data-plane evolution for the RDP MVP. It does not implement direct worker WebSocket runtime, mesh routing, VPN, QUIC, UDP, WebRTC, relay nodes, or multi-cluster behavior. This document defines the first staged data-plane evolution for the RDP MVP. It does not implement direct worker WebSocket runtime, mesh routing, VPN, QUIC, UDP, WebRTC, relay nodes, or multi-cluster behavior.
@@ -1,5 +1,12 @@
# Direct Worker WSS TLS / PKI # Direct Worker WSS TLS / PKI
Archived status: this document captures a direct-worker WSS trust design track
and is no longer the primary reference for node-to-node transport. The active
fabric transport model is QUIC-only between nodes; see
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
Status: P3.4 trust-model design/prep complete. Status: P3.4 trust-model design/prep complete.
This document defines the production trust model for direct worker WSS. It does This document defines the production trust model for direct worker WSS. It does
@@ -24,6 +24,21 @@ policy allows, host limited control/storage roles when approved, and report
mobile-specific capacity signals such as battery, network type, NAT behavior, mobile-specific capacity signals such as battery, network type, NAT behavior,
foreground/background state, and metered network policy. foreground/background state, and metered network policy.
Node survival and recovery across endpoint moves, NAT-only reachability, legacy
contract overlap, and unavailable manual host access are governed by
`docs/architecture/FABRIC_NODE_SURVIVAL_AND_RECOVERY_POLICY.md`. In
particular, nodes like `ifcm-rufms-s-mo1cr` must remain recoverable through the
fabric/update/recovery plane even when direct host login is unavailable.
Android implementation contract:
- app install/build contains a QUIC bootstrap seed set;
- runtime launch carries a `fabric_bootstrap_config`, not a backend URL;
- user login/profile selection happens over the fabric control channel;
- the Android VPN dataplane is QUIC fabric runtime only; HTTP batch packet
forwarding, WebSocket packet relay, and direct backend packet relay are not
part of the supported runtime path.
## What Was Missing ## What Was Missing
The current implementation proves route leases and production VPN forwarding, The current implementation proves route leases and production VPN forwarding,
@@ -60,8 +75,9 @@ route and stream semantics.
must keep working through cached policy, peer directories, route leases, and must keep working through cached policy, peer directories, route leases, and
local health when central components are degraded. local health when central components are degraded.
7. Mobile nodes are first-class nodes with stricter capability scoring. 7. Mobile nodes are first-class nodes with stricter capability scoring.
8. HTTP forwarding remains a compatibility and emergency fallback, not the 8. QUIC is the single runtime transport between fabric nodes. HTTP/HTTPS may
primary high-speed data plane. serve human-facing download or panel pages, but it is not a node data-plane
fallback and must not carry service packets.
9. There must be no single management service that can seize the fabric. Control, 9. There must be no single management service that can seize the fabric. Control,
storage, update distribution, route authority, and certificate authority are storage, update distribution, route authority, and certificate authority are
fabric roles assigned to eligible nodes and protected by quorum signatures. fabric roles assigned to eligible nodes and protected by quorum signatures.
@@ -73,6 +89,20 @@ route and stream semantics.
the usable candidate locally by policy, reachability, latency, load, and the usable candidate locally by policy, reachability, latency, load, and
trust. trust.
## Transport vs Control API
The system must keep two layers separate in naming, design, and diagnostics:
- `Fabric Transport` means inter-node runtime delivery only. It is QUIC over UDP
and carries leased service-channel/data-plane traffic between nodes.
- `Control API` means human/operator/programmatic management surfaces such as
web-admin, release publication, policy mutation, audit queries, and status
reads. Today that surface is HTTP/JSON and may sit behind HTTPS ingress.
The HTTP Control API is not a fallback transport for node-to-node runtime
traffic. A `409 Conflict` from the backend, a panel page load, or a release
download is control-plane behavior, not fabric transport behavior.
## Distributed Control And Trust ## Distributed Control And Trust
The target fabric behaves like a distributed network, not a client/server The target fabric behaves like a distributed network, not a client/server
@@ -145,6 +175,143 @@ Endpoint state is also distributed:
- Neighbor selection is local and latency/load-aware; the state log announces - Neighbor selection is local and latency/load-aware; the state log announces
facts and policy, not a forced single next hop. facts and policy, not a forced single next hop.
### Fabric Registry Gossip
Moving a service must not break the farm.
`RAP_BACKEND_URL` or any fixed HTTP/API address is only a migration fallback for
old nodes. It is not cluster truth. After bootstrap, a node finds services by
logical role through signed fabric registry records that can be carried by any
reachable peer.
The rule is:
- any node may relay registry knowledge;
- only authorized signatures can create or replace trusted registry truth;
- a new record becomes active only after signature/authority checks and a
successful live probe through the fabric or a policy-approved direct QUIC
candidate;
- older still-valid records remain as fallback until their TTL expires.
Registry record shape:
```text
schema_version: rap.fabric.registry.gossip_record.v1
cluster_id
service: control-api | update-store | update-cache | web-admin | vpn-egress-pool | ...
scope: farm | cluster | organization
organization_id: optional
epoch: monotonic service epoch
generation: optional human/debug generation
issued_at
expires_at
issuer_node_id
issuer_role: control-authority | update-authority | storage-authority | route-authority
endpoints:
- endpoint_id
address: quic://...
transport: direct_quic | relay_quic | reverse_quic
reachability
connectivity_mode
priority / weight
peer_cert_sha256
signatures:
- key_id
issuer_id
role
alg: ed25519
value
```
Acceptance algorithm:
1. Reject records for a different cluster, expired records, future records past
allowed clock skew, unsupported schema, missing endpoints, or non-QUIC
endpoints.
2. Verify the canonical record payload, excluding `signatures`, against the
configured authority set.
3. Check the signer role is allowed for that service and scope.
4. Require quorum where policy says M-of-N; development may use one trusted
signer but must mark that signer as bootstrap/development authority.
5. Store accepted records as `candidate`.
6. Promote `candidate` to `active` only after live-probing at least one endpoint
and verifying the endpoint identity/pin.
7. Prefer higher epoch, then newer issued time, then generation. Do not replace
a live active record with an older record.
8. Keep the previous active record usable as fallback until TTL expiry when a
newer candidate is not yet live-verified.
This is the recovery path for mass moves. If every known service endpoint moves
at once, the operator or a control-authority node only has to deliver a signed
registry record to one reachable fabric node. That node validates it, probes it,
promotes it, and gossips it onward. User/mobile/candidate nodes may carry the
record, but cannot make it authoritative unless their role certificate permits
that service/scope.
Service classes that must use this registry before production hardening:
- `control-api`: heartbeat, auth/profile control projection, node registration,
policy/snapshot fetch.
- `update-store`: signed release manifests and compatibility windows.
- `update-cache`: artifact mirrors close to nodes.
- `web-admin`: management UI/API ingress replicas.
- `vpn-egress-pool`: user-visible exit pools; users see pools, not backing
nodes.
Legacy endpoint compatibility is allowed only for rolling migration:
- Old nodes may use their baked HTTP/control URL only to fetch a new version or
a signed registry bootstrap record.
- New nodes must treat fixed URLs as fallback hints, not as authority.
- Old code is removed only after every live node reports a version that supports
signed registry gossip and service discovery by role.
Listener configuration is split into bind sockets and reachability candidates:
- `listen_addr` is what the local process binds, for example
`0.0.0.0:18080` on `home-1`.
- `endpoint_candidates` is the ordered set of addresses other nodes may try.
A single node can publish LAN addresses, addresses on several network
adapters, STUN/reflexive addresses, and multiple public NAT forwards from
different providers.
- Public NAT forwards are modeled as candidates with metadata, not as a
replacement for the internal bind address. Example:
`quic://94.141.118.222:19199 reachability=public connectivity=direct
provider=isp1 maps_to=192.168.200.85:18080`.
- A candidate may be valid only from outside the NAT. Same-LAN hairpin failure
is not a proof that the public candidate is broken; verification must be
scoped to an external peer or remote probe.
- The route builder scores candidates by reachability, measured latency, loss,
load, policy, and verification freshness. If one provider or interface fails,
the node keeps the same node identity and republishes a new candidate epoch.
## Install Artifact Bootstrap Contract
Every installable artifact is a node image plus a bootstrap seed set.
This applies to Android, Docker, Linux services, and Windows services. The seed
set is baked into the artifact or delivered beside it as signed install
metadata. It is not a single backend URL and not a management server choice. It
is a bounded list of known fabric endpoint candidates that may be reachable from
different network positions:
- public QUIC candidates, for example `usa-los-1` or externally reachable
`home-1`;
- private/LAN QUIC candidates, for example Docker-test or home LAN nodes;
- closed-site candidates that have no Internet route themselves but can reach a
neighboring fabric node;
- optional pinned certificate hashes or authority descriptors for high-trust
entry candidates.
On first start the installed node tries the seed set, joins through any reachable
peer, registers as a candidate node with minimal rights, and then receives
signed peer-directory, role, update, and policy state through the fabric. If a
node is installed in an isolated network, it can still become visible and usable
when at least one nearby seed node can route onward to the rest of the fabric.
User login on Android is only identity/profile selection for the `vpn-client`
service; the underlying phone node already exists and participates in the
fabric with candidate permissions.
## Node Roles ## Node Roles
Initial role vocabulary: Initial role vocabulary:
@@ -172,7 +339,7 @@ uplink stability, foreground state, and user cost policy.
Nodes must advertise capability facts in heartbeats and peer updates: Nodes must advertise capability facts in heartbeats and peer updates:
- supported fabric protocol versions; - supported fabric protocol versions;
- supported transports: UDP/QUIC, TCP, WebSocket, HTTPS fallback; - supported transport: UDP/QUIC;
- NAT type and reachability; - NAT type and reachability;
- measured RTT/loss/jitter/bandwidth to peers and entry candidates; - measured RTT/loss/jitter/bandwidth to peers and entry candidates;
- CPU, memory, queue depth, file descriptor/socket pressure; - CPU, memory, queue depth, file descriptor/socket pressure;
@@ -184,9 +351,8 @@ Nodes must advertise capability facts in heartbeats and peer updates:
## Fabric Data Session V1 ## Fabric Data Session V1
The first practical protocol step is a persistent binary data session. It may The first practical protocol step is a persistent binary QUIC data session.
initially run over WebSocket/TCP for faster delivery, but the framing must be The framing stays service-neutral, but the runtime transport is QUIC only.
transport-neutral so the same protocol can move to QUIC/UDP.
Minimum frame set: Minimum frame set:
@@ -338,69 +504,36 @@ Deliverables:
### Stage FNP-3: WebSocket/TCP Compatibility Transport ### Stage FNP-3: WebSocket/TCP Compatibility Transport
Status: started with a transport-neutral `io.Reader`/`io.Writer` frame loop, Status: retired as a migration-only stage.
WebSocket frame adapter in `agents/rap-node-agent/internal/fabricproto`, and a
gated/authenticated mesh smoke endpoint/client at `/mesh/v1/fabric/session/ws`. This stage existed to bootstrap binary frame semantics before QUIC routing and
`rap-host-agent fabric-session-smoke` provides the first operator smoke command carrier reuse were ready. It introduced the transport-neutral frame loop,
and can pass signed fabric-session authority payload/signature headers for session-shaped packet mapper, and early smoke tooling. That work was useful as
authority-pinned nodes. scaffolding, but it is no longer the target runtime.
Node-agent exposes the endpoint only when `RAP_MESH_FABRIC_SESSION_ENABLED` /
`-mesh-fabric-session-enabled` is set, and reports the enabled endpoint in Current rule:
heartbeat metadata.
`mesh-live-smoke` includes a fabric-session `PING`/`PONG` check alongside the - WebSocket/TCP fabric-session transport is not part of the supported node
existing route and test-service probes. Mesh client code now has a reusable dataplane.
`FabricSessionClient` for multiple frame exchanges over one WebSocket session, - QUIC/UDP is the only supported runtime carrier between fabric nodes.
plus a pump mode with outbound/inbound queues for asynchronous stream traffic. - Old WebSocket/TCP smoke helpers are being removed; migration/debug tooling
Live smoke verifies two `PING`/`PONG` round trips on the same connection. must move to QUIC-native smoke and recovery paths.
`vpnruntime` has a binary VPN packet-batch mapper for `FrameData` payloads so - Any routing, heartbeat, registry, peer probe, or service dataplane logic must
packet delivery can move away from JSON production envelopes in a gated mode. reject WebSocket/TCP carriers as non-QUIC transport, not treat them as a
`FabricSessionPacketTransport` now adapts that mapper to the existing valid alternate path.
`PacketTransport` interface and can demultiplex inbound DATA frames into the
VPN packet inbox by stream id. What survives from this stage is the service-neutral frame model and the
`mesh-live-smoke` now sends a real VPN packet batch through `FabricSessionPacketTransport` mapping, which now ride on QUIC carriers instead
`FabricSessionPacketTransport` over the WebSocket fabric session and requires a of a WebSocket fallback.
stream ACK from the remote node.
Mesh has a peer session manager that reuses one pump per peer endpoint, giving
VPN transport selection a stable place to acquire long-lived fabric sessions.
Node config now carries a separate gated
`RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED` switch and heartbeat report for the
binary VPN packet transport, keeping endpoint exposure and VPN dataplane
rollout independently controllable.
When the VPN fabric-session switch is enabled, node-agent now attempts to use a
long-lived peer session for gateway packet transport and falls back to the
existing HTTP production envelope path when the peer session is unavailable.
Peer session reuse now evicts closed pumps before reuse, so failed WebSocket
sessions can be reopened on the next transport acquisition.
Heartbeat telemetry includes peer session manager counters for active sessions,
reuses, opens, closed-pump evictions, and explicit close operations.
The mesh package now exposes a service-neutral `FabricTransport` abstraction;
the current WebSocket carrier implements it as `WebSocketFabricTransport`, so
future QUIC/UDP transport can be added without changing VPN/RDP/HTTP services.
`QUICFabricTransport` now implements the same interface and carries the same
binary `fabricproto` frames over a QUIC stream, with local smoke coverage for
`PING`/`PONG` and DATA/ACK.
Carrier selection understands QUIC transport labels and `quic://host:port`
endpoints while preserving WebSocket as the default fallback.
`QUICFabricServer` provides the matching node-side QUIC listener for accepting
fabric streams and running the same session frame handler as other carriers.
Node-agent can now gate the QUIC listener with
`RAP_MESH_QUIC_FABRIC_ENABLED` / `RAP_MESH_QUIC_FABRIC_LISTEN_ADDR`, report it
in heartbeat metadata, and pass the setting through host-agent install/update
profiles.
`mesh-live-smoke` verifies the QUIC carrier by starting a temporary QUIC fabric
server and requiring a `PING`/`PONG` round trip over `QUICFabricTransport`.
Nodes now advertise enabled QUIC fabric listeners as `direct_quic` fast-path
endpoint candidates, and endpoint ranking prefers QUIC over WebSocket/HTTPS
compatibility candidates for fabric sessions.
VPN fabric-session gateway transport now consumes ranked endpoint candidates, VPN fabric-session gateway transport now consumes ranked endpoint candidates,
so dataplane sessions can select QUIC fast-path candidates and fall back to so dataplane sessions can select QUIC fast-path candidates and refuse non-QUIC
legacy peer endpoints when the control plane has not published candidates yet. peer endpoints when the control plane has not published valid candidates yet.
The temporary self-signed QUIC listener advertises its SHA-256 certificate The temporary self-signed QUIC listener advertises its SHA-256 certificate
fingerprint in endpoint metadata, and the QUIC client can pin that fingerprint fingerprint in endpoint metadata, and the QUIC client can pin that fingerprint
instead of disabling verification while the cluster CA path is being finished. instead of disabling verification while the cluster CA path is being finished.
VPN fabric-session dialing now walks all ranked endpoint candidates before VPN fabric-session dialing now walks all ranked endpoint candidates before
falling back to the legacy peer endpoint, so a failed QUIC candidate does not declaring the target unavailable, so a failed QUIC candidate does not silently
block WebSocket/HTTPS compatibility transport. re-enable WebSocket/HTTPS compatibility transport.
Successful VPN fabric-session dialing logs the selected candidate, transport, Successful VPN fabric-session dialing logs the selected candidate, transport,
certificate pin usage, and remaining fallback count for phone-side diagnostics. certificate pin usage, and remaining fallback count for phone-side diagnostics.
Heartbeat telemetry now includes VPN fabric-session dial counters for attempts, Heartbeat telemetry now includes VPN fabric-session dial counters for attempts,
@@ -416,8 +549,8 @@ Endpoint health observations are now emitted as a bounded standalone heartbeat
report (`rap.vpn_fabric_endpoint_health_report.v1`) so control plane can ingest report (`rap.vpn_fabric_endpoint_health_report.v1`) so control plane can ingest
candidate feedback without parsing the transport diagnostics blob. candidate feedback without parsing the transport diagnostics blob.
VPN fabric-session transport telemetry is carrier-neutral VPN fabric-session transport telemetry is carrier-neutral
(`fabric_session_binary_frames`) and reports QUIC/WebSocket as available (`fabric_session_binary_frames`) and reports QUIC selection plus non-QUIC
carriers instead of describing the dataplane as WebSocket-only. candidate rejection instead of describing the dataplane as WebSocket-capable.
Endpoint health observations are pruned in-memory by age and count before Endpoint health observations are pruned in-memory by age and count before
snapshot/report generation, preventing long-running nodes from accumulating snapshot/report generation, preventing long-running nodes from accumulating
unbounded candidate history. unbounded candidate history.
@@ -583,10 +716,10 @@ propagated by host-agent install profiles.
Deliverables: Deliverables:
- carry binary frames over one persistent WebSocket/TCP connection; - carry binary frames over one persistent QUIC fabric session;
- replace high-frequency `/mesh/v1/forward` packet POST usage for VPN routes in - replace high-frequency `/mesh/v1/forward` packet POST usage for VPN routes in
a gated mode; a gated mode;
- keep HTTP forwarding as fallback. - remove HTTP/WebSocket packet forwarding from the supported dataplane.
### Stage FNP-4: Android As Mobile Fabric Node ### Stage FNP-4: Android As Mobile Fabric Node
@@ -609,12 +742,12 @@ Deliverables:
### Stage FNP-6: QUIC/UDP Transport ### Stage FNP-6: QUIC/UDP Transport
Status: started with `QUICFabricTransport` in `internal/mesh`. Status: active runtime baseline in `internal/mesh`.
Deliverables: Deliverables:
- implement QUIC transport for Fabric Data Session V1; - implement QUIC transport for Fabric Data Session V1;
- preserve WebSocket/TCP as fallback; - keep QUIC/UDP as the only supported inter-node runtime transport;
- test 4G/Wi-Fi transition and NAT behavior; - test 4G/Wi-Fi transition and NAT behavior;
- benchmark throughput, latency, and recovery against current HTTP forwarding. - benchmark throughput, latency, and recovery against current HTTP forwarding.
@@ -0,0 +1,183 @@
# Fabric Area And Peer Stability Model
Status: active design correction.
This document replaces the oversimplified rule "every node must keep 3
connections" with a stability model based on failure domains ("areas"),
multi-path reachability, and live peer memory.
## 1. Why the old "3 connections" rule is not enough
A raw connection count is too weak as a resilience rule.
Three links are not equivalent when:
- all three peers are in the same private network;
- all three depend on the same NAT or relay path;
- all three depend on the same public ingress;
- all three are relay-ready but not direct-ready;
- all three are stale observations rather than recently verified paths.
Therefore the fabric must not use a single scalar count as the stability
criterion.
## 2. Area
Introduce the concept of an `area`.
An area is a failure domain with high mutual reachability and shared external
risk. Examples:
- `home` - nodes in the same home/private site
- `test` - nodes in the same test Docker/LAN site
- `usa` - a public node in a remote Internet site
- `ifcm` - a separate NAT/domain behind another administrative boundary
An area can be derived from:
- operator-declared site/area label;
- shared private address space or local interface group;
- shared public egress/NAT identity;
- shared administrative host or cluster.
The area label must be part of live node metadata and endpoint candidate
metadata.
## 3. Stability objective
Each node should maintain a working peer set with diversity, not just count.
### 3.1 Minimum stable peer objective
For an ordinary production node:
- at least `2` recently verified direct-ready peers overall;
- at least `2` distinct external areas represented in the ready set when more
than one external area exists;
- at least `1` persistent recovery-capable path outside the local area;
- at least `1` additional relay-ready or rendezvous-capable path outside the
primary recovery path.
For an area gateway or strategically important public node:
- at least `3` direct-ready peers overall;
- at least `2` distinct external areas represented in the direct-ready set;
- at least `1` extra recovery path that does not share the same public ingress
or NAT dependency.
For a node in a tiny fleet where only one external area currently exists:
- the system must report `reduced-diversity mode`, not pretend the target is
fully satisfied.
### 3.2 What counts as "ready"
`ready` means:
- recently verified;
- usable for immediate QUIC route establishment;
- not only a historical candidate;
- not blocked on stale relay replacement;
- not only a compatibility `Control API/downloads` overlap path.
`relay_ready` does not replace `direct_ready`.
## 4. What a node must remember
Every node must keep a live working set, not just a tiny current-peer list.
Minimum retained peer memory:
1. all currently healthy nodes in the fleet, when the fleet is small enough;
2. for larger fleets, a bounded full directory plus prioritized recent working
peers;
3. for every known node:
- node id
- area
- role summary
- latest verified direct candidates
- latest verified relay/rendezvous candidates
- last success timestamp
- last failure class
- NAT / ingress dependency hints
- cert pin / authority compatibility metadata
For the current fleet size, every node should indeed be capable of remembering
the full directory of every other node. There is no scale excuse at 6-8 nodes.
## 5. Probe strategy
The node should not aggressively probe every possible path at full frequency.
It should maintain a layered strategy.
### 5.1 Hot set
Always keep a hot set of:
- current direct-ready peers;
- one recovery peer outside the local area;
- one alternate peer per external area.
These should be revalidated frequently.
### 5.2 Warm set
Maintain a warm set of:
- previously successful peers;
- peers from underrepresented areas;
- peers that would restore diversity if a hot peer fails.
These should be revalidated on a slower cadence and promoted when diversity or
direct-ready count drops.
### 5.3 Cold directory
Retain the full known directory and signed registry records, even if not
actively probed at the same rate.
## 6. Failure handling
When a direct-ready peer is lost:
1. do not merely replace it with the numerically cheapest peer;
2. prefer restoring:
- area diversity
- independent ingress diversity
- direct-ready count
3. only then fall back to relay-ready stabilization if direct replacement is
not currently available.
## 7. Implications for the current fleet
Current area mapping should be treated approximately as:
- `home`: `home-1`, `home-2`, `home-3`
- `test`: `test-1`, `test-2`, `test-3`
- `usa`: `usa-los-1`
- `ifcm`: `ifcm-rufms-s-mo1cr`
Under this model:
- a node in `home` should avoid satisfying its minimum peer objective using
only `home` peers plus one relay;
- `usa-los-1` and `ifcm-rufms-s-mo1cr` should both maintain direct-ready links
that span at least two foreign areas when possible;
- a fleet-wide alert should trigger when a node loses cross-area diversity even
if its total peer count still looks healthy.
## 8. Required implementation changes
1. Add `area` to node metadata and endpoint candidate metadata.
2. Track peer readiness by area, not only total count.
3. Separate:
- `direct_ready_count`
- `relay_ready_count`
- `external_area_ready_count`
- `independent_ingress_ready_count`
4. Alert on:
- zero recovery path outside the local area
- direct-ready deficit
- area diversity deficit
- registry resolution deficit
5. Preserve a full node directory for the current small fleet.
@@ -289,7 +289,10 @@ Production fabric-core migration boundary:
LAN/interface QUIC, STUN reflexive `ice_quic`, reverse/outbound-only, and LAN/interface QUIC, STUN reflexive `ice_quic`, reverse/outbound-only, and
`relay_quic` fallback. Candidate metadata carries `local_segment_id`, `relay_quic` fallback. Candidate metadata carries `local_segment_id`,
`nat_group_id`, `stun_server`, `ice_foundation`, `relay_node_id`, and `nat_group_id`, `stun_server`, `ice_foundation`, `relay_node_id`, and
`relay_endpoint` when configured. `relay_endpoint` when configured. When a relay endpoint is the first physical
QUIC hop, its advertised certificate fingerprint must survive route planning
so public-IP relay paths can verify the relay node by pin instead of falling
back to hostname/IP SAN matching.
- Endpoint candidate scoring is QUIC-mode only. It ranks `direct_quic`, - Endpoint candidate scoring is QUIC-mode only. It ranks `direct_quic`,
`lan_quic`, `ice_quic`, `reverse_quic`, and `relay_quic` using freshness, `lan_quic`, `ice_quic`, `reverse_quic`, and `relay_quic` using freshness,
health observations, latency, reliability, region, policy tags, and live health observations, latency, reliability, region, policy tags, and live
@@ -0,0 +1,179 @@
# Fabric Live Audit 2026-05-18
Status: live operational audit of the current fabric. This document records the
real state observed on 2026-05-18 and explicitly calls out where runtime
behavior still differs from the target architecture.
## Current confirmed state
- Inter-node transport for the live node-agent fleet is `QUIC over UDP`.
- The active node set
- `home-1`
- `home-2`
- `home-3`
- `test-1`
- `test-2`
- `test-3`
- `usa-los-1`
- `ifcm-rufms-s-mo1cr`
is converged on `0.2.321-directreadytarget`.
- `ifcm-rufms-s-mo1cr` recovered through the compatibility recovery path and is
no longer stale.
## Why TCP traffic is still visible
Visible TCP traffic is not coming from the inter-node fabric transport. It is
coming from the temporary compatibility recovery overlap that is still active.
Observed live listeners:
- `docker-test`
- `19191/tcp` - compatibility `Control API/downloads` bridge
- `18080/tcp` - web-admin
- `18090/tcp` - release files
- `18121/tcp` - backend Control API
- `19132/udp`, `19133/udp`, `19134/udp` - QUIC fabric listeners
- `usa-los-1`
- `19131/udp` - QUIC fabric listener
- `19191/tcp` - external compatibility bridge currently held open so legacy
recovery contracts can still reach `Control API/downloads`
Therefore:
- `TCP` is still present by design for recovery overlap.
- `UDP/QUIC` is the current node-to-node transport.
- The statement "the fabric is fully UDP-only" is not yet true at the full
system level while `19191/tcp` compatibility recovery remains enabled.
## Why nodes were still falling away
### 1. Nodes do not yet operate from a fully active signed registry gossip plane
Observed on the live `ifcm-rufms-s-mo1cr` heartbeat:
- `fabric_registry_runtime_report.status = candidate_only`
- `resolved_service_count = 0`
- `resolved_services.control-api = no_active_record`
- `resolved_services.update-store = no_active_record`
- `resolved_services.update-cache = no_active_record`
This means the current runtime still depends on compatibility control URLs more
than the target architecture allows. The node is alive in the fabric, but not
yet operating from a fully resolved active registry view.
### 2. Legacy control/download contracts are still real dependencies
Observed on the live `ifcm-rufms-s-mo1cr` heartbeat after recovery:
- `mesh_outbound_session_report.control_plane_url = http://vpn.cin.su:19191/api/v1`
This confirms the root recovery lesson:
- a NAT node without manual host access was still anchored to the old recovery
contract;
- until that contract was temporarily restored, the node could not advance;
- the node did not disappear because QUIC failed; it disappeared because the
recovery/control overlap was removed before the node had converged.
### 3. Direct peer resilience is still below the intended threshold
Observed from live heartbeat metadata:
- `ifcm-rufms-s-mo1cr`
- `peer_connection_ready = 2`
- `peer_connection_relay_ready = 3`
- `target_ready_peers = 3`
- `usa-los-1`
- `peer_connection_ready = 1`
- `peer_connection_relay_ready = 5`
- `target_ready_peers = 3`
This means the direct-path resilience target is not satisfied yet, even though
the nodes are healthy.
The practical reason is simple:
- the cluster has only a small number of externally reachable direct QUIC
endpoints;
- some nodes still advertise only private/LAN-reachable direct candidates;
- relay-ready adjacency is masking direct peer deficit, but it does not replace
the requirement for at least three direct-ready peers.
### 4. Observability is still heterogeneous
Live heartbeat coverage is inconsistent:
- `test-*`, `ifcm`, `usa-los-1` emit rich `c17z20` heartbeat metadata with
endpoint, peer recovery, and registry sections.
- `home-*` currently do not expose the same full sections in their latest
heartbeat rows.
This means operator visibility is uneven and the documentation must not imply
uniform live introspection across every node today.
## What is true right now
1. The fleet is converged on one live node-agent version.
2. QUIC/UDP is the actual node-to-node transport.
3. Compatibility `19191/tcp` is still required for recovery overlap.
4. Signed registry gossip is not yet the sole active discovery/control source.
5. The "at least 3 direct-ready peers per node" resilience target is not yet
met for all externally significant nodes.
## Operational rule until the next audit
Do not remove the compatibility `19191/tcp` recovery overlap while any of the
following remain true:
- any live node still reports a `control_plane_url` on the `19191` contract;
- any live node has `fabric_registry_runtime_report.status != active`;
- any externally significant node has fewer than 3 direct-ready peers;
- any node can only recover through legacy `Control API/downloads` overlap.
## Required next work
### A. Finish signed registry activation
Each node must be able to resolve active records for at least:
- `control-api`
- `update-store`
- `update-cache`
without falling back to the `19191` compatibility contract.
### B. Promote full direct endpoint dissemination
All nodes with public reachability must advertise every valid public direct QUIC
endpoint, and nodes must retain enough live peer memory to reconnect without
operator intervention.
### C. Enforce the direct-ready floor as a live alert
If a node has fewer than 3 direct-ready peers, this must remain a real
operational alert even when relay-ready peers exist.
### D. Normalize heartbeat observability
Every production node must emit the same minimum audit surface:
- endpoint candidates
- peer recovery counts
- registry runtime state
- update runtime state
without mixing rich and reduced heartbeat schemas across the fleet.
### E. Replace the naive peer-count rule
The live fleet shows that a plain "3 links per node" rule is not a sufficient
resilience model.
The current corrective design is documented in
[FABRIC_AREA_AND_PEER_STABILITY_MODEL.md](\\nas\\MST\\codex\\rdp-proxy\\docs\\architecture\\FABRIC_AREA_AND_PEER_STABILITY_MODEL.md)
and introduces:
- `area` as a failure-domain label;
- direct-ready vs relay-ready separation;
- cross-area diversity requirements;
- full-directory retention for small fleets.
@@ -0,0 +1,427 @@
# Fabric Node Survival And Recovery Policy
Status: active architecture policy.
This document defines the non-negotiable survival, compatibility, and recovery
rules for Secure Access Fabric nodes. It exists because losing a node is not an
acceptable operating model once the fabric grows beyond a small manually
maintained fleet.
Reference incident:
- `ifcm-rufms-s-mo1cr` is the canonical recovery case.
- The node is behind NAT.
- There is no direct administrative access to the Windows host.
- The node must remain recoverable through the fabric/update/recovery plane
without relying on manual host login.
The latest live recovery evidence for this case is documented in
[FABRIC_LIVE_AUDIT_2026-05-18.md](\\nas\\MST\\codex\\rdp-proxy\\docs\\architecture\\FABRIC_LIVE_AUDIT_2026-05-18.md).
This policy applies to Linux, Windows, Android, containerized nodes, and future
node types.
## 1. Core Decision
The fabric must be able to lose:
- old API endpoints;
- old artifact URLs;
- previous public IP addresses;
- previous NAT mappings;
- previous relay nodes;
- previous route-authority replicas;
- previous update-cache replicas;
- old service locations;
- operator access to the host OS;
- the current physical location of a workload;
- part of the cluster.
And still keep the node recoverable.
Manual repair is allowed as an emergency tool. It must not be the default
survival strategy.
## 2. Non-Negotiable Invariants
### 2.1 Node Identity Must Survive
A recoverable node must preserve:
- `node_id`;
- node keypair or key reference;
- pinned cluster authority / quorum descriptor;
- last accepted signed registry records;
- last accepted bootstrap seed set;
- last known good update policy;
- last known good workload desired state;
- rollback metadata;
- recovery audit trail.
Reinstall or repair must prefer preserving local state. Identity reset is a
high-risk operator action, not the default repair path.
### 2.2 Compatibility Must Stay Until Recovery Is Complete
Any change to the fabric must keep older nodes recoverable until one of these
is true:
1. every node has confirmed the new contract; or
2. the missing nodes were manually retired, revoked, or explicitly accepted as
lost.
This applies to:
- update plan formats;
- signed registry schemas;
- artifact install types;
- authority signature envelopes;
- bootstrap config formats;
- recovery seed formats;
- host-agent / updater runtime contracts;
- control endpoints needed only for migration.
The rule is strict: do not delete the old recovery format while nodes that may
still need it remain unrecovered.
### 2.3 QUIC-Only Transport Does Not Mean Single Bootstrap Location
Node-to-node runtime transport remains QUIC over UDP only.
That does not permit:
- one bootstrap address;
- one update mirror;
- one registry carrier;
- one ingress node;
- one relay;
- one control replica.
QUIC is the transport. Survivability requires many signed ways to discover the
current valid QUIC endpoints.
### 2.4 No Single Service May Own Recovery
Recovery must not depend on one:
- backend URL;
- DNS name;
- HTTP ingress;
- update repository host;
- relay node;
- cluster admin node.
Any of those may disappear while the node is still healthy enough to recover.
## 3. Required Recovery Layers
### 3.1 Embedded Bootstrap Seed Set
Each installable node package must contain a bounded bootstrap seed set:
- multiple seed nodes;
- public and private candidates where appropriate;
- QUIC endpoint candidates only;
- signed bootstrap metadata;
- expiry / epoch rules;
- optional organization / cluster scope constraints.
The bootstrap seed set is only the first door, not cluster truth.
### 3.2 Signed Registry Gossip
After bootstrap, a node must learn current service locations through signed
fabric registry records that can be carried by any reachable peer.
Required properties:
- multiple records per service;
- quorum or otherwise policy-approved signatures;
- monotonic epoch/generation;
- expiry and freshness checks;
- live probe before promotion;
- ability to accept newer records from a reachable neighbor even when old
origins are gone.
### 3.3 Outbound-Only Recovery Attachment
A node behind NAT or in passive mode must be recoverable through an outbound
attachment.
Required behaviors:
- the node can maintain at least one long-lived outbound QUIC control channel;
- that channel survives IP changes by reconnecting through any remaining seed or
signed registry endpoint;
- the node may receive updated registry truth, update triggers, workload
changes, and recovery instructions over that channel;
- the fabric must not require inbound TCP/UDP reachability to repair the node.
### 3.4 Local Recovery Agent Boundary
The node must have a minimal recovery-capable local agent boundary that is
separate from ordinary service workloads.
It must be able to:
- validate signed update plans;
- download artifacts from multiple mirrors;
- stage replacement binaries;
- restart node-agent or host-agent tasks;
- rollback to previous binaries;
- swap to new signed registry/bootstrap records;
- emit recovery status when transport returns.
If node workloads fail, this local recovery boundary must still exist.
### 3.5 Multi-Source Artifact Delivery
Artifacts must be retrievable from more than one source:
- local cached file;
- cluster update-cache;
- organization-local cache if policy allows;
- public or internet-reachable mirror;
- neighbor-assisted relay transfer over the fabric.
A node must not become unrecoverable because one artifact hostname or one
download service disappeared.
### 3.6 Trigger And Subscription Plane
Polling alone is not enough for very large fleets.
Required model:
- nodes may still perform slow fallback polling;
- primary update notification uses subscription/signal delivery;
- update-cache or registry service can repeatedly signal pending updates until
acknowledged;
- signals are idempotent;
- signals do not require the old control endpoint to remain alive.
## 4. Update Safety Rules
### 4.1 Upgrade Contracts
Every release that changes recovery-critical contracts must explicitly declare:
- minimum supported old version;
- maximum tolerated skew;
- whether migration is rolling-safe;
- whether the node must first update host-agent or node-agent;
- rollback compatibility;
- whether old bootstrap/registry envelopes remain accepted.
### 4.2 Two-Key Rule For Breaking Changes
Do not simultaneously break:
- discovery of where to get the update; and
- ability to understand the update once found.
At least one of those must remain compatible until fleet convergence or
explicit retirement.
### 4.3 Old Artifact Retention
Recovery-critical artifact versions must remain available until:
- all nodes have moved past them; or
- the remaining nodes are revoked/retired and recorded as intentionally lost.
Do not garbage-collect the last working host-agent or node-agent build for an
unrecovered population.
### 4.4 Install Type Continuity
If historical nodes request different install types for the same product
(`windows_binary`, `windows_service`, `native`, `linux_binary`, etc.), recovery
planning must keep compatibility aliases until the fleet converges.
The fabric must not strand nodes on an install-type naming mismatch.
### 4.5 Legacy Recovery Contract Drift Must Be Treated As A Blocking Risk
A stale node may report:
- a compatible recovery artifact exists under the current registry; but
- the last local updater/host-agent status still says `no_matching_artifact` or
an equivalent legacy contract failure.
This means the node is not only waiting for a heartbeat. It is running an older
recovery planner contract and may still depend on:
- historical install-type aliases;
- older artifact matching semantics;
- older update-plan interpretation rules;
- overlap in signed registry / bootstrap envelopes.
This condition must be classified as `legacy recovery contract drift` and must
block compatibility removal the same way an artifact gap does.
Operationally this also means:
- the node requires a `recovery bridge`;
- the cluster enters `bridge hold active` for compatibility-removal decisions;
- `bridge hold` remains active until the node reports a recovery-compatible
status on the current contract or the operator explicitly retires the node;
- when a compatible artifact and target mapping already exist, the node should
be classified as `bridge replay ready`, meaning the system can replay the
legacy-compatible update plan as soon as the node regains an outbound control
cycle;
- operator tooling should expose a canonical `bridge replay plan` per node so
recovery replay uses the same signed update-plan logic as normal updates;
- compatibility aliases / overlap must remain enabled for that node population;
- dashboards and rollout guards must show this separately from ordinary
`waiting recovery heartbeat`.
Canonical example:
- `ifcm-rufms-s-mo1cr` is stale;
- the current backend can match a Windows-compatible host-agent artifact;
- the last host-agent report still says `no_matching_artifact`;
- therefore the node must be treated as a legacy recovery-contract blocker, not
merely as a delayed heartbeat.
## 5. Service And Location Mobility Rules
Moving a service must not strand nodes that only know the old location.
Required pattern:
1. publish new signed registry records;
2. keep old records valid during overlap;
3. allow any reachable peer to relay the new records;
4. live-probe and promote the new endpoints;
5. only then retire the old location;
6. keep enough overlap for slow or partitioned nodes to catch up.
This applies to:
- control-api replicas;
- update-cache/update-store replicas;
- web/admin ingress replicas;
- relay/rendezvous nodes;
- service-channel endpoints.
## 6. Failure Classes The Fabric Must Tolerate
The design must explicitly handle all of these:
- node behind NAT with only outbound connectivity;
- several nodes behind one NAT/local segment;
- node changes public IP;
- node changes private IP;
- old DNS/URL becomes dead;
- artifact mirror disappears;
- control ingress disappears;
- relay disappears;
- update install fails halfway;
- binary staged but restart fails;
- old task/service name changes;
- local disk is nearly full;
- time skew causes signature freshness risk;
- authority rotates;
- route authority replica disappears;
- state directory survives but binary is broken;
- binary survives but state directory is partly stale;
- node reboots during update;
- only one peer still knows the new registry truth;
- node is partitioned for a long time and rejoins later;
- platform removes legacy support too early;
- operator has no shell/RDP/WinRM/SSH access to the host.
## 7. Required Local State And Journaling
The node local state store must retain at least:
- active and previous signed registry records;
- active and previous bootstrap seeds;
- last successful update plan per product;
- last applied artifact hash/version;
- last rollback candidate;
- last successful service endpoints used for update/control;
- pending trigger generation;
- recovery attempts with timestamps and reasons;
- last known good runtime command line / task/unit identity;
- last known workload desired states.
Writes must be atomic. A power loss must not leave the node with zero valid
state.
## 8. Observability And Fleet Safety Rules
The control plane must make invisible-recovery risk explicit.
It must surface:
- nodes with stale heartbeat but recent updater activity;
- nodes with no working compatible recovery artifact;
- nodes whose pinned registry/bootstrap epoch is too old;
- nodes whose only known artifact URL is dead;
- nodes whose desired state requires a contract they cannot parse;
- nodes whose local agent version is below the minimum recovery floor;
- nodes whose last successful contact depended on a single service replica.
Cluster-wide changes that would strand such nodes must be blocked or require an
explicit recovery-admin override.
## 9. Release And Migration Checklist
Before deleting old code, old formats, or old endpoints, verify all of these:
1. every active node has confirmed a compatible version; or the remaining nodes
are explicitly marked for manual retirement/recovery;
2. host-agent and node-agent recovery paths both have matching artifacts;
3. bootstrap/registry overlap exists for the migration window;
4. at least two independent artifact sources remain reachable;
5. signed registry gossip can carry the new locations without the old API
hostname;
6. rollback artifacts are still available;
7. install type aliases remain for historical agents where needed;
8. NAT/passive/outbound-only nodes were explicitly tested;
9. stale-node risk report is empty or consciously accepted by recovery-admin;
10. removal of legacy support is documented with the exact cutoff conditions.
## 10. `ifcm-rufms-s-mo1cr` Rule
`ifcm-rufms-s-mo1cr` is the standing reference case for future work.
For this node class, the platform must assume:
- the host is behind NAT;
- the node may only keep outbound channels;
- no direct Windows administrative access exists;
- old discovery endpoints may disappear;
- only the fabric/update/recovery plane can save the node.
Any future transport, update, authority, bootstrap, registry, or workload
change must be reviewed against this question:
> If `ifcm-rufms-s-mo1cr` is still on the older contract and we cannot log in to
> the host, can the fabric still recover it?
If the answer is no, the change is incomplete.
## 11. Immediate Follow-Through
The system should keep implementing these concrete items:
- separate documented recovery-plane tests for Windows NAT nodes;
- signed registry retention and overlap checks before endpoint migration;
- compatibility alias coverage for historical install types;
- artifact availability health over all mirrors;
- stale-node risk dashboard/report before legacy removal;
- node-local journaling for last good registry/update state;
- neighbor-assisted artifact relay path;
- explicit recovery simulation for outbound-only nodes with dead old endpoints.
## 12. Decision
The fabric must treat node survival as a first-class architecture contract.
A node is not considered safe merely because the happy path works. It is safe
only when it can survive protocol migration, endpoint relocation, partial
cluster loss, artifact source loss, and lack of manual host access without
being abandoned.
@@ -256,9 +256,11 @@ The first backend contract slice is implemented:
observations, and degraded backend relay usage. These incidents keep backend observations, and degraded backend relay usage. These incidents keep backend
relay visible as degraded compatibility behavior rather than hidden steady relay visible as degraded compatibility behavior rather than hidden steady
state. state.
- Node-agent access telemetry distinguishes backend relay actually used from - Node-agent access telemetry distinguishes degraded compatibility requested
backend relay blocked by signed data-plane policy. Blocked fallback reports from degraded compatibility blocked by signed data-plane policy. Blocked
include `backend_fallback_blocked` and the last violation status/reason, and compatibility reports include `degraded_compatibility_blocked` and the last
violation status/reason, while preserving the original raw violation code in
a separate field for historical correlation, and
backend projects them to access telemetry plus `data_plane_contract` backend projects them to access telemetry plus `data_plane_contract`
incidents. incidents.
- Backend correlates access-report send failures with active service-channel - Backend correlates access-report send failures with active service-channel
@@ -421,8 +423,8 @@ The first backend contract slice is implemented:
keeps failing outside manual retry cooldown creates a bounded rebuild keeps failing outside manual retry cooldown creates a bounded rebuild
request. If an unfenced alternate is available, Control Plane marks the request. If an unfenced alternate is available, Control Plane marks the
rebuild `applied` and selects that route generation; if no alternate exists, rebuild `applied` and selects that route generation; if no alternate exists,
it records `pending_degraded_fallback` and keeps backend relay as the it records `pending_degraded_route_state` and keeps the channel in explicit
explicit degraded path until a new route appears. The compatibility release degraded route state until a new route appears. The compatibility release
`0.2.175` keeps node/host-agent signed-config models aligned with these new `0.2.175` keeps node/host-agent signed-config models aligned with these new
fields. fields.
- C18U moves rebuild metadata into node-agent runtime behavior. Node-agent - C18U moves rebuild metadata into node-agent runtime behavior. Node-agent
@@ -437,10 +439,10 @@ The first backend contract slice is implemented:
- C18V adds route-manager transition telemetry and churn coverage. Node-agent - C18V adds route-manager transition telemetry and churn coverage. Node-agent
`0.2.177` reports `route_manager_transition` alongside the current manager `0.2.177` reports `route_manager_transition` alongside the current manager
snapshot, including previous/current generation, status, decision count, snapshot, including previous/current generation, status, decision count,
withdrawn route count, restored route count, pending-degraded fallback count, withdrawn route count, restored route count, pending degraded route-state count,
rebuild applied count, and any cached selected route cleared because Control rebuild applied count, and any cached selected route cleared because Control
Plane withdrew it. Coverage verifies three service-neutral lifecycle cases: Plane withdrew it. Coverage verifies three service-neutral lifecycle cases:
applied rebuild replacement, pending degraded fallback when no alternate is applied rebuild replacement, pending degraded route state when no alternate is
available, and rollback/restoration when a fresh config removes the rebuild available, and rollback/restoration when a fresh config removes the rebuild
decision. decision.
- C18W adds a live docker-test verification loop for that telemetry. The smoke - C18W adds a live docker-test verification loop for that telemetry. The smoke
@@ -973,8 +975,8 @@ The first backend contract slice is implemented:
in C18Z45; rebuild snapshot maintenance health with overdue/runtime-evidence in C18Z45; rebuild snapshot maintenance health with overdue/runtime-evidence
visibility landed in C18Z46; node-agent signed service-channel lease visibility landed in C18Z46; node-agent signed service-channel lease
enforcement when cluster authority is pinned landed in C18Z47; backend enforcement when cluster authority is pinned landed in C18Z47; backend
introspection fallback for unsigned compatibility clients landed in C18Z48; introspection fallback for token-authorized compatibility clients landed in C18Z48;
accepted-by telemetry for signed/introspection/legacy ingress landed in accepted-by telemetry for signed/introspection/token-authorized ingress landed in
C18Z49; durable lease introspection across backend restarts landed in C18Z50; C18Z49; durable lease introspection across backend restarts landed in C18Z50;
bounded durable lease cleanup and admin visibility landed in C18Z51; durable bounded durable lease cleanup and admin visibility landed in C18Z51; durable
accepted-by access telemetry aggregation with heartbeat fallback and admin accepted-by access telemetry aggregation with heartbeat fallback and admin
@@ -983,9 +985,9 @@ The first backend contract slice is implemented:
visibility landed in C18Z53; C18Z54 smoke proves the same diagnostics on a visibility landed in C18Z53; C18Z54 smoke proves the same diagnostics on a
normal non-fallback primary route with healthy rolling route-quality feedback; normal non-fallback primary route with healthy rolling route-quality feedback;
C18Z55 smoke proves degraded/fenced normal-route feedback is shown separately C18Z55 smoke proves degraded/fenced normal-route feedback is shown separately
from explicit backend fallback; C18Z56 adds active-channel remediation from explicit degraded compatibility requests; C18Z56 adds active-channel remediation
diagnostics (`none`, `rebuild_route`, `prefer_alternate_route`, diagnostics (`none`, `rebuild_route`, `prefer_alternate_route`,
`use_backend_fallback`) to make the next runtime action explicit, and its `hold_degraded_route_state`) to make the next runtime action explicit, and its
alternate-route branch is live-smoke-proven with backend fallback kept off. alternate-route branch is live-smoke-proven with backend fallback kept off.
C18Z57 adds the bounded machine-readable `remediation_command` contract to C18Z57 adds the bounded machine-readable `remediation_command` contract to
active access telemetry rows so route-manager can consume a short-lived active access telemetry rows so route-manager can consume a short-lived
@@ -1058,7 +1060,7 @@ The first backend contract slice is implemented:
`rebuild_request_recorded` or `rebuild_request_rejected` for the active `rebuild_request_recorded` or `rebuild_request_rejected` for the active
channel. C18Z76 adds node-side acknowledgement for the allowed channel. C18Z76 adds node-side acknowledgement for the allowed
`rebuild_route` branch: node-agent consumes the command as a route-manager `rebuild_route` branch: node-agent consumes the command as a route-manager
`pending_degraded_fallback` decision with source `pending_degraded_route_state` decision with source
`service_channel_remediation_command`, while guarded commands remain ignored. `service_channel_remediation_command`, while guarded commands remain ignored.
Backend access telemetry correlates that heartbeat evidence with the durable Backend access telemetry correlates that heartbeat evidence with the durable
ledger and reports `rebuild_request_recorded_node_pending`. C18Z77 resolves ledger and reports `rebuild_request_recorded_node_pending`. C18Z77 resolves
@@ -1089,7 +1091,7 @@ The first backend contract slice is implemented:
reselecting the degraded replacement or adding fallback/failure/drop deltas. reselecting the degraded replacement or adding fallback/failure/drop deltas.
C18Z82 proves the no-safe-recovery branch: if that replacement is also fenced C18Z82 proves the no-safe-recovery branch: if that replacement is also fenced
and no safe recovery route exists, synthetic config reports and no safe recovery route exists, synthetic config reports
`service_channel_feedback_no_alternate` / `pending_degraded_fallback` with `service_channel_feedback_no_alternate` / `pending_degraded_route_state` with
`no_unfenced_alternate_route` instead of silently keeping a bad route. `no_unfenced_alternate_route` instead of silently keeping a bad route.
C18Z83 projects that route-manager decision into active access telemetry and C18Z83 projects that route-manager decision into active access telemetry and
web-admin active-channel diagnostics, including decision source, route id, web-admin active-channel diagnostics, including decision source, route id,
@@ -1124,7 +1126,8 @@ The first backend contract slice is implemented:
`data_plane` is present in the lease, authority payload, introspection `data_plane` is present in the lease, authority payload, introspection
response, and lease-maintenance/admin list. It declares backend API as response, and lease-maintenance/admin list. It declares backend API as
control-plane transport, fabric service channel/fabric route as working control-plane transport, fabric service channel/fabric route as working
data/steady-state transport, backend relay as degraded fallback only, and data/steady-state transport, degraded compatibility relay as an explicit
compatibility state only, and
service-neutral protocol-agnostic isolated logical flows as the runtime service-neutral protocol-agnostic isolated logical flows as the runtime
contract for VPN, Remote Workspace, files, video, and future services. C18Z91 contract for VPN, Remote Workspace, files, video, and future services. C18Z91
makes node-agent consume the signed/introspected data-plane contract, apply makes node-agent consume the signed/introspected data-plane contract, apply
@@ -1187,12 +1190,13 @@ channel class, selected entry node, allowed flow isolation, and data-plane
contract on `remote-workspaces/{resource_id}/streams/{channel_class}`. Empty contract on `remote-workspaces/{resource_id}/streams/{channel_class}`. Empty
probe requests return `202` with a remote-workspace ingress probe contract and probe requests return `202` with a remote-workspace ingress probe contract and
access telemetry; real RDP frame forwarding remains deliberately access telemetry; real RDP frame forwarding remains deliberately
`not_implemented` until the service adapter work begins. `validated_only` for empty probes until the service adapter work begins.
C19E adds a narrow frame-batch probe on that boundary. The adapter contract C19E adds a narrow frame-batch probe on that boundary. The adapter contract
advertises `rap.remote_workspace_frame_batch.v1`, and entry-node accepts advertises `rap.remote_workspace_frame_batch.v1`, and entry-node accepts
non-empty payloads only when they are JSON probe batches with `probe_only=true`, non-empty payloads only when they are JSON probe batches with `probe_only=true`,
valid remote-workspace logical channels, valid directions, and bounded payload valid remote-workspace logical channels, valid directions, and bounded payload
metadata. Accepted probes return `payload_flow=validated_probe_only`; production metadata. Accepted frame probes return `payload_flow=validated_probe_only`, while
empty/control probes return `payload_flow=validated_only`; production
frame forwarding is still not enabled. frame forwarding is still not enabled.
C19F connects that validated probe to a node-agent local adapter sink. The C19F connects that validated probe to a node-agent local adapter sink. The
in-memory `node_agent_rdp_worker_contract_probe` sink accepts only validated in-memory `node_agent_rdp_worker_contract_probe` sink accepts only validated
@@ -3,7 +3,7 @@
Status: Stage C17 planning completed. Stage C17A synthetic mesh runtime Status: Stage C17 planning completed. Stage C17A synthetic mesh runtime
skeleton, Stage C17B route health/failover probes, Stage C17C relay semantic skeleton, Stage C17B route health/failover probes, Stage C17C relay semantic
hardening, Stage C17D non-production test-service path experiment, Stage C17E hardening, Stage C17D non-production test-service path experiment, Stage C17E
live node-to-node synthetic HTTP transport skeleton, Stage C17F scoped historical live node-to-node synthetic HTTP transport skeleton, Stage C17F scoped
synthetic route config boundary, Stage C17G Control Plane scoped synthetic synthetic route config boundary, Stage C17G Control Plane scoped synthetic
config read boundary, Stage C17H deployed multi-agent synthetic config smoke, config read boundary, Stage C17H deployed multi-agent synthetic config smoke,
Stage C17I production forwarding gate, Stage C17J production envelope Stage C17I production forwarding gate, Stage C17J production envelope
@@ -44,8 +44,9 @@ invalidation. C17C added synthetic relay validation, per-channel bounded
queues, QoS dequeue order, telemetry-only drop/backpressure, and reliable queues, QoS dequeue order, telemetry-only drop/backpressure, and reliable
fabric/control rejection behavior. C17D added one bounded `synthetic.echo` fabric/control rejection behavior. C17D added one bounded `synthetic.echo`
test-service path over direct, single-relay, and forced fallback routes. C17E test-service path over direct, single-relay, and forced fallback routes. C17E
added real HTTP peer transport and a disabled-by-default node-agent synthetic added one historical real-HTTP peer transport experiment and a
endpoint/smoke harness for direct and single-relay synthetic traffic. C17F disabled-by-default node-agent synthetic endpoint/smoke harness for direct and
single-relay synthetic traffic only. C17F
added scoped synthetic peer/route config loading and synthetic route-health added scoped synthetic peer/route config loading and synthetic route-health
link observation reporting. C17G added the Control Plane read boundary for link observation reporting. C17G added the Control Plane read boundary for
node-scoped synthetic mesh config. C17H proved that boundary in a deployed node-scoped synthetic mesh config. C17H proved that boundary in a deployed
@@ -596,10 +597,12 @@ C17H implemented a deployed multi-agent synthetic config smoke on
VPN/IP tunnel work remains a separate C18 track and must not be mixed into VPN/IP tunnel work remains a separate C18 track and must not be mixed into
C17 mesh runtime work. C17 mesh runtime work.
## 15.4 C17E Result ## 15.4 C17E Historical Result
C17E implemented live node-to-node synthetic HTTP transport while preserving C17E implemented a historical live node-to-node synthetic HTTP transport
the production forwarding kill-switch: experiment while preserving the production forwarding kill-switch. This result
is retained only as test-history context; it is not the active transport
direction for the fabric runtime:
- `HTTPPeerTransport` maps explicit peer node IDs to synthetic HTTP endpoint - `HTTPPeerTransport` maps explicit peer node IDs to synthetic HTTP endpoint
URLs. URLs.
@@ -613,6 +616,13 @@ the production forwarding kill-switch:
- `/mesh/v1/forward` remains disabled. - `/mesh/v1/forward` remains disabled.
- no production service traffic is authorized. - no production service traffic is authorized.
Current direction:
- active fabric runtime transport is QUIC-only
- synthetic HTTP motion is historical test-only context
- production forwarding/runtime acceptance must use QUIC route execution rather
than HTTP peer transport
Verification: Verification:
```powershell ```powershell
@@ -888,9 +898,11 @@ runtime. Stage C17A implements the first narrow runtime skeleton for synthetic
Fabric messages only. Stage C17B adds route health/failover observations using Fabric messages only. Stage C17B adds route health/failover observations using
synthetic Fabric messages only. Stage C17C adds relay semantic hardening for synthetic Fabric messages only. Stage C17C adds relay semantic hardening for
synthetic channel classes only. Stage C17D adds one bounded non-production synthetic channel classes only. Stage C17D adds one bounded non-production
`synthetic.echo` service-path experiment only. Stage C17E proves live `synthetic.echo` service-path experiment only. Stage C17E proves one
node-to-node synthetic HTTP transport using real local endpoints only. Stage historical synthetic HTTP carrier experiment using real local endpoints only;
C17F proves scoped synthetic config loading and route-health reporting only. it is test-only and not representative of the active QUIC fabric runtime.
Stage C17F proves scoped synthetic config loading and route-health reporting
only.
Stage C17G proves Control Plane scoped synthetic config read/consume only. Stage C17G proves Control Plane scoped synthetic config read/consume only.
Stage C17H proves deployed multi-agent Control Plane synthetic config Stage C17H proves deployed multi-agent Control Plane synthetic config
consumption and synthetic route-health reporting on `docker-test` only. consumption and synthetic route-health reporting on `docker-test` only.
@@ -1,5 +1,12 @@
# Production Direct Worker WSS Trust # Production Direct Worker WSS Trust
Archived status: this document describes an older direct-worker WSS trust
track. It is not the current runtime transport source of truth. For the active
fabric transport model, use
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
Status: P3.4 design/prep complete. Status: P3.4 design/prep complete.
This document defines the production trust model for direct worker WSS. It is a This document defines the production trust model for direct worker WSS. It is a
+8
View File
@@ -1,5 +1,13 @@
# RDP Adapter Runtime # RDP Adapter Runtime
Paused/archival note: this document remains useful for RDP adapter internals,
but it is not the current source of truth for transport/runtime architecture.
Fabric transport is now QUIC-only between nodes. For active transport,
recovery, and routing behavior, see
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
Status: active implementation plan for the new C++ RDP Adapter internals. Status: active implementation plan for the new C++ RDP Adapter internals.
Current implementation status: Current implementation status:
@@ -1,5 +1,12 @@
# RDP Stage 5.2 Design Pass - Server-To-Client File Download # RDP Stage 5.2 Design Pass - Server-To-Client File Download
Archived status: this document belongs to the earlier direct-worker/back-gateway
RDP track and is not the current source of truth for fabric transport
architecture. The active inter-node transport model is QUIC-only; see
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
Status: design-complete proposal, no runtime implementation in this step. Status: design-complete proposal, no runtime implementation in this step.
Date: 2026-04-26 Date: 2026-04-26
@@ -1,5 +1,13 @@
# RDP Service C++ Performance Target # RDP Service C++ Performance Target
Paused/archival note: this document is an RDP performance track record, not the
current source of truth for node-to-node transport. Fabric transport is now
QUIC-only between nodes; use
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md` for the active transport
model.
## Status ## Status
This is the paused RDP service performance direction. The implementation name is `RDP Adapter`: a concrete `Service Adapter` that translates Microsoft RDP into the platform session/data-plane protocol. The common adapter contract is defined in `docs/architecture/SERVICE_ADAPTER_PROTOCOL.md`; the RDP-specific runtime plan is defined in `docs/architecture/RDP_ADAPTER_RUNTIME.md`. This is the paused RDP service performance direction. The implementation name is `RDP Adapter`: a concrete `Service Adapter` that translates Microsoft RDP into the platform session/data-plane protocol. The common adapter contract is defined in `docs/architecture/SERVICE_ADAPTER_PROTOCOL.md`; the RDP-specific runtime plan is defined in `docs/architecture/RDP_ADAPTER_RUNTIME.md`.
@@ -1,5 +1,13 @@
# RDP Service C# Target Architecture # RDP Service C# Target Architecture
Archived scope note: this document is retained as historical RDP runtime
research and is not the current source of truth for node-to-node transport.
Fabric transport is now QUIC-only between nodes; use
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md` for the active transport
model.
## Status ## Status
Superseded. Superseded.
@@ -8,6 +8,12 @@ The current proven RDP lifecycle remains a preserved implementation baseline.
RDP work is currently paused by product decision. The active architecture focus RDP work is currently paused by product decision. The active architecture focus
is the lower Fabric Core / cluster / node foundation. is the lower Fabric Core / cluster / node foundation.
Transport clarification: historical references in this document to direct
worker WSS or backend gateway fallback describe the earlier RDP service proof
path and migration context. They must not be read as the current inter-node
transport contract. The active fabric node-to-node runtime transport is
QUIC-only.
## 1. Project Vision ## 1. Project Vision
The project is a Secure Access Fabric: a distributed, multi-tenant platform for secure access to private resources across sites, networks, and organizations. The project is a Secure Access Fabric: a distributed, multi-tenant platform for secure access to private resources across sites, networks, and organizations.
@@ -1702,7 +1708,7 @@ Channels must have independent priority, reliability, and backpressure behavior.
The current RDP MVP proves lifecycle and basic viewer behavior. It is not the target production performance model. The current RDP MVP proves lifecycle and basic viewer behavior. It is not the target production performance model.
Target RDP realtime model: Target RDP realtime model for the paused historical RDP service track:
- client connects to direct/relay data plane, not backend frame relay - client connects to direct/relay data plane, not backend frame relay
- input/control channels are separate from render/video - input/control channels are separate from render/video
@@ -2459,7 +2465,11 @@ This is an incremental migration plan. It must not be executed as a big-bang rew
### Current Fallback ### Current Fallback
Keep the current backend WebSocket gateway as fallback while the production data plane is introduced. Historical migration note: the older RDP MVP kept the backend WebSocket
gateway as a temporary fallback while an earlier production data-plane design
was being introduced. This is not the active fabric transport plan. Current
fabric node-to-node runtime transport is QUIC-only, and old compatibility paths
are being removed rather than extended.
Current RDP MVP remains the preserved service-adapter baseline, but it is not Current RDP MVP remains the preserved service-adapter baseline, but it is not
the active implementation focus while Fabric Core stages are underway. the active implementation focus while Fabric Core stages are underway.
@@ -2543,9 +2553,14 @@ These stages must be introduced only through explicit, narrow implementation
prompts. RDP/VNC/SSH/VPN/video/file services remain above the Fabric Core and prompts. RDP/VNC/SSH/VPN/video/file services remain above the Fabric Core and
must not define the lower fabric foundation. must not define the lower fabric foundation.
### Stage DP-1: Direct Worker WSS ### Historical Stage DP-1: Direct Worker WSS
Introduce a short-lived authorized direct WSS path from client to worker or worker-local live endpoint. This stage records an earlier RDP service migration concept. It is paused and
retained for historical context only. It must not be read as the active fabric
transport roadmap.
Introduce a short-lived authorized direct WSS path from client to worker or
worker-local live endpoint.
Goals: Goals:
@@ -2554,7 +2569,7 @@ Goals:
- keep session broker lifecycle unchanged - keep session broker lifecycle unchanged
- keep fallback gateway available - keep fallback gateway available
### Stage DP-2: Binary Frames ### Historical Stage DP-2: Binary Frames
Replace base64 JSON frame payloads with binary frame messages. Replace base64 JSON frame payloads with binary frame messages.
@@ -2565,7 +2580,7 @@ Goals:
- reduce JSON/base64 overhead - reduce JSON/base64 overhead
- preserve latest-frame-only behavior - preserve latest-frame-only behavior
### Stage DP-3: Adaptive Quality ### Historical Stage DP-3: Adaptive Quality
Implement adaptive RDP quality profiles. Implement adaptive RDP quality profiles.
@@ -2577,9 +2592,10 @@ Goals:
- bandwidth and latency feedback - bandwidth and latency feedback
- bounded frame queues - bounded frame queues
### Stage DP-4: Relay Nodes ### Historical Stage DP-4: Relay Nodes
Introduce `entry-node` and `relay-node` roles for data-plane routing. Introduce `entry-node` and `relay-node` roles for the earlier service-specific
data-plane routing model.
Goals: Goals:
+29 -19
View File
@@ -1,20 +1,28 @@
# Security And Secrets Readiness # Security And Secrets Readiness
Status: P3.3 test-stand smoke complete for encrypted resource secrets, Archived scope note: this document records an earlier RDP/direct-worker trust
assignment-time resolution, and production fallback behavior with smoke-only and secret-handling stage. It is not the current source of truth for fabric
direct worker WSS trust. transport architecture. The active inter-node transport model is QUIC-only; see
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
Status: P3.3 historical test-stand smoke complete for encrypted resource
secrets, assignment-time resolution, and legacy RDP baseline behavior with
smoke-only direct-worker trust.
This document defines the next security hardening layer around the accepted RDP This document defines the next security hardening layer around the accepted RDP
MVP baseline. It does not implement mesh, VPN, server-to-client download, new MVP baseline. It does not implement mesh, VPN, server-to-client download, new
protocol adapters, or another RDP rendering mode. protocol adapters, or another RDP rendering mode.
## Current Accepted Baseline ## Current Accepted Historical RDP Baseline
- RDP worker baseline: `rap-rdp-worker:rdp-p1-region-order2` - RDP worker baseline: `rap-rdp-worker:rdp-p1-region-order2`
- Backend control plane remains source of truth. - Backend control plane remains source of truth.
- Redis remains live coordination/routing only. - Redis remains live coordination/routing only.
- Direct worker WSS is preferred for realtime RDP. - Historical direct-worker WSS was the preferred realtime RDP path in this
- Backend gateway remains fallback/debug. stage.
- Historical backend gateway remained a fallback/debug path for this stage.
- Text clipboard is policy-gated and accepted. - Text clipboard is policy-gated and accepted.
- Client-to-server file upload and restricted `RAP_Transfers` visibility are - Client-to-server file upload and restricted `RAP_Transfers` visibility are
accepted. accepted.
@@ -124,22 +132,24 @@ Already accepted:
- worker rejects wrong worker, wrong attachment, wrong organization, wrong - worker rejects wrong worker, wrong attachment, wrong organization, wrong
resource, over-broad channels, failed/terminated sessions, and jti replay resource, over-broad channels, failed/terminated sessions, and jti replay
Production still needs: Production still needed for that stage:
- deployed certificate chain for direct worker WSS on production nodes - deployed certificate chain for the historical direct-worker WSS path on
- pinned or platform-issued worker certificates in live production config production nodes
- pinned or platform-issued worker certificates in live production config for
that historical path
- no smoke-only TLS bypass in production clients - no smoke-only TLS bypass in production clients
- rotation process for data-plane signing keys - rotation process for data-plane signing keys
- audit for failed token validation/bind attempts - audit for failed token validation/bind attempts
P3.2 guard exists: P3.2 historical guard exists:
- backend distinguishes `smoke_insecure`, `public_ca`, and `platform_ca` - backend distinguished `smoke_insecure`, `public_ca`, and `platform_ca`
direct worker WSS trust modes direct-worker trust modes for the historical RDP path
- production backend omits smoke-only direct candidates - production backend omitted smoke-only direct candidates on that path
- Windows production client skips untrusted or smoke-only direct candidates - Windows production client skipped untrusted or smoke-only direct candidates
P3.3 test-stand smoke exists: P3.3 historical test-stand smoke exists:
- `resource_secrets` migration is applied on `docker-test` - `resource_secrets` migration is applied on `docker-test`
- backend runs as `APP_ENV=production` with a test-only - backend runs as `APP_ENV=production` with a test-only
@@ -149,9 +159,9 @@ P3.3 test-stand smoke exists:
- `resources.metadata`, `remote_sessions.metadata`, and `audit_events` were - `resources.metadata`, `remote_sessions.metadata`, and `audit_events` were
checked for plaintext username/password leakage checked for plaintext username/password leakage
- production backend with `DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE=smoke_insecure` - production backend with `DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE=smoke_insecure`
returns backend gateway fallback only returned the historical backend gateway debug path only
- development/smoke backend with the same trust mode advertises the explicit - development/smoke backend with the same trust mode advertises the explicit
smoke-only direct worker WSS candidate smoke-only historical direct-worker candidate
- `RAP_Transfers` smoke passed on the secret-backed resource - `RAP_Transfers` smoke passed on the secret-backed resource
## Required Regression Tests ## Required Regression Tests
@@ -202,8 +212,8 @@ P3.1 implemented audit events for:
assignment payload; a future resolver pull/token flow should reduce exposure assignment payload; a future resolver pull/token flow should reduce exposure
in Redis control queues. in Redis control queues.
- Worker still depends on plaintext assignment metadata for development smoke. - Worker still depends on plaintext assignment metadata for development smoke.
- Production direct worker WSS certificate issuance/rotation and platform CA - Production certificate issuance/rotation and platform CA distribution for the
distribution are not complete. historical direct-worker path are not complete.
- The test-stand secret key is a host-local test file, not a production KMS or - The test-stand secret key is a host-local test file, not a production KMS or
HSM-backed key. HSM-backed key.
- Automated end-to-end policy denial coverage is still thin. - Automated end-to-end policy denial coverage is still thin.
+20 -2
View File
@@ -1,7 +1,21 @@
# Service Adapter Protocol # Service Adapter Protocol
Scope note: this document remains the common adapter-model reference, but it is
not the current source of truth for transport/runtime topology between fabric
nodes. Fabric transport is now QUIC-only between nodes; for active transport,
routing, and recovery behavior see
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
Status: target contract and compile-safe foundation. This document defines the common adapter model for RDP, SSH, VNC, and future services. It does not replace the current backend control plane or current RDP runtime by itself. Status: target contract and compile-safe foundation. This document defines the common adapter model for RDP, SSH, VNC, and future services. It does not replace the current backend control plane or current RDP runtime by itself.
Transport clarification: historical references in this document to direct
worker WSS, backend gateway fallback, or DP-1 channel shape belong to the
earlier RDP service baseline. They are not the active inter-node transport
contract. Current fabric node-to-node transport is QUIC-only; service adapters
consume fabric routes rather than define transport fallback behavior.
## 1. Purpose ## 1. Purpose
The platform client must not implement third-party protocols directly. The platform client must not implement third-party protocols directly.
@@ -94,12 +108,16 @@ adapter runtime.
- Service Adapter does not know UI implementation details. - Service Adapter does not know UI implementation details.
- Control Plane remains authoritative for session lifecycle and policy. - Control Plane remains authoritative for session lifecycle and policy.
- PostgreSQL remains source of truth; Redis remains live coordination only. - PostgreSQL remains source of truth; Redis remains live coordination only.
- Direct worker WSS and backend gateway fallback remain valid transports. - Fabric transport remains QUIC-only between nodes; any historical direct
worker or backend fallback paths belong to paused service-specific baselines,
not to the active fabric transport contract.
- Adapter runtime must not create sessions outside broker/assignment control. - Adapter runtime must not create sessions outside broker/assignment control.
## 4. Logical Channels ## 4. Logical Channels
The session protocol is channel-oriented even when DP-1 uses one WSS connection. The session protocol is channel-oriented regardless of the concrete carrier. A
historical DP-1 single-WSS shape may still appear in paused RDP notes, but it
is not the current fabric transport contract.
| Channel | Direction | Reliability | Priority | Purpose | | Channel | Direction | Reliability | Priority | Purpose |
| --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- |
@@ -7,6 +7,11 @@ Secure Access Fabric. It does not implement VPN runtime, packet routing, TUN
devices, mesh traffic, service workload execution, API changes, migrations, or devices, mesh traffic, service workload execution, API changes, migrations, or
RDP behavior changes. RDP behavior changes.
Transport clarification: this document defines a service layer above Fabric
Core. It does not redefine node-to-node transport. Current fabric inter-node
transport is QUIC-only; VPN/IP tunnel runtime must request and use fabric
routes instead of introducing a separate packet transport contract.
## Purpose ## Purpose
VPN/IP tunnel is a service above the Fabric Core, not a node-local setting. VPN/IP tunnel is a service above the Fabric Core, not a node-local setting.
@@ -9,6 +9,15 @@ Secure Access Fabric.
The fabric node-to-node transport remains QUIC-only. HTTP/HTTPS is allowed only The fabric node-to-node transport remains QUIC-only. HTTP/HTTPS is allowed only
as an external client-facing service edge. as an external client-facing service edge.
Terminology rule:
- `Fabric Transport` = QUIC/UDP node-to-node runtime layer.
- `Control API` = HTTP/HTTPS management surface for UI, automation, releases,
policy, audit, and status.
The Control API may use HTTP/HTTPS, but it is not a fallback or alternate
carrier for fabric node-to-node runtime traffic.
## Purpose ## Purpose
The platform needs a clear distinction between: The platform needs a clear distinction between:
+2 -2
View File
@@ -115,9 +115,9 @@ for container in rap_test_postgres rap_test_redis rap_test_backend rap_web_admin
done done
redis_guard redis_guard
probe_http "downloads" "$BACKEND_URL/downloads/rap-android-rdp-vpn-build.json" probe_http "downloads" "$BACKEND_URL/downloads/rap-android-vpn-build.json"
probe_http "web_admin_root" "$BACKEND_URL/" probe_http "web_admin_root" "$BACKEND_URL/"
probe_http "diagnostics" "$PUBLIC_URL/api/v1/clusters/$CLUSTER_ID/vpn/client-diagnostics" probe_http "backend_healthz" "http://127.0.0.1:18121/healthz"
used_after="$(disk_used_percent)" used_after="$(disk_used_percent)"
status="ok" status="ok"
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+1 -1
View File
@@ -4,7 +4,7 @@
<meta charset="UTF-8" /> <meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Панель Secure Access Fabric</title> <title>Панель Secure Access Fabric</title>
<script type="module" crossorigin src="/assets/index-gMV--oab.js"></script> <script type="module" crossorigin src="/assets/index-CiNvRobk.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-Cur_BAkX.css"> <link rel="stylesheet" crossorigin href="/assets/index-Cur_BAkX.css">
</head> </head>
<body> <body>
+826 -54
View File
File diff suppressed because it is too large Load Diff
+133 -1
View File
@@ -36,6 +36,7 @@ import type {
NodeSyntheticMeshConfig, NodeSyntheticMeshConfig,
NodeTelemetryObservation, NodeTelemetryObservation,
NodeUpdatePlan, NodeUpdatePlan,
NodeBridgeReplayPlan,
NodeUpdatePolicy, NodeUpdatePolicy,
NodeUpdateStatus, NodeUpdateStatus,
NodeWorkloadDesiredState, NodeWorkloadDesiredState,
@@ -46,6 +47,7 @@ import type {
ReleaseVersion, ReleaseVersion,
Resource, Resource,
RoleAssignment, RoleAssignment,
StaleNodeRiskReport,
UserAccount, UserAccount,
VPNClientDiagnosticCommand, VPNClientDiagnosticCommand,
VPNClientDiagnosticStatus, VPNClientDiagnosticStatus,
@@ -66,6 +68,7 @@ type ApiErrorPayload = {
message_key?: string; message_key?: string;
fallback_message?: string; fallback_message?: string;
trace_id?: string; trace_id?: string;
details?: Record<string, unknown>;
}; };
}; };
@@ -106,6 +109,25 @@ export type UpsertNodeUpdatePolicyPayload = {
healthWindowSeconds?: number; healthWindowSeconds?: number;
}; };
export type CreateReleaseVersionPayload = {
product: string;
version: string;
channel?: string;
status?: string;
compatibility?: Record<string, unknown>;
changelog?: string;
artifacts: Array<{
os: string;
arch: string;
installType: string;
kind: string;
url: string;
sha256: string;
sizeBytes?: number;
metadata?: Record<string, unknown>;
}>;
};
export type UpdateFabricServiceChannelRecoveryPolicyPayload = { export type UpdateFabricServiceChannelRecoveryPolicyPayload = {
hysteresisPenalty?: number; hysteresisPenalty?: number;
promotionMinSamples?: number; promotionMinSamples?: number;
@@ -436,6 +458,37 @@ export class AdminApiClient {
return payload.release_versions ?? []; return payload.release_versions ?? [];
} }
async createReleaseVersion(clusterId: string, input: CreateReleaseVersionPayload): Promise<ReleaseVersion> {
const payload = await this.post<{ release_version: ReleaseVersion }>(`/clusters/${clusterId}/updates/releases`, {
actor_user_id: this.actorUserId,
product: input.product,
version: input.version,
channel: input.channel || "stable",
status: input.status || "active",
compatibility: input.compatibility || {},
changelog: input.changelog || "",
artifacts: input.artifacts.map((artifact) => ({
os: artifact.os,
arch: artifact.arch,
install_type: artifact.installType,
kind: artifact.kind,
url: artifact.url,
sha256: artifact.sha256,
size_bytes: artifact.sizeBytes || 0,
metadata: artifact.metadata || {},
})),
});
return payload.release_version;
}
async getStaleNodeRiskReport(clusterId: string): Promise<StaleNodeRiskReport> {
const params = new URLSearchParams({ actor_user_id: this.actorUserId });
const payload = await this.get<{ stale_node_risk_report: StaleNodeRiskReport }>(
`/clusters/${clusterId}/updates/stale-node-risk-report?${params.toString()}`,
);
return payload.stale_node_risk_report;
}
async getNodeUpdatePlan( async getNodeUpdatePlan(
clusterId: string, clusterId: string,
nodeId: string, nodeId: string,
@@ -453,6 +506,14 @@ export class AdminApiClient {
return payload.node_update_plan; return payload.node_update_plan;
} }
async getNodeBridgeReplayPlan(clusterId: string, nodeId: string): Promise<NodeBridgeReplayPlan> {
const params = new URLSearchParams({ actor_user_id: this.actorUserId });
const payload = await this.get<{ node_bridge_replay_plan: NodeBridgeReplayPlan }>(
`/clusters/${clusterId}/nodes/${nodeId}/updates/bridge-replay-plan?${params.toString()}`,
);
return payload.node_bridge_replay_plan;
}
async upsertNodeUpdatePolicy(clusterId: string, nodeId: string, input: UpsertNodeUpdatePolicyPayload): Promise<NodeUpdatePolicy> { async upsertNodeUpdatePolicy(clusterId: string, nodeId: string, input: UpsertNodeUpdatePolicyPayload): Promise<NodeUpdatePolicy> {
const payload = await this.put<{ node_update_policy: NodeUpdatePolicy }>(`/clusters/${clusterId}/nodes/${nodeId}/updates/policy`, { const payload = await this.put<{ node_update_policy: NodeUpdatePolicy }>(`/clusters/${clusterId}/nodes/${nodeId}/updates/policy`, {
actor_user_id: this.actorUserId, actor_user_id: this.actorUserId,
@@ -1269,7 +1330,7 @@ export class AdminApiClient {
let message = `Запрос завершился ошибкой HTTP ${response.status}`; let message = `Запрос завершился ошибкой HTTP ${response.status}`;
try { try {
const payload = (await response.json()) as ApiErrorPayload; const payload = (await response.json()) as ApiErrorPayload;
message = payload.error?.fallback_message || payload.error?.code || message; message = formatApiErrorMessage(payload, response.status) || payload.error?.fallback_message || payload.error?.code || message;
} catch { } catch {
// Keep generic HTTP message if backend did not return JSON. // Keep generic HTTP message if backend did not return JSON.
} }
@@ -1279,6 +1340,77 @@ export class AdminApiClient {
} }
} }
function formatApiErrorMessage(payload: ApiErrorPayload, status: number) {
const error = payload.error;
if (!error) {
return "";
}
if (status === 409 && error.code === "conflict.legacy_compatibility_removal_is_blocked_while_stale_recovery_risk_nodes_remain") {
const details = error.details || {};
const parts: string[] = ["Compatibility cleanup заблокирован."];
const blockedOperation = stringDetail(details, "blocked_operation");
if (blockedOperation) {
parts.push(`Операция: ${blockedOperation}.`);
}
const counters = [
numberDetail(details, "blocked_nodes") ? `blockers ${numberDetail(details, "blocked_nodes")}` : "",
numberDetail(details, "stale_nodes") ? `stale ${numberDetail(details, "stale_nodes")}` : "",
numberDetail(details, "artifact_gap_nodes") ? `artifact gap ${numberDetail(details, "artifact_gap_nodes")}` : "",
numberDetail(details, "unknown_profile_nodes") ? `profile unknown ${numberDetail(details, "unknown_profile_nodes")}` : "",
numberDetail(details, "waiting_update_status_nodes") ? `waiting status ${numberDetail(details, "waiting_update_status_nodes")}` : "",
numberDetail(details, "unknown_version_nodes") ? `version unknown ${numberDetail(details, "unknown_version_nodes")}` : "",
numberDetail(details, "legacy_recovery_contract_nodes") ? `legacy contract ${numberDetail(details, "legacy_recovery_contract_nodes")}` : "",
numberDetail(details, "recovery_bridge_required_nodes") ? `recovery bridge ${numberDetail(details, "recovery_bridge_required_nodes")}` : "",
numberDetail(details, "recovery_bridge_replay_ready_nodes") ? `bridge replay ready ${numberDetail(details, "recovery_bridge_replay_ready_nodes")}` : "",
numberDetail(details, "waiting_recovery_heartbeat_nodes") ? `waiting heartbeat ${numberDetail(details, "waiting_recovery_heartbeat_nodes")}` : "",
].filter(Boolean);
if (counters.length > 0) {
parts.push(counters.join(" / ") + ".");
}
const nodeIds = arrayDetail(details, "blocked_node_ids");
if (nodeIds.length > 0) {
parts.push(`Blocked nodes: ${nodeIds.join(", ")}.`);
}
if (booleanDetail(details, "bridge_hold_required")) {
const holdReasons = arrayDetail(details, "bridge_hold_reasons");
const holdNodes = arrayDetail(details, "bridge_hold_node_ids");
const holdSummary: string[] = [];
if (holdReasons.length > 0) {
holdSummary.push(`reasons ${holdReasons.join(", ")}`);
}
if (holdNodes.length > 0) {
holdSummary.push(`nodes ${holdNodes.join(", ")}`);
}
parts.push(`Recovery bridge hold active${holdSummary.length > 0 ? `: ${holdSummary.join(" / ")}` : ""}.`);
}
const traceID = error.trace_id?.trim();
if (traceID) {
parts.push(`Trace: ${traceID}.`);
}
return parts.join(" ");
}
return "";
}
function stringDetail(source: Record<string, unknown>, key: string) {
const value = source[key];
return typeof value === "string" ? value.trim() : "";
}
function numberDetail(source: Record<string, unknown>, key: string) {
const value = source[key];
return typeof value === "number" && Number.isFinite(value) ? value : 0;
}
function booleanDetail(source: Record<string, unknown>, key: string) {
return source[key] === true;
}
function arrayDetail(source: Record<string, unknown>, key: string) {
const value = source[key];
return Array.isArray(value) ? value.filter((item): item is string => typeof item === "string" && item.trim().length > 0) : [];
}
function browserDeviceFingerprint(): string { function browserDeviceFingerprint(): string {
const key = "rap.webAdmin.deviceFingerprint"; const key = "rap.webAdmin.deviceFingerprint";
const existing = localStorage.getItem(key); const existing = localStorage.getItem(key);
+96
View File
@@ -343,6 +343,28 @@ export type NodeUpdatePlan = {
production_forwarding: boolean; production_forwarding: boolean;
}; };
export type NodeBridgeReplayProductPlan = {
product: string;
recovery_bridge_mode?: string;
recovery_bridge_replay_ready: boolean;
last_status_reason?: string;
update_plan: NodeUpdatePlan;
};
export type NodeBridgeReplayPlan = {
schema_version: string;
cluster_id: string;
node_id: string;
node_name?: string;
health_status?: string;
heartbeat_stale: boolean;
bridge_hold_required: boolean;
recovery_bridge_replay_ready: boolean;
bridge_hold_reasons?: string[];
bridge_actions?: string[];
products?: NodeBridgeReplayProductPlan[];
};
export type NodeUpdatePolicy = { export type NodeUpdatePolicy = {
id: string; id: string;
cluster_id: string; cluster_id: string;
@@ -374,6 +396,78 @@ export type NodeUpdateStatus = {
observed_at: string; observed_at: string;
}; };
export type StaleNodeRiskProduct = {
product: string;
current_version?: string | null;
target_version?: string | null;
channel?: string | null;
strategy?: string | null;
enabled?: boolean;
detected_os?: string | null;
detected_arch?: string | null;
detected_install_type?: string | null;
compatible_artifact_found: boolean;
matching_release_version?: string | null;
last_status_observed_at?: string | null;
last_status_phase?: string | null;
last_status_value?: string | null;
last_status_reason?: string | null;
recovery_bridge_required?: boolean;
recovery_bridge_replay_ready?: boolean;
recovery_bridge_mode?: string | null;
risks?: string[];
};
export type StaleNodeRiskNode = {
node_id: string;
name: string;
node_key?: string;
reported_version?: string | null;
health_status: string;
registration_status: string;
last_seen_at?: string | null;
heartbeat_stale: boolean;
blocked: boolean;
direct_peer_alert?: boolean;
direct_peer_ready_count?: number;
direct_peer_target_count?: number;
direct_peer_deficit?: number;
alerts?: string[];
recovery_bridge_required?: boolean;
recovery_bridge_replay_ready?: boolean;
recovery_bridge_actions?: string[];
risks: string[];
products: StaleNodeRiskProduct[];
};
export type StaleNodeRiskSummary = {
total_nodes: number;
stale_nodes: number;
blocked_nodes: number;
direct_peer_alert_nodes?: number;
artifact_gap_nodes?: number;
unknown_profile_nodes?: number;
waiting_update_status_nodes?: number;
unknown_version_nodes?: number;
legacy_recovery_contract_nodes?: number;
recovery_bridge_required_nodes?: number;
recovery_bridge_replay_ready_nodes?: number;
waiting_recovery_heartbeat_nodes?: number;
};
export type StaleNodeRiskReport = {
cluster_id: string;
generated_at: string;
heartbeat_stale_after_seconds?: number;
legacy_removal_allowed: boolean;
bridge_hold_required?: boolean;
bridge_hold_node_ids?: string[];
bridge_hold_reasons?: string[];
blocked_operations?: string[];
summary: StaleNodeRiskSummary;
nodes: StaleNodeRiskNode[];
};
export type MeshLink = { export type MeshLink = {
id: string; id: string;
cluster_id: string; cluster_id: string;
@@ -1196,6 +1290,8 @@ export type NodeSyntheticMeshConfig = {
auto_port_start?: number; auto_port_start?: number;
auto_port_end?: number; auto_port_end?: number;
advertise_endpoint?: string; advertise_endpoint?: string;
advertise_endpoints?: string[];
endpoint_candidates?: PeerEndpointCandidate[];
advertise_transport?: string; advertise_transport?: string;
connectivity_mode?: string; connectivity_mode?: string;
nat_type?: string; nat_type?: string;