3
This commit is contained in:
@@ -140,15 +140,12 @@ func run(ctx context.Context) (smokeReport, error) {
|
||||
return smokeReport{}, fmt.Errorf("test service: %w", err)
|
||||
}
|
||||
fabricSessionStartedAt := time.Now()
|
||||
fabricSession, _, err := mesh.NewClient(nodeB.URL).OpenFabricSession(ctx, mesh.FabricSessionDialOptions{
|
||||
Token: "rap_fsn_mesh_live_smoke",
|
||||
Timeout: 3 * time.Second,
|
||||
})
|
||||
fabricSession, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx)
|
||||
if err != nil {
|
||||
return smokeReport{}, fmt.Errorf("fabric session open: %w", err)
|
||||
return smokeReport{}, fmt.Errorf("fabric quic session open: %w", err)
|
||||
}
|
||||
defer fabricSession.Close()
|
||||
firstFabricSessionResponse, err := fabricSession.RoundTrip(ctx, fabricproto.Frame{
|
||||
firstFabricSessionResponse, err := smokeFabricSessionRoundTrip(ctx, fabricSession, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: uint64(fabricSessionStartedAt.UnixNano()),
|
||||
Payload: []byte("mesh-live-smoke-fabric-session"),
|
||||
@@ -156,7 +153,7 @@ func run(ctx context.Context) (smokeReport, error) {
|
||||
if err != nil {
|
||||
return smokeReport{}, fmt.Errorf("fabric session first round trip: %w", err)
|
||||
}
|
||||
secondFabricSessionResponse, err := fabricSession.RoundTrip(ctx, fabricproto.Frame{
|
||||
secondFabricSessionResponse, err := smokeFabricSessionRoundTrip(ctx, fabricSession, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: uint64(fabricSessionStartedAt.UnixNano()) + 1,
|
||||
Payload: []byte("mesh-live-smoke-fabric-session-2"),
|
||||
@@ -175,13 +172,9 @@ func run(ctx context.Context) (smokeReport, error) {
|
||||
}
|
||||
fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow, fabricVPNPressureLevel, fabricVPNPressureScore, fabricVPNPressureReasons, fabricVPNPressureAction := smokeVPNFlowSchedulerBulkPressure()
|
||||
fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS, fabricVPNRecoveryReason := smokeVPNFlowSchedulerRouteRecovery()
|
||||
fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx)
|
||||
if err != nil {
|
||||
return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err)
|
||||
}
|
||||
|
||||
return smokeReport{
|
||||
Stage: "C17F scoped synthetic config plus live HTTP transport",
|
||||
Stage: "C17F scoped synthetic config plus live QUIC fabric transport",
|
||||
ProductionForwarding: false,
|
||||
ScopedConfigLoaded: nodeAConfig.ConfigVersion == "smoke-config-v1",
|
||||
DirectProbeAccepted: directAck.MessageType == mesh.SyntheticMessageProbeAck,
|
||||
@@ -210,11 +203,11 @@ func run(ctx context.Context) (smokeReport, error) {
|
||||
FabricVPNRecoveryMaxMS: fabricVPNRecoveryMaxMS,
|
||||
FabricVPNRecoveryAvgMS: fabricVPNRecoveryAvgMS,
|
||||
FabricVPNRecoveryReason: fabricVPNRecoveryReason,
|
||||
FabricQUICAccepted: fabricQUICAccepted,
|
||||
FabricQUICAccepted: fabricSessionAccepted,
|
||||
FabricQUICEndpoint: fabricQUICEndpoint,
|
||||
FabricQUICPressure: fabricQUICPressure,
|
||||
FabricSessionLatencyMS: fabricSessionLatency.Milliseconds(),
|
||||
FabricSessionEndpoint: nodeB.URL + "/mesh/v1/fabric/session/ws",
|
||||
FabricSessionEndpoint: "quic://" + fabricQUICEndpoint,
|
||||
PeerEndpoints: map[string]any{
|
||||
"node-a": nodeA.URL,
|
||||
"node-r": nodeR.URL,
|
||||
@@ -269,18 +262,16 @@ func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64, st
|
||||
stat.LastRouteSwitchReason
|
||||
}
|
||||
|
||||
func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) {
|
||||
func smokeQUICFabricSession(ctx context.Context) (mesh.FabricTransportSession, string, int, error) {
|
||||
server, err := mesh.StartQUICFabricServer(ctx, mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: smokeQUICTLSConfig(),
|
||||
})
|
||||
if err != nil {
|
||||
return false, "", 0, err
|
||||
return nil, "", 0, err
|
||||
}
|
||||
defer server.Close()
|
||||
endpoint := server.Addr().String()
|
||||
transport := mesh.NewQUICFabricTransport(nil)
|
||||
defer transport.Close()
|
||||
session, err := transport.Connect(ctx, mesh.FabricTransportTarget{
|
||||
PeerID: "node-b",
|
||||
Endpoint: endpoint,
|
||||
@@ -293,31 +284,12 @@ func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) {
|
||||
ErrorBuffer: 4,
|
||||
})
|
||||
if err != nil {
|
||||
return false, endpoint, 0, err
|
||||
}
|
||||
defer session.Close()
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: uint64(time.Now().UnixNano()),
|
||||
Payload: []byte("mesh-live-smoke-quic"),
|
||||
}); err != nil {
|
||||
return false, endpoint, 0, err
|
||||
}
|
||||
timer := time.NewTimer(3 * time.Second)
|
||||
defer timer.Stop()
|
||||
for {
|
||||
select {
|
||||
case frame := <-session.Frames():
|
||||
snapshot := transport.Snapshot()
|
||||
return frame.Type == fabricproto.FramePong && string(frame.Payload) == "mesh-live-smoke-quic", endpoint, snapshot.CapacityPressurePercent, nil
|
||||
case err := <-session.Errors():
|
||||
return false, endpoint, 0, err
|
||||
case <-timer.C:
|
||||
return false, endpoint, 0, fmt.Errorf("timed out waiting for quic pong")
|
||||
case <-ctx.Done():
|
||||
return false, endpoint, 0, ctx.Err()
|
||||
}
|
||||
_ = transport.Close()
|
||||
_ = server.Close()
|
||||
return nil, endpoint, 0, err
|
||||
}
|
||||
snapshot := transport.Snapshot()
|
||||
return &smokeManagedFabricSession{session: session, transport: transport, server: server}, endpoint, snapshot.CapacityPressurePercent, nil
|
||||
}
|
||||
|
||||
func smokeQUICTLSConfig() *tls.Config {
|
||||
@@ -341,25 +313,20 @@ func smokeQUICTLSConfig() *tls.Config {
|
||||
}
|
||||
}
|
||||
|
||||
func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession *mesh.FabricSessionClient) (bool, bool, int, error) {
|
||||
func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession mesh.FabricTransportSession) (bool, bool, int, error) {
|
||||
const interactiveStreamID uint64 = 4400
|
||||
const bulkStreamID uint64 = 4401
|
||||
pump := fabricSession.StartPump(ctx, mesh.FabricSessionPumpOptions{
|
||||
OutboundBuffer: 4,
|
||||
InboundBuffer: 4,
|
||||
ErrorBuffer: 4,
|
||||
})
|
||||
defer pump.Close()
|
||||
for _, frame := range []fabricproto.Frame{
|
||||
{Type: fabricproto.FrameOpenStream, StreamID: interactiveStreamID, TrafficClass: fabricproto.TrafficClassInteractive},
|
||||
{Type: fabricproto.FrameOpenStream, StreamID: bulkStreamID, TrafficClass: fabricproto.TrafficClassBulk},
|
||||
} {
|
||||
if err := pump.Send(ctx, frame); err != nil {
|
||||
if err := fabricSession.Send(ctx, frame); err != nil {
|
||||
return false, false, 0, err
|
||||
}
|
||||
}
|
||||
transport := &vpnruntime.FabricSessionPacketTransport{
|
||||
Sender: pump,
|
||||
Sender: fabricSession,
|
||||
Receiver: fabricSession,
|
||||
StreamID: interactiveStreamID,
|
||||
VPNConnectionID: "vpn-smoke",
|
||||
SendDirection: vpnruntime.FabricDirectionGatewayToClient,
|
||||
@@ -378,7 +345,7 @@ func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession *mesh.Fa
|
||||
acked := map[uint64]bool{}
|
||||
for {
|
||||
select {
|
||||
case frame := <-pump.Frames():
|
||||
case frame := <-fabricSession.Frames():
|
||||
if frame.Type == fabricproto.FrameAck && frame.Sequence == 1 {
|
||||
acked[frame.StreamID] = true
|
||||
if acked[interactiveStreamID] && acked[bulkStreamID] {
|
||||
@@ -393,7 +360,7 @@ func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession *mesh.Fa
|
||||
return true, sharded, int(fanout), nil
|
||||
}
|
||||
}
|
||||
case err := <-pump.Errors():
|
||||
case err := <-fabricSession.Errors():
|
||||
return false, false, 0, err
|
||||
case <-timer.C:
|
||||
return false, false, 0, fmt.Errorf("timed out waiting for fabric vpn packet ack")
|
||||
@@ -403,6 +370,68 @@ func smokeFabricVPNPacketOverSession(ctx context.Context, fabricSession *mesh.Fa
|
||||
}
|
||||
}
|
||||
|
||||
type smokeManagedFabricSession struct {
|
||||
session mesh.FabricTransportSession
|
||||
transport *mesh.QUICFabricTransport
|
||||
server *mesh.QUICFabricServer
|
||||
}
|
||||
|
||||
func (s *smokeManagedFabricSession) Send(ctx context.Context, frame fabricproto.Frame) error {
|
||||
return s.session.Send(ctx, frame)
|
||||
}
|
||||
|
||||
func (s *smokeManagedFabricSession) Frames() <-chan fabricproto.Frame {
|
||||
return s.session.Frames()
|
||||
}
|
||||
|
||||
func (s *smokeManagedFabricSession) Errors() <-chan error {
|
||||
return s.session.Errors()
|
||||
}
|
||||
|
||||
func (s *smokeManagedFabricSession) Closed() bool {
|
||||
return s.session.Closed()
|
||||
}
|
||||
|
||||
func (s *smokeManagedFabricSession) Close() error {
|
||||
var firstErr error
|
||||
if s.session != nil {
|
||||
firstErr = s.session.Close()
|
||||
}
|
||||
if s.transport != nil {
|
||||
if err := s.transport.Close(); firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
}
|
||||
if s.server != nil {
|
||||
if err := s.server.Close(); firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
}
|
||||
return firstErr
|
||||
}
|
||||
|
||||
func smokeFabricSessionRoundTrip(ctx context.Context, session mesh.FabricTransportSession, frame fabricproto.Frame) (fabricproto.Frame, error) {
|
||||
if err := session.Send(ctx, frame); err != nil {
|
||||
return fabricproto.Frame{}, err
|
||||
}
|
||||
timer := time.NewTimer(3 * time.Second)
|
||||
defer timer.Stop()
|
||||
for {
|
||||
select {
|
||||
case response := <-session.Frames():
|
||||
if response.Sequence == frame.Sequence {
|
||||
return response, nil
|
||||
}
|
||||
case err := <-session.Errors():
|
||||
return fabricproto.Frame{}, err
|
||||
case <-timer.C:
|
||||
return fabricproto.Frame{}, fmt.Errorf("timed out waiting for fabric session response")
|
||||
case <-ctx.Done():
|
||||
return fabricproto.Frame{}, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func smokeIPv4TCPPacket(src [4]byte, dst [4]byte, srcPort uint16, dstPort uint16, flags byte) []byte {
|
||||
packet := make([]byte, 40)
|
||||
packet[0] = 0x45
|
||||
@@ -445,7 +474,7 @@ func writeSmokeScopedConfig(local mesh.PeerIdentity, peers map[string]string, ro
|
||||
func newSmokeNode(local mesh.PeerIdentity) *smokeNode {
|
||||
node := &smokeNode{Local: local}
|
||||
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime, FabricSessionEnabled: true, FabricSessionWebSocketEnabled: true}.Handler().ServeHTTP(w, r)
|
||||
mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
|
||||
}))
|
||||
node.URL = node.server.URL
|
||||
return node
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
@@ -15,9 +14,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/hostagent"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
|
||||
)
|
||||
|
||||
type installCommandConfig struct {
|
||||
@@ -82,10 +79,6 @@ func main() {
|
||||
if err := runUpdateHostAgentLoop(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update-host-agent-loop failed: %v", err)
|
||||
}
|
||||
case "fabric-session-smoke":
|
||||
if err := runFabricSessionSmoke(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("fabric-session-smoke failed: %v", err)
|
||||
}
|
||||
default:
|
||||
usage()
|
||||
os.Exit(2)
|
||||
@@ -117,78 +110,6 @@ func applyStagedSelfUpdate() {
|
||||
_ = os.Remove(backup)
|
||||
}
|
||||
|
||||
func runFabricSessionSmoke(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("fabric-session-smoke", flag.ContinueOnError)
|
||||
var meshURL string
|
||||
var token string
|
||||
var timeoutSeconds int
|
||||
var payload string
|
||||
var authorityPayload string
|
||||
var authoritySignature string
|
||||
fs.StringVar(&meshURL, "mesh-url", getenv("RAP_MESH_SMOKE_URL", ""), "Mesh base URL, for example http://node:19131.")
|
||||
fs.StringVar(&token, "token", getenv("RAP_FABRIC_SESSION_TOKEN", ""), "Fabric session token starting with rap_fsn_.")
|
||||
fs.IntVar(&timeoutSeconds, "timeout-seconds", getenvInt("RAP_FABRIC_SESSION_SMOKE_TIMEOUT_SECONDS", 5), "Smoke timeout in seconds.")
|
||||
fs.StringVar(&payload, "payload", getenv("RAP_FABRIC_SESSION_SMOKE_PAYLOAD", "rap-fabric-session-smoke"), "Ping payload.")
|
||||
fs.StringVar(&authorityPayload, "authority-payload", getenv("RAP_FABRIC_SESSION_AUTHORITY_PAYLOAD", ""), "Base64 or JSON fabric session authority payload header.")
|
||||
fs.StringVar(&authoritySignature, "authority-signature", getenv("RAP_FABRIC_SESSION_AUTHORITY_SIGNATURE", ""), "Base64 or JSON fabric session authority signature header.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.TrimSpace(meshURL) == "" {
|
||||
return fmt.Errorf("mesh-url is required")
|
||||
}
|
||||
if strings.TrimSpace(token) == "" {
|
||||
return fmt.Errorf("token is required")
|
||||
}
|
||||
if timeoutSeconds <= 0 {
|
||||
timeoutSeconds = 5
|
||||
}
|
||||
smokeCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSeconds)*time.Second)
|
||||
defer cancel()
|
||||
header := make(http.Header)
|
||||
if strings.TrimSpace(authorityPayload) != "" {
|
||||
header.Set("X-RAP-Fabric-Session-Authority-Payload", strings.TrimSpace(authorityPayload))
|
||||
}
|
||||
if strings.TrimSpace(authoritySignature) != "" {
|
||||
header.Set("X-RAP-Fabric-Session-Authority-Signature", strings.TrimSpace(authoritySignature))
|
||||
}
|
||||
startedAt := time.Now()
|
||||
response, err := mesh.NewClient(meshURL).SendFabricSessionFrame(smokeCtx, mesh.FabricSessionDialOptions{
|
||||
Token: token,
|
||||
Header: header,
|
||||
Timeout: time.Duration(timeoutSeconds) * time.Second,
|
||||
}, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: uint64(startedAt.UnixNano()),
|
||||
Payload: []byte(payload),
|
||||
})
|
||||
duration := time.Since(startedAt)
|
||||
result := map[string]any{
|
||||
"schema_version": "rap.fabric_session_smoke_result.v1",
|
||||
"mesh_url": strings.TrimSpace(meshURL),
|
||||
"ok": err == nil && response.Type == fabricproto.FramePong && string(response.Payload) == payload,
|
||||
"latency_ms": duration.Milliseconds(),
|
||||
"response_type": response.Type,
|
||||
"sequence": response.Sequence,
|
||||
"authority": strings.TrimSpace(authorityPayload) != "" || strings.TrimSpace(authoritySignature) != "",
|
||||
}
|
||||
if err != nil {
|
||||
result["error"] = err.Error()
|
||||
}
|
||||
encoded, marshalErr := json.MarshalIndent(result, "", " ")
|
||||
if marshalErr != nil {
|
||||
return marshalErr
|
||||
}
|
||||
fmt.Println(string(encoded))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if response.Type != fabricproto.FramePong || string(response.Payload) != payload {
|
||||
return fmt.Errorf("fabric session smoke returned unexpected response type=%d payload=%q", response.Type, string(response.Payload))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func runInstallLinux(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("install-linux", flag.ContinueOnError)
|
||||
cfg := hostagent.LinuxInstallConfig{}
|
||||
@@ -215,16 +136,15 @@ func runInstallLinux(ctx context.Context, args []string) error {
|
||||
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
|
||||
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent path copied to the persistent updater location.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable historical synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Historical synthetic mesh HTTP listen address.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
|
||||
@@ -303,16 +223,15 @@ func runInstallWindows(ctx context.Context, args []string) error {
|
||||
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
|
||||
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent.exe path copied to the persistent updater location.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable historical synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Historical synthetic mesh HTTP listen address.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
|
||||
@@ -513,16 +432,19 @@ func runUpdateLoop(ctx context.Context, args []string) error {
|
||||
}
|
||||
cfg.HostAgentUpdateEnabled = hostAgentStatusEnabled
|
||||
cfg.HostAgentUpdateRequest = hostagent.HostAgentUpdateRequest{
|
||||
BackendURL: req.BackendURL,
|
||||
ClusterID: req.ClusterID,
|
||||
NodeID: req.NodeID,
|
||||
StateDir: req.StateDir,
|
||||
CurrentVersion: hostAgentVersion,
|
||||
Channel: req.Channel,
|
||||
OS: firstNonEmptyLocal(req.OS, runtime.GOOS),
|
||||
Arch: firstNonEmptyLocal(req.Arch, runtime.GOARCH),
|
||||
InstallType: hostagent.BinaryUpdateInstallType,
|
||||
BinaryPath: hostAgentBinaryPath,
|
||||
BackendURL: req.BackendURL,
|
||||
ClusterID: req.ClusterID,
|
||||
NodeID: req.NodeID,
|
||||
StateDir: req.StateDir,
|
||||
ClusterAuthorityPublicKey: req.ClusterAuthorityPublicKey,
|
||||
FabricRegistryRecordsJSON: req.FabricRegistryRecordsJSON,
|
||||
MeshRegion: req.MeshRegion,
|
||||
CurrentVersion: hostAgentVersion,
|
||||
Channel: req.Channel,
|
||||
OS: firstNonEmptyLocal(req.OS, runtime.GOOS),
|
||||
Arch: firstNonEmptyLocal(req.Arch, runtime.GOARCH),
|
||||
InstallType: hostagent.BinaryUpdateInstallType,
|
||||
BinaryPath: hostAgentBinaryPath,
|
||||
}
|
||||
if req.InstallType == hostagent.WindowsUpdateInstallType || runtime.GOOS == "windows" {
|
||||
cfg.HostAgentUpdateRequest.InstallType = "windows_binary"
|
||||
@@ -569,6 +491,9 @@ func parseMonitor(args []string) (hostagent.MonitorConfig, error) {
|
||||
fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
|
||||
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.")
|
||||
fs.StringVar(&cfg.ClusterAuthorityPublicKey, "cluster-authority-public-key", getenv("RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned Ed25519 cluster authority public key for signed fabric registry records.")
|
||||
fs.StringVar(&cfg.FabricRegistryRecordsJSON, "fabric-registry-records-json", getenv("RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry records used to reach update/control services.")
|
||||
fs.StringVar(&cfg.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint for fabric registry endpoint selection.")
|
||||
fs.StringVar(&cfg.Product, "product", getenv("RAP_MONITOR_PRODUCT", hostagent.DefaultMonitorProduct), "Status product name.")
|
||||
fs.StringVar(&cfg.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Current rap-host-agent version.")
|
||||
fs.StringVar(&cfg.DockerBinary, "docker-binary", getenv("RAP_DOCKER_BINARY", "docker"), "Docker CLI binary.")
|
||||
@@ -716,6 +641,9 @@ func parseHostAgentUpdate(args []string) (hostagent.HostAgentUpdateRequest, int,
|
||||
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
|
||||
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json.")
|
||||
fs.StringVar(&req.ClusterAuthorityPublicKey, "cluster-authority-public-key", getenv("RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned Ed25519 cluster authority public key for signed fabric registry records.")
|
||||
fs.StringVar(&req.FabricRegistryRecordsJSON, "fabric-registry-records-json", getenv("RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry records used to reach update/control services.")
|
||||
fs.StringVar(&req.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint for fabric registry endpoint selection.")
|
||||
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Currently installed rap-host-agent version.")
|
||||
fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
|
||||
fs.StringVar(&req.OS, "os", getenv("RAP_HOST_AGENT_UPDATE_OS", runtime.GOOS), "Host-agent artifact OS selector.")
|
||||
@@ -739,6 +667,9 @@ func registerUpdateFlags(fs *flag.FlagSet, req *hostagent.UpdateRequest, healthT
|
||||
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
|
||||
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json; used when node-id is not known yet.")
|
||||
fs.StringVar(&req.ClusterAuthorityPublicKey, "cluster-authority-public-key", getenv("RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned Ed25519 cluster authority public key for signed fabric registry records.")
|
||||
fs.StringVar(&req.FabricRegistryRecordsJSON, "fabric-registry-records-json", getenv("RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry records used to reach update/control services.")
|
||||
fs.StringVar(&req.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint for fabric registry endpoint selection.")
|
||||
fs.StringVar(&req.Product, "product", getenv("RAP_UPDATE_PRODUCT", hostagent.DefaultUpdateProduct), "Update product name.")
|
||||
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Currently running product version.")
|
||||
fs.StringVar(&req.OS, "os", getenv("RAP_UPDATE_OS", runtime.GOOS), "Artifact OS selector.")
|
||||
@@ -797,16 +728,15 @@ func parseInstall(args []string) (installCommandConfig, error) {
|
||||
fs.IntVar(&autoUpdate.MonitorDiskCritical, "monitor-disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.")
|
||||
fs.BoolVar(&autoUpdate.MonitorCleanupDocker, "monitor-cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.")
|
||||
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable historical synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.")
|
||||
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
|
||||
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
|
||||
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
|
||||
fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
|
||||
fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
|
||||
fs.IntVar(&cfg.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.")
|
||||
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Synthetic mesh HTTP listen address inside container.")
|
||||
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Historical synthetic mesh HTTP listen address inside container.")
|
||||
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", ""), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 0), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.")
|
||||
@@ -941,13 +871,12 @@ func usage() {
|
||||
rap-host-agent install -backend-url URL -cluster-id ID -join-token TOKEN -node-name NAME [docker options]
|
||||
rap-host-agent install-windows -profile-url URL -install-token TOKEN [-node-name NAME] [windows options]
|
||||
rap-host-agent install-linux -profile-url URL -install-token TOKEN [-node-name NAME] [linux/systemd options]
|
||||
rap-host-agent install-updater -backend-url URL -cluster-id ID -state-dir DIR -container-name NAME
|
||||
rap-host-agent update-host-agent -backend-url URL -cluster-id ID -state-dir DIR
|
||||
rap-host-agent update-host-agent-loop -backend-url URL -cluster-id ID -state-dir DIR
|
||||
rap-host-agent monitor-loop -backend-url URL -cluster-id ID -state-dir DIR --watch-container NAME
|
||||
rap-host-agent monitor-once -backend-url URL -cluster-id ID -state-dir DIR --watch-container NAME
|
||||
rap-host-agent fabric-session-smoke -mesh-url URL -token rap_fsn_TOKEN [-authority-payload VALUE -authority-signature VALUE]
|
||||
rap-host-agent update -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent update-loop -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent install-updater (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR -container-name NAME
|
||||
rap-host-agent update-host-agent (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR
|
||||
rap-host-agent update-host-agent-loop (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR
|
||||
rap-host-agent monitor-loop (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR --watch-container NAME
|
||||
rap-host-agent monitor-once (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR --watch-container NAME
|
||||
rap-host-agent update (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent update-loop (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent status [-container-name NAME]`)
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -21,6 +21,7 @@ import (
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -204,7 +205,7 @@ func TestRouteManagerDecisionsFromControlPlaneConsumesRebuildRouteCommand(t *tes
|
||||
}
|
||||
decision := decisions[0]
|
||||
if decision.RouteID != "route-primary" ||
|
||||
decision.RebuildStatus != "pending_degraded_fallback" ||
|
||||
decision.RebuildStatus != "pending_degraded_route_state" ||
|
||||
decision.DecisionSource != "service_channel_remediation_command" ||
|
||||
decision.RebuildRequestID != "cmd-rebuild" {
|
||||
t.Fatalf("unexpected rebuild remediation decision: %+v", decision)
|
||||
@@ -279,7 +280,6 @@ func TestGatewayTransportForAssignmentUsesFabricSessionWhenEnabled(t *testing.T)
|
||||
&syntheticMeshState{
|
||||
ProductionForwardTransport: noopProductionForwardTransport{},
|
||||
VPNFabricInbox: inbox,
|
||||
VPNFabricSessionPeers: mesh.NewFabricSessionPeerManager(),
|
||||
PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{
|
||||
"entry-1": {{
|
||||
EndpointID: "entry-1-quic",
|
||||
@@ -322,7 +322,6 @@ func TestGatewayTransportForAssignmentFallsBackWhenFabricSessionUnavailable(t *t
|
||||
&syntheticMeshState{
|
||||
ProductionForwardTransport: noopProductionForwardTransport{},
|
||||
VPNFabricInbox: inbox,
|
||||
VPNFabricSessionPeers: mesh.NewFabricSessionPeerManager(),
|
||||
PeerEndpoints: map[string]string{},
|
||||
Routes: []mesh.SyntheticRoute{{
|
||||
RouteID: "route-exit-entry",
|
||||
@@ -424,6 +423,496 @@ func testMainQUICCertSHA256(t *testing.T, config *tls.Config) string {
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
func TestFabricControlForwardHandlerUsesRegistryQUICControlAPI(t *testing.T) {
|
||||
tlsConfig := testMainQUICTLSConfig(t)
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
var req client.RawControlRequest
|
||||
if err := json.Unmarshal(payload, &req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if req.Path != "/auth/login" {
|
||||
return nil, fmt.Errorf("unexpected path %s", req.Path)
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{StatusCode: 200, Body: json.RawMessage(`{"via":"fabric"}`)})
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
now := time.Now().UTC()
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("generate key: %v", err)
|
||||
}
|
||||
issuer := mesh.FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: mesh.FabricRegistryAuthorityControl, PublicKey: publicKey}
|
||||
record := mesh.FabricRegistryGossipRecord{
|
||||
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
|
||||
ClusterID: "cluster-1",
|
||||
Service: mesh.FabricRegistryServiceControlAPI,
|
||||
Scope: mesh.FabricRegistryScopeCluster,
|
||||
Epoch: 1,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Hour),
|
||||
IssuerNodeID: "authority-1",
|
||||
IssuerRole: mesh.FabricRegistryAuthorityControl,
|
||||
Endpoints: []mesh.FabricRegistryEndpoint{{
|
||||
EndpointID: "control-a",
|
||||
Address: "quic://" + server.Addr().String(),
|
||||
Transport: "direct_quic",
|
||||
PeerCertSHA256: testMainQUICCertSHA256(t, tlsConfig),
|
||||
}},
|
||||
}
|
||||
signed, err := mesh.SignFabricRegistryGossipRecord(record, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign registry record: %v", err)
|
||||
}
|
||||
registry := mesh.NewFabricRegistry()
|
||||
if _, _, err := registry.ApplyGossipRecord(signed, mesh.FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: "cluster-1",
|
||||
TrustedIssuers: []mesh.FabricRegistryTrustedIssuer{issuer},
|
||||
RequiredSignatures: 1,
|
||||
Now: now,
|
||||
}, true); err != nil {
|
||||
t.Fatalf("apply registry record: %v", err)
|
||||
}
|
||||
transport := mesh.NewQUICFabricTransport(nil)
|
||||
transport.SetLocalPeerID("node-a")
|
||||
handler := fabricControlForwardHandlerFromMeshState(nil, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, &syntheticMeshState{
|
||||
FabricRegistry: registry,
|
||||
VPNFabricQUICTransport: transport,
|
||||
ListenerRuntimeConfig: config.Config{MeshRegion: "test"},
|
||||
})
|
||||
payload, err := handler(context.Background(), []byte(`{"method":"POST","path":"/auth/login","body":{"user":"a"}}`))
|
||||
if err != nil {
|
||||
t.Fatalf("fabric control handler: %v", err)
|
||||
}
|
||||
var response client.RawControlResponse
|
||||
if err := json.Unmarshal(payload, &response); err != nil {
|
||||
t.Fatalf("decode raw control response: %v", err)
|
||||
}
|
||||
if response.StatusCode != 200 || string(response.Body) != `{"via":"fabric"}` {
|
||||
t.Fatalf("response = %+v", response)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatViaFabricControlUsesRegistryQUICControlAPI(t *testing.T) {
|
||||
tlsConfig := testMainQUICTLSConfig(t)
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
var req client.RawControlRequest
|
||||
if err := json.Unmarshal(payload, &req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if req.Method != http.MethodPost || req.Path != "/clusters/cluster-1/nodes/node-a/heartbeats" {
|
||||
return nil, fmt.Errorf("unexpected request: %+v", req)
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{
|
||||
StatusCode: 202,
|
||||
Body: json.RawMessage(`{
|
||||
"heartbeat":{"id":"hb-1"},
|
||||
"testing_flags":{"enabled":true,"synthetic_links_enabled":true,"applied_scopes":["cluster"]},
|
||||
"update_hint":{"schema_version":"rap.node_update_hint.v1","check_now":true,"generation":"gen-1"}
|
||||
}`),
|
||||
})
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
now := time.Now().UTC()
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("generate key: %v", err)
|
||||
}
|
||||
issuer := mesh.FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: mesh.FabricRegistryAuthorityControl, PublicKey: publicKey}
|
||||
record := mesh.FabricRegistryGossipRecord{
|
||||
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
|
||||
ClusterID: "cluster-1",
|
||||
Service: mesh.FabricRegistryServiceControlAPI,
|
||||
Scope: mesh.FabricRegistryScopeCluster,
|
||||
Epoch: 1,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Hour),
|
||||
IssuerNodeID: "authority-1",
|
||||
IssuerRole: mesh.FabricRegistryAuthorityControl,
|
||||
Endpoints: []mesh.FabricRegistryEndpoint{{
|
||||
EndpointID: "control-a",
|
||||
Address: "quic://" + server.Addr().String(),
|
||||
Transport: "direct_quic",
|
||||
PeerCertSHA256: testMainQUICCertSHA256(t, tlsConfig),
|
||||
}},
|
||||
}
|
||||
signed, err := mesh.SignFabricRegistryGossipRecord(record, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign registry record: %v", err)
|
||||
}
|
||||
registry := mesh.NewFabricRegistry()
|
||||
if _, _, err := registry.ApplyGossipRecord(signed, mesh.FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: "cluster-1",
|
||||
TrustedIssuers: []mesh.FabricRegistryTrustedIssuer{issuer},
|
||||
RequiredSignatures: 1,
|
||||
Now: now,
|
||||
}, true); err != nil {
|
||||
t.Fatalf("apply registry record: %v", err)
|
||||
}
|
||||
response, viaFabric, err := heartbeatViaFabricControl(context.Background(), state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, &syntheticMeshState{
|
||||
FabricRegistry: registry,
|
||||
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
|
||||
}, client.HeartbeatRequest{HealthStatus: "healthy"})
|
||||
if err != nil {
|
||||
t.Fatalf("heartbeat via fabric: %v", err)
|
||||
}
|
||||
if !viaFabric || !response.TestingFlags.Enabled || response.UpdateHint == nil || response.UpdateHint.Generation != "gen-1" {
|
||||
t.Fatalf("unexpected heartbeat response viaFabric=%t response=%+v", viaFabric, response)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticMeshConfigRefreshUsesRegistryQUICControlAPI(t *testing.T) {
|
||||
tlsConfig := testMainQUICTLSConfig(t)
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
var req client.RawControlRequest
|
||||
if err := json.Unmarshal(payload, &req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if req.Method != http.MethodGet || req.Path != "/clusters/cluster-1/nodes/node-a/mesh/synthetic-config" {
|
||||
return nil, fmt.Errorf("unexpected request: %+v", req)
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{
|
||||
StatusCode: 200,
|
||||
Body: json.RawMessage(`{
|
||||
"synthetic_mesh_config":{
|
||||
"enabled":true,
|
||||
"config_version":"fabric-gen-1",
|
||||
"peer_directory_version":"pd-1",
|
||||
"policy_version":"pol-1",
|
||||
"peer_endpoints":{},
|
||||
"routes":[]
|
||||
}
|
||||
}`),
|
||||
})
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
|
||||
loaded, err := loadSyntheticMeshConfigRuntime(context.Background(), config.Config{}, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, nil, &syntheticMeshState{
|
||||
FabricRegistry: registry,
|
||||
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("load synthetic mesh config via fabric: %v", err)
|
||||
}
|
||||
if loaded.Source != "control_plane" || loaded.ConfigVersion != "fabric-gen-1" {
|
||||
t.Fatalf("loaded = %+v", loaded)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReportMeshLinkUsesRegistryQUICControlAPI(t *testing.T) {
|
||||
tlsConfig := testMainQUICTLSConfig(t)
|
||||
var received client.RawControlRequest
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
if err := json.Unmarshal(payload, &received); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if received.Method != http.MethodPost || received.Path != "/clusters/cluster-1/mesh/links" {
|
||||
return nil, fmt.Errorf("unexpected request: %+v", received)
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{StatusCode: 202, Body: json.RawMessage(`{"ok":true}`)})
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
|
||||
err = reportMeshLink(context.Background(), nil, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, &syntheticMeshState{
|
||||
FabricRegistry: registry,
|
||||
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
|
||||
}, client.MeshLinkObservationRequest{
|
||||
SourceNodeID: "node-a",
|
||||
TargetNodeID: "node-b",
|
||||
LinkStatus: "reachable",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("report mesh link via fabric: %v", err)
|
||||
}
|
||||
if len(received.Body) == 0 || !strings.Contains(string(received.Body), `"target_node_id":"node-b"`) {
|
||||
t.Fatalf("unexpected received body: %s", string(received.Body))
|
||||
}
|
||||
}
|
||||
|
||||
func TestReportTelemetryUsesRegistryQUICControlAPI(t *testing.T) {
|
||||
tlsConfig := testMainQUICTLSConfig(t)
|
||||
var received client.RawControlRequest
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
if err := json.Unmarshal(payload, &received); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if received.Method != http.MethodPost || received.Path != "/clusters/cluster-1/nodes/node-a/telemetry" {
|
||||
return nil, fmt.Errorf("unexpected request: %+v", received)
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{StatusCode: 202, Body: json.RawMessage(`{"ok":true}`)})
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
|
||||
err = reportTelemetry(context.Background(), nil, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, &syntheticMeshState{
|
||||
FabricRegistry: registry,
|
||||
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
|
||||
}, client.TelemetryRequest{Payload: map[string]any{"fabric": "quic"}})
|
||||
if err != nil {
|
||||
t.Fatalf("report telemetry via fabric: %v", err)
|
||||
}
|
||||
if len(received.Body) == 0 || !strings.Contains(string(received.Body), `"fabric":"quic"`) {
|
||||
t.Fatalf("unexpected received body: %s", string(received.Body))
|
||||
}
|
||||
}
|
||||
|
||||
func TestWorkloadControlUsesRegistryQUICControlAPI(t *testing.T) {
|
||||
tlsConfig := testMainQUICTLSConfig(t)
|
||||
var paths []string
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
var req client.RawControlRequest
|
||||
if err := json.Unmarshal(payload, &req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
paths = append(paths, req.Method+" "+req.Path)
|
||||
switch req.Path {
|
||||
case "/clusters/cluster-1/nodes/node-a/workloads/desired":
|
||||
return json.Marshal(client.RawControlResponse{
|
||||
StatusCode: 200,
|
||||
Body: json.RawMessage(`{"desired_workloads":[{"service_type":"vpn-egress","desired_state":"enabled","runtime_mode":"node"}]}`),
|
||||
})
|
||||
case "/clusters/cluster-1/nodes/node-a/workloads/vpn-egress/status":
|
||||
if len(req.Body) == 0 || !strings.Contains(string(req.Body), `"reported_state":"running"`) {
|
||||
return nil, fmt.Errorf("unexpected status body: %s", string(req.Body))
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{StatusCode: 204, Body: json.RawMessage(`{}`)})
|
||||
default:
|
||||
return nil, fmt.Errorf("unexpected request: %+v", req)
|
||||
}
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
|
||||
meshState := &syntheticMeshState{
|
||||
FabricRegistry: registry,
|
||||
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
|
||||
}
|
||||
identity := state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
desired, err := desiredWorkloads(context.Background(), nil, identity, meshState)
|
||||
if err != nil {
|
||||
t.Fatalf("desired workloads via fabric: %v", err)
|
||||
}
|
||||
if len(desired) != 1 || desired[0].ServiceType != "vpn-egress" {
|
||||
t.Fatalf("desired = %+v", desired)
|
||||
}
|
||||
if err := reportSingleWorkloadStatus(context.Background(), nil, identity, meshState, "vpn-egress", client.WorkloadStatusRequest{ReportedState: "running"}); err != nil {
|
||||
t.Fatalf("report workload status via fabric: %v", err)
|
||||
}
|
||||
want := []string{
|
||||
"GET /clusters/cluster-1/nodes/node-a/workloads/desired",
|
||||
"POST /clusters/cluster-1/nodes/node-a/workloads/vpn-egress/status",
|
||||
}
|
||||
if !reflect.DeepEqual(paths, want) {
|
||||
t.Fatalf("paths = %+v, want %+v", paths, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAdminRuntimeProjectionUsesRegistryQUICControlAPI(t *testing.T) {
|
||||
tlsConfig := testMainQUICTLSConfig(t)
|
||||
var received client.RawControlRequest
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
if err := json.Unmarshal(payload, &received); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if received.Method != http.MethodPost || received.Path != "/clusters/cluster-1/nodes/node-a/admin-runtime/projection" {
|
||||
return nil, fmt.Errorf("unexpected request: %+v", received)
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{
|
||||
StatusCode: 200,
|
||||
Body: json.RawMessage(`{"schema_version":"rap.admin_runtime_projection.v1","status":"ok","status_code":200,"body":{"page":"cluster"}}`),
|
||||
})
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
|
||||
projection, err := controlAPIProjectionClient{
|
||||
Identity: state.Identity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
MeshState: &syntheticMeshState{
|
||||
FabricRegistry: registry,
|
||||
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
|
||||
},
|
||||
}.Project(context.Background(), webingress.ControlAPIProjectionRequest{
|
||||
SchemaVersion: "rap.web_ingress_projection.v1",
|
||||
Method: http.MethodGet,
|
||||
Path: "/cluster-admin",
|
||||
Scope: "cluster",
|
||||
ServiceClass: "cluster_admin",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("admin projection via fabric: %v", err)
|
||||
}
|
||||
if projection.StatusCode != 200 || string(projection.Body) != `{"page":"cluster"}` {
|
||||
t.Fatalf("projection = %+v", projection)
|
||||
}
|
||||
if len(received.Body) == 0 || !strings.Contains(string(received.Body), `"service_class":"cluster_admin"`) {
|
||||
t.Fatalf("unexpected received body: %s", string(received.Body))
|
||||
}
|
||||
}
|
||||
|
||||
func TestVPNAssignmentControlUsesRegistryQUICControlAPI(t *testing.T) {
|
||||
tlsConfig := testMainQUICTLSConfig(t)
|
||||
var paths []string
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
var req client.RawControlRequest
|
||||
if err := json.Unmarshal(payload, &req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
paths = append(paths, req.Method+" "+req.Path)
|
||||
switch req.Path {
|
||||
case "/clusters/cluster-1/nodes/node-a/vpn/assignments":
|
||||
return json.Marshal(client.RawControlResponse{
|
||||
StatusCode: 200,
|
||||
Body: json.RawMessage(`{"vpn_assignments":[{"vpn_connection_id":"vpn-1","desired_state":"enabled","assignment_reason":"eligible_candidate"}]}`),
|
||||
})
|
||||
case "/clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/lease/acquire":
|
||||
return json.Marshal(client.RawControlResponse{
|
||||
StatusCode: 201,
|
||||
Body: json.RawMessage(`{"lease":{"lease_id":"lease-1","owner_node_id":"node-a","lease_generation":1,"status":"active"}}`),
|
||||
})
|
||||
case "/clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/lease/lease-1/renew":
|
||||
return json.Marshal(client.RawControlResponse{StatusCode: 204, Body: json.RawMessage(`{}`)})
|
||||
case "/clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/status":
|
||||
if len(req.Body) == 0 || !strings.Contains(string(req.Body), `"observed_status":"assigned"`) {
|
||||
return nil, fmt.Errorf("unexpected status body: %s", string(req.Body))
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{StatusCode: 204, Body: json.RawMessage(`{}`)})
|
||||
default:
|
||||
return nil, fmt.Errorf("unexpected request: %+v", req)
|
||||
}
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
registry := signedTestControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testMainQUICCertSHA256(t, tlsConfig))
|
||||
meshState := &syntheticMeshState{
|
||||
FabricRegistry: registry,
|
||||
VPNFabricQUICTransport: mesh.NewQUICFabricTransport(nil),
|
||||
}
|
||||
identity := state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
assignments, err := nodeVPNAssignments(context.Background(), nil, identity, meshState)
|
||||
if err != nil {
|
||||
t.Fatalf("vpn assignments via fabric: %v", err)
|
||||
}
|
||||
if len(assignments) != 1 || assignments[0].VPNConnectionID != "vpn-1" {
|
||||
t.Fatalf("assignments = %+v", assignments)
|
||||
}
|
||||
lease, err := acquireNodeVPNAssignmentLease(context.Background(), nil, identity, meshState, "vpn-1", client.NodeVPNAssignmentLeaseAcquireRequest{TTLSeconds: 300})
|
||||
if err != nil {
|
||||
t.Fatalf("acquire lease via fabric: %v", err)
|
||||
}
|
||||
if lease == nil || lease.LeaseID != "lease-1" {
|
||||
t.Fatalf("lease = %+v", lease)
|
||||
}
|
||||
if err := renewNodeVPNAssignmentLease(context.Background(), nil, identity, meshState, "vpn-1", "lease-1", client.NodeVPNAssignmentLeaseRenewRequest{TTLSeconds: 300}); err != nil {
|
||||
t.Fatalf("renew lease via fabric: %v", err)
|
||||
}
|
||||
if err := reportNodeVPNAssignmentStatus(context.Background(), nil, identity, meshState, "vpn-1", client.NodeVPNAssignmentStatusRequest{ObservedStatus: "assigned"}); err != nil {
|
||||
t.Fatalf("report status via fabric: %v", err)
|
||||
}
|
||||
want := []string{
|
||||
"GET /clusters/cluster-1/nodes/node-a/vpn/assignments",
|
||||
"POST /clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/lease/acquire",
|
||||
"POST /clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/lease/lease-1/renew",
|
||||
"POST /clusters/cluster-1/nodes/node-a/vpn/assignments/vpn-1/status",
|
||||
}
|
||||
if !reflect.DeepEqual(paths, want) {
|
||||
t.Fatalf("paths = %+v, want %+v", paths, want)
|
||||
}
|
||||
}
|
||||
|
||||
func signedTestControlRegistry(t *testing.T, clusterID string, endpoint string, certSHA256 string) *mesh.FabricRegistry {
|
||||
t.Helper()
|
||||
now := time.Now().UTC()
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("generate key: %v", err)
|
||||
}
|
||||
issuer := mesh.FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: mesh.FabricRegistryAuthorityControl, PublicKey: publicKey}
|
||||
record := mesh.FabricRegistryGossipRecord{
|
||||
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
|
||||
ClusterID: clusterID,
|
||||
Service: mesh.FabricRegistryServiceControlAPI,
|
||||
Scope: mesh.FabricRegistryScopeCluster,
|
||||
Epoch: 1,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Hour),
|
||||
IssuerNodeID: "authority-1",
|
||||
IssuerRole: mesh.FabricRegistryAuthorityControl,
|
||||
Endpoints: []mesh.FabricRegistryEndpoint{{
|
||||
EndpointID: "control-a",
|
||||
Address: endpoint,
|
||||
Transport: "direct_quic",
|
||||
PeerCertSHA256: certSHA256,
|
||||
}},
|
||||
}
|
||||
signed, err := mesh.SignFabricRegistryGossipRecord(record, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign registry record: %v", err)
|
||||
}
|
||||
registry := mesh.NewFabricRegistry()
|
||||
if _, _, err := registry.ApplyGossipRecord(signed, mesh.FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: clusterID,
|
||||
TrustedIssuers: []mesh.FabricRegistryTrustedIssuer{issuer},
|
||||
RequiredSignatures: 1,
|
||||
Now: now,
|
||||
}, true); err != nil {
|
||||
t.Fatalf("apply registry record: %v", err)
|
||||
}
|
||||
return registry
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneKeepsExplicitRemediationCommand(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
report := &client.RoutePathDecisionReport{Decisions: []client.RoutePathDecision{{
|
||||
@@ -493,9 +982,10 @@ func TestFabricServiceChannelAccessStatsReportsDataPlaneViolations(t *testing.T)
|
||||
OccurredAt: time.Unix(10, 0).UTC(),
|
||||
})
|
||||
report := stats.Report(time.Unix(20, 0).UTC())
|
||||
if report["backend_fallback_blocked"] != int64(1) ||
|
||||
if report["degraded_compatibility_blocked"] != int64(1) ||
|
||||
report["fabric_route_send_failure"] != int64(1) ||
|
||||
report["last_data_plane_violation_status"] != "fabric_route_send_failed_backend_fallback_blocked" ||
|
||||
report["last_data_plane_violation_status"] != "degraded_compatibility_blocked" ||
|
||||
report["last_data_plane_violation_status_raw"] != "fabric_route_send_failed_backend_fallback_blocked" ||
|
||||
report["last_data_plane_violation_reason"] != "mesh synthetic route not found" {
|
||||
t.Fatalf("unexpected violation report: %+v", report)
|
||||
}
|
||||
@@ -790,7 +1280,56 @@ func TestVerifyEnrollmentBootstrapRejectsPinnedAuthorityMismatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t *testing.T) {
|
||||
func TestLoadFabricRegistryBootstrapAcceptsSignedCandidate(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("GenerateKey: %v", err)
|
||||
}
|
||||
record := mesh.FabricRegistryGossipRecord{
|
||||
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
|
||||
ClusterID: "cluster-1",
|
||||
Service: mesh.FabricRegistryServiceControlAPI,
|
||||
Scope: mesh.FabricRegistryScopeCluster,
|
||||
Epoch: 1,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Hour),
|
||||
IssuerNodeID: "authority-node",
|
||||
IssuerRole: mesh.FabricRegistryAuthorityControl,
|
||||
Endpoints: []mesh.FabricRegistryEndpoint{
|
||||
{EndpointID: "control-a", Address: "quic://control.example.test:19443", Transport: "direct_quic"},
|
||||
},
|
||||
}
|
||||
signed, err := mesh.SignFabricRegistryGossipRecord(record, mesh.FabricRegistryTrustedIssuer{
|
||||
IssuerID: "cluster-authority",
|
||||
Role: mesh.FabricRegistryAuthorityControl,
|
||||
}, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign registry record: %v", err)
|
||||
}
|
||||
raw, err := json.Marshal([]mesh.FabricRegistryGossipRecord{signed})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal registry records: %v", err)
|
||||
}
|
||||
registry, report := loadFabricRegistryBootstrap(config.Config{
|
||||
ClusterAuthorityPublicKey: base64.StdEncoding.EncodeToString(publicKey),
|
||||
FabricRegistryRecordsJSON: string(raw),
|
||||
}, state.Identity{ClusterID: "cluster-1"})
|
||||
if registry == nil || report.Total != 1 || report.Candidate != 1 || report.Rejected != 0 {
|
||||
t.Fatalf("unexpected registry bootstrap report: %+v registry=%v", report, registry)
|
||||
}
|
||||
if _, ok := registry.Active("cluster-1", mesh.FabricRegistryServiceControlAPI, mesh.FabricRegistryScopeCluster, "", now); ok {
|
||||
t.Fatal("bootstrap record should remain candidate until live verification")
|
||||
}
|
||||
if !registry.MarkLiveVerified("cluster-1", mesh.FabricRegistryServiceControlAPI, mesh.FabricRegistryScopeCluster, "", now) {
|
||||
t.Fatal("MarkLiveVerified = false")
|
||||
}
|
||||
if _, ok := registry.Active("cluster-1", mesh.FabricRegistryServiceControlAPI, mesh.FabricRegistryScopeCluster, "", now); !ok {
|
||||
t.Fatal("expected active record after live verification")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeLoadedSyntheticMeshConfigMigratesNonQUICControlPlaneSurfaces(t *testing.T) {
|
||||
loaded := loadedSyntheticMeshConfig{
|
||||
PeerEndpoints: map[string]string{
|
||||
"node-a": "https://node-a.example.test:443",
|
||||
@@ -798,7 +1337,7 @@ func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t
|
||||
PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-legacy",
|
||||
EndpointID: "node-b-http-migration",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_http",
|
||||
Address: "https://node-b.example.test:443",
|
||||
@@ -816,7 +1355,7 @@ func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t
|
||||
},
|
||||
RendezvousLeases: []mesh.PeerRendezvousLease{
|
||||
{
|
||||
LeaseID: "lease-legacy",
|
||||
LeaseID: "lease-http-migration",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "http://node-r.example.test:19001",
|
||||
@@ -824,7 +1363,7 @@ func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t
|
||||
},
|
||||
},
|
||||
RoutePathDecisions: &client.RoutePathDecisionReport{
|
||||
Decisions: []client.RoutePathDecision{{DecisionID: "decision-legacy", SelectedRelayEndpoint: "http://node-r.example.test:19001"}},
|
||||
Decisions: []client.RoutePathDecision{{DecisionID: "decision-http-migration", SelectedRelayEndpoint: "http://node-r.example.test:19001"}},
|
||||
},
|
||||
}
|
||||
normalizeLoadedSyntheticMeshConfigQUICOnly(&loaded)
|
||||
@@ -849,14 +1388,14 @@ func TestNormalizeLoadedSyntheticMeshConfigMigratesLegacyControlPlaneSurfaces(t
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateLoadedSyntheticMeshConfigRejectsUnnormalizedLegacyControlPlaneSurfaces(t *testing.T) {
|
||||
func TestValidateLoadedSyntheticMeshConfigRejectsUnnormalizedNonQUICControlPlaneSurfaces(t *testing.T) {
|
||||
err := validateLoadedSyntheticMeshConfigQUICOnly(loadedSyntheticMeshConfig{
|
||||
RoutePathDecisions: &client.RoutePathDecisionReport{
|
||||
Decisions: []client.RoutePathDecision{{DecisionID: "decision-legacy", SelectedRelayEndpoint: "http://node-r.example.test:19001"}},
|
||||
Decisions: []client.RoutePathDecision{{DecisionID: "decision-http-migration", SelectedRelayEndpoint: "http://node-r.example.test:19001"}},
|
||||
},
|
||||
})
|
||||
if err == nil || !strings.Contains(err.Error(), "QUIC selected relay endpoint") {
|
||||
t.Fatalf("expected legacy selected relay endpoint rejection, got %v", err)
|
||||
t.Fatalf("expected non-QUIC selected relay endpoint rejection, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -942,7 +1481,6 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
|
||||
MeshRegion: "eu",
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshProductionForwardingEnabled: true,
|
||||
MeshFabricSessionEnabled: true,
|
||||
VPNFabricSessionTransportEnabled: true,
|
||||
VPNFabricSessionStreamShards: 6,
|
||||
VPNFabricQUICMaxStreamsPerConn: 24,
|
||||
@@ -952,7 +1490,6 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
}, &syntheticMeshState{
|
||||
VPNFabricSessionPeers: mesh.NewFabricSessionPeerManager(),
|
||||
VPNFabricQUICTransport: func() *mesh.QUICFabricTransport {
|
||||
transport := mesh.NewQUICFabricTransport(nil)
|
||||
transport.MaxStreamsPerConn = 24
|
||||
@@ -1010,8 +1547,7 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
|
||||
if report, ok := payload.Metadata["vpn_fabric_session_transport_report"].(map[string]any); !ok ||
|
||||
report["packet_payload"] != "rap.vpn_packet_batch.fabric.v1" ||
|
||||
report["transport"] != "fabric_session_binary_frames" ||
|
||||
report["stream_shards_per_class"] != 6 ||
|
||||
report["peer_sessions"] == nil {
|
||||
report["stream_shards_per_class"] != 6 {
|
||||
t.Fatalf("vpn fabric session report missing: %+v", payload.Metadata)
|
||||
} else if report["quic_sessions"] == nil || report["quic_max_streams_per_conn"] != 24 {
|
||||
t.Fatalf("vpn fabric quic session report missing: %+v", report)
|
||||
@@ -1242,14 +1778,14 @@ func TestVPNFabricSessionTargetPrefersRankedQUICCandidate(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestVPNFabricSessionTargetFallsBackToLegacyPeerEndpoint(t *testing.T) {
|
||||
func TestVPNFabricSessionTargetRejectsNonQUICPeerEndpoint(t *testing.T) {
|
||||
_, ok := vpnFabricSessionTarget(&syntheticMeshState{
|
||||
PeerEndpoints: map[string]string{
|
||||
"node-b": "https://node-b.example.test:443/",
|
||||
},
|
||||
}, "node-b")
|
||||
if ok {
|
||||
t.Fatal("legacy peer endpoint unexpectedly produced a QUIC target")
|
||||
t.Fatal("non-QUIC peer endpoint unexpectedly produced a QUIC target")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1257,7 +1793,7 @@ func TestVPNFabricSessionTargetsIncludeRankedQUICCandidatesWithoutLegacyFallback
|
||||
now := time.Now().UTC()
|
||||
targets := vpnFabricSessionTargets(&syntheticMeshState{
|
||||
PeerEndpoints: map[string]string{
|
||||
"node-b": "https://node-b-legacy.example.test:443/",
|
||||
"node-b": "https://node-b-http-migration.example.test:443/",
|
||||
},
|
||||
PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
@@ -2731,7 +3267,7 @@ func TestWebIngressForwardHandlerFromConfigVerifiesSignedEnvelope(t *testing.T)
|
||||
keyID := "web-key-1"
|
||||
handler := webIngressForwardHandlerFromConfig(config.Config{
|
||||
WebIngressTrustedKeysJSON: webingress.TrustedKeysJSONForPublicKey(keyID, publicKey),
|
||||
}, state.Identity{ClusterID: "cluster-1", NodeID: "node-1"}, nil)
|
||||
}, state.Identity{ClusterID: "cluster-1", NodeID: "node-1"}, nil, nil)
|
||||
if handler == nil {
|
||||
t.Fatal("handler is nil")
|
||||
}
|
||||
@@ -2780,10 +3316,10 @@ func TestWebIngressForwardHandlerFromConfigVerifiesSignedEnvelope(t *testing.T)
|
||||
}
|
||||
|
||||
func TestWebIngressForwardHandlerFromConfigDisabledWithoutTrustedKeys(t *testing.T) {
|
||||
if handler := webIngressForwardHandlerFromConfig(config.Config{}, state.Identity{}, nil); handler != nil {
|
||||
if handler := webIngressForwardHandlerFromConfig(config.Config{}, state.Identity{}, nil, nil); handler != nil {
|
||||
t.Fatal("handler should be nil without trusted keys")
|
||||
}
|
||||
if handler := webIngressForwardHandlerFromConfig(config.Config{WebIngressTrustedKeysJSON: `{"bad":"key"}`}, state.Identity{}, nil); handler != nil {
|
||||
if handler := webIngressForwardHandlerFromConfig(config.Config{WebIngressTrustedKeysJSON: `{"bad":"key"}`}, state.Identity{}, nil, nil); handler != nil {
|
||||
t.Fatal("handler should be nil with invalid trusted keys")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
const Version = "0.2.309-latencyaware"
|
||||
const Version = "0.2.321-directreadytarget"
|
||||
|
||||
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
|
||||
return client.EnrollRequest{
|
||||
|
||||
@@ -828,9 +828,6 @@ func (c *Client) RawControl(ctx context.Context, request RawControlRequest) (Raw
|
||||
if err != nil {
|
||||
return RawControlResponse{}, err
|
||||
}
|
||||
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
|
||||
return RawControlResponse{}, fmt.Errorf("backend returned status %d: %s", httpResp.StatusCode, string(payload))
|
||||
}
|
||||
return RawControlResponse{StatusCode: httpResp.StatusCode, Body: json.RawMessage(payload)}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"os"
|
||||
@@ -31,7 +32,6 @@ type Config struct {
|
||||
EnrollmentPollTimeout time.Duration
|
||||
MeshSyntheticRuntimeEnabled bool
|
||||
MeshProductionForwardingEnabled bool
|
||||
MeshFabricSessionEnabled bool
|
||||
VPNFabricSessionTransportEnabled bool
|
||||
MeshQUICFabricEnabled bool
|
||||
MeshQUICFabricListenAddr string
|
||||
@@ -45,6 +45,7 @@ type Config struct {
|
||||
MeshListenAutoPortEnd int
|
||||
MeshAdvertiseEndpoint string
|
||||
MeshAdvertiseEndpointsJSON string
|
||||
FabricRegistryRecordsJSON string
|
||||
MeshAdvertiseTransport string
|
||||
MeshConnectivityMode string
|
||||
MeshNATType string
|
||||
@@ -86,7 +87,6 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
fs.StringVar(&cfg.WebIngressRuntimeServiceClasses, "web-ingress-runtime-service-classes", getEnv(env, "RAP_WEB_INGRESS_RUNTIME_SERVICE_CLASSES", ""), "Optional comma-separated allow-list of web ingress runtime service classes accepted by this node.")
|
||||
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
|
||||
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
|
||||
fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getEnvBool(env, "RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint. Disabled by default.")
|
||||
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getEnvBool(env, "RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric session when explicitly enabled. Disabled by default.")
|
||||
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getEnvBool(env, "RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener. Disabled by default.")
|
||||
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getEnv(env, "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "Listen address for QUIC/UDP fabric endpoint, for example :19443.")
|
||||
@@ -94,12 +94,13 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getEnvInt(env, "RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
|
||||
fs.DurationVar(&cfg.VPNFabricQUICIdleTTL, "vpn-fabric-quic-idle-ttl", time.Duration(getEnvInt(env, "RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300))*time.Second, "Idle TTL for cached VPN QUIC carrier connections.")
|
||||
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
|
||||
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.")
|
||||
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default historical synthetic mesh HTTP endpoint.")
|
||||
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
|
||||
fs.StringVar(&cfg.FabricRegistryRecordsJSON, "fabric-registry-records-json", getEnv(env, "RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry gossip records used as bootstrap discovery seeds.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Transport label for the advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.")
|
||||
@@ -150,6 +151,7 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
}
|
||||
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
|
||||
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
|
||||
cfg.FabricRegistryRecordsJSON = strings.TrimSpace(cfg.FabricRegistryRecordsJSON)
|
||||
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
|
||||
if cfg.MeshAdvertiseTransport == "" {
|
||||
cfg.MeshAdvertiseTransport = "quic"
|
||||
@@ -199,6 +201,9 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
|
||||
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
|
||||
}
|
||||
if cfg.FabricRegistryRecordsJSON != "" && !isJSONArray(cfg.FabricRegistryRecordsJSON) {
|
||||
return Config{}, errors.New("fabric registry records must be a JSON array")
|
||||
}
|
||||
switch cfg.MeshListenPortMode {
|
||||
case "", "manual", "auto", "disabled":
|
||||
if cfg.MeshListenPortMode == "" {
|
||||
@@ -269,6 +274,11 @@ func hasLegacyEndpointScheme(endpoint string) bool {
|
||||
strings.HasPrefix(endpoint, "wss://")
|
||||
}
|
||||
|
||||
func isJSONArray(value string) bool {
|
||||
var items []json.RawMessage
|
||||
return json.Unmarshal([]byte(strings.TrimSpace(value)), &items) == nil
|
||||
}
|
||||
|
||||
func readEnv() map[string]string {
|
||||
out := map[string]string{}
|
||||
for _, pair := range os.Environ() {
|
||||
|
||||
@@ -25,7 +25,6 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30",
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true",
|
||||
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
|
||||
"RAP_MESH_FABRIC_SESSION_ENABLED": "true",
|
||||
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED": "true",
|
||||
"RAP_MESH_QUIC_FABRIC_ENABLED": "true",
|
||||
"RAP_MESH_QUIC_FABRIC_LISTEN_ADDR": ":19443",
|
||||
@@ -39,6 +38,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443/",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
|
||||
"RAP_FABRIC_REGISTRY_RECORDS_JSON": ` [{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}] `,
|
||||
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
|
||||
"RAP_MESH_CONNECTIVITY_MODE": "outbound_only",
|
||||
"RAP_MESH_NAT_TYPE": "symmetric",
|
||||
@@ -93,9 +93,6 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
if !cfg.MeshProductionForwardingEnabled {
|
||||
t.Fatal("MeshProductionForwardingEnabled = false, want true")
|
||||
}
|
||||
if !cfg.MeshFabricSessionEnabled {
|
||||
t.Fatal("MeshFabricSessionEnabled = false, want true")
|
||||
}
|
||||
if !cfg.VPNFabricSessionTransportEnabled {
|
||||
t.Fatal("VPNFabricSessionTransportEnabled = false, want true")
|
||||
}
|
||||
@@ -122,6 +119,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
}
|
||||
if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:19443" ||
|
||||
cfg.MeshAdvertiseEndpointsJSON == "" ||
|
||||
cfg.FabricRegistryRecordsJSON != `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]` ||
|
||||
cfg.MeshAdvertiseTransport != "direct_quic" ||
|
||||
cfg.MeshConnectivityMode != "outbound_only" ||
|
||||
cfg.MeshNATType != "symmetric" ||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
@@ -29,7 +30,6 @@ type RuntimeConfig struct {
|
||||
WorkloadSupervisionEnabled bool
|
||||
MeshSyntheticRuntimeEnabled bool
|
||||
MeshProductionForwardingEnabled bool
|
||||
MeshFabricSessionEnabled bool
|
||||
VPNFabricSessionTransportEnabled bool
|
||||
MeshQUICFabricEnabled bool
|
||||
MeshQUICFabricListenAddr string
|
||||
@@ -42,6 +42,7 @@ type RuntimeConfig struct {
|
||||
MeshListenAutoPortEnd int
|
||||
MeshAdvertiseEndpoint string
|
||||
MeshAdvertiseEndpointsJSON string
|
||||
FabricRegistryRecordsJSON string
|
||||
MeshAdvertiseTransport string
|
||||
MeshConnectivityMode string
|
||||
MeshNATType string
|
||||
@@ -84,6 +85,7 @@ func (cfg RuntimeConfig) Normalize() RuntimeConfig {
|
||||
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
|
||||
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
|
||||
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
|
||||
cfg.FabricRegistryRecordsJSON = strings.TrimSpace(cfg.FabricRegistryRecordsJSON)
|
||||
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
|
||||
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
|
||||
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
|
||||
@@ -145,6 +147,9 @@ func (cfg RuntimeConfig) ValidateInstall() error {
|
||||
if cfg.ProductionObservationSinkCap < 0 {
|
||||
return errors.New("production observation sink capacity must not be negative")
|
||||
}
|
||||
if cfg.FabricRegistryRecordsJSON != "" && !isJSONArray(cfg.FabricRegistryRecordsJSON) {
|
||||
return errors.New("fabric registry records must be a JSON array")
|
||||
}
|
||||
for _, item := range cfg.ExtraEnv {
|
||||
if !strings.Contains(item, "=") {
|
||||
return fmt.Errorf("extra env %q must be KEY=VALUE", item)
|
||||
@@ -176,3 +181,8 @@ func hasLegacyEndpointScheme(endpoint string) bool {
|
||||
strings.HasPrefix(endpoint, "ws://") ||
|
||||
strings.HasPrefix(endpoint, "wss://")
|
||||
}
|
||||
|
||||
func isJSONArray(value string) bool {
|
||||
var items []json.RawMessage
|
||||
return json.Unmarshal([]byte(strings.TrimSpace(value)), &items) == nil
|
||||
}
|
||||
|
||||
@@ -264,7 +264,6 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
|
||||
"RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled),
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled),
|
||||
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled),
|
||||
"RAP_MESH_FABRIC_SESSION_ENABLED=" + boolString(cfg.MeshFabricSessionEnabled),
|
||||
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED=" + boolString(cfg.VPNFabricSessionTransportEnabled),
|
||||
"RAP_MESH_QUIC_FABRIC_ENABLED=" + boolString(cfg.MeshQUICFabricEnabled),
|
||||
"RAP_VPN_FABRIC_SESSION_STREAM_SHARDS=" + strconv.Itoa(cfg.VPNFabricSessionStreamShards),
|
||||
@@ -295,6 +294,9 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
|
||||
if cfg.MeshAdvertiseEndpointsJSON != "" {
|
||||
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON="+cfg.MeshAdvertiseEndpointsJSON)
|
||||
}
|
||||
if cfg.FabricRegistryRecordsJSON != "" {
|
||||
env = append(env, "RAP_FABRIC_REGISTRY_RECORDS_JSON="+cfg.FabricRegistryRecordsJSON)
|
||||
}
|
||||
if cfg.MeshAdvertiseTransport != "" {
|
||||
env = append(env, "RAP_MESH_ADVERTISE_TRANSPORT="+cfg.MeshAdvertiseTransport)
|
||||
}
|
||||
|
||||
@@ -74,6 +74,7 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
|
||||
VPNFabricQUICIdleTTLSeconds: 120,
|
||||
MeshListenAddr: ":19131",
|
||||
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443/",
|
||||
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
|
||||
MeshAdvertiseTransport: "direct_quic",
|
||||
MeshConnectivityMode: "private_lan",
|
||||
})
|
||||
@@ -96,6 +97,7 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
|
||||
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS=120",
|
||||
"RAP_MESH_LISTEN_ADDR=:19131",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINT=quic://10.0.0.11:19443",
|
||||
`RAP_FABRIC_REGISTRY_RECORDS_JSON=[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
|
||||
"RAP_MESH_ADVERTISE_TRANSPORT=direct_quic",
|
||||
"RAP_MESH_CONNECTIVITY_MODE=private_lan",
|
||||
"rap-node-agent:test",
|
||||
@@ -164,6 +166,11 @@ func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
|
||||
"node_name": "node-a",
|
||||
"image": "rap-node-agent:test",
|
||||
"artifact_endpoints": []string{"https://cache.example.test/artifacts"},
|
||||
"fabric_registry_records": []map[string]any{{
|
||||
"schema": "rap.fabric.registry.gossip_record.v1",
|
||||
"service_class": "control-api",
|
||||
"service_id": "control-a",
|
||||
}},
|
||||
"docker_image_artifact": map[string]any{
|
||||
"kind": "docker_image_tar",
|
||||
"image": "rap-node-agent:test",
|
||||
@@ -207,6 +214,7 @@ func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
|
||||
!cfg.MeshQUICFabricEnabled ||
|
||||
cfg.MeshQUICFabricListenAddr != ":19443" ||
|
||||
cfg.VPNFabricSessionStreamShards != 6 ||
|
||||
cfg.FabricRegistryRecordsJSON != `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api","service_id":"control-a"}]` ||
|
||||
cfg.MeshConnectivityMode != "outbound_only" {
|
||||
t.Fatalf("unexpected cfg: %+v", cfg)
|
||||
}
|
||||
|
||||
@@ -72,7 +72,6 @@ func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConf
|
||||
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
|
||||
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
|
||||
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
|
||||
MeshFabricSessionEnabled: profile.MeshFabricSessionEnabled,
|
||||
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
|
||||
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
|
||||
MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr,
|
||||
@@ -287,7 +286,6 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
|
||||
args := []string{
|
||||
result.HostAgentPath,
|
||||
"update-loop",
|
||||
"--backend-url", cfg.RuntimeConfig.BackendURL,
|
||||
"--cluster-id", cfg.RuntimeConfig.ClusterID,
|
||||
"--state-dir", result.StateDir,
|
||||
"--current-version", cfg.AutoUpdateCurrentVersion,
|
||||
@@ -303,6 +301,10 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
|
||||
"--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"),
|
||||
"--host-agent-binary-path", result.HostAgentPath,
|
||||
}
|
||||
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
|
||||
args = append(args, "--backend-url", strings.TrimSpace(cfg.RuntimeConfig.BackendURL))
|
||||
}
|
||||
args = appendFabricUpdateArgs(args, cfg.RuntimeConfig)
|
||||
if strings.TrimSpace(cfg.NodeID) != "" {
|
||||
args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID))
|
||||
}
|
||||
@@ -363,48 +365,48 @@ func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Updat
|
||||
}
|
||||
status.Payload["systemd_unit"] = req.SystemdUnitName
|
||||
status.Payload["binary_path"] = req.BinaryPath
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, status)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.ProductionForwarding && !req.AllowProductionMesh {
|
||||
err := errors.New("refusing update plan with production forwarding enabled")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != BinaryUpdateInstallType {
|
||||
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
|
||||
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
runner := m.runner()
|
||||
_, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName)
|
||||
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "apply", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
result.Replaced = true
|
||||
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
|
||||
_ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()})
|
||||
return result, nil
|
||||
}
|
||||
|
||||
@@ -31,31 +31,34 @@ const (
|
||||
)
|
||||
|
||||
type MonitorConfig struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
Product string
|
||||
CurrentVersion string
|
||||
Interval time.Duration
|
||||
InitialDelay time.Duration
|
||||
MaxRuns int
|
||||
DockerBinary string
|
||||
WatchContainers []string
|
||||
RestartContainers bool
|
||||
RestartCooldown time.Duration
|
||||
StaleRestartingAfter time.Duration
|
||||
DiskPath string
|
||||
TmpDir string
|
||||
DiskWarnPercent int
|
||||
DiskCleanupPercent int
|
||||
DiskCriticalPercent int
|
||||
TmpMinAge time.Duration
|
||||
CleanupDocker bool
|
||||
StatusFile string
|
||||
Runner CommandRunner
|
||||
Logf func(format string, args ...any)
|
||||
restartHistory map[string]time.Time
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
ClusterAuthorityPublicKey string
|
||||
FabricRegistryRecordsJSON string
|
||||
MeshRegion string
|
||||
Product string
|
||||
CurrentVersion string
|
||||
Interval time.Duration
|
||||
InitialDelay time.Duration
|
||||
MaxRuns int
|
||||
DockerBinary string
|
||||
WatchContainers []string
|
||||
RestartContainers bool
|
||||
RestartCooldown time.Duration
|
||||
StaleRestartingAfter time.Duration
|
||||
DiskPath string
|
||||
TmpDir string
|
||||
DiskWarnPercent int
|
||||
DiskCleanupPercent int
|
||||
DiskCriticalPercent int
|
||||
TmpMinAge time.Duration
|
||||
CleanupDocker bool
|
||||
StatusFile string
|
||||
Runner CommandRunner
|
||||
Logf func(format string, args ...any)
|
||||
restartHistory map[string]time.Time
|
||||
}
|
||||
|
||||
type DiskUsage struct {
|
||||
@@ -421,7 +424,18 @@ func reportMonitorStatus(ctx context.Context, cfg MonitorConfig, result MonitorR
|
||||
if errText != "" {
|
||||
req.ErrorMessage = &errText
|
||||
}
|
||||
return ReportNodeUpdateStatus(ctx, cfg.BackendURL, clusterID, nodeID, req)
|
||||
return ReportNodeUpdateStatusForRequest(ctx, UpdateRequest{
|
||||
BackendURL: cfg.BackendURL,
|
||||
ClusterID: clusterID,
|
||||
NodeID: nodeID,
|
||||
StateDir: cfg.StateDir,
|
||||
ClusterAuthorityPublicKey: cfg.ClusterAuthorityPublicKey,
|
||||
FabricRegistryRecordsJSON: cfg.FabricRegistryRecordsJSON,
|
||||
MeshRegion: cfg.MeshRegion,
|
||||
Product: cfg.Product,
|
||||
CurrentVersion: cfg.CurrentVersion,
|
||||
InstallType: DefaultUpdateInstallType,
|
||||
}, req)
|
||||
}
|
||||
|
||||
func resolveMonitorIdentity(cfg MonitorConfig) (string, string, error) {
|
||||
|
||||
@@ -16,6 +16,7 @@ type DockerInstallProfile struct {
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
||||
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
|
||||
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
@@ -30,7 +31,6 @@ type DockerInstallProfile struct {
|
||||
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
|
||||
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
|
||||
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
|
||||
MeshFabricSessionEnabled bool `json:"mesh_fabric_session_enabled"`
|
||||
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
|
||||
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
|
||||
MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"`
|
||||
@@ -70,6 +70,7 @@ type WindowsInstallProfile struct {
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
||||
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
|
||||
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
@@ -79,7 +80,6 @@ type WindowsInstallProfile struct {
|
||||
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
|
||||
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
|
||||
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
|
||||
MeshFabricSessionEnabled bool `json:"mesh_fabric_session_enabled"`
|
||||
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
|
||||
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
|
||||
MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"`
|
||||
@@ -109,6 +109,7 @@ type LinuxInstallProfile struct {
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
||||
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
|
||||
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
@@ -118,7 +119,6 @@ type LinuxInstallProfile struct {
|
||||
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
|
||||
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
|
||||
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
|
||||
MeshFabricSessionEnabled bool `json:"mesh_fabric_session_enabled"`
|
||||
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
|
||||
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
|
||||
MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"`
|
||||
@@ -302,7 +302,6 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
|
||||
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
|
||||
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
|
||||
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
|
||||
MeshFabricSessionEnabled: profile.MeshFabricSessionEnabled,
|
||||
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
|
||||
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
|
||||
MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr,
|
||||
@@ -315,6 +314,7 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
|
||||
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
|
||||
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
|
||||
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
|
||||
FabricRegistryRecordsJSON: string(profile.FabricRegistryRecords),
|
||||
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
|
||||
MeshConnectivityMode: profile.MeshConnectivityMode,
|
||||
MeshNATType: profile.MeshNATType,
|
||||
|
||||
@@ -10,19 +10,22 @@ import (
|
||||
)
|
||||
|
||||
type HostAgentUpdateRequest struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
CurrentVersion string
|
||||
Channel string
|
||||
OS string
|
||||
Arch string
|
||||
InstallType string
|
||||
BinaryPath string
|
||||
DryRun bool
|
||||
RestartService string
|
||||
RestartAfterApply bool
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
ClusterAuthorityPublicKey string
|
||||
FabricRegistryRecordsJSON string
|
||||
MeshRegion string
|
||||
CurrentVersion string
|
||||
Channel string
|
||||
OS string
|
||||
Arch string
|
||||
InstallType string
|
||||
BinaryPath string
|
||||
DryRun bool
|
||||
RestartService string
|
||||
RestartAfterApply bool
|
||||
}
|
||||
|
||||
type HostAgentUpdateLoopConfig struct {
|
||||
@@ -37,18 +40,21 @@ type HostAgentUpdateLoopConfig struct {
|
||||
|
||||
func (req HostAgentUpdateRequest) updateRequest() UpdateRequest {
|
||||
return UpdateRequest{
|
||||
BackendURL: req.BackendURL,
|
||||
ClusterID: req.ClusterID,
|
||||
NodeID: req.NodeID,
|
||||
StateDir: req.StateDir,
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
OS: firstNonEmpty(req.OS, "linux"),
|
||||
Arch: req.Arch,
|
||||
InstallType: firstNonEmpty(req.InstallType, BinaryUpdateInstallType),
|
||||
Channel: req.Channel,
|
||||
ContainerName: "host-agent-service",
|
||||
DryRun: req.DryRun,
|
||||
BackendURL: req.BackendURL,
|
||||
ClusterID: req.ClusterID,
|
||||
NodeID: req.NodeID,
|
||||
StateDir: req.StateDir,
|
||||
ClusterAuthorityPublicKey: req.ClusterAuthorityPublicKey,
|
||||
FabricRegistryRecordsJSON: req.FabricRegistryRecordsJSON,
|
||||
MeshRegion: req.MeshRegion,
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
OS: firstNonEmpty(req.OS, "linux"),
|
||||
Arch: req.Arch,
|
||||
InstallType: firstNonEmpty(req.InstallType, BinaryUpdateInstallType),
|
||||
Channel: req.Channel,
|
||||
ContainerName: "host-agent-service",
|
||||
DryRun: req.DryRun,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,25 +85,25 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
|
||||
status.Payload = map[string]any{}
|
||||
}
|
||||
status.Payload["binary_path"] = binaryPath
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, status)
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, status)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("host-agent update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if !isBinaryInstallType(plan.Artifact.InstallType) {
|
||||
err := fmt.Errorf("unsupported host-agent artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL)
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: resolved.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
@@ -109,7 +115,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
|
||||
})
|
||||
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "download", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "download", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
@@ -125,7 +131,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
|
||||
Image: binaryPath,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: resolved.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
@@ -137,7 +143,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
|
||||
})
|
||||
return result, nil
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr)))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr)))
|
||||
return result, err
|
||||
}
|
||||
result.Loaded = true
|
||||
@@ -151,7 +157,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
|
||||
Image: binaryPath,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: resolved.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
|
||||
@@ -173,8 +173,8 @@ func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServi
|
||||
func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
|
||||
runtimeCfg := cfg.RuntimeConfig.Normalize()
|
||||
var missing []string
|
||||
if runtimeCfg.BackendURL == "" {
|
||||
missing = append(missing, "backend-url")
|
||||
if runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "" {
|
||||
missing = append(missing, "backend-url-or-fabric-registry-records-json")
|
||||
}
|
||||
if runtimeCfg.ClusterID == "" {
|
||||
missing = append(missing, "cluster-id")
|
||||
@@ -191,7 +191,6 @@ func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
|
||||
args := []string{
|
||||
cfg.BinaryInstallPath,
|
||||
"update-loop",
|
||||
"--backend-url", runtimeCfg.BackendURL,
|
||||
"--cluster-id", runtimeCfg.ClusterID,
|
||||
"--state-dir", runtimeCfg.StateDir,
|
||||
"--container-name", runtimeCfg.ContainerName,
|
||||
@@ -202,9 +201,13 @@ func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
|
||||
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
|
||||
"--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec),
|
||||
}
|
||||
if runtimeCfg.BackendURL != "" {
|
||||
args = append(args, "--backend-url", runtimeCfg.BackendURL)
|
||||
}
|
||||
if strings.TrimSpace(cfg.Channel) != "" {
|
||||
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
|
||||
}
|
||||
args = appendFabricUpdateArgs(args, runtimeCfg)
|
||||
execStart := systemdJoin(args)
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=RAP host-agent updater for %s
|
||||
@@ -225,8 +228,8 @@ WantedBy=multi-user.target
|
||||
|
||||
func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) {
|
||||
runtimeCfg := cfg.RuntimeConfig.Normalize()
|
||||
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
|
||||
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host-agent self updater")
|
||||
if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
|
||||
return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host-agent self updater")
|
||||
}
|
||||
unitName := "rap-host-agent-self-updater.service"
|
||||
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
|
||||
@@ -234,7 +237,6 @@ func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, stri
|
||||
args := []string{
|
||||
cfg.BinaryInstallPath,
|
||||
"update-host-agent-loop",
|
||||
"--backend-url", runtimeCfg.BackendURL,
|
||||
"--cluster-id", runtimeCfg.ClusterID,
|
||||
"--state-dir", runtimeCfg.StateDir,
|
||||
"--binary-path", firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath),
|
||||
@@ -243,9 +245,13 @@ func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, stri
|
||||
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30),
|
||||
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
|
||||
}
|
||||
if runtimeCfg.BackendURL != "" {
|
||||
args = append(args, "--backend-url", runtimeCfg.BackendURL)
|
||||
}
|
||||
if strings.TrimSpace(cfg.Channel) != "" {
|
||||
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
|
||||
}
|
||||
args = appendFabricUpdateArgs(args, runtimeCfg)
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=RAP host-agent self updater
|
||||
After=network-online.target docker.service
|
||||
@@ -265,8 +271,8 @@ WantedBy=multi-user.target
|
||||
|
||||
func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string, error) {
|
||||
runtimeCfg := cfg.RuntimeConfig.Normalize()
|
||||
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
|
||||
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host monitor")
|
||||
if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
|
||||
return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host monitor")
|
||||
}
|
||||
containers := uniqueTrimmed(append([]string{runtimeCfg.ContainerName}, cfg.MonitorContainers...))
|
||||
if len(containers) == 0 {
|
||||
@@ -277,7 +283,6 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
|
||||
args := []string{
|
||||
cfg.BinaryInstallPath,
|
||||
"monitor-loop",
|
||||
"--backend-url", runtimeCfg.BackendURL,
|
||||
"--cluster-id", runtimeCfg.ClusterID,
|
||||
"--state-dir", runtimeCfg.StateDir,
|
||||
"--current-version", firstNonEmpty(cfg.SelfUpdateVersion, cfg.CurrentVersion),
|
||||
@@ -286,6 +291,9 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
|
||||
"--disk-cleanup-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCleanup, DefaultMonitorDiskCleanupPercent)),
|
||||
"--disk-critical-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCritical, DefaultMonitorDiskCriticalPercent)),
|
||||
}
|
||||
if runtimeCfg.BackendURL != "" {
|
||||
args = append(args, "--backend-url", runtimeCfg.BackendURL)
|
||||
}
|
||||
if cfg.MonitorCleanupDocker {
|
||||
args = append(args, "--cleanup-docker")
|
||||
}
|
||||
@@ -295,6 +303,7 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
|
||||
for _, container := range containers {
|
||||
args = append(args, "--watch-container", container)
|
||||
}
|
||||
args = appendFabricUpdateArgs(args, runtimeCfg)
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=RAP host-agent monitor for %s
|
||||
After=network-online.target docker.service
|
||||
@@ -312,6 +321,16 @@ WantedBy=multi-user.target
|
||||
`, runtimeCfg.ContainerName, systemdJoin(args)), unitName, unitPath, nil
|
||||
}
|
||||
|
||||
func appendFabricUpdateArgs(args []string, runtimeCfg RuntimeConfig) []string {
|
||||
if strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON) != "" {
|
||||
args = append(args, "--fabric-registry-records-json", strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON))
|
||||
}
|
||||
if strings.TrimSpace(runtimeCfg.MeshRegion) != "" {
|
||||
args = append(args, "--mesh-region", strings.TrimSpace(runtimeCfg.MeshRegion))
|
||||
}
|
||||
return args
|
||||
}
|
||||
|
||||
func firstNonZero(values ...int) int {
|
||||
for _, value := range values {
|
||||
if value != 0 {
|
||||
|
||||
@@ -119,7 +119,7 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
|
||||
for _, want := range []string{
|
||||
":loop",
|
||||
"rap-host-agent.exe.next",
|
||||
"update-loop --backend-url",
|
||||
"update-loop --cluster-id",
|
||||
"--backend-url \"http://control/api/v1\"",
|
||||
"--cluster-id \"cluster-1\"",
|
||||
"--node-id \"node-1\"",
|
||||
@@ -139,6 +139,35 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsHostAgentUpdateScriptOmitsEmptyBackendURL(t *testing.T) {
|
||||
cfg := WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
ClusterID: "cluster-1",
|
||||
FabricRegistryRecordsJSON: `[{"record_id":"r1"}]`,
|
||||
MeshRegion: "ru-msk",
|
||||
},
|
||||
AutoUpdateCurrentVersion: "0.1.2",
|
||||
}
|
||||
result := WindowsInstallResult{
|
||||
NodeName: "win-a",
|
||||
StateDir: `C:\ProgramData\RAP\nodes\win-a`,
|
||||
NodeAgentPath: `C:\Program Files\RAP\win-a\rap-node-agent.exe`,
|
||||
TaskName: "RAP Node Agent win-a",
|
||||
}
|
||||
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
|
||||
if strings.Contains(script, "--backend-url") {
|
||||
t.Fatalf("script must not include backend-url when it is empty:\n%s", script)
|
||||
}
|
||||
for _, want := range []string{
|
||||
`--fabric-registry-records-json [{"record_id":"r1"}]`,
|
||||
"--mesh-region ru-msk",
|
||||
} {
|
||||
if !strings.Contains(script, want) {
|
||||
t.Fatalf("script missing %q:\n%s", want, script)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) {
|
||||
result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
|
||||
@@ -3,6 +3,8 @@ package hostagent
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/ed25519"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -17,6 +19,8 @@ import (
|
||||
"time"
|
||||
|
||||
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
@@ -33,23 +37,26 @@ const (
|
||||
var ErrNodeIdentityNotReady = errors.New("node identity is not approved yet")
|
||||
|
||||
type UpdateRequest struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
Product string
|
||||
CurrentVersion string
|
||||
OS string
|
||||
Arch string
|
||||
InstallType string
|
||||
Channel string
|
||||
ContainerName string
|
||||
BinaryPath string
|
||||
WindowsTaskName string
|
||||
SystemdUnitName string
|
||||
HealthTimeout time.Duration
|
||||
DryRun bool
|
||||
AllowProductionMesh bool
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
ClusterAuthorityPublicKey string
|
||||
FabricRegistryRecordsJSON string
|
||||
MeshRegion string
|
||||
Product string
|
||||
CurrentVersion string
|
||||
OS string
|
||||
Arch string
|
||||
InstallType string
|
||||
Channel string
|
||||
ContainerName string
|
||||
BinaryPath string
|
||||
WindowsTaskName string
|
||||
SystemdUnitName string
|
||||
HealthTimeout time.Duration
|
||||
DryRun bool
|
||||
AllowProductionMesh bool
|
||||
}
|
||||
|
||||
type UpdateResult struct {
|
||||
@@ -204,6 +211,9 @@ func (req UpdateRequest) Normalize() UpdateRequest {
|
||||
req.ClusterID = strings.TrimSpace(req.ClusterID)
|
||||
req.NodeID = strings.TrimSpace(req.NodeID)
|
||||
req.StateDir = strings.TrimSpace(req.StateDir)
|
||||
req.ClusterAuthorityPublicKey = strings.TrimSpace(req.ClusterAuthorityPublicKey)
|
||||
req.FabricRegistryRecordsJSON = strings.TrimSpace(req.FabricRegistryRecordsJSON)
|
||||
req.MeshRegion = strings.TrimSpace(req.MeshRegion)
|
||||
req.Product = firstNonEmpty(req.Product, DefaultUpdateProduct)
|
||||
req.OS = firstNonEmpty(req.OS, runtime.GOOS)
|
||||
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
|
||||
@@ -222,8 +232,8 @@ func (req UpdateRequest) Normalize() UpdateRequest {
|
||||
func (req UpdateRequest) Validate() error {
|
||||
req = req.Normalize()
|
||||
var missing []string
|
||||
if req.BackendURL == "" {
|
||||
missing = append(missing, "backend-url")
|
||||
if req.BackendURL == "" && req.FabricRegistryRecordsJSON == "" {
|
||||
missing = append(missing, "backend-url-or-fabric-registry-records-json")
|
||||
}
|
||||
if req.ClusterID == "" {
|
||||
missing = append(missing, "cluster-id")
|
||||
@@ -285,30 +295,30 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
|
||||
}
|
||||
if plan.Action != "update" {
|
||||
if !req.DryRun {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromNoopPlan(req, plan))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromNoopPlan(req, plan))
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.ProductionForwarding && !req.AllowProductionMesh {
|
||||
err := errors.New("refusing update plan with production forwarding enabled")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != DefaultUpdateInstallType {
|
||||
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
result.NewImage = artifactImage(*plan.Artifact, "")
|
||||
return result, nil
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
@@ -321,7 +331,7 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
|
||||
|
||||
current, cfg, err := m.runtimeConfigFromContainer(ctx, runner, docker, req.ContainerName)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "inspect", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "inspect", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
result.PreviousImageID = current.Image
|
||||
@@ -339,7 +349,7 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
|
||||
cfg.JoinToken = ""
|
||||
result.NewImage = cfg.Image
|
||||
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
@@ -351,7 +361,7 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
|
||||
})
|
||||
installed, err := m.Install(ctx, cfg)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "apply", "failed", err))
|
||||
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
|
||||
if rollbackErr == nil && plan.RollbackAllowed {
|
||||
result.RolledBack = true
|
||||
@@ -363,14 +373,14 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
|
||||
result.ContainerID = installed.ContainerID
|
||||
|
||||
if err := m.waitContainerRunning(ctx, runner, docker, req.ContainerName, req.HealthTimeout); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "health_check", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "health_check", "failed", err))
|
||||
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
|
||||
if rollbackErr == nil && plan.RollbackAllowed {
|
||||
result.RolledBack = true
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
@@ -515,7 +525,27 @@ func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan
|
||||
if req.Channel != "" {
|
||||
values.Set("channel", req.Channel)
|
||||
}
|
||||
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/plan?%s", req.BackendURL, url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode())
|
||||
path := fmt.Sprintf("/clusters/%s/nodes/%s/updates/plan?%s", url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode())
|
||||
if raw, viaFabric, err := updateControlRawViaFabric(ctx, req, client.RawControlRequest{Method: http.MethodGet, Path: path}); viaFabric {
|
||||
if err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
if raw.StatusCode < 200 || raw.StatusCode >= 300 {
|
||||
return NodeUpdatePlan{}, fmt.Errorf("fetch update plan via fabric: status %d", raw.StatusCode)
|
||||
}
|
||||
var out NodeUpdatePlanResponse
|
||||
if err := json.Unmarshal(raw.Body, &out); err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
if err := verifyNodeUpdatePlanAuthority(req, out.Plan); err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
return out.Plan, nil
|
||||
}
|
||||
endpoint := req.BackendURL + path
|
||||
if req.BackendURL == "" {
|
||||
return NodeUpdatePlan{}, errors.New("update plan control API is unavailable: no active fabric route and backend-url is empty")
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
@@ -538,6 +568,110 @@ func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan
|
||||
return out.Plan, nil
|
||||
}
|
||||
|
||||
func updateControlRawViaFabric(ctx context.Context, req UpdateRequest, rawReq client.RawControlRequest) (client.RawControlResponse, bool, error) {
|
||||
if strings.TrimSpace(req.FabricRegistryRecordsJSON) == "" {
|
||||
return client.RawControlResponse{}, false, nil
|
||||
}
|
||||
publicKey, err := decodeUpdateFabricRegistryPublicKey(req)
|
||||
if err != nil {
|
||||
return client.RawControlResponse{}, false, err
|
||||
}
|
||||
registry, _, err := mesh.LoadFabricRegistryBootstrapRecords(req.FabricRegistryRecordsJSON, mesh.FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: req.ClusterID,
|
||||
TrustedIssuers: []mesh.FabricRegistryTrustedIssuer{{
|
||||
IssuerID: "cluster-authority",
|
||||
Role: mesh.FabricRegistryAuthorityControl,
|
||||
PublicKey: publicKey,
|
||||
Scopes: []string{mesh.FabricRegistryScopeFarm, mesh.FabricRegistryScopeCluster, mesh.FabricRegistryScopeOrganization},
|
||||
Services: []string{mesh.FabricRegistryServiceControlAPI},
|
||||
}},
|
||||
RequiredSignatures: 1,
|
||||
MaxClockSkew: 2 * time.Minute,
|
||||
Now: time.Now().UTC(),
|
||||
}, false)
|
||||
if err != nil {
|
||||
return client.RawControlResponse{}, false, err
|
||||
}
|
||||
transport := mesh.NewQUICFabricTransport(nil)
|
||||
if req.NodeID != "" {
|
||||
transport.SetLocalPeerID(req.NodeID)
|
||||
}
|
||||
registry.VerifyCandidates(ctx, transport, mesh.FabricRegistryLiveProbeRequest{
|
||||
ClusterID: req.ClusterID,
|
||||
PreferredRegion: req.MeshRegion,
|
||||
Timeout: 2 * time.Second,
|
||||
MaxCandidates: 8,
|
||||
Now: time.Now().UTC(),
|
||||
})
|
||||
resolved := registry.ResolveService(mesh.FabricRegistryResolveRequest{
|
||||
ClusterID: req.ClusterID,
|
||||
Service: mesh.FabricRegistryServiceControlAPI,
|
||||
Scope: mesh.FabricRegistryScopeCluster,
|
||||
PreferredRegion: req.MeshRegion,
|
||||
Now: time.Now().UTC(),
|
||||
})
|
||||
if !resolved.Found || len(resolved.Endpoints) == 0 {
|
||||
return client.RawControlResponse{}, false, nil
|
||||
}
|
||||
payload, err := json.Marshal(rawReq)
|
||||
if err != nil {
|
||||
return client.RawControlResponse{}, false, err
|
||||
}
|
||||
var lastErr error
|
||||
for _, endpoint := range resolved.Endpoints {
|
||||
result, err := mesh.SendFabricControlForward(ctx, transport, endpoint, payload, 5*time.Second)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
var envelope struct {
|
||||
Payload json.RawMessage `json:"payload,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
if err := json.Unmarshal(result.Payload, &envelope); err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(envelope.Error) != "" {
|
||||
lastErr = errors.New(envelope.Error)
|
||||
continue
|
||||
}
|
||||
var raw client.RawControlResponse
|
||||
if err := json.Unmarshal(envelope.Payload, &raw); err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
return raw, true, nil
|
||||
}
|
||||
if lastErr == nil {
|
||||
lastErr = errors.New("fabric control registry endpoints unavailable")
|
||||
}
|
||||
return client.RawControlResponse{}, false, lastErr
|
||||
}
|
||||
|
||||
func decodeUpdateFabricRegistryPublicKey(req UpdateRequest) (ed25519.PublicKey, error) {
|
||||
value := strings.TrimSpace(req.ClusterAuthorityPublicKey)
|
||||
if value == "" && strings.TrimSpace(req.StateDir) != "" {
|
||||
if identity, err := state.Load(filepath.Join(req.StateDir, state.FileName)); err == nil {
|
||||
value = strings.TrimSpace(identity.ClusterAuthorityPublicKey)
|
||||
}
|
||||
}
|
||||
if value == "" {
|
||||
return nil, errors.New("cluster authority public key is required for fabric registry records")
|
||||
}
|
||||
decoded, err := base64.StdEncoding.DecodeString(value)
|
||||
if err != nil {
|
||||
decoded, err = base64.RawStdEncoding.DecodeString(value)
|
||||
}
|
||||
if err != nil {
|
||||
decoded, err = base64.RawURLEncoding.DecodeString(value)
|
||||
}
|
||||
if err != nil || len(decoded) != ed25519.PublicKeySize {
|
||||
return nil, errors.New("cluster authority public key must be base64 Ed25519 public key")
|
||||
}
|
||||
return ed25519.PublicKey(decoded), nil
|
||||
}
|
||||
|
||||
func verifyNodeUpdatePlanAuthority(req UpdateRequest, plan NodeUpdatePlan) error {
|
||||
identity, ok := pinnedUpdatePlanAuthority(req)
|
||||
if !ok {
|
||||
@@ -642,6 +776,9 @@ func resolveUpdateRequest(req UpdateRequest) (UpdateRequest, error) {
|
||||
|
||||
func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID string, request NodeUpdateStatusRequest) error {
|
||||
backendURL = strings.TrimRight(strings.TrimSpace(backendURL), "/")
|
||||
if backendURL == "" {
|
||||
return errors.New("update status control API is unavailable: backend-url is empty")
|
||||
}
|
||||
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/status", backendURL, url.PathEscape(clusterID), url.PathEscape(nodeID))
|
||||
body, err := json.Marshal(request)
|
||||
if err != nil {
|
||||
@@ -663,6 +800,33 @@ func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID s
|
||||
return nil
|
||||
}
|
||||
|
||||
func ReportNodeUpdateStatusForRequest(ctx context.Context, req UpdateRequest, request NodeUpdateStatusRequest) error {
|
||||
var err error
|
||||
req, err = resolveUpdateRequest(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
body, err := json.Marshal(request)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
raw, viaFabric, err := updateControlRawViaFabric(ctx, req, client.RawControlRequest{
|
||||
Method: http.MethodPost,
|
||||
Path: fmt.Sprintf("/clusters/%s/nodes/%s/updates/status", url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID)),
|
||||
Body: body,
|
||||
})
|
||||
if viaFabric {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if raw.StatusCode < 200 || raw.StatusCode >= 300 {
|
||||
return fmt.Errorf("report update status via fabric: status %d", raw.StatusCode)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, request)
|
||||
}
|
||||
|
||||
func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner CommandRunner, docker, containerName string) (dockerInspectContainer, RuntimeConfig, error) {
|
||||
out, err := runner.Run(ctx, docker, "inspect", containerName)
|
||||
if err != nil {
|
||||
@@ -686,9 +850,8 @@ func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner Co
|
||||
Network: firstNonEmpty(inspected[0].HostConfig.NetworkMode, DefaultNetwork),
|
||||
RestartPolicy: firstNonEmpty(inspected[0].HostConfig.RestartPolicy.Name, "unless-stopped"),
|
||||
WorkloadSupervisionEnabled: parseBool(env["RAP_WORKLOAD_SUPERVISION_ENABLED"]),
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshSyntheticRuntimeEnabled: parseBool(env["RAP_MESH_SYNTHETIC_RUNTIME_ENABLED"]),
|
||||
MeshProductionForwardingEnabled: parseBool(env["RAP_MESH_PRODUCTION_FORWARDING_ENABLED"]),
|
||||
MeshFabricSessionEnabled: parseBool(env["RAP_MESH_FABRIC_SESSION_ENABLED"]),
|
||||
VPNFabricSessionTransportEnabled: parseBool(env["RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED"]),
|
||||
MeshQUICFabricEnabled: parseBool(env["RAP_MESH_QUIC_FABRIC_ENABLED"]),
|
||||
MeshQUICFabricListenAddr: env["RAP_MESH_QUIC_FABRIC_LISTEN_ADDR"],
|
||||
|
||||
@@ -4,9 +4,17 @@ import (
|
||||
"context"
|
||||
"crypto/ed25519"
|
||||
cryptorand "crypto/rand"
|
||||
"crypto/rsa"
|
||||
"crypto/sha256"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math/big"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
@@ -16,6 +24,8 @@ import (
|
||||
"time"
|
||||
|
||||
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
@@ -120,6 +130,81 @@ func signHostAgentPayload(t *testing.T, payload json.RawMessage, privateKey ed25
|
||||
}
|
||||
}
|
||||
|
||||
func testHostAgentQUICTLSConfig(t *testing.T) *tls.Config {
|
||||
t.Helper()
|
||||
key, err := rsa.GenerateKey(cryptorand.Reader, 2048)
|
||||
if err != nil {
|
||||
t.Fatalf("generate rsa key: %v", err)
|
||||
}
|
||||
template := x509.Certificate{
|
||||
SerialNumber: big.NewInt(1),
|
||||
Subject: pkix.Name{CommonName: "127.0.0.1"},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(time.Hour),
|
||||
KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
|
||||
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
|
||||
IPAddresses: []net.IP{net.ParseIP("127.0.0.1")},
|
||||
}
|
||||
der, err := x509.CreateCertificate(cryptorand.Reader, &template, &template, &key.PublicKey, key)
|
||||
if err != nil {
|
||||
t.Fatalf("create cert: %v", err)
|
||||
}
|
||||
return &tls.Config{
|
||||
Certificates: []tls.Certificate{{Certificate: [][]byte{der}, PrivateKey: key}},
|
||||
NextProtos: []string{"rap-fabric-data-session-v1"},
|
||||
}
|
||||
}
|
||||
|
||||
func testHostAgentQUICCertSHA256(t *testing.T, cfg *tls.Config) string {
|
||||
t.Helper()
|
||||
if len(cfg.Certificates) == 0 || len(cfg.Certificates[0].Certificate) == 0 {
|
||||
t.Fatal("missing test certificate")
|
||||
}
|
||||
sum := sha256.Sum256(cfg.Certificates[0].Certificate[0])
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
func signedUpdateControlRegistry(t *testing.T, clusterID, endpoint, certSHA256 string, publicKey ed25519.PublicKey, privateKey ed25519.PrivateKey) string {
|
||||
t.Helper()
|
||||
now := time.Now().UTC()
|
||||
issuer := mesh.FabricRegistryTrustedIssuer{IssuerID: "cluster-authority", Role: mesh.FabricRegistryAuthorityControl, PublicKey: publicKey}
|
||||
record := mesh.FabricRegistryGossipRecord{
|
||||
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
|
||||
ClusterID: clusterID,
|
||||
Service: mesh.FabricRegistryServiceControlAPI,
|
||||
Scope: mesh.FabricRegistryScopeCluster,
|
||||
Epoch: 1,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Hour),
|
||||
IssuerNodeID: "cluster-authority",
|
||||
IssuerRole: mesh.FabricRegistryAuthorityControl,
|
||||
Endpoints: []mesh.FabricRegistryEndpoint{{
|
||||
EndpointID: "control-a",
|
||||
Address: endpoint,
|
||||
Transport: "direct_quic",
|
||||
PeerCertSHA256: certSHA256,
|
||||
}},
|
||||
}
|
||||
signed, err := mesh.SignFabricRegistryGossipRecord(record, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign registry record: %v", err)
|
||||
}
|
||||
raw, err := json.Marshal([]mesh.FabricRegistryGossipRecord{signed})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal registry record: %v", err)
|
||||
}
|
||||
return string(raw)
|
||||
}
|
||||
|
||||
func mustJSONRaw(t *testing.T, value any) json.RawMessage {
|
||||
t.Helper()
|
||||
raw, err := json.Marshal(value)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal json: %v", err)
|
||||
}
|
||||
return raw
|
||||
}
|
||||
|
||||
func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.T) {
|
||||
urls := artifactURLsForBackend(ReleaseArtifact{
|
||||
URL: "/downloads/rap-node-agent-0.2.92.tar",
|
||||
@@ -223,6 +308,111 @@ func TestFetchNodeUpdatePlanAcceptsSignedPlanWithPinnedAuthority(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchNodeUpdatePlanUsesFabricRegistryQUICControlAPI(t *testing.T) {
|
||||
stateDir, publicKey, privateKey := writePinnedAuthorityIdentity(t)
|
||||
plan := map[string]any{
|
||||
"schema_version": "rap.node_update_plan.v1",
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.1.0",
|
||||
"action": "none",
|
||||
"reason": "already_current",
|
||||
"production_forwarding": false,
|
||||
}
|
||||
payload := map[string]any{
|
||||
"schema_version": "rap.node_update_plan_authority.v1",
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.1.0",
|
||||
"action": "none",
|
||||
"target_version": "",
|
||||
"artifact_sha256": "",
|
||||
"control_plane_only": true,
|
||||
"production_forwarding": false,
|
||||
}
|
||||
rawPayload, signature := signedAuthorityPayload(t, publicKey, privateKey, payload)
|
||||
plan["authority_payload"] = json.RawMessage(rawPayload)
|
||||
plan["authority_signature"] = signature
|
||||
tlsConfig := testHostAgentQUICTLSConfig(t)
|
||||
var received client.RawControlRequest
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
if err := json.Unmarshal(payload, &received); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if received.Method != http.MethodGet || !strings.HasPrefix(received.Path, "/clusters/cluster-1/nodes/node-1/updates/plan?") {
|
||||
return nil, fmt.Errorf("unexpected request: %+v", received)
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{StatusCode: 200, Body: mustJSONRaw(t, map[string]any{"node_update_plan": plan})})
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
got, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
|
||||
BackendURL: "http://127.0.0.1:1",
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
StateDir: stateDir,
|
||||
FabricRegistryRecordsJSON: signedUpdateControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testHostAgentQUICCertSHA256(t, tlsConfig), publicKey, privateKey),
|
||||
CurrentVersion: "0.1.0",
|
||||
OS: "linux",
|
||||
Arch: "amd64",
|
||||
InstallType: "docker",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("fetch plan via fabric: %v", err)
|
||||
}
|
||||
if got.Action != "none" || got.Reason != "already_current" {
|
||||
t.Fatalf("plan = %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReportNodeUpdateStatusUsesFabricRegistryQUICControlAPI(t *testing.T) {
|
||||
stateDir, publicKey, privateKey := writePinnedAuthorityIdentity(t)
|
||||
tlsConfig := testHostAgentQUICTLSConfig(t)
|
||||
var received client.RawControlRequest
|
||||
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
if err := json.Unmarshal(payload, &received); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if received.Method != http.MethodPost || received.Path != "/clusters/cluster-1/nodes/node-1/updates/status" {
|
||||
return nil, fmt.Errorf("unexpected request: %+v", received)
|
||||
}
|
||||
return json.Marshal(client.RawControlResponse{StatusCode: 204, Body: json.RawMessage(`{}`)})
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
err = ReportNodeUpdateStatusForRequest(context.Background(), UpdateRequest{
|
||||
BackendURL: "http://127.0.0.1:1",
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
StateDir: stateDir,
|
||||
FabricRegistryRecordsJSON: signedUpdateControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testHostAgentQUICCertSHA256(t, tlsConfig), publicKey, privateKey),
|
||||
CurrentVersion: "0.1.0",
|
||||
OS: "linux",
|
||||
Arch: "amd64",
|
||||
InstallType: "docker",
|
||||
}, NodeUpdateStatusRequest{Product: "rap-node-agent", Phase: "download", Status: "started"})
|
||||
if err != nil {
|
||||
t.Fatalf("report status via fabric: %v", err)
|
||||
}
|
||||
if len(received.Body) == 0 || !strings.Contains(string(received.Body), `"phase":"download"`) {
|
||||
t.Fatalf("unexpected status body: %s", string(received.Body))
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchNodeUpdatePlanAcceptsQuorumSignedPlan(t *testing.T) {
|
||||
stateDir, descriptor, privateKeys := writePinnedQuorumIdentity(t)
|
||||
plan := map[string]any{
|
||||
|
||||
@@ -66,7 +66,6 @@ func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInsta
|
||||
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
|
||||
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
|
||||
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
|
||||
MeshFabricSessionEnabled: profile.MeshFabricSessionEnabled,
|
||||
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
|
||||
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
|
||||
MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr,
|
||||
|
||||
@@ -48,29 +48,29 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
|
||||
}
|
||||
status.Payload["task"] = req.WindowsTaskName
|
||||
status.Payload["binary_path"] = req.BinaryPath
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, status)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.ProductionForwarding && !req.AllowProductionMesh {
|
||||
err := errors.New("refusing update plan with production forwarding enabled")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != WindowsUpdateInstallType {
|
||||
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
@@ -81,7 +81,7 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
|
||||
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName},
|
||||
})
|
||||
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
@@ -93,7 +93,7 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
|
||||
})
|
||||
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
@@ -101,16 +101,16 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
|
||||
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
|
||||
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
|
||||
if retryErr := copyFile(path, req.BinaryPath, 0o755); retryErr != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "apply", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
}
|
||||
result.Replaced = true
|
||||
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
@@ -290,7 +290,6 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
|
||||
updateLoopArgs := []string{
|
||||
`"` + hostAgentPath + `"`,
|
||||
"update-loop",
|
||||
"--backend-url", `"` + cfg.RuntimeConfig.BackendURL + `"`,
|
||||
"--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`,
|
||||
"--state-dir", `"` + result.StateDir + `"`,
|
||||
"--current-version", currentVersion,
|
||||
@@ -306,6 +305,10 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
|
||||
"--host-agent-current-version", currentVersion,
|
||||
"--host-agent-binary-path", `"` + hostAgentPath + `"`,
|
||||
}
|
||||
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
|
||||
updateLoopArgs = append(updateLoopArgs, "--backend-url", `"`+strings.TrimSpace(cfg.RuntimeConfig.BackendURL)+`"`)
|
||||
}
|
||||
updateLoopArgs = appendFabricUpdateArgs(updateLoopArgs, cfg.RuntimeConfig)
|
||||
if strings.TrimSpace(cfg.NodeID) != "" {
|
||||
updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`)
|
||||
}
|
||||
|
||||
@@ -6,13 +6,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
@@ -20,38 +14,6 @@ type Client struct {
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
|
||||
type FabricSessionDialOptions struct {
|
||||
Token string
|
||||
Header http.Header
|
||||
Dialer *websocket.Dialer
|
||||
Timeout time.Duration
|
||||
MaxPayload int
|
||||
}
|
||||
|
||||
type FabricSessionClient struct {
|
||||
conn *websocket.Conn
|
||||
timeout time.Duration
|
||||
maxPayload int
|
||||
readMu sync.Mutex
|
||||
writeMu sync.Mutex
|
||||
}
|
||||
|
||||
type FabricSessionPumpOptions struct {
|
||||
OutboundBuffer int
|
||||
InboundBuffer int
|
||||
ErrorBuffer int
|
||||
}
|
||||
|
||||
type FabricSessionPump struct {
|
||||
session *FabricSessionClient
|
||||
outbound chan fabricproto.Frame
|
||||
inbound chan fabricproto.Frame
|
||||
errors chan error
|
||||
done chan struct{}
|
||||
cancel context.CancelFunc
|
||||
closeMu sync.Once
|
||||
}
|
||||
|
||||
func NewClient(baseURL string) Client {
|
||||
return Client{
|
||||
BaseURL: baseURL,
|
||||
@@ -147,270 +109,3 @@ func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (c Client) DialFabricSession(ctx context.Context, opts FabricSessionDialOptions) (*websocket.Conn, *http.Response, error) {
|
||||
target, err := c.fabricSessionWebSocketURL()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
header := cloneHeader(opts.Header)
|
||||
if strings.TrimSpace(opts.Token) != "" {
|
||||
header.Set("X-RAP-Fabric-Session-Token", strings.TrimSpace(opts.Token))
|
||||
}
|
||||
dialer := opts.Dialer
|
||||
if dialer == nil {
|
||||
base := *websocket.DefaultDialer
|
||||
if opts.Timeout > 0 {
|
||||
base.HandshakeTimeout = opts.Timeout
|
||||
}
|
||||
dialer = &base
|
||||
}
|
||||
return dialer.DialContext(ctx, target, header)
|
||||
}
|
||||
|
||||
func (c Client) OpenFabricSession(ctx context.Context, opts FabricSessionDialOptions) (*FabricSessionClient, *http.Response, error) {
|
||||
conn, resp, err := c.DialFabricSession(ctx, opts)
|
||||
if err != nil {
|
||||
if resp != nil {
|
||||
return nil, resp, fmt.Errorf("fabric session websocket rejected with status %d: %w", resp.StatusCode, err)
|
||||
}
|
||||
return nil, resp, err
|
||||
}
|
||||
maxPayload := opts.MaxPayload
|
||||
if maxPayload <= 0 {
|
||||
maxPayload = fabricproto.DefaultMaxPayload
|
||||
}
|
||||
return &FabricSessionClient{
|
||||
conn: conn,
|
||||
timeout: opts.Timeout,
|
||||
maxPayload: maxPayload,
|
||||
}, resp, nil
|
||||
}
|
||||
|
||||
func (c Client) SendFabricSessionFrame(ctx context.Context, opts FabricSessionDialOptions, frame fabricproto.Frame) (fabricproto.Frame, error) {
|
||||
session, _, err := c.OpenFabricSession(ctx, opts)
|
||||
if err != nil {
|
||||
return fabricproto.Frame{}, err
|
||||
}
|
||||
defer session.Close()
|
||||
return session.RoundTrip(ctx, frame)
|
||||
}
|
||||
|
||||
func (c *FabricSessionClient) Close() error {
|
||||
if c == nil || c.conn == nil {
|
||||
return nil
|
||||
}
|
||||
return c.conn.Close()
|
||||
}
|
||||
|
||||
func (c *FabricSessionClient) WriteFrame(ctx context.Context, frame fabricproto.Frame) error {
|
||||
if c == nil || c.conn == nil {
|
||||
return fmt.Errorf("fabric session client is closed")
|
||||
}
|
||||
payload, err := fabricproto.MarshalFrame(frame)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.writeMu.Lock()
|
||||
defer c.writeMu.Unlock()
|
||||
c.applyWriteDeadline(ctx)
|
||||
return c.conn.WriteMessage(websocket.BinaryMessage, payload)
|
||||
}
|
||||
|
||||
func (c *FabricSessionClient) ReadFrame(ctx context.Context) (fabricproto.Frame, error) {
|
||||
if c == nil || c.conn == nil {
|
||||
return fabricproto.Frame{}, fmt.Errorf("fabric session client is closed")
|
||||
}
|
||||
c.readMu.Lock()
|
||||
defer c.readMu.Unlock()
|
||||
c.applyReadDeadline(ctx)
|
||||
messageType, responsePayload, err := c.conn.ReadMessage()
|
||||
if err != nil {
|
||||
return fabricproto.Frame{}, err
|
||||
}
|
||||
if messageType != websocket.BinaryMessage {
|
||||
return fabricproto.Frame{}, fmt.Errorf("fabric session websocket returned non-binary message type %d", messageType)
|
||||
}
|
||||
return fabricproto.UnmarshalFrame(responsePayload, c.maxPayload)
|
||||
}
|
||||
|
||||
func (c *FabricSessionClient) RoundTrip(ctx context.Context, frame fabricproto.Frame) (fabricproto.Frame, error) {
|
||||
if err := c.WriteFrame(ctx, frame); err != nil {
|
||||
return fabricproto.Frame{}, err
|
||||
}
|
||||
return c.ReadFrame(ctx)
|
||||
}
|
||||
|
||||
func (c *FabricSessionClient) StartPump(ctx context.Context, opts FabricSessionPumpOptions) *FabricSessionPump {
|
||||
if opts.OutboundBuffer <= 0 {
|
||||
opts.OutboundBuffer = 64
|
||||
}
|
||||
if opts.InboundBuffer <= 0 {
|
||||
opts.InboundBuffer = 64
|
||||
}
|
||||
if opts.ErrorBuffer <= 0 {
|
||||
opts.ErrorBuffer = 8
|
||||
}
|
||||
pumpCtx, cancel := context.WithCancel(ctx)
|
||||
pump := &FabricSessionPump{
|
||||
session: c,
|
||||
outbound: make(chan fabricproto.Frame, opts.OutboundBuffer),
|
||||
inbound: make(chan fabricproto.Frame, opts.InboundBuffer),
|
||||
errors: make(chan error, opts.ErrorBuffer),
|
||||
done: make(chan struct{}),
|
||||
cancel: cancel,
|
||||
}
|
||||
go pump.writeLoop(pumpCtx)
|
||||
go pump.readLoop(pumpCtx)
|
||||
return pump
|
||||
}
|
||||
|
||||
func (p *FabricSessionPump) Send(ctx context.Context, frame fabricproto.Frame) error {
|
||||
if p == nil {
|
||||
return fmt.Errorf("fabric session pump is nil")
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-p.done:
|
||||
return fmt.Errorf("fabric session pump is closed")
|
||||
case p.outbound <- frame:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (p *FabricSessionPump) Frames() <-chan fabricproto.Frame {
|
||||
if p == nil {
|
||||
return nil
|
||||
}
|
||||
return p.inbound
|
||||
}
|
||||
|
||||
func (p *FabricSessionPump) Errors() <-chan error {
|
||||
if p == nil {
|
||||
return nil
|
||||
}
|
||||
return p.errors
|
||||
}
|
||||
|
||||
func (p *FabricSessionPump) Closed() bool {
|
||||
if p == nil {
|
||||
return true
|
||||
}
|
||||
select {
|
||||
case <-p.done:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (p *FabricSessionPump) Close() error {
|
||||
if p == nil {
|
||||
return nil
|
||||
}
|
||||
var err error
|
||||
p.closeMu.Do(func() {
|
||||
close(p.done)
|
||||
p.cancel()
|
||||
err = p.session.Close()
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func (p *FabricSessionPump) writeLoop(ctx context.Context) {
|
||||
defer p.Close()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
p.reportError(ctx.Err())
|
||||
return
|
||||
case <-p.done:
|
||||
return
|
||||
case frame := <-p.outbound:
|
||||
if err := p.session.WriteFrame(ctx, frame); err != nil {
|
||||
p.reportError(err)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (p *FabricSessionPump) readLoop(ctx context.Context) {
|
||||
defer p.Close()
|
||||
for {
|
||||
frame, err := p.session.ReadFrame(ctx)
|
||||
if err != nil {
|
||||
p.reportError(err)
|
||||
return
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
p.reportError(ctx.Err())
|
||||
return
|
||||
case <-p.done:
|
||||
return
|
||||
case p.inbound <- frame:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (p *FabricSessionPump) reportError(err error) {
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
select {
|
||||
case p.errors <- err:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
func (c *FabricSessionClient) applyReadDeadline(ctx context.Context) {
|
||||
if deadline, ok := ctx.Deadline(); ok {
|
||||
_ = c.conn.SetReadDeadline(deadline)
|
||||
} else if c.timeout > 0 {
|
||||
_ = c.conn.SetReadDeadline(time.Now().Add(c.timeout))
|
||||
}
|
||||
}
|
||||
|
||||
func (c *FabricSessionClient) applyWriteDeadline(ctx context.Context) {
|
||||
if deadline, ok := ctx.Deadline(); ok {
|
||||
_ = c.conn.SetWriteDeadline(deadline)
|
||||
} else if c.timeout > 0 {
|
||||
_ = c.conn.SetWriteDeadline(time.Now().Add(c.timeout))
|
||||
}
|
||||
}
|
||||
|
||||
func (c Client) fabricSessionWebSocketURL() (string, error) {
|
||||
base := strings.TrimSpace(c.BaseURL)
|
||||
if base == "" {
|
||||
return "", fmt.Errorf("mesh base url is required")
|
||||
}
|
||||
parsed, err := url.Parse(base)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
switch parsed.Scheme {
|
||||
case "http":
|
||||
parsed.Scheme = "ws"
|
||||
case "https":
|
||||
parsed.Scheme = "wss"
|
||||
case "ws", "wss":
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported mesh base url scheme %q", parsed.Scheme)
|
||||
}
|
||||
parsed.Path = strings.TrimRight(parsed.Path, "/") + "/mesh/v1/fabric/session/ws"
|
||||
parsed.RawQuery = ""
|
||||
parsed.Fragment = ""
|
||||
return parsed.String(), nil
|
||||
}
|
||||
|
||||
func cloneHeader(header http.Header) http.Header {
|
||||
out := http.Header{}
|
||||
for key, values := range header {
|
||||
for _, value := range values {
|
||||
out.Add(key, value)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
@@ -1,243 +0,0 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
func TestClientFabricSessionFrameRoundTrip(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
client := NewClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
response, err := client.SendFabricSessionFrame(ctx, FabricSessionDialOptions{
|
||||
Token: "rap_fsn_clienttest",
|
||||
Timeout: time.Second,
|
||||
}, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: 12,
|
||||
Payload: []byte("probe"),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("send fabric session frame: %v", err)
|
||||
}
|
||||
if response.Type != fabricproto.FramePong || response.Sequence != 12 || string(response.Payload) != "probe" {
|
||||
t.Fatalf("response = %+v, want pong seq 12", response)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClientFabricSessionPersistentRoundTrips(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
client := NewClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
session, _, err := client.OpenFabricSession(ctx, FabricSessionDialOptions{
|
||||
Token: "rap_fsn_persistent",
|
||||
Timeout: time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("open fabric session: %v", err)
|
||||
}
|
||||
defer session.Close()
|
||||
|
||||
first, err := session.RoundTrip(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: 1,
|
||||
Payload: []byte("first"),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("first round trip: %v", err)
|
||||
}
|
||||
second, err := session.RoundTrip(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: 2,
|
||||
Payload: []byte("second"),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("second round trip: %v", err)
|
||||
}
|
||||
if first.Type != fabricproto.FramePong || first.Sequence != 1 || string(first.Payload) != "first" {
|
||||
t.Fatalf("first response = %+v, want pong seq 1", first)
|
||||
}
|
||||
if second.Type != fabricproto.FramePong || second.Sequence != 2 || string(second.Payload) != "second" {
|
||||
t.Fatalf("second response = %+v, want pong seq 2", second)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClientFabricSessionPersistentDataAcks(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
client := NewClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
session, _, err := client.OpenFabricSession(ctx, FabricSessionDialOptions{
|
||||
Token: "rap_fsn_dataacks",
|
||||
Timeout: time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("open fabric session: %v", err)
|
||||
}
|
||||
defer session.Close()
|
||||
|
||||
if err := session.WriteFrame(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameOpenStream,
|
||||
StreamID: 77,
|
||||
TrafficClass: fabricproto.TrafficClassInteractive,
|
||||
}); err != nil {
|
||||
t.Fatalf("open stream frame: %v", err)
|
||||
}
|
||||
|
||||
first, err := session.RoundTrip(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
StreamID: 77,
|
||||
Sequence: 10,
|
||||
TrafficClass: fabricproto.TrafficClassInteractive,
|
||||
Payload: []byte("first payload"),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("first data round trip: %v", err)
|
||||
}
|
||||
second, err := session.RoundTrip(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
StreamID: 77,
|
||||
Sequence: 11,
|
||||
TrafficClass: fabricproto.TrafficClassInteractive,
|
||||
Payload: []byte("second payload"),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("second data round trip: %v", err)
|
||||
}
|
||||
if first.Type != fabricproto.FrameAck || first.StreamID != 77 || first.Sequence != 10 {
|
||||
t.Fatalf("first ack = %+v, want stream 77 seq 10", first)
|
||||
}
|
||||
if second.Type != fabricproto.FrameAck || second.StreamID != 77 || second.Sequence != 11 {
|
||||
t.Fatalf("second ack = %+v, want stream 77 seq 11", second)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClientFabricSessionPumpMovesIndependentFrames(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
client := NewClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
session, _, err := client.OpenFabricSession(ctx, FabricSessionDialOptions{
|
||||
Token: "rap_fsn_pump",
|
||||
Timeout: time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("open fabric session: %v", err)
|
||||
}
|
||||
pump := session.StartPump(ctx, FabricSessionPumpOptions{
|
||||
OutboundBuffer: 4,
|
||||
InboundBuffer: 4,
|
||||
ErrorBuffer: 4,
|
||||
})
|
||||
defer pump.Close()
|
||||
|
||||
if err := pump.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameOpenStream,
|
||||
StreamID: 900,
|
||||
TrafficClass: fabricproto.TrafficClassBulk,
|
||||
}); err != nil {
|
||||
t.Fatalf("send open bulk stream: %v", err)
|
||||
}
|
||||
if err := pump.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
StreamID: 900,
|
||||
Sequence: 31,
|
||||
TrafficClass: fabricproto.TrafficClassBulk,
|
||||
Payload: []byte("bulk payload"),
|
||||
}); err != nil {
|
||||
t.Fatalf("send bulk data: %v", err)
|
||||
}
|
||||
if err := pump.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: 32,
|
||||
Payload: []byte("control ping"),
|
||||
}); err != nil {
|
||||
t.Fatalf("send ping: %v", err)
|
||||
}
|
||||
|
||||
gotAck := false
|
||||
gotPong := false
|
||||
for !gotAck || !gotPong {
|
||||
select {
|
||||
case frame := <-pump.Frames():
|
||||
switch {
|
||||
case frame.Type == fabricproto.FrameAck && frame.StreamID == 900 && frame.Sequence == 31:
|
||||
gotAck = true
|
||||
case frame.Type == fabricproto.FramePong && frame.Sequence == 32 && string(frame.Payload) == "control ping":
|
||||
gotPong = true
|
||||
}
|
||||
case err := <-pump.Errors():
|
||||
t.Fatalf("pump error: %v", err)
|
||||
case <-ctx.Done():
|
||||
t.Fatalf("timed out waiting for pump frames: ack=%v pong=%v", gotAck, gotPong)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestClientFabricSessionReportsRejectedStatus(t *testing.T) {
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
client := NewClient(server.URL)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
_, err := client.SendFabricSessionFrame(ctx, FabricSessionDialOptions{}, fabricproto.Frame{Type: fabricproto.FramePing})
|
||||
if err == nil {
|
||||
t.Fatal("send fabric session without token unexpectedly succeeded")
|
||||
}
|
||||
}
|
||||
|
||||
func TestClientFabricSessionWebSocketURL(t *testing.T) {
|
||||
cases := []struct {
|
||||
base string
|
||||
want string
|
||||
}{
|
||||
{base: "http://node.example", want: "ws://node.example/mesh/v1/fabric/session/ws"},
|
||||
{base: "https://node.example/base/", want: "wss://node.example/base/mesh/v1/fabric/session/ws"},
|
||||
{base: "ws://node.example", want: "ws://node.example/mesh/v1/fabric/session/ws"},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
client := NewClient(tc.base)
|
||||
got, err := client.fabricSessionWebSocketURL()
|
||||
if err != nil {
|
||||
t.Fatalf("fabricSessionWebSocketURL(%q): %v", tc.base, err)
|
||||
}
|
||||
if got != tc.want {
|
||||
t.Fatalf("fabricSessionWebSocketURL(%q) = %q, want %q", tc.base, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
var fabricControlForwardSequence atomic.Uint64
|
||||
|
||||
type FabricControlForwardResult struct {
|
||||
Payload json.RawMessage `json:"payload,omitempty"`
|
||||
LatencyMs int64 `json:"latency_ms"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
}
|
||||
|
||||
func FabricTransportTargetFromRegistryEndpoint(endpoint FabricRegistryEndpoint) FabricTransportTarget {
|
||||
return FabricTransportTarget{
|
||||
EndpointID: strings.TrimSpace(endpoint.EndpointID),
|
||||
PeerID: strings.TrimSpace(endpoint.EndpointID),
|
||||
Endpoint: strings.TrimSpace(endpoint.Address),
|
||||
Transport: strings.TrimSpace(endpoint.Transport),
|
||||
PeerCertSHA256: strings.TrimSpace(endpoint.PeerCertSHA256),
|
||||
Timeout: 5 * time.Second,
|
||||
InboundBuffer: 4,
|
||||
ErrorBuffer: 4,
|
||||
}
|
||||
}
|
||||
|
||||
func SendFabricControlForward(ctx context.Context, transport FabricTransport, endpoint FabricRegistryEndpoint, payload []byte, timeout time.Duration) (FabricControlForwardResult, error) {
|
||||
if transport == nil {
|
||||
return FabricControlForwardResult{}, fmt.Errorf("fabric control transport is unavailable")
|
||||
}
|
||||
if len(payload) == 0 {
|
||||
return FabricControlForwardResult{}, fmt.Errorf("fabric control payload is empty")
|
||||
}
|
||||
if timeout <= 0 {
|
||||
timeout = 5 * time.Second
|
||||
}
|
||||
target := FabricTransportTargetFromRegistryEndpoint(endpoint)
|
||||
target.Timeout = timeout
|
||||
session, err := transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
return FabricControlForwardResult{}, err
|
||||
}
|
||||
defer session.Close()
|
||||
sequence := fabricControlForwardSequence.Add(1)
|
||||
if err := session.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FrameData,
|
||||
TrafficClass: fabricproto.TrafficClassReliable,
|
||||
StreamID: FabricControlForwardQUICStreamID,
|
||||
Sequence: sequence,
|
||||
Payload: append([]byte(nil), payload...),
|
||||
}); err != nil {
|
||||
return FabricControlForwardResult{}, err
|
||||
}
|
||||
waitCtx := ctx
|
||||
var cancel context.CancelFunc
|
||||
if timeout > 0 {
|
||||
waitCtx, cancel = context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
}
|
||||
startedAt := time.Now()
|
||||
for {
|
||||
select {
|
||||
case <-waitCtx.Done():
|
||||
return FabricControlForwardResult{}, waitCtx.Err()
|
||||
case err, ok := <-session.Errors():
|
||||
if !ok {
|
||||
return FabricControlForwardResult{}, fmt.Errorf("fabric control session closed")
|
||||
}
|
||||
if err != nil {
|
||||
return FabricControlForwardResult{}, err
|
||||
}
|
||||
case frame, ok := <-session.Frames():
|
||||
if !ok {
|
||||
return FabricControlForwardResult{}, fmt.Errorf("fabric control session closed")
|
||||
}
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != FabricControlForwardQUICStreamID || frame.Sequence != sequence {
|
||||
continue
|
||||
}
|
||||
return FabricControlForwardResult{
|
||||
Payload: append(json.RawMessage(nil), frame.Payload...),
|
||||
LatencyMs: time.Since(startedAt).Milliseconds(),
|
||||
Endpoint: endpoint.Address,
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -565,6 +565,43 @@ func TestQUICFabricServerHandlesWebIngressForwardFrames(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestSendFabricControlForwardUsesQUICStream(t *testing.T) {
|
||||
tlsConfig := testQUICTLSConfig(t)
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
|
||||
if string(payload) != `{"method":"GET","path":"/auth/login"}` {
|
||||
return nil, ErrForwardRuntimeUnavailable
|
||||
}
|
||||
return []byte(`{"status_code":200,"body":{"ok":true}}`), nil
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
defer cancel()
|
||||
result, err := SendFabricControlForward(ctx, NewQUICFabricTransport(nil), FabricRegistryEndpoint{
|
||||
EndpointID: "control-a",
|
||||
Address: "quic://" + server.Addr().String(),
|
||||
Transport: "direct_quic",
|
||||
PeerCertSHA256: testQUICCertSHA256(t, tlsConfig),
|
||||
}, []byte(`{"method":"GET","path":"/auth/login"}`), time.Second)
|
||||
if err != nil {
|
||||
t.Fatalf("send fabric control forward: %v", err)
|
||||
}
|
||||
var response quicFabricControlForwardResponse
|
||||
if err := json.Unmarshal(result.Payload, &response); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if response.Error != "" || string(response.Payload) != `{"status_code":200,"body":{"ok":true}}` {
|
||||
t.Fatalf("response = %+v", response)
|
||||
}
|
||||
}
|
||||
|
||||
func startQUICFabricEchoServer(t *testing.T) *quic.Listener {
|
||||
t.Helper()
|
||||
return startQUICFabricEchoServerWithTLS(t, testQUICTLSConfig(t))
|
||||
|
||||
@@ -164,6 +164,7 @@ func fabricRouteHopsForCandidate(candidate PeerEndpointCandidate, metadata Fabri
|
||||
case FabricRouteRelay:
|
||||
relayNodeID := firstNonEmpty(strings.TrimSpace(metadata.RelayNodeID), strings.TrimSpace(metadata.ViaNodeID))
|
||||
relayEndpoint := firstNonEmpty(strings.TrimRight(strings.TrimSpace(metadata.RelayEndpoint), "/"), endpoint)
|
||||
relayPeerCertSHA256 := candidatePeerCertSHA256(candidate)
|
||||
hops := []FabricRouteHop{}
|
||||
if localNodeID != "" {
|
||||
hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: FabricRouteDirect})
|
||||
@@ -173,7 +174,7 @@ func fabricRouteHopsForCandidate(candidate PeerEndpointCandidate, metadata Fabri
|
||||
return hops
|
||||
}
|
||||
hops = append(hops,
|
||||
FabricRouteHop{NodeID: relayNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID + ":relay", Address: relayEndpoint},
|
||||
FabricRouteHop{NodeID: relayNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID + ":relay", Address: relayEndpoint, PeerCertSHA256: relayPeerCertSHA256},
|
||||
FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)},
|
||||
)
|
||||
return hops
|
||||
|
||||
@@ -44,7 +44,13 @@ func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T) {
|
||||
metadata, _ := json.Marshal(FabricCandidateMetadata{RelayNodeID: "node-r", RelayEndpoint: "quic://node-r:19443"})
|
||||
metadata, _ := json.Marshal(struct {
|
||||
FabricCandidateMetadata
|
||||
TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"`
|
||||
}{
|
||||
FabricCandidateMetadata: FabricCandidateMetadata{RelayNodeID: "node-r", RelayEndpoint: "quic://node-r:19443"},
|
||||
TLSCertSHA256: "relay-cert",
|
||||
})
|
||||
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
@@ -69,6 +75,9 @@ func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T
|
||||
if got := routeSet.Primary.Hops[1].NodeID; got != "node-r" {
|
||||
t.Fatalf("relay hop = %q, want node-r", got)
|
||||
}
|
||||
if got := routeSet.Primary.Hops[1].PeerCertSHA256; got != "relay-cert" {
|
||||
t.Fatalf("relay hop peer cert = %q, want relay-cert", got)
|
||||
}
|
||||
if routeSet.Primary.Capacity != 50 {
|
||||
t.Fatalf("capacity = %d, want 50", routeSet.Primary.Capacity)
|
||||
}
|
||||
|
||||
@@ -1,156 +0,0 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type FabricSessionPeerManager struct {
|
||||
mu sync.Mutex
|
||||
sessions map[string]*FabricSessionPump
|
||||
stats FabricSessionPeerManagerStats
|
||||
}
|
||||
|
||||
type FabricSessionPeerTarget struct {
|
||||
PeerID string
|
||||
BaseURL string
|
||||
Options FabricSessionDialOptions
|
||||
Pump FabricSessionPumpOptions
|
||||
}
|
||||
|
||||
type FabricSessionPeerManagerStats struct {
|
||||
Opens uint64 `json:"opens"`
|
||||
Reuses uint64 `json:"reuses"`
|
||||
ClosedEvicted uint64 `json:"closed_evicted"`
|
||||
ClosePeerCalls uint64 `json:"close_peer_calls"`
|
||||
CloseAllCalls uint64 `json:"close_all_calls"`
|
||||
}
|
||||
|
||||
type FabricSessionPeerManagerSnapshot struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ActiveCount int `json:"active_count"`
|
||||
ClosedCount int `json:"closed_count"`
|
||||
Stats FabricSessionPeerManagerStats `json:"stats"`
|
||||
}
|
||||
|
||||
func NewFabricSessionPeerManager() *FabricSessionPeerManager {
|
||||
return &FabricSessionPeerManager{
|
||||
sessions: map[string]*FabricSessionPump{},
|
||||
}
|
||||
}
|
||||
|
||||
func (m *FabricSessionPeerManager) Get(ctx context.Context, target FabricSessionPeerTarget) (*FabricSessionPump, error) {
|
||||
if m == nil {
|
||||
return nil, fmt.Errorf("fabric session peer manager is nil")
|
||||
}
|
||||
key, err := fabricSessionPeerKey(target)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
m.mu.Lock()
|
||||
if pump := m.sessions[key]; pump != nil {
|
||||
if pump.Closed() {
|
||||
delete(m.sessions, key)
|
||||
m.stats.ClosedEvicted++
|
||||
} else {
|
||||
m.stats.Reuses++
|
||||
m.mu.Unlock()
|
||||
return pump, nil
|
||||
}
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
session, _, err := NewClient(target.BaseURL).OpenFabricSession(ctx, target.Options)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pump := session.StartPump(context.Background(), target.Pump)
|
||||
|
||||
m.mu.Lock()
|
||||
if existing := m.sessions[key]; existing != nil {
|
||||
if existing.Closed() {
|
||||
delete(m.sessions, key)
|
||||
m.stats.ClosedEvicted++
|
||||
} else {
|
||||
m.stats.Reuses++
|
||||
m.mu.Unlock()
|
||||
_ = pump.Close()
|
||||
return existing, nil
|
||||
}
|
||||
}
|
||||
if m.sessions == nil {
|
||||
m.sessions = map[string]*FabricSessionPump{}
|
||||
}
|
||||
m.sessions[key] = pump
|
||||
m.stats.Opens++
|
||||
m.mu.Unlock()
|
||||
return pump, nil
|
||||
}
|
||||
|
||||
func (m *FabricSessionPeerManager) ClosePeer(target FabricSessionPeerTarget) error {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
key, err := fabricSessionPeerKey(target)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
m.mu.Lock()
|
||||
m.stats.ClosePeerCalls++
|
||||
pump := m.sessions[key]
|
||||
delete(m.sessions, key)
|
||||
m.mu.Unlock()
|
||||
if pump == nil {
|
||||
return nil
|
||||
}
|
||||
return pump.Close()
|
||||
}
|
||||
|
||||
func (m *FabricSessionPeerManager) Close() error {
|
||||
if m == nil {
|
||||
return nil
|
||||
}
|
||||
m.mu.Lock()
|
||||
m.stats.CloseAllCalls++
|
||||
sessions := m.sessions
|
||||
m.sessions = map[string]*FabricSessionPump{}
|
||||
m.mu.Unlock()
|
||||
var firstErr error
|
||||
for _, pump := range sessions {
|
||||
if err := pump.Close(); err != nil && firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
}
|
||||
return firstErr
|
||||
}
|
||||
|
||||
func (m *FabricSessionPeerManager) Snapshot() FabricSessionPeerManagerSnapshot {
|
||||
if m == nil {
|
||||
return FabricSessionPeerManagerSnapshot{SchemaVersion: "rap.fabric_session_peer_manager.v1"}
|
||||
}
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
snapshot := FabricSessionPeerManagerSnapshot{
|
||||
SchemaVersion: "rap.fabric_session_peer_manager.v1",
|
||||
Stats: m.stats,
|
||||
}
|
||||
for _, pump := range m.sessions {
|
||||
if pump == nil || pump.Closed() {
|
||||
snapshot.ClosedCount++
|
||||
continue
|
||||
}
|
||||
snapshot.ActiveCount++
|
||||
}
|
||||
return snapshot
|
||||
}
|
||||
|
||||
func fabricSessionPeerKey(target FabricSessionPeerTarget) (string, error) {
|
||||
peerID := strings.TrimSpace(target.PeerID)
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(target.BaseURL), "/")
|
||||
if peerID == "" || baseURL == "" {
|
||||
return "", fmt.Errorf("fabric session peer id and base url are required")
|
||||
}
|
||||
return peerID + "\x00" + baseURL, nil
|
||||
}
|
||||
@@ -1,194 +0,0 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
func TestFabricSessionPeerManagerReusesPeerPump(t *testing.T) {
|
||||
var opened int
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
if entry.Event == "fabric_session_websocket_opened" {
|
||||
opened++
|
||||
}
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
manager := NewFabricSessionPeerManager()
|
||||
defer manager.Close()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
target := FabricSessionPeerTarget{
|
||||
PeerID: "node-a",
|
||||
BaseURL: server.URL,
|
||||
Options: FabricSessionDialOptions{
|
||||
Token: "rap_fsn_manager",
|
||||
Timeout: time.Second,
|
||||
},
|
||||
Pump: FabricSessionPumpOptions{
|
||||
OutboundBuffer: 4,
|
||||
InboundBuffer: 4,
|
||||
},
|
||||
}
|
||||
|
||||
first, err := manager.Get(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("first get: %v", err)
|
||||
}
|
||||
second, err := manager.Get(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("second get: %v", err)
|
||||
}
|
||||
if first != second {
|
||||
t.Fatal("manager did not reuse peer pump")
|
||||
}
|
||||
if opened != 1 {
|
||||
t.Fatalf("opened sessions = %d, want 1", opened)
|
||||
}
|
||||
snapshot := manager.Snapshot()
|
||||
if snapshot.SchemaVersion != "rap.fabric_session_peer_manager.v1" ||
|
||||
snapshot.ActiveCount != 1 ||
|
||||
snapshot.ClosedCount != 0 ||
|
||||
snapshot.Stats.Opens != 1 ||
|
||||
snapshot.Stats.Reuses != 1 {
|
||||
t.Fatalf("snapshot = %+v", snapshot)
|
||||
}
|
||||
if err := first.Send(ctx, fabricproto.Frame{
|
||||
Type: fabricproto.FramePing,
|
||||
Sequence: 1,
|
||||
Payload: []byte("manager"),
|
||||
}); err != nil {
|
||||
t.Fatalf("send ping: %v", err)
|
||||
}
|
||||
select {
|
||||
case frame := <-first.Frames():
|
||||
if frame.Type != fabricproto.FramePong || frame.Sequence != 1 || string(frame.Payload) != "manager" {
|
||||
t.Fatalf("frame = %+v", frame)
|
||||
}
|
||||
case err := <-first.Errors():
|
||||
t.Fatalf("pump error: %v", err)
|
||||
case <-ctx.Done():
|
||||
t.Fatal(ctx.Err())
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricSessionPeerManagerClosePeerReopens(t *testing.T) {
|
||||
var opened int
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
if entry.Event == "fabric_session_websocket_opened" {
|
||||
opened++
|
||||
}
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
manager := NewFabricSessionPeerManager()
|
||||
defer manager.Close()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
target := FabricSessionPeerTarget{
|
||||
PeerID: "node-a",
|
||||
BaseURL: server.URL,
|
||||
Options: FabricSessionDialOptions{
|
||||
Token: "rap_fsn_manager_reopen",
|
||||
Timeout: time.Second,
|
||||
},
|
||||
}
|
||||
|
||||
first, err := manager.Get(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("first get: %v", err)
|
||||
}
|
||||
if err := manager.ClosePeer(target); err != nil {
|
||||
t.Fatalf("close peer: %v", err)
|
||||
}
|
||||
second, err := manager.Get(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("second get: %v", err)
|
||||
}
|
||||
if first == second {
|
||||
t.Fatal("manager reused pump after close peer")
|
||||
}
|
||||
if opened != 2 {
|
||||
t.Fatalf("opened sessions = %d, want 2", opened)
|
||||
}
|
||||
if snapshot := manager.Snapshot(); snapshot.Stats.ClosePeerCalls != 1 || snapshot.Stats.Opens != 2 {
|
||||
t.Fatalf("snapshot = %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricSessionPeerManagerReopensClosedPump(t *testing.T) {
|
||||
var opened int
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
FabricSessionEnabled: true,
|
||||
FabricSessionWebSocketEnabled: true,
|
||||
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
|
||||
if entry.Event == "fabric_session_websocket_opened" {
|
||||
opened++
|
||||
}
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
manager := NewFabricSessionPeerManager()
|
||||
defer manager.Close()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
target := FabricSessionPeerTarget{
|
||||
PeerID: "node-a",
|
||||
BaseURL: server.URL,
|
||||
Options: FabricSessionDialOptions{
|
||||
Token: "rap_fsn_manager_closed",
|
||||
Timeout: time.Second,
|
||||
},
|
||||
}
|
||||
|
||||
first, err := manager.Get(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("first get: %v", err)
|
||||
}
|
||||
if err := first.Close(); err != nil {
|
||||
t.Fatalf("close first pump: %v", err)
|
||||
}
|
||||
if !first.Closed() {
|
||||
t.Fatal("first pump should report closed")
|
||||
}
|
||||
second, err := manager.Get(ctx, target)
|
||||
if err != nil {
|
||||
t.Fatalf("second get: %v", err)
|
||||
}
|
||||
if first == second {
|
||||
t.Fatal("manager reused closed pump")
|
||||
}
|
||||
if opened != 2 {
|
||||
t.Fatalf("opened sessions = %d, want 2", opened)
|
||||
}
|
||||
snapshot := manager.Snapshot()
|
||||
if snapshot.ActiveCount != 1 ||
|
||||
snapshot.Stats.Opens != 2 ||
|
||||
snapshot.Stats.ClosedEvicted != 1 {
|
||||
t.Fatalf("snapshot = %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricSessionPeerManagerRejectsIncompleteTarget(t *testing.T) {
|
||||
manager := NewFabricSessionPeerManager()
|
||||
_, err := manager.Get(context.Background(), FabricSessionPeerTarget{PeerID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("incomplete target unexpectedly succeeded")
|
||||
}
|
||||
}
|
||||
@@ -308,7 +308,7 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
Transport: intent.Transport,
|
||||
PeerCertSHA256: intent.BestPeerCertSHA256,
|
||||
}}
|
||||
if intent.DirectCandidate {
|
||||
if intent.DirectCandidate || peerConnectionShouldProbeDirectUpgrade(intent, cacheEntry) {
|
||||
targets = peerConnectionProbeTargets(intent, cacheEntry)
|
||||
}
|
||||
var lastFailure string
|
||||
@@ -354,7 +354,9 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
result.SelectedCandidateID = probePeer.BestCandidateID
|
||||
result.SelectedEndpoint = probePeer.Endpoint
|
||||
result.LatencyMs = latency
|
||||
if intent.RelayCandidate {
|
||||
if probeTargetUsesDirectQUIC(probeTarget) {
|
||||
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
|
||||
} else if intent.RelayCandidate {
|
||||
result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt)
|
||||
} else {
|
||||
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
|
||||
@@ -410,6 +412,10 @@ func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer P
|
||||
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
|
||||
seen := map[string]struct{}{}
|
||||
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
|
||||
fallbackPeerCertSHA256 := firstNonEmpty(
|
||||
strings.TrimSpace(cacheEntry.BestPeerCertSHA256),
|
||||
strings.TrimSpace(intent.BestPeerCertSHA256),
|
||||
)
|
||||
add := func(candidateID, endpoint, transport, peerCertSHA256 string) {
|
||||
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
|
||||
if endpoint == "" {
|
||||
@@ -423,6 +429,9 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
|
||||
return
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
if strings.TrimSpace(peerCertSHA256) == "" {
|
||||
peerCertSHA256 = fallbackPeerCertSHA256
|
||||
}
|
||||
out = append(out, peerConnectionProbeTarget{
|
||||
CandidateID: strings.TrimSpace(candidateID),
|
||||
Endpoint: endpoint,
|
||||
@@ -440,6 +449,31 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
|
||||
return out
|
||||
}
|
||||
|
||||
func peerConnectionShouldProbeDirectUpgrade(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) bool {
|
||||
if intent.DirectCandidate {
|
||||
return true
|
||||
}
|
||||
if strings.TrimSpace(intent.ConnectionState) != PeerConnectionRelayReady &&
|
||||
!intent.RelayCandidate &&
|
||||
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayControl {
|
||||
return false
|
||||
}
|
||||
for _, candidate := range cacheEntry.EndpointCandidates {
|
||||
if candidateUsableForDirectProbe(candidate) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func probeTargetUsesDirectQUIC(target peerConnectionProbeTarget) bool {
|
||||
transport := strings.ToLower(strings.TrimSpace(target.Transport))
|
||||
if strings.Contains(transport, "relay") || strings.Contains(transport, "reverse") || strings.Contains(transport, "outbound") {
|
||||
return false
|
||||
}
|
||||
return peerConnectionTargetIsQUIC(target.Transport, target.Endpoint)
|
||||
}
|
||||
|
||||
func peerConnectionTargetIsQUIC(transport string, endpoint string) bool {
|
||||
return isQUICOnlyCandidateTransport(transport) || strings.HasPrefix(strings.ToLower(strings.TrimSpace(endpoint)), "quic://")
|
||||
}
|
||||
|
||||
@@ -221,6 +221,125 @@ func TestPeerConnectionProbeTargetKeepsPeerForLocalRelayReverseQUIC(t *testing.T
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionProbeTargetsFallsBackToBestPeerCertSHA256(t *testing.T) {
|
||||
intent := PeerConnectionIntent{
|
||||
NodeID: "node-b",
|
||||
BestPeerCertSHA256: "intent-cert",
|
||||
}
|
||||
cacheEntry := PeerCacheEntry{
|
||||
NodeID: "node-b",
|
||||
BestPeerCertSHA256: "cache-cert",
|
||||
BestCandidateID: "node-b-best",
|
||||
BestTransport: "direct_quic",
|
||||
Endpoint: "quic://94.141.118.222:19199",
|
||||
EndpointCandidates: []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://94.141.118.222:19199",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 1,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
targets := peerConnectionProbeTargets(intent, cacheEntry)
|
||||
if len(targets) != 1 {
|
||||
t.Fatalf("target count = %d, want 1", len(targets))
|
||||
}
|
||||
for _, target := range targets {
|
||||
if target.Endpoint != "quic://94.141.118.222:19199" {
|
||||
continue
|
||||
}
|
||||
if target.PeerCertSHA256 != "cache-cert" {
|
||||
t.Fatalf("peer cert = %q, want cache-cert", target.PeerCertSHA256)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionProbeTargetsUpgradeRelayReadyPeerToDirectQUIC(t *testing.T) {
|
||||
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
tlsConfig := testQUICTLSConfig(t)
|
||||
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
|
||||
ListenAddr: "127.0.0.1:0",
|
||||
TLSConfig: tlsConfig,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start quic fabric server: %v", err)
|
||||
}
|
||||
defer server.Close()
|
||||
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
certSHA256 := testQUICCertSHA256(t, tlsConfig)
|
||||
leases := []PeerRendezvousLease{{
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "quic://127.0.0.1:1",
|
||||
Transport: "relay_quic",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 10,
|
||||
ControlPlaneOnly: true,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-direct",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_quic",
|
||||
Address: "quic://" + server.Addr().String(),
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 1,
|
||||
Metadata: peerConnectionProbeMetadata(t, certSHA256),
|
||||
},
|
||||
},
|
||||
},
|
||||
RendezvousLeases: leases,
|
||||
WarmPeerLimit: 1,
|
||||
Now: now,
|
||||
})
|
||||
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
|
||||
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
RendezvousLeases: leases,
|
||||
QUICTransport: NewQUICFabricTransport(nil),
|
||||
ProbeTimeout: time.Second,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
return current
|
||||
},
|
||||
})
|
||||
|
||||
cycle := manager.ProbeOnce(context.Background())
|
||||
if cycle.Attempted != 1 || cycle.Succeeded != 1 || len(cycle.Results) != 1 {
|
||||
t.Fatalf("unexpected cycle: %+v", cycle)
|
||||
}
|
||||
result := cycle.Results[0]
|
||||
if result.SelectedCandidateID != "node-b-direct" || result.SelectedEndpoint != "quic://"+server.Addr().String() {
|
||||
t.Fatalf("relay-ready peer did not upgrade to direct candidate: %+v", result)
|
||||
}
|
||||
if result.ConnectionState.State != PeerConnectionReady {
|
||||
t.Fatalf("connection state = %q, want ready", result.ConnectionState.State)
|
||||
}
|
||||
if len(result.CandidateResults) == 0 || result.CandidateResults[0].Transport != "direct_quic" || result.CandidateResults[0].LinkStatus != PeerConnectionProbeReachable {
|
||||
t.Fatalf("candidate trail missing direct probe success: %+v", result.CandidateResults)
|
||||
}
|
||||
snapshot := tracker.Snapshot()
|
||||
if snapshot.Ready != 1 || snapshot.RelayReady != 0 {
|
||||
t.Fatalf("unexpected tracker snapshot after direct upgrade: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
|
||||
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
|
||||
@@ -102,8 +102,11 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
|
||||
continue
|
||||
}
|
||||
switch connection.State {
|
||||
case PeerConnectionReady, PeerConnectionRelayReady:
|
||||
case PeerConnectionReady:
|
||||
ready++
|
||||
case PeerConnectionRelayReady:
|
||||
// Relay-ready peers remain valuable for control-plane reachability,
|
||||
// but they do not satisfy the target for direct-ready transport paths.
|
||||
case PeerConnectionDegraded:
|
||||
degraded++
|
||||
case PeerConnectionBackoff:
|
||||
|
||||
@@ -69,7 +69,7 @@ func TestPeerRecoveryPlanAddsRecoverySeedWhenReadyDeficit(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
|
||||
func TestPeerRecoveryPlanTreatsRelayReadyPeersAsRecoveryGap(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{
|
||||
@@ -92,12 +92,15 @@ func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
|
||||
t.Fatalf("unexpected steady plan: %+v", plan)
|
||||
if plan.Mode != PeerRecoveryModeRecovery || plan.Healthy {
|
||||
t.Fatalf("unexpected relay-ready recovery plan: %+v", plan)
|
||||
}
|
||||
if !recoveryPlanHasCandidate(plan, "node-c", "maintain_ready") {
|
||||
t.Fatalf("relay-ready peer was not maintained: %+v", plan.Candidates)
|
||||
}
|
||||
if plan.ReadyPeerCount != 0 || plan.Deficit != 1 {
|
||||
t.Fatalf("relay-ready peer should not satisfy direct-ready target: %+v", plan)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
|
||||
|
||||
@@ -0,0 +1,713 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/ed25519"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
)
|
||||
|
||||
const (
|
||||
FabricRegistryGossipRecordSchema = "rap.fabric.registry.gossip_record.v1"
|
||||
|
||||
FabricRegistryScopeFarm = "farm"
|
||||
FabricRegistryScopeCluster = "cluster"
|
||||
FabricRegistryScopeOrganization = "organization"
|
||||
|
||||
FabricRegistryServiceControlAPI = "control-api"
|
||||
FabricRegistryServiceUpdateStore = "update-store"
|
||||
FabricRegistryServiceUpdateCache = "update-cache"
|
||||
FabricRegistryServiceWebAdmin = "web-admin"
|
||||
FabricRegistryServiceVPNExitPool = "vpn-egress-pool"
|
||||
|
||||
FabricRegistryAuthorityControl = "control-authority"
|
||||
FabricRegistryAuthorityUpdate = "update-authority"
|
||||
FabricRegistryAuthorityStorage = "storage-authority"
|
||||
FabricRegistryAuthorityRoute = "route-authority"
|
||||
)
|
||||
|
||||
type FabricRegistryEndpoint struct {
|
||||
EndpointID string `json:"endpoint_id"`
|
||||
Address string `json:"address"`
|
||||
Transport string `json:"transport"`
|
||||
Reachability string `json:"reachability,omitempty"`
|
||||
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
||||
Region string `json:"region,omitempty"`
|
||||
Priority int `json:"priority,omitempty"`
|
||||
Weight int `json:"weight,omitempty"`
|
||||
PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"`
|
||||
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type FabricRegistrySignature struct {
|
||||
KeyID string `json:"key_id"`
|
||||
IssuerID string `json:"issuer_id"`
|
||||
Role string `json:"role"`
|
||||
Alg string `json:"alg"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
type FabricRegistryGossipRecord struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
Service string `json:"service"`
|
||||
Scope string `json:"scope"`
|
||||
OrganizationID string `json:"organization_id,omitempty"`
|
||||
Epoch int64 `json:"epoch"`
|
||||
Generation string `json:"generation,omitempty"`
|
||||
IssuedAt time.Time `json:"issued_at"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
IssuerNodeID string `json:"issuer_node_id"`
|
||||
IssuerRole string `json:"issuer_role"`
|
||||
Endpoints []FabricRegistryEndpoint `json:"endpoints"`
|
||||
Metadata json.RawMessage `json:"metadata,omitempty"`
|
||||
Signatures []FabricRegistrySignature `json:"signatures,omitempty"`
|
||||
}
|
||||
|
||||
type FabricRegistryTrustedIssuer struct {
|
||||
IssuerID string
|
||||
Role string
|
||||
PublicKey ed25519.PublicKey
|
||||
Scopes []string
|
||||
Services []string
|
||||
}
|
||||
|
||||
type FabricRegistryVerificationPolicy struct {
|
||||
LocalClusterID string
|
||||
TrustedIssuers []FabricRegistryTrustedIssuer
|
||||
RequiredSignatures int
|
||||
MaxClockSkew time.Duration
|
||||
Now time.Time
|
||||
}
|
||||
|
||||
type FabricRegistryVerificationResult struct {
|
||||
AcceptedSignatureCount int `json:"accepted_signature_count"`
|
||||
AcceptedIssuers []string `json:"accepted_issuers,omitempty"`
|
||||
RecordHash string `json:"record_hash"`
|
||||
}
|
||||
|
||||
type FabricRegistryEntryState string
|
||||
|
||||
const (
|
||||
FabricRegistryCandidate FabricRegistryEntryState = "candidate"
|
||||
FabricRegistryActive FabricRegistryEntryState = "active"
|
||||
FabricRegistryExpired FabricRegistryEntryState = "expired"
|
||||
FabricRegistryRejected FabricRegistryEntryState = "rejected"
|
||||
)
|
||||
|
||||
type FabricRegistryEntry struct {
|
||||
Record FabricRegistryGossipRecord `json:"record"`
|
||||
State FabricRegistryEntryState `json:"state"`
|
||||
AcceptedAt time.Time `json:"accepted_at"`
|
||||
PromotedAt *time.Time `json:"promoted_at,omitempty"`
|
||||
VerifyResult FabricRegistryVerificationResult `json:"verify_result"`
|
||||
}
|
||||
|
||||
type FabricRegistryBootstrapReport struct {
|
||||
Total int `json:"total"`
|
||||
Active int `json:"active"`
|
||||
Candidate int `json:"candidate"`
|
||||
Rejected int `json:"rejected"`
|
||||
Rejects []string `json:"rejects,omitempty"`
|
||||
RecordKeys []string `json:"record_keys,omitempty"`
|
||||
}
|
||||
|
||||
type FabricRegistryResolveRequest struct {
|
||||
ClusterID string
|
||||
Service string
|
||||
Scope string
|
||||
OrganizationID string
|
||||
PreferredRegion string
|
||||
Now time.Time
|
||||
}
|
||||
|
||||
type FabricRegistryResolvedService struct {
|
||||
Found bool `json:"found"`
|
||||
Service string `json:"service"`
|
||||
Scope string `json:"scope,omitempty"`
|
||||
OrganizationID string `json:"organization_id,omitempty"`
|
||||
RecordEpoch int64 `json:"record_epoch,omitempty"`
|
||||
RecordHash string `json:"record_hash,omitempty"`
|
||||
Endpoints []FabricRegistryEndpoint `json:"endpoints,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
}
|
||||
|
||||
type FabricRegistryLiveProbeRequest struct {
|
||||
ClusterID string
|
||||
PreferredRegion string
|
||||
Timeout time.Duration
|
||||
Now time.Time
|
||||
MaxCandidates int
|
||||
}
|
||||
|
||||
type FabricRegistryLiveProbeResult struct {
|
||||
Service string `json:"service"`
|
||||
Scope string `json:"scope"`
|
||||
OrganizationID string `json:"organization_id,omitempty"`
|
||||
EndpointID string `json:"endpoint_id,omitempty"`
|
||||
Address string `json:"address,omitempty"`
|
||||
Status string `json:"status"`
|
||||
LatencyMs int64 `json:"latency_ms,omitempty"`
|
||||
Promoted bool `json:"promoted"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type FabricRegistrySnapshot struct {
|
||||
Active int `json:"active"`
|
||||
Candidate int `json:"candidate"`
|
||||
ActiveKeys []string `json:"active_keys,omitempty"`
|
||||
CandidateKeys []string `json:"candidate_keys,omitempty"`
|
||||
}
|
||||
|
||||
type FabricRegistry struct {
|
||||
entries map[string]FabricRegistryEntry
|
||||
candidates map[string]FabricRegistryEntry
|
||||
}
|
||||
|
||||
func NewFabricRegistry() *FabricRegistry {
|
||||
return &FabricRegistry{entries: map[string]FabricRegistryEntry{}, candidates: map[string]FabricRegistryEntry{}}
|
||||
}
|
||||
|
||||
func LoadFabricRegistryBootstrapRecords(recordsJSON string, policy FabricRegistryVerificationPolicy, liveVerified bool) (*FabricRegistry, FabricRegistryBootstrapReport, error) {
|
||||
registry := NewFabricRegistry()
|
||||
recordsJSON = strings.TrimSpace(recordsJSON)
|
||||
if recordsJSON == "" {
|
||||
return registry, FabricRegistryBootstrapReport{}, nil
|
||||
}
|
||||
var records []FabricRegistryGossipRecord
|
||||
if err := json.Unmarshal([]byte(recordsJSON), &records); err != nil {
|
||||
return nil, FabricRegistryBootstrapReport{}, fmt.Errorf("decode fabric registry bootstrap records: %w", err)
|
||||
}
|
||||
report := FabricRegistryBootstrapReport{Total: len(records)}
|
||||
for _, record := range records {
|
||||
entry, changed, err := registry.ApplyGossipRecord(record, policy, liveVerified)
|
||||
if err != nil {
|
||||
report.Rejected++
|
||||
report.Rejects = append(report.Rejects, err.Error())
|
||||
continue
|
||||
}
|
||||
if !changed {
|
||||
continue
|
||||
}
|
||||
report.RecordKeys = append(report.RecordKeys, fabricRegistryRecordKey(record))
|
||||
switch entry.State {
|
||||
case FabricRegistryActive:
|
||||
report.Active++
|
||||
case FabricRegistryCandidate:
|
||||
report.Candidate++
|
||||
}
|
||||
}
|
||||
return registry, report, nil
|
||||
}
|
||||
|
||||
func (r *FabricRegistry) ApplyGossipRecord(record FabricRegistryGossipRecord, policy FabricRegistryVerificationPolicy, liveVerified bool) (FabricRegistryEntry, bool, error) {
|
||||
if r == nil {
|
||||
return FabricRegistryEntry{}, false, fmt.Errorf("fabric registry is nil")
|
||||
}
|
||||
result, err := VerifyFabricRegistryGossipRecord(record, policy)
|
||||
if err != nil {
|
||||
return FabricRegistryEntry{}, false, err
|
||||
}
|
||||
now := registryNow(policy.Now)
|
||||
key := fabricRegistryRecordKey(record)
|
||||
current, exists := r.entries[key]
|
||||
if exists && !fabricRegistryRecordNewer(record, current.Record, now) {
|
||||
return current, false, nil
|
||||
}
|
||||
state := FabricRegistryCandidate
|
||||
var promotedAt *time.Time
|
||||
if liveVerified {
|
||||
state = FabricRegistryActive
|
||||
t := now
|
||||
promotedAt = &t
|
||||
}
|
||||
entry := FabricRegistryEntry{
|
||||
Record: normalizeFabricRegistryRecord(record),
|
||||
State: state,
|
||||
AcceptedAt: now,
|
||||
PromotedAt: promotedAt,
|
||||
VerifyResult: result,
|
||||
}
|
||||
if state == FabricRegistryActive {
|
||||
r.entries[key] = entry
|
||||
delete(r.candidates, key)
|
||||
return entry, true, nil
|
||||
}
|
||||
if r.candidates == nil {
|
||||
r.candidates = map[string]FabricRegistryEntry{}
|
||||
}
|
||||
r.candidates[key] = entry
|
||||
return entry, true, nil
|
||||
}
|
||||
|
||||
func (r *FabricRegistry) MarkLiveVerified(clusterID, service, scope, organizationID string, now time.Time) bool {
|
||||
if r == nil {
|
||||
return false
|
||||
}
|
||||
key := fabricRegistryKey(clusterID, service, scope, organizationID)
|
||||
entry, ok := r.candidates[key]
|
||||
if !ok || entry.State == FabricRegistryExpired || entry.State == FabricRegistryRejected {
|
||||
return false
|
||||
}
|
||||
t := registryNow(now)
|
||||
entry.State = FabricRegistryActive
|
||||
entry.PromotedAt = &t
|
||||
r.entries[key] = entry
|
||||
delete(r.candidates, key)
|
||||
return true
|
||||
}
|
||||
|
||||
func (r *FabricRegistry) Active(clusterID, service, scope, organizationID string, now time.Time) (FabricRegistryGossipRecord, bool) {
|
||||
if r == nil {
|
||||
return FabricRegistryGossipRecord{}, false
|
||||
}
|
||||
entry, ok := r.entries[fabricRegistryKey(clusterID, service, scope, organizationID)]
|
||||
if !ok || entry.State != FabricRegistryActive || !entry.Record.ExpiresAt.After(registryNow(now)) {
|
||||
return FabricRegistryGossipRecord{}, false
|
||||
}
|
||||
return entry.Record, true
|
||||
}
|
||||
|
||||
func (r *FabricRegistry) ResolveService(req FabricRegistryResolveRequest) FabricRegistryResolvedService {
|
||||
service := strings.ToLower(strings.TrimSpace(req.Service))
|
||||
if service == "" {
|
||||
return FabricRegistryResolvedService{Found: false, Reason: "service_required"}
|
||||
}
|
||||
scopeOrder := fabricRegistryScopeResolutionOrder(req.Scope, req.OrganizationID)
|
||||
for _, scope := range scopeOrder {
|
||||
organizationID := strings.TrimSpace(req.OrganizationID)
|
||||
if scope != FabricRegistryScopeOrganization {
|
||||
organizationID = ""
|
||||
}
|
||||
record, ok := r.Active(req.ClusterID, service, scope, organizationID, req.Now)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
endpoints := selectFabricRegistryEndpoints(record.Endpoints, req.PreferredRegion)
|
||||
if len(endpoints) == 0 {
|
||||
return FabricRegistryResolvedService{Found: false, Service: service, Scope: scope, OrganizationID: organizationID, Reason: "no_usable_endpoints"}
|
||||
}
|
||||
result, _ := canonicalFabricRegistryPayload(record)
|
||||
sum := sha256.Sum256(result)
|
||||
return FabricRegistryResolvedService{
|
||||
Found: true,
|
||||
Service: service,
|
||||
Scope: scope,
|
||||
OrganizationID: organizationID,
|
||||
RecordEpoch: record.Epoch,
|
||||
RecordHash: hex.EncodeToString(sum[:]),
|
||||
Endpoints: endpoints,
|
||||
}
|
||||
}
|
||||
return FabricRegistryResolvedService{Found: false, Service: service, Reason: "no_active_record"}
|
||||
}
|
||||
|
||||
func (r *FabricRegistry) Snapshot(now time.Time) FabricRegistrySnapshot {
|
||||
if r == nil {
|
||||
return FabricRegistrySnapshot{}
|
||||
}
|
||||
now = registryNow(now)
|
||||
out := FabricRegistrySnapshot{}
|
||||
for key, entry := range r.entries {
|
||||
if entry.State == FabricRegistryActive && entry.Record.ExpiresAt.After(now) {
|
||||
out.Active++
|
||||
out.ActiveKeys = append(out.ActiveKeys, key)
|
||||
}
|
||||
}
|
||||
for key, entry := range r.candidates {
|
||||
if entry.State == FabricRegistryCandidate && entry.Record.ExpiresAt.After(now) {
|
||||
out.Candidate++
|
||||
out.CandidateKeys = append(out.CandidateKeys, key)
|
||||
}
|
||||
}
|
||||
sort.Strings(out.ActiveKeys)
|
||||
sort.Strings(out.CandidateKeys)
|
||||
return out
|
||||
}
|
||||
|
||||
func (r *FabricRegistry) VerifyCandidates(ctx context.Context, transport FabricTransport, req FabricRegistryLiveProbeRequest) []FabricRegistryLiveProbeResult {
|
||||
if r == nil {
|
||||
return nil
|
||||
}
|
||||
now := registryNow(req.Now)
|
||||
timeout := req.Timeout
|
||||
if timeout <= 0 {
|
||||
timeout = 2 * time.Second
|
||||
}
|
||||
maxCandidates := req.MaxCandidates
|
||||
if maxCandidates <= 0 {
|
||||
maxCandidates = 16
|
||||
}
|
||||
candidates := make([]FabricRegistryEntry, 0, len(r.candidates))
|
||||
for _, entry := range r.candidates {
|
||||
if entry.State != FabricRegistryCandidate || !entry.Record.ExpiresAt.After(now) {
|
||||
continue
|
||||
}
|
||||
if clusterID := strings.TrimSpace(req.ClusterID); clusterID != "" && entry.Record.ClusterID != clusterID {
|
||||
continue
|
||||
}
|
||||
candidates = append(candidates, entry)
|
||||
}
|
||||
sort.SliceStable(candidates, func(i, j int) bool {
|
||||
if candidates[i].Record.Service != candidates[j].Record.Service {
|
||||
return candidates[i].Record.Service < candidates[j].Record.Service
|
||||
}
|
||||
if candidates[i].Record.Scope != candidates[j].Record.Scope {
|
||||
return candidates[i].Record.Scope < candidates[j].Record.Scope
|
||||
}
|
||||
return candidates[i].Record.Epoch > candidates[j].Record.Epoch
|
||||
})
|
||||
if len(candidates) > maxCandidates {
|
||||
candidates = candidates[:maxCandidates]
|
||||
}
|
||||
results := make([]FabricRegistryLiveProbeResult, 0, len(candidates))
|
||||
for _, entry := range candidates {
|
||||
record := entry.Record
|
||||
result := FabricRegistryLiveProbeResult{
|
||||
Service: record.Service,
|
||||
Scope: record.Scope,
|
||||
OrganizationID: record.OrganizationID,
|
||||
Status: "unreachable",
|
||||
}
|
||||
endpoints := selectFabricRegistryEndpoints(record.Endpoints, req.PreferredRegion)
|
||||
if len(endpoints) == 0 {
|
||||
result.Error = "no_usable_endpoints"
|
||||
results = append(results, result)
|
||||
continue
|
||||
}
|
||||
for _, endpoint := range endpoints {
|
||||
probeCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
latency, err := probeFabricRegistryEndpoint(probeCtx, transport, endpoint, timeout)
|
||||
cancel()
|
||||
result.EndpointID = endpoint.EndpointID
|
||||
result.Address = endpoint.Address
|
||||
if err != nil {
|
||||
result.Error = err.Error()
|
||||
continue
|
||||
}
|
||||
result.Status = "reachable"
|
||||
result.LatencyMs = latency.Milliseconds()
|
||||
result.Promoted = r.MarkLiveVerified(record.ClusterID, record.Service, record.Scope, record.OrganizationID, now)
|
||||
result.Error = ""
|
||||
break
|
||||
}
|
||||
results = append(results, result)
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
func SignFabricRegistryGossipRecord(record FabricRegistryGossipRecord, issuer FabricRegistryTrustedIssuer, privateKey ed25519.PrivateKey) (FabricRegistryGossipRecord, error) {
|
||||
payload, err := canonicalFabricRegistryPayload(record)
|
||||
if err != nil {
|
||||
return record, err
|
||||
}
|
||||
sig := ed25519.Sign(privateKey, payload)
|
||||
record.Signatures = append(record.Signatures, FabricRegistrySignature{
|
||||
KeyID: firstNonEmpty(issuer.IssuerID, record.IssuerNodeID),
|
||||
IssuerID: firstNonEmpty(issuer.IssuerID, record.IssuerNodeID),
|
||||
Role: firstNonEmpty(issuer.Role, record.IssuerRole),
|
||||
Alg: "ed25519",
|
||||
Value: hex.EncodeToString(sig),
|
||||
})
|
||||
return record, nil
|
||||
}
|
||||
|
||||
func VerifyFabricRegistryGossipRecord(record FabricRegistryGossipRecord, policy FabricRegistryVerificationPolicy) (FabricRegistryVerificationResult, error) {
|
||||
record = normalizeFabricRegistryRecord(record)
|
||||
if err := validateFabricRegistryGossipRecord(record, policy); err != nil {
|
||||
return FabricRegistryVerificationResult{}, err
|
||||
}
|
||||
payload, err := canonicalFabricRegistryPayload(record)
|
||||
if err != nil {
|
||||
return FabricRegistryVerificationResult{}, err
|
||||
}
|
||||
sum := sha256.Sum256(payload)
|
||||
trusted := map[string]FabricRegistryTrustedIssuer{}
|
||||
for _, issuer := range policy.TrustedIssuers {
|
||||
if strings.TrimSpace(issuer.IssuerID) != "" {
|
||||
trusted[issuer.IssuerID] = issuer
|
||||
}
|
||||
if strings.TrimSpace(issuer.IssuerID) != "" && strings.TrimSpace(issuer.Role) != "" {
|
||||
trusted[issuer.IssuerID+"\x00"+issuer.Role] = issuer
|
||||
}
|
||||
}
|
||||
accepted := map[string]struct{}{}
|
||||
for _, signature := range record.Signatures {
|
||||
if strings.ToLower(strings.TrimSpace(signature.Alg)) != "ed25519" {
|
||||
continue
|
||||
}
|
||||
issuer, ok := trusted[strings.TrimSpace(signature.IssuerID)+"\x00"+strings.TrimSpace(signature.Role)]
|
||||
if !ok {
|
||||
issuer, ok = trusted[strings.TrimSpace(signature.IssuerID)]
|
||||
}
|
||||
if !ok || !fabricRegistryIssuerAllowed(issuer, record) {
|
||||
continue
|
||||
}
|
||||
rawSig, err := hex.DecodeString(strings.TrimSpace(signature.Value))
|
||||
if err != nil || len(rawSig) != ed25519.SignatureSize || len(issuer.PublicKey) != ed25519.PublicKeySize {
|
||||
continue
|
||||
}
|
||||
if ed25519.Verify(issuer.PublicKey, payload, rawSig) {
|
||||
accepted[signature.IssuerID] = struct{}{}
|
||||
}
|
||||
}
|
||||
required := policy.RequiredSignatures
|
||||
if required <= 0 {
|
||||
required = 1
|
||||
}
|
||||
if len(accepted) < required {
|
||||
return FabricRegistryVerificationResult{RecordHash: hex.EncodeToString(sum[:])}, fmt.Errorf("fabric registry gossip record lacks required trusted signatures")
|
||||
}
|
||||
issuers := make([]string, 0, len(accepted))
|
||||
for issuer := range accepted {
|
||||
issuers = append(issuers, issuer)
|
||||
}
|
||||
sort.Strings(issuers)
|
||||
return FabricRegistryVerificationResult{
|
||||
AcceptedSignatureCount: len(accepted),
|
||||
AcceptedIssuers: issuers,
|
||||
RecordHash: hex.EncodeToString(sum[:]),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func validateFabricRegistryGossipRecord(record FabricRegistryGossipRecord, policy FabricRegistryVerificationPolicy) error {
|
||||
if record.SchemaVersion != FabricRegistryGossipRecordSchema {
|
||||
return fmt.Errorf("fabric registry gossip record schema_version is invalid")
|
||||
}
|
||||
if strings.TrimSpace(record.ClusterID) == "" || (strings.TrimSpace(policy.LocalClusterID) != "" && record.ClusterID != policy.LocalClusterID) {
|
||||
return ErrClusterMismatch
|
||||
}
|
||||
if strings.TrimSpace(record.Service) == "" || strings.TrimSpace(record.Scope) == "" || strings.TrimSpace(record.IssuerNodeID) == "" || strings.TrimSpace(record.IssuerRole) == "" {
|
||||
return fmt.Errorf("fabric registry gossip record is missing service, scope, or issuer")
|
||||
}
|
||||
if record.Epoch <= 0 || record.IssuedAt.IsZero() || record.ExpiresAt.IsZero() || !record.ExpiresAt.After(record.IssuedAt) {
|
||||
return fmt.Errorf("fabric registry gossip record has invalid epoch or validity window")
|
||||
}
|
||||
now := registryNow(policy.Now)
|
||||
skew := policy.MaxClockSkew
|
||||
if skew <= 0 {
|
||||
skew = time.Minute
|
||||
}
|
||||
if record.IssuedAt.After(now.Add(skew)) || !record.ExpiresAt.After(now) {
|
||||
return fmt.Errorf("fabric registry gossip record is not currently valid")
|
||||
}
|
||||
if len(record.Endpoints) == 0 {
|
||||
return fmt.Errorf("fabric registry gossip record has no endpoints")
|
||||
}
|
||||
for _, endpoint := range record.Endpoints {
|
||||
if strings.TrimSpace(endpoint.EndpointID) == "" || strings.TrimSpace(endpoint.Address) == "" || strings.TrimSpace(endpoint.Transport) == "" {
|
||||
return fmt.Errorf("fabric registry gossip record contains invalid endpoint")
|
||||
}
|
||||
if !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
|
||||
return fmt.Errorf("fabric registry gossip endpoint must be QUIC-only")
|
||||
}
|
||||
if len(endpoint.Metadata) > 0 && !json.Valid(endpoint.Metadata) {
|
||||
return fmt.Errorf("fabric registry gossip endpoint metadata is invalid")
|
||||
}
|
||||
}
|
||||
if len(record.Metadata) > 0 && !json.Valid(record.Metadata) {
|
||||
return fmt.Errorf("fabric registry gossip metadata is invalid")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func canonicalFabricRegistryPayload(record FabricRegistryGossipRecord) ([]byte, error) {
|
||||
record = normalizeFabricRegistryRecord(record)
|
||||
record.Signatures = nil
|
||||
payload, err := json.Marshal(record)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var compact bytes.Buffer
|
||||
if err := json.Compact(&compact, payload); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return compact.Bytes(), nil
|
||||
}
|
||||
|
||||
func normalizeFabricRegistryRecord(record FabricRegistryGossipRecord) FabricRegistryGossipRecord {
|
||||
record.SchemaVersion = strings.TrimSpace(record.SchemaVersion)
|
||||
record.ClusterID = strings.TrimSpace(record.ClusterID)
|
||||
record.Service = strings.ToLower(strings.TrimSpace(record.Service))
|
||||
record.Scope = strings.ToLower(strings.TrimSpace(record.Scope))
|
||||
record.OrganizationID = strings.TrimSpace(record.OrganizationID)
|
||||
record.IssuerNodeID = strings.TrimSpace(record.IssuerNodeID)
|
||||
record.IssuerRole = strings.TrimSpace(record.IssuerRole)
|
||||
record.Generation = strings.TrimSpace(record.Generation)
|
||||
for i := range record.Endpoints {
|
||||
record.Endpoints[i].EndpointID = strings.TrimSpace(record.Endpoints[i].EndpointID)
|
||||
record.Endpoints[i].Address = strings.TrimSpace(record.Endpoints[i].Address)
|
||||
record.Endpoints[i].Transport = strings.TrimSpace(record.Endpoints[i].Transport)
|
||||
record.Endpoints[i].Reachability = strings.TrimSpace(record.Endpoints[i].Reachability)
|
||||
record.Endpoints[i].ConnectivityMode = strings.TrimSpace(record.Endpoints[i].ConnectivityMode)
|
||||
record.Endpoints[i].Region = strings.TrimSpace(record.Endpoints[i].Region)
|
||||
record.Endpoints[i].PeerCertSHA256 = normalizeCertSHA256(record.Endpoints[i].PeerCertSHA256)
|
||||
}
|
||||
sort.SliceStable(record.Endpoints, func(i, j int) bool {
|
||||
if record.Endpoints[i].Priority != record.Endpoints[j].Priority {
|
||||
return record.Endpoints[i].Priority < record.Endpoints[j].Priority
|
||||
}
|
||||
return record.Endpoints[i].EndpointID < record.Endpoints[j].EndpointID
|
||||
})
|
||||
sort.SliceStable(record.Signatures, func(i, j int) bool {
|
||||
if record.Signatures[i].IssuerID != record.Signatures[j].IssuerID {
|
||||
return record.Signatures[i].IssuerID < record.Signatures[j].IssuerID
|
||||
}
|
||||
return record.Signatures[i].KeyID < record.Signatures[j].KeyID
|
||||
})
|
||||
return record
|
||||
}
|
||||
|
||||
func fabricRegistryIssuerAllowed(issuer FabricRegistryTrustedIssuer, record FabricRegistryGossipRecord) bool {
|
||||
if strings.TrimSpace(issuer.Role) != "" && issuer.Role != record.IssuerRole {
|
||||
return false
|
||||
}
|
||||
if len(issuer.Scopes) > 0 && !stringInSlice(record.Scope, issuer.Scopes) {
|
||||
return false
|
||||
}
|
||||
if len(issuer.Services) > 0 && !stringInSlice(record.Service, issuer.Services) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func fabricRegistryRecordKey(record FabricRegistryGossipRecord) string {
|
||||
return fabricRegistryKey(record.ClusterID, record.Service, record.Scope, record.OrganizationID)
|
||||
}
|
||||
|
||||
func fabricRegistryScopeResolutionOrder(scope string, organizationID string) []string {
|
||||
scope = strings.ToLower(strings.TrimSpace(scope))
|
||||
switch scope {
|
||||
case FabricRegistryScopeOrganization:
|
||||
if strings.TrimSpace(organizationID) != "" {
|
||||
return []string{FabricRegistryScopeOrganization, FabricRegistryScopeCluster, FabricRegistryScopeFarm}
|
||||
}
|
||||
return []string{FabricRegistryScopeCluster, FabricRegistryScopeFarm}
|
||||
case FabricRegistryScopeFarm:
|
||||
return []string{FabricRegistryScopeFarm}
|
||||
case FabricRegistryScopeCluster, "":
|
||||
return []string{FabricRegistryScopeCluster, FabricRegistryScopeFarm}
|
||||
default:
|
||||
return []string{scope, FabricRegistryScopeCluster, FabricRegistryScopeFarm}
|
||||
}
|
||||
}
|
||||
|
||||
func selectFabricRegistryEndpoints(endpoints []FabricRegistryEndpoint, preferredRegion string) []FabricRegistryEndpoint {
|
||||
preferredRegion = strings.TrimSpace(preferredRegion)
|
||||
out := make([]FabricRegistryEndpoint, 0, len(endpoints))
|
||||
for _, endpoint := range endpoints {
|
||||
if strings.TrimSpace(endpoint.Address) == "" || !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
|
||||
continue
|
||||
}
|
||||
out = append(out, endpoint)
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
if preferredRegion != "" {
|
||||
iMatch := strings.EqualFold(out[i].Region, preferredRegion)
|
||||
jMatch := strings.EqualFold(out[j].Region, preferredRegion)
|
||||
if iMatch != jMatch {
|
||||
return iMatch
|
||||
}
|
||||
}
|
||||
if out[i].Priority != out[j].Priority {
|
||||
return out[i].Priority < out[j].Priority
|
||||
}
|
||||
if out[i].Weight != out[j].Weight {
|
||||
return out[i].Weight > out[j].Weight
|
||||
}
|
||||
return out[i].EndpointID < out[j].EndpointID
|
||||
})
|
||||
return out
|
||||
}
|
||||
|
||||
func probeFabricRegistryEndpoint(ctx context.Context, transport FabricTransport, endpoint FabricRegistryEndpoint, timeout time.Duration) (time.Duration, error) {
|
||||
if transport == nil {
|
||||
return 0, fmt.Errorf("fabric registry live probe transport is unavailable")
|
||||
}
|
||||
if timeout <= 0 {
|
||||
timeout = 2 * time.Second
|
||||
}
|
||||
target := FabricTransportTarget{
|
||||
EndpointID: endpoint.EndpointID,
|
||||
PeerID: endpoint.EndpointID,
|
||||
Endpoint: endpoint.Address,
|
||||
Transport: endpoint.Transport,
|
||||
PeerCertSHA256: endpoint.PeerCertSHA256,
|
||||
Timeout: timeout,
|
||||
InboundBuffer: 2,
|
||||
ErrorBuffer: 2,
|
||||
}
|
||||
startedAt := time.Now()
|
||||
session, err := transport.Connect(ctx, target)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer session.Close()
|
||||
sequence := uint64(startedAt.UnixNano())
|
||||
if err := session.Send(ctx, fabricproto.Frame{Type: fabricproto.FramePing, TrafficClass: fabricproto.TrafficClassReliable, Sequence: sequence, Payload: []byte("fabric-registry-live-probe")}); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case frame, ok := <-session.Frames():
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("fabric registry live probe session closed")
|
||||
}
|
||||
if frame.Type == fabricproto.FramePong && frame.Sequence == sequence {
|
||||
return time.Since(startedAt), nil
|
||||
}
|
||||
case err, ok := <-session.Errors():
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("fabric registry live probe error channel closed")
|
||||
}
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
case <-ctx.Done():
|
||||
return 0, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func fabricRegistryKey(clusterID, service, scope, organizationID string) string {
|
||||
return strings.TrimSpace(clusterID) + "\x00" + strings.ToLower(strings.TrimSpace(service)) + "\x00" + strings.ToLower(strings.TrimSpace(scope)) + "\x00" + strings.TrimSpace(organizationID)
|
||||
}
|
||||
|
||||
func fabricRegistryRecordNewer(next, current FabricRegistryGossipRecord, now time.Time) bool {
|
||||
if !current.ExpiresAt.After(now) {
|
||||
return true
|
||||
}
|
||||
if next.Epoch != current.Epoch {
|
||||
return next.Epoch > current.Epoch
|
||||
}
|
||||
if !next.IssuedAt.Equal(current.IssuedAt) {
|
||||
return next.IssuedAt.After(current.IssuedAt)
|
||||
}
|
||||
return strings.TrimSpace(next.Generation) > strings.TrimSpace(current.Generation)
|
||||
}
|
||||
|
||||
func registryNow(now time.Time) time.Time {
|
||||
if now.IsZero() {
|
||||
return time.Now().UTC()
|
||||
}
|
||||
return now.UTC()
|
||||
}
|
||||
|
||||
func stringInSlice(value string, values []string) bool {
|
||||
value = strings.TrimSpace(value)
|
||||
for _, candidate := range values {
|
||||
if strings.TrimSpace(candidate) == value {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -0,0 +1,280 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/ed25519"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestFabricRegistryGossipRecordRequiresTrustedSignature(t *testing.T) {
|
||||
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
record := testFabricRegistryGossipRecord(now, 10)
|
||||
issuer := FabricRegistryTrustedIssuer{
|
||||
IssuerID: "authority-1",
|
||||
Role: FabricRegistryAuthorityControl,
|
||||
PublicKey: publicKey,
|
||||
Scopes: []string{FabricRegistryScopeCluster},
|
||||
Services: []string{FabricRegistryServiceControlAPI},
|
||||
}
|
||||
signed, err := SignFabricRegistryGossipRecord(record, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign record: %v", err)
|
||||
}
|
||||
if _, err := VerifyFabricRegistryGossipRecord(signed, FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: "cluster-1",
|
||||
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
|
||||
RequiredSignatures: 1,
|
||||
Now: now,
|
||||
}); err != nil {
|
||||
t.Fatalf("verify signed record: %v", err)
|
||||
}
|
||||
tampered := signed
|
||||
tampered.Endpoints[0].Address = "quic://10.10.10.10:19443"
|
||||
if _, err := VerifyFabricRegistryGossipRecord(tampered, FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: "cluster-1",
|
||||
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
|
||||
RequiredSignatures: 1,
|
||||
Now: now,
|
||||
}); err == nil {
|
||||
t.Fatal("tampered record verified")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRegistryRejectsLegacyEndpointAndExpiredRecord(t *testing.T) {
|
||||
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
|
||||
record := testFabricRegistryGossipRecord(now, 10)
|
||||
record.Endpoints[0].Address = "https://control.example.test/api/v1"
|
||||
signed, err := SignFabricRegistryGossipRecord(record, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign record: %v", err)
|
||||
}
|
||||
if _, err := VerifyFabricRegistryGossipRecord(signed, FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: "cluster-1",
|
||||
TrustedIssuers: []FabricRegistryTrustedIssuer{
|
||||
{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey},
|
||||
},
|
||||
Now: now,
|
||||
}); err == nil {
|
||||
t.Fatal("legacy HTTP endpoint was accepted")
|
||||
}
|
||||
expired := testFabricRegistryGossipRecord(now.Add(-2*time.Hour), 11)
|
||||
expired.ExpiresAt = now.Add(-time.Minute)
|
||||
expiredSigned, err := SignFabricRegistryGossipRecord(expired, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign expired record: %v", err)
|
||||
}
|
||||
if _, err := VerifyFabricRegistryGossipRecord(expiredSigned, FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: "cluster-1",
|
||||
TrustedIssuers: []FabricRegistryTrustedIssuer{
|
||||
{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey},
|
||||
},
|
||||
Now: now,
|
||||
}); err == nil {
|
||||
t.Fatal("expired record was accepted")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRegistryKeepsActiveRecordUntilNewerVerified(t *testing.T) {
|
||||
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
|
||||
policy := FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: "cluster-1",
|
||||
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
|
||||
RequiredSignatures: 1,
|
||||
Now: now,
|
||||
}
|
||||
registry := NewFabricRegistry()
|
||||
active, err := SignFabricRegistryGossipRecord(testFabricRegistryGossipRecord(now, 10), issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign active: %v", err)
|
||||
}
|
||||
entry, changed, err := registry.ApplyGossipRecord(active, policy, true)
|
||||
if err != nil || !changed || entry.State != FabricRegistryActive {
|
||||
t.Fatalf("apply active entry changed=%t entry=%+v err=%v", changed, entry, err)
|
||||
}
|
||||
old := testFabricRegistryGossipRecord(now.Add(time.Minute), 9)
|
||||
old.Endpoints[0].Address = "quic://192.0.2.9:19443"
|
||||
oldSigned, err := SignFabricRegistryGossipRecord(old, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign old: %v", err)
|
||||
}
|
||||
entry, changed, err = registry.ApplyGossipRecord(oldSigned, policy, true)
|
||||
if err != nil {
|
||||
t.Fatalf("apply old: %v", err)
|
||||
}
|
||||
if changed || entry.Record.Epoch != 10 || entry.Record.Endpoints[0].Address != "quic://192.0.2.10:19443" {
|
||||
t.Fatalf("older record replaced active entry: changed=%t entry=%+v", changed, entry)
|
||||
}
|
||||
newer := testFabricRegistryGossipRecord(now.Add(2*time.Minute), 11)
|
||||
newer.Endpoints[0].Address = "quic://192.0.2.11:19443"
|
||||
newerSigned, err := SignFabricRegistryGossipRecord(newer, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign newer: %v", err)
|
||||
}
|
||||
policy.Now = now.Add(2 * time.Minute)
|
||||
entry, changed, err = registry.ApplyGossipRecord(newerSigned, policy, false)
|
||||
if err != nil || !changed || entry.State != FabricRegistryCandidate {
|
||||
t.Fatalf("apply newer candidate changed=%t entry=%+v err=%v", changed, entry, err)
|
||||
}
|
||||
activeRecord, ok := registry.Active("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", policy.Now)
|
||||
if !ok || activeRecord.Endpoints[0].Address != "quic://192.0.2.10:19443" {
|
||||
t.Fatalf("unverified newer candidate displaced active fallback: ok=%t record=%+v", ok, activeRecord)
|
||||
}
|
||||
if !registry.MarkLiveVerified("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", policy.Now.Add(time.Second)) {
|
||||
t.Fatal("mark live verified failed")
|
||||
}
|
||||
activeRecord, ok = registry.Active("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", policy.Now.Add(time.Second))
|
||||
if !ok || activeRecord.Endpoints[0].Address != "quic://192.0.2.11:19443" {
|
||||
t.Fatalf("newer verified record not active: ok=%t record=%+v", ok, activeRecord)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRegistryResolveServicePrefersVerifiedScopedRegionalEndpoint(t *testing.T) {
|
||||
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
|
||||
policy := FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: "cluster-1",
|
||||
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
|
||||
RequiredSignatures: 1,
|
||||
Now: now,
|
||||
}
|
||||
registry := NewFabricRegistry()
|
||||
clusterRecord := testFabricRegistryGossipRecord(now, 10)
|
||||
clusterRecord.Endpoints = []FabricRegistryEndpoint{
|
||||
{EndpointID: "control-eu", Address: "quic://eu.example.test:19443", Transport: "direct_quic", Region: "eu", Priority: 10, Weight: 1},
|
||||
{EndpointID: "control-us", Address: "quic://us.example.test:19443", Transport: "direct_quic", Region: "us", Priority: 10, Weight: 10},
|
||||
}
|
||||
signedCluster, err := SignFabricRegistryGossipRecord(clusterRecord, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign cluster record: %v", err)
|
||||
}
|
||||
if _, _, err := registry.ApplyGossipRecord(signedCluster, policy, true); err != nil {
|
||||
t.Fatalf("apply cluster record: %v", err)
|
||||
}
|
||||
orgRecord := testFabricRegistryGossipRecord(now.Add(time.Minute), 11)
|
||||
orgRecord.Scope = FabricRegistryScopeOrganization
|
||||
orgRecord.OrganizationID = "org-1"
|
||||
orgRecord.Endpoints = []FabricRegistryEndpoint{
|
||||
{EndpointID: "control-org", Address: "quic://org.example.test:19443", Transport: "direct_quic", Region: "eu", Priority: 1, Weight: 1},
|
||||
}
|
||||
signedOrg, err := SignFabricRegistryGossipRecord(orgRecord, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign org record: %v", err)
|
||||
}
|
||||
policy.Now = now.Add(time.Minute)
|
||||
if _, _, err := registry.ApplyGossipRecord(signedOrg, policy, false); err != nil {
|
||||
t.Fatalf("apply org candidate: %v", err)
|
||||
}
|
||||
resolved := registry.ResolveService(FabricRegistryResolveRequest{
|
||||
ClusterID: "cluster-1",
|
||||
Service: FabricRegistryServiceControlAPI,
|
||||
Scope: FabricRegistryScopeOrganization,
|
||||
OrganizationID: "org-1",
|
||||
PreferredRegion: "us",
|
||||
Now: now.Add(time.Minute),
|
||||
})
|
||||
if !resolved.Found || resolved.Scope != FabricRegistryScopeCluster || resolved.Endpoints[0].EndpointID != "control-us" {
|
||||
t.Fatalf("expected cluster fallback with preferred region endpoint, got %+v", resolved)
|
||||
}
|
||||
if !registry.MarkLiveVerified("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeOrganization, "org-1", now.Add(2*time.Minute)) {
|
||||
t.Fatal("mark org live verified failed")
|
||||
}
|
||||
resolved = registry.ResolveService(FabricRegistryResolveRequest{
|
||||
ClusterID: "cluster-1",
|
||||
Service: FabricRegistryServiceControlAPI,
|
||||
Scope: FabricRegistryScopeOrganization,
|
||||
OrganizationID: "org-1",
|
||||
Now: now.Add(2 * time.Minute),
|
||||
})
|
||||
if !resolved.Found || resolved.Scope != FabricRegistryScopeOrganization || resolved.Endpoints[0].EndpointID != "control-org" {
|
||||
t.Fatalf("expected verified organization record, got %+v", resolved)
|
||||
}
|
||||
snapshot := registry.Snapshot(now.Add(2 * time.Minute))
|
||||
if snapshot.Active != 2 || snapshot.Candidate != 0 {
|
||||
t.Fatalf("unexpected snapshot: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricRegistryVerifyCandidatesPromotesAfterQUICPong(t *testing.T) {
|
||||
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
|
||||
tlsConfig := testQUICTLSConfig(t)
|
||||
listener := startQUICFabricEchoServerWithTLS(t, tlsConfig)
|
||||
defer listener.Close()
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
|
||||
policy := FabricRegistryVerificationPolicy{
|
||||
LocalClusterID: "cluster-1",
|
||||
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
|
||||
RequiredSignatures: 1,
|
||||
Now: now,
|
||||
}
|
||||
record := testFabricRegistryGossipRecord(now, 12)
|
||||
record.Endpoints[0].Address = "quic://" + listener.Addr().String()
|
||||
record.Endpoints[0].PeerCertSHA256 = testQUICCertSHA256(t, tlsConfig)
|
||||
signed, err := SignFabricRegistryGossipRecord(record, issuer, privateKey)
|
||||
if err != nil {
|
||||
t.Fatalf("sign record: %v", err)
|
||||
}
|
||||
registry := NewFabricRegistry()
|
||||
if entry, changed, err := registry.ApplyGossipRecord(signed, policy, false); err != nil || !changed || entry.State != FabricRegistryCandidate {
|
||||
t.Fatalf("apply candidate changed=%t entry=%+v err=%v", changed, entry, err)
|
||||
}
|
||||
results := registry.VerifyCandidates(context.Background(), NewQUICFabricTransport(nil), FabricRegistryLiveProbeRequest{
|
||||
ClusterID: "cluster-1",
|
||||
Timeout: 3 * time.Second,
|
||||
Now: now.Add(time.Second),
|
||||
MaxCandidates: 1,
|
||||
})
|
||||
if len(results) != 1 || results[0].Status != "reachable" || !results[0].Promoted {
|
||||
t.Fatalf("unexpected live probe results: %+v", results)
|
||||
}
|
||||
if _, ok := registry.Active("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", now.Add(time.Second)); !ok {
|
||||
t.Fatal("candidate was not promoted to active")
|
||||
}
|
||||
}
|
||||
|
||||
func testFabricRegistryGossipRecord(now time.Time, epoch int64) FabricRegistryGossipRecord {
|
||||
return FabricRegistryGossipRecord{
|
||||
SchemaVersion: FabricRegistryGossipRecordSchema,
|
||||
ClusterID: "cluster-1",
|
||||
Service: FabricRegistryServiceControlAPI,
|
||||
Scope: FabricRegistryScopeCluster,
|
||||
Epoch: epoch,
|
||||
Generation: "gen",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(10 * time.Minute),
|
||||
IssuerNodeID: "authority-1",
|
||||
IssuerRole: FabricRegistryAuthorityControl,
|
||||
Endpoints: []FabricRegistryEndpoint{
|
||||
{
|
||||
EndpointID: "control-a",
|
||||
Address: "quic://192.0.2.10:19443",
|
||||
Transport: "direct_quic",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 1,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -20,7 +20,6 @@ import (
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
type ProductionEnvelopeObserver func(context.Context, ProductionEnvelopeObservation) error
|
||||
@@ -55,6 +54,22 @@ type RemoteWorkspaceFrameSinkSessionMailboxConsumerResume interface {
|
||||
type RemoteWorkspaceFrameSinkSessionMailboxPreflight interface {
|
||||
PreflightAdapterSessionMailboxConsumerResume(adapterSessionID string, consumerID string, resumeFrom string, limit int, now time.Time) (RemoteWorkspaceAdapterMailboxPreflightSnapshot, error)
|
||||
}
|
||||
type FabricSessionEventLogEntry struct {
|
||||
Event string `json:"event"`
|
||||
ClusterID string `json:"cluster_id,omitempty"`
|
||||
NodeID string `json:"node_id,omitempty"`
|
||||
PeerID string `json:"peer_id,omitempty"`
|
||||
AcceptedBy string `json:"accepted_by,omitempty"`
|
||||
SessionID string `json:"session_id,omitempty"`
|
||||
SessionEvent fabricproto.SessionEventType `json:"session_event,omitempty"`
|
||||
StreamID uint64 `json:"stream_id,omitempty"`
|
||||
Sequence uint64 `json:"sequence,omitempty"`
|
||||
TrafficClass fabricproto.TrafficClass `json:"traffic_class,omitempty"`
|
||||
RemoteAddr string `json:"remote_addr,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
ObservedAt time.Time `json:"observed_at"`
|
||||
}
|
||||
|
||||
type VPNPacketIngress interface {
|
||||
SendClientPacketBatch(ctx context.Context, clusterID string, vpnConnectionID string, packets [][]byte) error
|
||||
ReceiveClientPacketBatch(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration) ([][]byte, error)
|
||||
@@ -69,24 +84,21 @@ type VPNPacketIngressRoutePreference interface {
|
||||
}
|
||||
|
||||
type Server struct {
|
||||
Local PeerIdentity
|
||||
SyntheticRuntime *SyntheticRuntime
|
||||
ProductionForwardingEnabled bool
|
||||
ProductionEnvelopeObserver ProductionEnvelopeObserver
|
||||
ProductionEnvelopeDelivery ProductionEnvelopeDelivery
|
||||
ProductionForwardTransport ProductionForwardTransport
|
||||
ProductionForwardLogger ProductionForwardLogger
|
||||
DisableHTTPDataPlane bool
|
||||
FabricServiceChannelLogger FabricServiceChannelAccessLogger
|
||||
RemoteWorkspaceFrameSink RemoteWorkspaceFrameSink
|
||||
ProductionRoutes []SyntheticRoute
|
||||
VPNPacketIngress VPNPacketIngress
|
||||
BackendProxyBaseURL string
|
||||
ClusterAuthorityPublicKey string
|
||||
ServiceChannelIntrospection bool
|
||||
FabricSessionEnabled bool
|
||||
FabricSessionWebSocketEnabled bool
|
||||
FabricSessionLogger FabricSessionEventLogger
|
||||
Local PeerIdentity
|
||||
SyntheticRuntime *SyntheticRuntime
|
||||
ProductionForwardingEnabled bool
|
||||
ProductionEnvelopeObserver ProductionEnvelopeObserver
|
||||
ProductionEnvelopeDelivery ProductionEnvelopeDelivery
|
||||
ProductionForwardTransport ProductionForwardTransport
|
||||
ProductionForwardLogger ProductionForwardLogger
|
||||
DisableHTTPDataPlane bool
|
||||
FabricServiceChannelLogger FabricServiceChannelAccessLogger
|
||||
RemoteWorkspaceFrameSink RemoteWorkspaceFrameSink
|
||||
ProductionRoutes []SyntheticRoute
|
||||
VPNPacketIngress VPNPacketIngress
|
||||
BackendProxyBaseURL string
|
||||
ClusterAuthorityPublicKey string
|
||||
ServiceChannelIntrospection bool
|
||||
}
|
||||
|
||||
func (s Server) Handler() http.Handler {
|
||||
@@ -94,9 +106,6 @@ func (s Server) Handler() http.Handler {
|
||||
mux.HandleFunc("/mesh/v1/health", s.handleHealth)
|
||||
mux.HandleFunc("/mesh/v1/forward", s.handleForward)
|
||||
mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe)
|
||||
if s.FabricSessionEnabled && s.FabricSessionWebSocketEnabled {
|
||||
mux.HandleFunc("/mesh/v1/fabric/session/ws", s.handleFabricSessionWebSocket)
|
||||
}
|
||||
if s.RemoteWorkspaceFrameSink != nil {
|
||||
mux.HandleFunc("/mesh/v1/remote-workspace/adapter-sessions/", s.handleRemoteWorkspaceAdapterSessionControl)
|
||||
}
|
||||
@@ -196,185 +205,6 @@ func (s Server) handleRemoteWorkspaceAdapterSessionSnapshot(w http.ResponseWrite
|
||||
_ = json.NewEncoder(w).Encode(snapshotter.SnapshotAdapterSessions(includeTerminal, limit, time.Now().UTC()))
|
||||
}
|
||||
|
||||
type FabricSessionEventLogEntry struct {
|
||||
Event string `json:"event"`
|
||||
ClusterID string `json:"cluster_id,omitempty"`
|
||||
NodeID string `json:"node_id,omitempty"`
|
||||
PeerID string `json:"peer_id,omitempty"`
|
||||
AcceptedBy string `json:"accepted_by,omitempty"`
|
||||
SessionID string `json:"session_id,omitempty"`
|
||||
SessionEvent fabricproto.SessionEventType `json:"session_event,omitempty"`
|
||||
StreamID uint64 `json:"stream_id,omitempty"`
|
||||
Sequence uint64 `json:"sequence,omitempty"`
|
||||
TrafficClass fabricproto.TrafficClass `json:"traffic_class,omitempty"`
|
||||
RemoteAddr string `json:"remote_addr,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
ObservedAt time.Time `json:"observed_at"`
|
||||
}
|
||||
|
||||
type fabricSessionAuthorityPayload struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
SessionID string `json:"session_id"`
|
||||
SourceNodeID string `json:"source_node_id,omitempty"`
|
||||
SelectedEntryNodeID string `json:"selected_entry_node_id,omitempty"`
|
||||
TokenHash string `json:"token_hash"`
|
||||
IssuedAt time.Time `json:"issued_at"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
}
|
||||
|
||||
type fabricSessionAuthDecision struct {
|
||||
AcceptedBy string
|
||||
SessionID string
|
||||
}
|
||||
|
||||
func (s Server) handleFabricSessionWebSocket(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
decision, ok := s.validateFabricSessionRequest(w, r)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
upgrader := websocket.Upgrader{
|
||||
CheckOrigin: func(_ *http.Request) bool { return true },
|
||||
}
|
||||
conn, err := upgrader.Upgrade(w, r, nil)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
s.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_websocket_opened",
|
||||
ClusterID: s.Local.ClusterID,
|
||||
NodeID: s.Local.NodeID,
|
||||
AcceptedBy: decision.AcceptedBy,
|
||||
SessionID: decision.SessionID,
|
||||
RemoteAddr: r.RemoteAddr,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
})
|
||||
loop := fabricproto.TransportLoop{
|
||||
Session: fabricproto.NewSession(fabricproto.SessionConfig{}),
|
||||
OnEvent: func(event fabricproto.SessionEvent) ([]fabricproto.Frame, error) {
|
||||
s.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_event",
|
||||
ClusterID: s.Local.ClusterID,
|
||||
NodeID: s.Local.NodeID,
|
||||
AcceptedBy: decision.AcceptedBy,
|
||||
SessionID: decision.SessionID,
|
||||
SessionEvent: event.Type,
|
||||
StreamID: event.StreamID,
|
||||
Sequence: event.Sequence,
|
||||
TrafficClass: event.TrafficClass,
|
||||
RemoteAddr: r.RemoteAddr,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
})
|
||||
return nil, nil
|
||||
},
|
||||
}
|
||||
err = loop.RunWebSocket(r.Context(), conn, fabricproto.WebSocketTransportConfig{})
|
||||
if err != nil && !errors.Is(err, context.Canceled) {
|
||||
s.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_websocket_closed",
|
||||
ClusterID: s.Local.ClusterID,
|
||||
NodeID: s.Local.NodeID,
|
||||
AcceptedBy: decision.AcceptedBy,
|
||||
SessionID: decision.SessionID,
|
||||
RemoteAddr: r.RemoteAddr,
|
||||
Reason: err.Error(),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
})
|
||||
return
|
||||
}
|
||||
s.logFabricSession(FabricSessionEventLogEntry{
|
||||
Event: "fabric_session_websocket_closed",
|
||||
ClusterID: s.Local.ClusterID,
|
||||
NodeID: s.Local.NodeID,
|
||||
AcceptedBy: decision.AcceptedBy,
|
||||
SessionID: decision.SessionID,
|
||||
RemoteAddr: r.RemoteAddr,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
})
|
||||
}
|
||||
|
||||
func (s Server) validateFabricSessionRequest(w http.ResponseWriter, r *http.Request) (fabricSessionAuthDecision, bool) {
|
||||
var decision fabricSessionAuthDecision
|
||||
token := fabricSessionBearerToken(r)
|
||||
if !strings.HasPrefix(token, "rap_fsn_") {
|
||||
http.Error(w, "fabric session token is required", http.StatusUnauthorized)
|
||||
return decision, false
|
||||
}
|
||||
payload, err := s.verifyFabricSessionAuthority(r, token)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusForbidden)
|
||||
return decision, false
|
||||
}
|
||||
decision.AcceptedBy = "legacy_unsigned"
|
||||
if payload != nil {
|
||||
decision.AcceptedBy = "signed"
|
||||
decision.SessionID = strings.TrimSpace(payload.SessionID)
|
||||
}
|
||||
return decision, true
|
||||
}
|
||||
|
||||
func (s Server) verifyFabricSessionAuthority(r *http.Request, token string) (*fabricSessionAuthorityPayload, error) {
|
||||
publicKey := strings.TrimSpace(s.ClusterAuthorityPublicKey)
|
||||
payloadHeader := strings.TrimSpace(r.Header.Get("X-RAP-Fabric-Session-Authority-Payload"))
|
||||
signatureHeader := strings.TrimSpace(r.Header.Get("X-RAP-Fabric-Session-Authority-Signature"))
|
||||
if payloadHeader == "" && signatureHeader == "" {
|
||||
if publicKey != "" {
|
||||
return nil, fmt.Errorf("%w: signed fabric session authority is required", ErrUnauthorizedChannel)
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
if publicKey == "" {
|
||||
return nil, ErrUnauthorizedChannel
|
||||
}
|
||||
if payloadHeader == "" || signatureHeader == "" {
|
||||
return nil, fmt.Errorf("%w: fabric session authority payload and signature are required together", ErrUnauthorizedChannel)
|
||||
}
|
||||
payloadRaw, err := decodeHeaderJSON(payloadHeader)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: invalid fabric session authority payload", ErrUnauthorizedChannel)
|
||||
}
|
||||
signatureRaw, err := decodeHeaderJSON(signatureHeader)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: invalid fabric session authority signature", ErrUnauthorizedChannel)
|
||||
}
|
||||
var signature authority.Signature
|
||||
if err := json.Unmarshal(signatureRaw, &signature); err != nil {
|
||||
return nil, fmt.Errorf("%w: invalid fabric session authority signature", ErrUnauthorizedChannel)
|
||||
}
|
||||
if err := authority.VerifyRaw(publicKey, payloadRaw, signature); err != nil {
|
||||
return nil, fmt.Errorf("%w: fabric session authority signature rejected", ErrUnauthorizedChannel)
|
||||
}
|
||||
var payload fabricSessionAuthorityPayload
|
||||
if err := json.Unmarshal(payloadRaw, &payload); err != nil {
|
||||
return nil, fmt.Errorf("%w: invalid fabric session authority payload", ErrUnauthorizedChannel)
|
||||
}
|
||||
if payload.SchemaVersion != "rap.fabric_session_authority.v1" ||
|
||||
payload.ClusterID != s.Local.ClusterID ||
|
||||
payload.TokenHash != fabricSessionTokenHash(token) ||
|
||||
strings.TrimSpace(payload.SessionID) == "" {
|
||||
return nil, fmt.Errorf("%w: fabric session authority payload mismatch", ErrUnauthorizedChannel)
|
||||
}
|
||||
if payload.SelectedEntryNodeID != "" && s.Local.NodeID != "" && payload.SelectedEntryNodeID != s.Local.NodeID {
|
||||
return nil, fmt.Errorf("%w: fabric session entry node mismatch", ErrUnauthorizedChannel)
|
||||
}
|
||||
if !payload.ExpiresAt.IsZero() && !payload.ExpiresAt.After(time.Now().UTC()) {
|
||||
return nil, fmt.Errorf("%w: fabric session lease expired", ErrUnauthorizedChannel)
|
||||
}
|
||||
return &payload, nil
|
||||
}
|
||||
|
||||
func (s Server) logFabricSession(entry FabricSessionEventLogEntry) {
|
||||
if s.FabricSessionLogger != nil {
|
||||
s.FabricSessionLogger(entry)
|
||||
}
|
||||
}
|
||||
|
||||
func (s Server) handleRemoteWorkspaceAdapterSessionMailbox(w http.ResponseWriter, r *http.Request) {
|
||||
reader, ok := s.RemoteWorkspaceFrameSink.(RemoteWorkspaceFrameSinkSessionMailbox)
|
||||
if !ok {
|
||||
@@ -711,15 +541,15 @@ func parseRemoteWorkspaceAdapterSessionControlPath(path string) (string, bool) {
|
||||
}
|
||||
|
||||
func (s Server) handleVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool {
|
||||
if clusterID, vpnConnectionID, ok := parseVPNClientPacketWebSocketPath(r.URL.Path); ok {
|
||||
s.handleVPNPacketWebSocket(w, r, clusterID, "", vpnConnectionID, false, true, "")
|
||||
if isVPNClientPacketWebSocketPath(r.URL.Path) {
|
||||
http.Error(w, "legacy VPN WebSocket dataplane is removed; use QUIC fabric route", http.StatusGone)
|
||||
return true
|
||||
}
|
||||
clusterID, vpnConnectionID, ok := parseVPNClientPacketPath(r.URL.Path)
|
||||
if !ok {
|
||||
if _, _, ok := parseVPNClientPacketPath(r.URL.Path); !ok {
|
||||
return false
|
||||
}
|
||||
return s.handleVPNPacketHTTP(w, r, clusterID, "", vpnConnectionID, "", false, true, "")
|
||||
http.Error(w, "legacy VPN HTTP dataplane is removed; use QUIC fabric route", http.StatusGone)
|
||||
return true
|
||||
}
|
||||
|
||||
func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.ResponseWriter, r *http.Request) bool {
|
||||
@@ -728,7 +558,7 @@ func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.Response
|
||||
return false
|
||||
}
|
||||
if webSocket {
|
||||
http.Error(w, "remote workspace service-channel websocket forwarding is not implemented", http.StatusNotImplemented)
|
||||
http.Error(w, "remote workspace service-channel websocket ingress is removed; use QUIC fabric route", http.StatusGone)
|
||||
return true
|
||||
}
|
||||
decision, valid := s.validateFabricServiceChannelRequest(w, r, clusterID, channelID, resourceID, FabricServiceClassRemoteWorkspace, channelClass)
|
||||
@@ -809,7 +639,7 @@ func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.Response
|
||||
"channel_id": channelID,
|
||||
"resource_id": resourceID,
|
||||
"data_plane": "validated",
|
||||
"payload_flow": "not_implemented",
|
||||
"payload_flow": "validated_only",
|
||||
})
|
||||
return true
|
||||
}
|
||||
@@ -898,7 +728,7 @@ func validateRemoteWorkspaceFrameBatchProbe(payload []byte, requiredChannelClass
|
||||
return decoded, fmt.Errorf("unsupported remote workspace frame batch schema")
|
||||
}
|
||||
if !decoded.ProbeOnly {
|
||||
return decoded, fmt.Errorf("remote workspace payload forwarding is not implemented")
|
||||
return decoded, fmt.Errorf("remote workspace production payload forwarding is disabled; probe_only required")
|
||||
}
|
||||
if strings.TrimSpace(strings.ToLower(decoded.ServiceClass)) != FabricServiceClassRemoteWorkspace {
|
||||
return decoded, fmt.Errorf("remote workspace frame batch service class mismatch")
|
||||
@@ -952,438 +782,6 @@ func isAllowedRemoteWorkspaceAdapterFrameDirection(channel string, direction str
|
||||
}
|
||||
}
|
||||
|
||||
func (s Server) handleFabricServiceChannelVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool {
|
||||
if clusterID, channelID, vpnConnectionID, ok := parseFabricServiceChannelVPNPacketWebSocketPath(r.URL.Path); ok {
|
||||
decision, valid := s.validateFabricServiceChannelVPNRequest(w, r, clusterID, channelID, vpnConnectionID)
|
||||
if !valid {
|
||||
return true
|
||||
}
|
||||
s.logFabricServiceChannelAccess(r, clusterID, channelID, vpnConnectionID, decision)
|
||||
s.preferVPNPacketIngressRoute(decision.PreferredRouteID)
|
||||
s.handleVPNPacketWebSocket(w, r, clusterID, channelID, vpnConnectionID, decision.ForceBackendFallback, decision.BackendFallbackAllowed(), decision.BackendRelayPolicy)
|
||||
return true
|
||||
}
|
||||
clusterID, channelID, vpnConnectionID, ok := parseFabricServiceChannelVPNPacketPath(r.URL.Path)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
decision, valid := s.validateFabricServiceChannelVPNRequest(w, r, clusterID, channelID, vpnConnectionID)
|
||||
if !valid {
|
||||
return true
|
||||
}
|
||||
w.Header().Set("X-RAP-Service-Channel-Accepted-By", decision.AcceptedBy)
|
||||
s.logFabricServiceChannelAccess(r, clusterID, channelID, vpnConnectionID, decision)
|
||||
s.preferVPNPacketIngressRoute(decision.PreferredRouteID)
|
||||
backendPath := "/api/v1/clusters/" + clusterID + "/vpn-connections/" + vpnConnectionID + "/tunnel/client/packets"
|
||||
return s.handleVPNPacketHTTP(w, r, clusterID, channelID, vpnConnectionID, backendPath, decision.ForceBackendFallback, decision.BackendFallbackAllowed(), decision.BackendRelayPolicy)
|
||||
}
|
||||
|
||||
func (s Server) preferVPNPacketIngressRoute(routeID string) {
|
||||
routeID = strings.TrimSpace(routeID)
|
||||
if routeID == "" || s.VPNPacketIngress == nil {
|
||||
return
|
||||
}
|
||||
if preferred, ok := s.VPNPacketIngress.(VPNPacketIngressRoutePreference); ok {
|
||||
preferred.PreferClientRoute(routeID)
|
||||
}
|
||||
}
|
||||
|
||||
func (s Server) handleVPNPacketHTTP(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, vpnConnectionID string, backendFallbackPath string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) bool {
|
||||
switch r.Method {
|
||||
case http.MethodPost:
|
||||
body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, MaxProductionVPNPacketPayloadBytes))
|
||||
if err != nil {
|
||||
http.Error(w, "invalid vpn packet payload", http.StatusBadRequest)
|
||||
return true
|
||||
}
|
||||
if r.URL.Query().Get("batch") != "true" && len(body) == 0 {
|
||||
http.Error(w, "empty vpn packet payload", http.StatusBadRequest)
|
||||
return true
|
||||
}
|
||||
packets := [][]byte{body}
|
||||
if r.URL.Query().Get("batch") == "true" {
|
||||
packets, err = decodeVPNIngressPacketBatch(body)
|
||||
if err != nil {
|
||||
http.Error(w, "invalid vpn packet batch", http.StatusBadRequest)
|
||||
return true
|
||||
}
|
||||
}
|
||||
packets = cleanVPNIngressPacketBatch(packets)
|
||||
if len(packets) == 0 {
|
||||
http.Error(w, "empty vpn packet batch", http.StatusBadRequest)
|
||||
return true
|
||||
}
|
||||
if forceBackendFallback {
|
||||
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, body, backendFallbackPath) {
|
||||
return true
|
||||
}
|
||||
s.logFabricServiceChannelViolation(r, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
|
||||
http.Error(w, ErrRouteNotFound.Error(), vpnIngressStatusCode(ErrRouteNotFound))
|
||||
return true
|
||||
}
|
||||
trafficClass := inferVPNPacketTrafficClass(r.Header.Get("X-RAP-Traffic-Class"), packets)
|
||||
var sendErr error
|
||||
if classIngress, ok := s.VPNPacketIngress.(VPNPacketIngressTrafficClass); ok {
|
||||
sendErr = classIngress.SendClientPacketBatchWithTrafficClass(r.Context(), clusterID, vpnConnectionID, trafficClass, packets)
|
||||
} else {
|
||||
sendErr = s.VPNPacketIngress.SendClientPacketBatch(r.Context(), clusterID, vpnConnectionID, packets)
|
||||
}
|
||||
if sendErr != nil {
|
||||
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, body, backendFallbackPath) {
|
||||
return true
|
||||
}
|
||||
s.logFabricServiceChannelViolation(r, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "fabric_route_send_failed_backend_fallback_blocked", sendErr.Error())
|
||||
http.Error(w, sendErr.Error(), vpnIngressStatusCode(sendErr))
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusAccepted)
|
||||
return true
|
||||
case http.MethodGet:
|
||||
if forceBackendFallback {
|
||||
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, nil, backendFallbackPath) {
|
||||
return true
|
||||
}
|
||||
s.logFabricServiceChannelViolation(r, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
return true
|
||||
}
|
||||
timeout := vpnIngressTimeout(r)
|
||||
packets, err := s.VPNPacketIngress.ReceiveClientPacketBatch(r.Context(), clusterID, vpnConnectionID, timeout)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), vpnIngressStatusCode(err))
|
||||
return true
|
||||
}
|
||||
packets = cleanVPNIngressPacketBatch(packets)
|
||||
if len(packets) == 0 {
|
||||
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, nil, backendFallbackPath) {
|
||||
return true
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
return true
|
||||
}
|
||||
if r.URL.Query().Get("batch") == "true" {
|
||||
w.Header().Set("Content-Type", "application/vnd.rap.vpn-packet-batch.v1")
|
||||
_, _ = w.Write(encodeVPNIngressPacketBatch(packets))
|
||||
return true
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/octet-stream")
|
||||
_, _ = w.Write(packets[0])
|
||||
return true
|
||||
default:
|
||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
func (s Server) handleVPNPacketWebSocket(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, vpnConnectionID string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) {
|
||||
if r.Method != http.MethodGet {
|
||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
if s.VPNPacketIngress == nil {
|
||||
http.Error(w, ErrForwardRuntimeUnavailable.Error(), http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
upgrader := websocket.Upgrader{
|
||||
CheckOrigin: func(_ *http.Request) bool { return true },
|
||||
}
|
||||
conn, err := upgrader.Upgrade(w, r, nil)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer conn.Close()
|
||||
conn.SetReadLimit(MaxProductionVPNPacketPayloadBytes)
|
||||
|
||||
ctx, cancel := context.WithCancel(r.Context())
|
||||
defer cancel()
|
||||
trafficClass := r.Header.Get("X-RAP-Traffic-Class")
|
||||
errCh := make(chan error, 2)
|
||||
go func() {
|
||||
errCh <- s.readVPNPacketWebSocket(ctx, conn, clusterID, channelID, vpnConnectionID, trafficClass, forceBackendFallback, backendFallbackAllowed, backendRelayPolicy)
|
||||
}()
|
||||
go func() {
|
||||
errCh <- s.writeVPNPacketWebSocket(ctx, conn, clusterID, channelID, vpnConnectionID, forceBackendFallback, backendFallbackAllowed, backendRelayPolicy)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
case <-errCh:
|
||||
cancel()
|
||||
}
|
||||
}
|
||||
|
||||
func (s Server) readVPNPacketWebSocket(ctx context.Context, conn *websocket.Conn, clusterID string, channelID string, vpnConnectionID string, trafficClass string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) error {
|
||||
for {
|
||||
messageType, payload, err := conn.ReadMessage()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if messageType != websocket.BinaryMessage {
|
||||
continue
|
||||
}
|
||||
packets, err := decodeVPNIngressPacketBatch(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
packets = cleanVPNIngressPacketBatch(packets)
|
||||
if len(packets) == 0 {
|
||||
continue
|
||||
}
|
||||
if forceBackendFallback {
|
||||
if !backendFallbackAllowed {
|
||||
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
|
||||
return ErrRouteNotFound
|
||||
}
|
||||
if proxyErr := s.backendVPNPacketPost(ctx, clusterID, vpnConnectionID, payload); proxyErr != nil {
|
||||
return proxyErr
|
||||
}
|
||||
continue
|
||||
}
|
||||
sendErr := s.sendVPNPacketWebSocketBatch(ctx, clusterID, vpnConnectionID, inferVPNPacketTrafficClass(trafficClass, packets), packets, !backendFallbackAllowed)
|
||||
if sendErr != nil {
|
||||
if !backendFallbackAllowed {
|
||||
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "fabric_route_send_failed_backend_fallback_blocked", sendErr.Error())
|
||||
if isRetryableVPNPacketIngressError(sendErr) {
|
||||
continue
|
||||
}
|
||||
return sendErr
|
||||
}
|
||||
if proxyErr := s.backendVPNPacketPost(ctx, clusterID, vpnConnectionID, payload); proxyErr != nil {
|
||||
return sendErr
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s Server) sendVPNPacketWebSocketBatch(ctx context.Context, clusterID string, vpnConnectionID string, trafficClass string, packets [][]byte, retryRouteErrors bool) error {
|
||||
const maxAttempts = 6
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < maxAttempts; attempt++ {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
var sendErr error
|
||||
if classIngress, ok := s.VPNPacketIngress.(VPNPacketIngressTrafficClass); ok {
|
||||
sendErr = classIngress.SendClientPacketBatchWithTrafficClass(ctx, clusterID, vpnConnectionID, trafficClass, packets)
|
||||
} else {
|
||||
sendErr = s.VPNPacketIngress.SendClientPacketBatch(ctx, clusterID, vpnConnectionID, packets)
|
||||
}
|
||||
if sendErr == nil {
|
||||
return nil
|
||||
}
|
||||
lastErr = sendErr
|
||||
if !retryRouteErrors || !isRetryableVPNPacketIngressError(sendErr) {
|
||||
return sendErr
|
||||
}
|
||||
timer := time.NewTimer(time.Duration(75+attempt*50) * time.Millisecond)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
timer.Stop()
|
||||
return ctx.Err()
|
||||
case <-timer.C:
|
||||
}
|
||||
}
|
||||
return lastErr
|
||||
}
|
||||
|
||||
func isRetryableVPNPacketIngressError(err error) bool {
|
||||
return errors.Is(err, ErrRouteNotFound) ||
|
||||
errors.Is(err, ErrForwardRuntimeUnavailable) ||
|
||||
errors.Is(err, ErrForwardPeerUnavailable) ||
|
||||
errors.Is(err, ErrSyntheticPeerUnavailable)
|
||||
}
|
||||
|
||||
func (s Server) receiveVPNPacketWebSocketBatch(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration, retryRouteErrors bool) ([][]byte, error) {
|
||||
const maxAttempts = 4
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < maxAttempts; attempt++ {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
packets, err := s.VPNPacketIngress.ReceiveClientPacketBatch(ctx, clusterID, vpnConnectionID, timeout)
|
||||
if err == nil {
|
||||
return packets, nil
|
||||
}
|
||||
lastErr = err
|
||||
if !retryRouteErrors || !isRetryableVPNPacketIngressError(err) {
|
||||
return nil, err
|
||||
}
|
||||
timer := time.NewTimer(time.Duration(75+attempt*50) * time.Millisecond)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
timer.Stop()
|
||||
return nil, ctx.Err()
|
||||
case <-timer.C:
|
||||
}
|
||||
}
|
||||
if retryRouteErrors && isRetryableVPNPacketIngressError(lastErr) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, lastErr
|
||||
}
|
||||
|
||||
func (s Server) writeVPNPacketWebSocket(ctx context.Context, conn *websocket.Conn, clusterID string, channelID string, vpnConnectionID string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) error {
|
||||
lastPing := time.Now()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
var packets [][]byte
|
||||
var err error
|
||||
if !forceBackendFallback {
|
||||
packets, err = s.receiveVPNPacketWebSocketBatch(ctx, clusterID, vpnConnectionID, 50*time.Millisecond, !backendFallbackAllowed)
|
||||
}
|
||||
if forceBackendFallback && !backendFallbackAllowed {
|
||||
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
|
||||
return ErrRouteNotFound
|
||||
}
|
||||
if err != nil && !backendFallbackAllowed {
|
||||
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "fabric_route_receive_failed_backend_fallback_blocked", err.Error())
|
||||
return err
|
||||
}
|
||||
if backendFallbackAllowed && (forceBackendFallback || err != nil || len(packets) == 0) {
|
||||
backendPackets, proxyErr := s.backendVPNPacketGet(ctx, clusterID, vpnConnectionID, 50*time.Millisecond)
|
||||
if proxyErr != nil && err != nil {
|
||||
return err
|
||||
}
|
||||
if len(backendPackets) > 0 {
|
||||
packets = backendPackets
|
||||
}
|
||||
}
|
||||
if len(packets) > 0 {
|
||||
if err := conn.SetWriteDeadline(time.Now().Add(5 * time.Second)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := conn.WriteMessage(websocket.BinaryMessage, encodeVPNIngressPacketBatch(packets)); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
}
|
||||
if time.Since(lastPing) >= 15*time.Second {
|
||||
if err := conn.SetWriteDeadline(time.Now().Add(5 * time.Second)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := conn.WriteMessage(websocket.PingMessage, []byte("rap-vpn")); err != nil {
|
||||
return err
|
||||
}
|
||||
lastPing = time.Now()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s Server) backendVPNPacketPost(ctx context.Context, clusterID string, vpnConnectionID string, batchPayload []byte) error {
|
||||
target := strings.TrimRight(strings.TrimSpace(s.BackendProxyBaseURL), "/")
|
||||
if target == "" {
|
||||
return ErrRouteNotFound
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, target+"/clusters/"+clusterID+"/vpn-connections/"+vpnConnectionID+"/tunnel/client/packets?batch=true", bytes.NewReader(batchPayload))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/octet-stream")
|
||||
req.Header.Set("X-RAP-Entry-Node", s.Local.NodeID)
|
||||
req.Header.Set("X-RAP-Entry-Cluster", s.Local.ClusterID)
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return fmt.Errorf("backend vpn packet post failed: status=%d", resp.StatusCode)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s Server) backendVPNPacketGet(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration) ([][]byte, error) {
|
||||
target := strings.TrimRight(strings.TrimSpace(s.BackendProxyBaseURL), "/")
|
||||
if target == "" {
|
||||
return nil, ErrRouteNotFound
|
||||
}
|
||||
if timeout <= 0 {
|
||||
timeout = 50 * time.Millisecond
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, target+"/clusters/"+clusterID+"/vpn-connections/"+vpnConnectionID+"/tunnel/client/packets?batch=true&timeout_ms="+strconv.FormatInt(timeout.Milliseconds(), 10), nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Set("Accept", "application/vnd.rap.vpn-packet-batch.v1")
|
||||
req.Header.Set("X-RAP-Entry-Node", s.Local.NodeID)
|
||||
req.Header.Set("X-RAP-Entry-Cluster", s.Local.ClusterID)
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode == http.StatusNoContent {
|
||||
return nil, nil
|
||||
}
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return nil, fmt.Errorf("backend vpn packet get failed: status=%d", resp.StatusCode)
|
||||
}
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, MaxProductionVPNPacketPayloadBytes))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(body) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
return decodeVPNIngressPacketBatch(body)
|
||||
}
|
||||
|
||||
func (s Server) proxyVPNPacketIngressToBackend(w http.ResponseWriter, r *http.Request, body []byte) bool {
|
||||
return s.proxyVPNPacketIngressToBackendPath(w, r, body, "")
|
||||
}
|
||||
|
||||
func (s Server) proxyVPNPacketIngressToBackendPath(w http.ResponseWriter, r *http.Request, body []byte, backendPath string) bool {
|
||||
if strings.TrimSpace(s.BackendProxyBaseURL) == "" {
|
||||
return false
|
||||
}
|
||||
target, err := url.Parse(s.BackendProxyBaseURL)
|
||||
if err != nil || target.Scheme == "" || target.Host == "" {
|
||||
return false
|
||||
}
|
||||
if strings.EqualFold(target.Host, r.Host) {
|
||||
return false
|
||||
}
|
||||
var reader io.Reader
|
||||
if body != nil {
|
||||
reader = bytes.NewReader(body)
|
||||
}
|
||||
requestURI := r.URL.RequestURI()
|
||||
if backendPath != "" {
|
||||
requestURI = backendPath
|
||||
if r.URL.RawQuery != "" {
|
||||
requestURI += "?" + r.URL.RawQuery
|
||||
}
|
||||
}
|
||||
req, err := http.NewRequestWithContext(r.Context(), r.Method, target.Scheme+"://"+target.Host+requestURI, reader)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
for _, key := range []string{"Accept", "Content-Type"} {
|
||||
if value := r.Header.Get(key); value != "" {
|
||||
req.Header.Set(key, value)
|
||||
}
|
||||
}
|
||||
req.Header.Set("X-RAP-Entry-Node", s.Local.NodeID)
|
||||
req.Header.Set("X-RAP-Entry-Cluster", s.Local.ClusterID)
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
for _, key := range []string{"Content-Type"} {
|
||||
if value := resp.Header.Get(key); value != "" {
|
||||
w.Header().Set(key, value)
|
||||
}
|
||||
}
|
||||
w.WriteHeader(resp.StatusCode)
|
||||
_, _ = io.Copy(w, resp.Body)
|
||||
return true
|
||||
}
|
||||
|
||||
type fabricServiceChannelLeaseAuthorityPayload struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ChannelID string `json:"channel_id"`
|
||||
@@ -1443,10 +841,6 @@ func (d fabricServiceChannelRequestDecision) BackendFallbackAllowed() bool {
|
||||
return strings.TrimSpace(d.BackendRelayPolicy) != "disabled"
|
||||
}
|
||||
|
||||
func (s Server) validateFabricServiceChannelVPNRequest(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, vpnConnectionID string) (fabricServiceChannelRequestDecision, bool) {
|
||||
return s.validateFabricServiceChannelRequest(w, r, clusterID, channelID, vpnConnectionID, FabricServiceClassVPNPackets, ProductionChannelVPNPacket)
|
||||
}
|
||||
|
||||
func (s Server) validateFabricServiceChannelRequest(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, resourceID string, expectedServiceClass string, defaultChannelClass string) (fabricServiceChannelRequestDecision, bool) {
|
||||
var decision fabricServiceChannelRequestDecision
|
||||
expectedServiceClass = strings.TrimSpace(strings.ToLower(expectedServiceClass))
|
||||
@@ -1485,7 +879,7 @@ func (s Server) validateFabricServiceChannelRequest(w http.ResponseWriter, r *ht
|
||||
http.Error(w, err.Error(), http.StatusForbidden)
|
||||
return decision, false
|
||||
}
|
||||
decision.AcceptedBy = "legacy_unsigned"
|
||||
decision.AcceptedBy = "token_authorized"
|
||||
decision.ServiceClass = serviceClass
|
||||
decision.ChannelClass = channelClass
|
||||
if payload != nil && (payload.Status == "degraded_fallback" || payload.PrimaryRoute.Status == "missing_route_intent") {
|
||||
@@ -1571,30 +965,6 @@ func (s Server) logFabricServiceChannelAccess(r *http.Request, clusterID string,
|
||||
s.FabricServiceChannelLogger(entry)
|
||||
}
|
||||
|
||||
func (s Server) logFabricServiceChannelViolation(r *http.Request, clusterID string, channelID string, resourceID string, backendRelayPolicy string, status string, reason string) {
|
||||
if s.FabricServiceChannelLogger == nil || strings.TrimSpace(channelID) == "" {
|
||||
return
|
||||
}
|
||||
entry := FabricServiceChannelAccessLogEntry{
|
||||
Event: "fabric_service_channel_data_plane_violation",
|
||||
ClusterID: clusterID,
|
||||
ChannelID: channelID,
|
||||
ResourceID: resourceID,
|
||||
LocalNodeID: s.Local.NodeID,
|
||||
BackendRelayPolicy: strings.TrimSpace(backendRelayPolicy),
|
||||
ViolationStatus: strings.TrimSpace(status),
|
||||
ViolationReason: strings.TrimSpace(reason),
|
||||
OccurredAt: time.Now().UTC(),
|
||||
}
|
||||
if r != nil {
|
||||
entry.Method = r.Method
|
||||
if r.URL != nil {
|
||||
entry.Path = r.URL.Path
|
||||
}
|
||||
}
|
||||
s.FabricServiceChannelLogger(entry)
|
||||
}
|
||||
|
||||
func (s Server) verifyFabricServiceChannelLeaseAuthority(r *http.Request, clusterID string, channelID string, resourceID string, serviceClass string, channelClass string, token string) (*fabricServiceChannelLeaseAuthorityPayload, error) {
|
||||
publicKey := strings.TrimSpace(s.ClusterAuthorityPublicKey)
|
||||
payloadHeader := strings.TrimSpace(r.Header.Get("X-RAP-Service-Channel-Authority-Payload"))
|
||||
@@ -1657,15 +1027,15 @@ func validateFabricServiceChannelDataPlaneContract(contract fabricServiceChannel
|
||||
}
|
||||
requiredFlowClass = strings.TrimSpace(strings.ToLower(requiredFlowClass))
|
||||
if contract.SchemaVersion != "rap.fabric_service_channel_data_plane.v1" ||
|
||||
contract.WorkingDataTransport != "fabric_service_channel" ||
|
||||
contract.WorkingDataTransport != "fabric_quic_route" ||
|
||||
contract.SteadyStateTransport != "fabric_route" ||
|
||||
(contract.BackendRelayPolicy != "degraded_fallback_only" && contract.BackendRelayPolicy != "disabled") ||
|
||||
contract.BackendRelayPolicy != "disabled" ||
|
||||
!contract.ServiceNeutral ||
|
||||
!contract.ProtocolAgnostic ||
|
||||
contract.LogicalFlowMode != "multi_flow_isolated" {
|
||||
return fmt.Errorf("%w: unsupported service channel data-plane contract", ErrUnauthorizedChannel)
|
||||
}
|
||||
if contract.Mode != "" && contract.Mode != "fabric_primary" && contract.Mode != "degraded_backend_fallback" {
|
||||
if contract.Mode != "" && contract.Mode != "fabric_primary" && contract.Mode != "fabric_quic_only" {
|
||||
return fmt.Errorf("%w: unsupported service channel data-plane mode", ErrUnauthorizedChannel)
|
||||
}
|
||||
if requiredFlowClass != "" && len(contract.RequiredFlowIsolationClasses) > 0 && !containsString(contract.RequiredFlowIsolationClasses, requiredFlowClass) {
|
||||
@@ -1796,29 +1166,6 @@ func fabricServiceChannelBearerToken(r *http.Request) string {
|
||||
return strings.TrimSpace(r.URL.Query().Get("service_channel_token"))
|
||||
}
|
||||
|
||||
func fabricSessionTokenHash(token string) string {
|
||||
sum := sha256.Sum256([]byte(strings.TrimSpace(token)))
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
func fabricSessionBearerToken(r *http.Request) string {
|
||||
if r == nil {
|
||||
return ""
|
||||
}
|
||||
if token := strings.TrimSpace(r.Header.Get("X-RAP-Fabric-Session-Token")); token != "" {
|
||||
return token
|
||||
}
|
||||
auth := strings.TrimSpace(r.Header.Get("Authorization"))
|
||||
if len(auth) > len("Bearer ") && strings.EqualFold(auth[:len("Bearer ")], "Bearer ") {
|
||||
return strings.TrimSpace(auth[len("Bearer "):])
|
||||
}
|
||||
return strings.TrimSpace(r.URL.Query().Get("fabric_session_token"))
|
||||
}
|
||||
|
||||
func isAllowedFabricServiceVPNChannel(channel string) bool {
|
||||
return isAllowedFabricServiceChannelForClass(FabricServiceClassVPNPackets, channel)
|
||||
}
|
||||
|
||||
func isAllowedFabricServiceChannelForClass(serviceClass string, channel string) bool {
|
||||
serviceClass = strings.TrimSpace(strings.ToLower(serviceClass))
|
||||
channel = strings.TrimSpace(strings.ToLower(channel))
|
||||
@@ -1846,25 +1193,6 @@ func containsString(values []string, target string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func parseFabricServiceChannelVPNPacketWebSocketPath(path string) (string, string, string, bool) {
|
||||
parts := strings.Split(strings.Trim(path, "/"), "/")
|
||||
if len(parts) != 11 ||
|
||||
parts[0] != "api" ||
|
||||
parts[1] != "v1" ||
|
||||
parts[2] != "clusters" ||
|
||||
parts[4] != "fabric" ||
|
||||
parts[5] != "service-channels" ||
|
||||
parts[7] != "vpn-connections" ||
|
||||
parts[9] != "packets" ||
|
||||
parts[10] != "ws" {
|
||||
return "", "", "", false
|
||||
}
|
||||
if parts[3] == "" || parts[6] == "" || parts[8] == "" {
|
||||
return "", "", "", false
|
||||
}
|
||||
return parts[3], parts[6], parts[8], true
|
||||
}
|
||||
|
||||
func parseFabricServiceChannelRemoteWorkspacePath(path string) (string, string, string, string, bool, bool) {
|
||||
parts := strings.Split(strings.Trim(path, "/"), "/")
|
||||
if len(parts) == 11 &&
|
||||
@@ -1897,6 +1225,34 @@ func parseFabricServiceChannelRemoteWorkspacePath(path string) (string, string,
|
||||
return parts[3], parts[6], parts[8], strings.TrimSpace(strings.ToLower(parts[10])), false, true
|
||||
}
|
||||
|
||||
func (s Server) handleFabricServiceChannelVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool {
|
||||
if isFabricServiceChannelVPNPacketWebSocketPath(r.URL.Path) {
|
||||
http.Error(w, "fabric service-channel WebSocket dataplane is removed; use QUIC fabric route", http.StatusGone)
|
||||
return true
|
||||
}
|
||||
if _, _, _, ok := parseFabricServiceChannelVPNPacketPath(r.URL.Path); !ok {
|
||||
return false
|
||||
}
|
||||
http.Error(w, "fabric service-channel HTTP dataplane is removed; use QUIC fabric route", http.StatusGone)
|
||||
return true
|
||||
}
|
||||
|
||||
func isFabricServiceChannelVPNPacketWebSocketPath(path string) bool {
|
||||
parts := strings.Split(strings.Trim(path, "/"), "/")
|
||||
if len(parts) != 11 ||
|
||||
parts[0] != "api" ||
|
||||
parts[1] != "v1" ||
|
||||
parts[2] != "clusters" ||
|
||||
parts[4] != "fabric" ||
|
||||
parts[5] != "service-channels" ||
|
||||
parts[7] != "vpn-connections" ||
|
||||
parts[9] != "packets" ||
|
||||
parts[10] != "ws" {
|
||||
return false
|
||||
}
|
||||
return parts[3] != "" && parts[6] != "" && parts[8] != ""
|
||||
}
|
||||
|
||||
func parseFabricServiceChannelVPNPacketPath(path string) (string, string, string, bool) {
|
||||
parts := strings.Split(strings.Trim(path, "/"), "/")
|
||||
if len(parts) != 10 ||
|
||||
@@ -1915,7 +1271,7 @@ func parseFabricServiceChannelVPNPacketPath(path string) (string, string, string
|
||||
return parts[3], parts[6], parts[8], true
|
||||
}
|
||||
|
||||
func parseVPNClientPacketWebSocketPath(path string) (string, string, bool) {
|
||||
func isVPNClientPacketWebSocketPath(path string) bool {
|
||||
parts := strings.Split(strings.Trim(path, "/"), "/")
|
||||
if len(parts) != 10 ||
|
||||
parts[0] != "api" ||
|
||||
@@ -1926,12 +1282,9 @@ func parseVPNClientPacketWebSocketPath(path string) (string, string, bool) {
|
||||
parts[7] != "client" ||
|
||||
parts[8] != "packets" ||
|
||||
parts[9] != "ws" {
|
||||
return "", "", false
|
||||
return false
|
||||
}
|
||||
if parts[3] == "" || parts[5] == "" {
|
||||
return "", "", false
|
||||
}
|
||||
return parts[3], parts[5], true
|
||||
return parts[3] != "" && parts[5] != ""
|
||||
}
|
||||
|
||||
func parseVPNClientPacketPath(path string) (string, string, bool) {
|
||||
@@ -1952,28 +1305,6 @@ func parseVPNClientPacketPath(path string) (string, string, bool) {
|
||||
return parts[3], parts[5], true
|
||||
}
|
||||
|
||||
func vpnIngressTimeout(r *http.Request) time.Duration {
|
||||
timeoutMs, _ := strconv.Atoi(r.URL.Query().Get("timeout_ms"))
|
||||
if timeoutMs <= 0 {
|
||||
timeoutMs = 25000
|
||||
}
|
||||
if timeoutMs > 30000 {
|
||||
timeoutMs = 30000
|
||||
}
|
||||
return time.Duration(timeoutMs) * time.Millisecond
|
||||
}
|
||||
|
||||
func vpnIngressStatusCode(err error) int {
|
||||
switch err {
|
||||
case ErrForwardRuntimeUnavailable, ErrRouteNotFound, ErrForwardPeerUnavailable:
|
||||
return http.StatusServiceUnavailable
|
||||
case ErrUnauthorizedChannel, ErrClusterMismatch, ErrNodeMismatch:
|
||||
return http.StatusForbidden
|
||||
default:
|
||||
return http.StatusBadGateway
|
||||
}
|
||||
}
|
||||
|
||||
func encodeVPNIngressPacketBatch(packets [][]byte) []byte {
|
||||
packets = cleanVPNIngressPacketBatch(packets)
|
||||
total := 0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,49 @@
|
||||
package fabricvpn
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLiveFabricControlRequest(t *testing.T) {
|
||||
cfg := strings.TrimSpace(os.Getenv("RAP_LIVE_FABRIC_CONTROL_CONFIG"))
|
||||
if cfg == "" {
|
||||
t.Skip("set RAP_LIVE_FABRIC_CONTROL_CONFIG to run live fabric control test")
|
||||
}
|
||||
path := strings.TrimSpace(os.Getenv("RAP_LIVE_FABRIC_CONTROL_PATH"))
|
||||
if path == "" {
|
||||
path = "/organizations/?user_id=3fded8a8-f19b-4974-919f-44d34ac5f63d"
|
||||
}
|
||||
method := strings.TrimSpace(os.Getenv("RAP_LIVE_FABRIC_CONTROL_METHOD"))
|
||||
if method == "" {
|
||||
method = "GET"
|
||||
}
|
||||
body := strings.TrimSpace(os.Getenv("RAP_LIVE_FABRIC_CONTROL_BODY"))
|
||||
manager := NewManager()
|
||||
if err := manager.Start(cfg); err != nil {
|
||||
t.Fatalf("start manager: %v", err)
|
||||
}
|
||||
defer manager.Stop()
|
||||
request := map[string]any{"method": method, "path": path}
|
||||
if body != "" {
|
||||
var raw json.RawMessage
|
||||
if err := json.Unmarshal([]byte(body), &raw); err != nil {
|
||||
t.Fatalf("invalid request body: %v", err)
|
||||
}
|
||||
request["body"] = raw
|
||||
}
|
||||
payload, err := json.Marshal(request)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
response, err := manager.ControlRequest(string(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("control request failed: %v", err)
|
||||
}
|
||||
if !strings.Contains(response, "status_code") {
|
||||
t.Fatalf("unexpected control response: %s", response)
|
||||
}
|
||||
t.Log(response)
|
||||
}
|
||||
@@ -243,7 +243,7 @@ func (m *Manager) connect(ctx context.Context, cfg runtimeConfig, cancel context
|
||||
if lastErr == nil {
|
||||
lastErr = fmt.Errorf("no QUIC exit endpoints available")
|
||||
}
|
||||
return lastErr
|
||||
return fmt.Errorf("fabric bootstrap failed after %d endpoint candidates: %w", len(cfg.Endpoints), lastErr)
|
||||
}
|
||||
|
||||
func (m *Manager) protectedQUICDialer() func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error) {
|
||||
@@ -447,11 +447,17 @@ func (m *Manager) ControlRequest(payloadJSON string) (string, error) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return "", ctx.Err()
|
||||
case err := <-session.Errors():
|
||||
case err, ok := <-session.Errors():
|
||||
if !ok {
|
||||
return "", fmt.Errorf("fabric control error stream closed")
|
||||
}
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
case frame := <-session.Frames():
|
||||
case frame, ok := <-session.Frames():
|
||||
if !ok {
|
||||
return "", fmt.Errorf("fabric control stream closed")
|
||||
}
|
||||
if frame.Type != fabricproto.FrameData || frame.StreamID != mesh.FabricControlForwardQUICStreamID {
|
||||
continue
|
||||
}
|
||||
@@ -460,7 +466,7 @@ func (m *Manager) ControlRequest(payloadJSON string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
if response.Error != "" {
|
||||
return "", fmt.Errorf(response.Error)
|
||||
return "", fmt.Errorf("%s", response.Error)
|
||||
}
|
||||
return string(response.Payload), nil
|
||||
}
|
||||
|
||||
@@ -166,6 +166,7 @@ type DockerInstallProfile struct {
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints,omitempty"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints,omitempty"`
|
||||
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records,omitempty"`
|
||||
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact,omitempty"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
@@ -203,6 +204,7 @@ type WindowsInstallProfile struct {
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints,omitempty"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints,omitempty"`
|
||||
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records,omitempty"`
|
||||
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact,omitempty"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
@@ -235,6 +237,7 @@ type LinuxInstallProfile struct {
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints,omitempty"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints,omitempty"`
|
||||
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records,omitempty"`
|
||||
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact,omitempty"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
@@ -372,6 +375,28 @@ type NodeUpdatePlan struct {
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
}
|
||||
|
||||
type NodeBridgeReplayProductPlan struct {
|
||||
Product string `json:"product"`
|
||||
RecoveryBridgeMode string `json:"recovery_bridge_mode,omitempty"`
|
||||
RecoveryBridgeReplayReady bool `json:"recovery_bridge_replay_ready"`
|
||||
LastStatusReason string `json:"last_status_reason,omitempty"`
|
||||
UpdatePlan NodeUpdatePlan `json:"update_plan"`
|
||||
}
|
||||
|
||||
type NodeBridgeReplayPlan struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
NodeID string `json:"node_id"`
|
||||
NodeName string `json:"node_name,omitempty"`
|
||||
HealthStatus string `json:"health_status,omitempty"`
|
||||
HeartbeatStale bool `json:"heartbeat_stale"`
|
||||
BridgeHoldRequired bool `json:"bridge_hold_required"`
|
||||
RecoveryBridgeReplayReady bool `json:"recovery_bridge_replay_ready"`
|
||||
BridgeHoldReasons []string `json:"bridge_hold_reasons,omitempty"`
|
||||
BridgeActions []string `json:"bridge_actions,omitempty"`
|
||||
Products []NodeBridgeReplayProductPlan `json:"products,omitempty"`
|
||||
}
|
||||
|
||||
type NodeUpdateStatus struct {
|
||||
ID string `json:"id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
@@ -388,6 +413,77 @@ type NodeUpdateStatus struct {
|
||||
ObservedAt time.Time `json:"observed_at"`
|
||||
}
|
||||
|
||||
type StaleNodeRiskReport struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
HeartbeatStaleAfterSeconds int `json:"heartbeat_stale_after_seconds"`
|
||||
LegacyRemovalAllowed bool `json:"legacy_removal_allowed"`
|
||||
BridgeHoldRequired bool `json:"bridge_hold_required"`
|
||||
BridgeHoldNodeIDs []string `json:"bridge_hold_node_ids,omitempty"`
|
||||
BridgeHoldReasons []string `json:"bridge_hold_reasons,omitempty"`
|
||||
BlockedOperations []string `json:"blocked_operations,omitempty"`
|
||||
Nodes []StaleNodeRiskNode `json:"nodes"`
|
||||
Summary StaleNodeRiskSummary `json:"summary"`
|
||||
}
|
||||
|
||||
type StaleNodeRiskSummary struct {
|
||||
TotalNodes int `json:"total_nodes"`
|
||||
StaleNodes int `json:"stale_nodes"`
|
||||
BlockedNodes int `json:"blocked_nodes"`
|
||||
DirectPeerAlertNodes int `json:"direct_peer_alert_nodes"`
|
||||
ArtifactGapNodes int `json:"artifact_gap_nodes"`
|
||||
UnknownProfileNodes int `json:"unknown_profile_nodes"`
|
||||
WaitingUpdateStatusNodes int `json:"waiting_update_status_nodes"`
|
||||
UnknownVersionNodes int `json:"unknown_version_nodes"`
|
||||
LegacyRecoveryContractNodes int `json:"legacy_recovery_contract_nodes"`
|
||||
RecoveryBridgeRequiredNodes int `json:"recovery_bridge_required_nodes"`
|
||||
RecoveryBridgeReplayReadyNodes int `json:"recovery_bridge_replay_ready_nodes"`
|
||||
WaitingRecoveryHeartbeatNodes int `json:"waiting_recovery_heartbeat_nodes"`
|
||||
}
|
||||
|
||||
type StaleNodeRiskNode struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Name string `json:"name"`
|
||||
RegistrationStatus string `json:"registration_status"`
|
||||
HealthStatus string `json:"health_status"`
|
||||
ReportedVersion *string `json:"reported_version,omitempty"`
|
||||
LastSeenAt *time.Time `json:"last_seen_at,omitempty"`
|
||||
HeartbeatStale bool `json:"heartbeat_stale"`
|
||||
Blocked bool `json:"blocked"`
|
||||
DirectPeerAlert bool `json:"direct_peer_alert"`
|
||||
DirectPeerReadyCount int `json:"direct_peer_ready_count,omitempty"`
|
||||
DirectPeerTargetCount int `json:"direct_peer_target_count,omitempty"`
|
||||
DirectPeerDeficit int `json:"direct_peer_deficit,omitempty"`
|
||||
Alerts []string `json:"alerts,omitempty"`
|
||||
RecoveryBridgeRequired bool `json:"recovery_bridge_required"`
|
||||
RecoveryBridgeReplayReady bool `json:"recovery_bridge_replay_ready"`
|
||||
RecoveryBridgeActions []string `json:"recovery_bridge_actions,omitempty"`
|
||||
Risks []string `json:"risks,omitempty"`
|
||||
Products []StaleNodeRiskProduct `json:"products,omitempty"`
|
||||
}
|
||||
|
||||
type StaleNodeRiskProduct struct {
|
||||
Product string `json:"product"`
|
||||
CurrentVersion string `json:"current_version,omitempty"`
|
||||
TargetVersion *string `json:"target_version,omitempty"`
|
||||
Channel string `json:"channel,omitempty"`
|
||||
Strategy string `json:"strategy,omitempty"`
|
||||
Enabled bool `json:"enabled"`
|
||||
DetectedOS string `json:"detected_os,omitempty"`
|
||||
DetectedArch string `json:"detected_arch,omitempty"`
|
||||
DetectedInstallType string `json:"detected_install_type,omitempty"`
|
||||
CompatibleArtifactFound bool `json:"compatible_artifact_found"`
|
||||
MatchingReleaseVersion string `json:"matching_release_version,omitempty"`
|
||||
LastStatusObservedAt *time.Time `json:"last_status_observed_at,omitempty"`
|
||||
LastStatusPhase string `json:"last_status_phase,omitempty"`
|
||||
LastStatusValue string `json:"last_status_value,omitempty"`
|
||||
LastStatusReason string `json:"last_status_reason,omitempty"`
|
||||
RecoveryBridgeRequired bool `json:"recovery_bridge_required"`
|
||||
RecoveryBridgeReplayReady bool `json:"recovery_bridge_replay_ready"`
|
||||
RecoveryBridgeMode string `json:"recovery_bridge_mode,omitempty"`
|
||||
Risks []string `json:"risks,omitempty"`
|
||||
}
|
||||
|
||||
type NodeBootstrap struct {
|
||||
NodeID string `json:"node_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
@@ -761,23 +857,25 @@ type NodeSyntheticMeshConfig struct {
|
||||
}
|
||||
|
||||
type NodeMeshListenerConfig struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
Source string `json:"source"`
|
||||
DesiredState string `json:"desired_state"`
|
||||
ListenAddr string `json:"listen_addr"`
|
||||
ListenPortMode string `json:"listen_port_mode"`
|
||||
AutoPortStart int `json:"auto_port_start,omitempty"`
|
||||
AutoPortEnd int `json:"auto_port_end,omitempty"`
|
||||
AdvertiseEndpoint string `json:"advertise_endpoint,omitempty"`
|
||||
AdvertiseTransport string `json:"advertise_transport,omitempty"`
|
||||
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
||||
NATType string `json:"nat_type,omitempty"`
|
||||
Region string `json:"region,omitempty"`
|
||||
ConfigVersion string `json:"config_version,omitempty"`
|
||||
UpdatedByUserID string `json:"updated_by_user_id,omitempty"`
|
||||
UpdatedAt string `json:"updated_at,omitempty"`
|
||||
ControlPlaneOnly bool `json:"control_plane_only"`
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
Source string `json:"source"`
|
||||
DesiredState string `json:"desired_state"`
|
||||
ListenAddr string `json:"listen_addr"`
|
||||
ListenPortMode string `json:"listen_port_mode"`
|
||||
AutoPortStart int `json:"auto_port_start,omitempty"`
|
||||
AutoPortEnd int `json:"auto_port_end,omitempty"`
|
||||
AdvertiseEndpoint string `json:"advertise_endpoint,omitempty"`
|
||||
AdvertiseEndpoints []string `json:"advertise_endpoints,omitempty"`
|
||||
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
|
||||
AdvertiseTransport string `json:"advertise_transport,omitempty"`
|
||||
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
||||
NATType string `json:"nat_type,omitempty"`
|
||||
Region string `json:"region,omitempty"`
|
||||
ConfigVersion string `json:"config_version,omitempty"`
|
||||
UpdatedByUserID string `json:"updated_by_user_id,omitempty"`
|
||||
UpdatedAt string `json:"updated_at,omitempty"`
|
||||
ControlPlaneOnly bool `json:"control_plane_only"`
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
}
|
||||
|
||||
type MeshQoSPolicy struct {
|
||||
@@ -2027,6 +2125,17 @@ type GetNodeUpdatePlanInput struct {
|
||||
ArtifactOrigin string
|
||||
}
|
||||
|
||||
type GetStaleNodeRiskReportInput struct {
|
||||
ActorUserID string
|
||||
ClusterID string
|
||||
}
|
||||
|
||||
type GetNodeBridgeReplayPlanInput struct {
|
||||
ActorUserID string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
}
|
||||
|
||||
type ReportNodeUpdateStatusInput struct {
|
||||
ClusterID string
|
||||
NodeID string
|
||||
|
||||
@@ -84,8 +84,10 @@ func (m *Module) RegisterRoutes(router chi.Router) {
|
||||
r.Post("/{clusterID}/updates/releases", m.createReleaseVersion)
|
||||
r.Put("/{clusterID}/nodes/{nodeID}/updates/policy", m.upsertNodeUpdatePolicy)
|
||||
r.Get("/{clusterID}/nodes/{nodeID}/updates/plan", m.getNodeUpdatePlan)
|
||||
r.Get("/{clusterID}/nodes/{nodeID}/updates/bridge-replay-plan", m.getNodeBridgeReplayPlan)
|
||||
r.Post("/{clusterID}/nodes/{nodeID}/updates/status", m.reportNodeUpdateStatus)
|
||||
r.Get("/{clusterID}/nodes/{nodeID}/updates/statuses", m.listNodeUpdateStatuses)
|
||||
r.Get("/{clusterID}/updates/stale-node-risk-report", m.getStaleNodeRiskReport)
|
||||
r.Get("/{clusterID}/nodes/{nodeID}/testing-flags", m.getEffectiveNodeTestingFlags)
|
||||
r.Get("/{clusterID}/nodes/{nodeID}/mesh/synthetic-config", m.getNodeSyntheticMeshConfig)
|
||||
r.Post("/{clusterID}/nodes/{nodeID}/telemetry", m.recordNodeTelemetry)
|
||||
@@ -843,6 +845,29 @@ func (m *Module) listNodeUpdateStatuses(w http.ResponseWriter, r *http.Request)
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"node_update_statuses": items})
|
||||
}
|
||||
|
||||
func (m *Module) getStaleNodeRiskReport(w http.ResponseWriter, r *http.Request) {
|
||||
item, err := m.service.GetStaleNodeRiskReport(r.Context(), GetStaleNodeRiskReportInput{
|
||||
ActorUserID: r.URL.Query().Get("actor_user_id"),
|
||||
ClusterID: chi.URLParam(r, "clusterID"),
|
||||
})
|
||||
if writeServiceError(w, err) {
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"stale_node_risk_report": item})
|
||||
}
|
||||
|
||||
func (m *Module) getNodeBridgeReplayPlan(w http.ResponseWriter, r *http.Request) {
|
||||
item, err := m.service.GetNodeBridgeReplayPlan(r.Context(), GetNodeBridgeReplayPlanInput{
|
||||
ActorUserID: r.URL.Query().Get("actor_user_id"),
|
||||
ClusterID: chi.URLParam(r, "clusterID"),
|
||||
NodeID: chi.URLParam(r, "nodeID"),
|
||||
})
|
||||
if writeServiceError(w, err) {
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"node_bridge_replay_plan": item})
|
||||
}
|
||||
|
||||
func (m *Module) getEffectiveNodeTestingFlags(w http.ResponseWriter, r *http.Request) {
|
||||
item, err := m.service.GetEffectiveNodeTestingFlags(r.Context(), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID"))
|
||||
if writeServiceError(w, err) {
|
||||
@@ -3386,6 +3411,7 @@ func writeServiceError(w http.ResponseWriter, err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
var legacyRemovalBlocked *LegacyRemovalBlockedError
|
||||
switch {
|
||||
case errors.Is(err, ErrAccessDenied):
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
@@ -3393,6 +3419,12 @@ func writeServiceError(w http.ResponseWriter, err error) bool {
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
case errors.Is(err, ErrClusterReadOnly):
|
||||
httpx.WriteError(w, http.StatusConflict, err.Error())
|
||||
case errors.As(err, &legacyRemovalBlocked):
|
||||
httpx.WriteErrorMessage(w, http.StatusConflict, httpx.ErrorResponse{
|
||||
Error: httpx.NewErrorMessage(http.StatusConflict, err.Error(), legacyRemovalBlockedErrorDetails(*legacyRemovalBlocked), ""),
|
||||
})
|
||||
case errors.Is(err, ErrLegacyRemovalBlocked):
|
||||
httpx.WriteError(w, http.StatusConflict, err.Error())
|
||||
case errors.Is(err, ErrVPNLeaseAlreadyActive):
|
||||
httpx.WriteError(w, http.StatusConflict, err.Error())
|
||||
case errors.Is(err, ErrInvalidPayload), errors.Is(err, ErrInvalidJoinToken), errors.Is(err, ErrInvalidNodeRole):
|
||||
@@ -3404,3 +3436,37 @@ func writeServiceError(w http.ResponseWriter, err error) bool {
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func legacyRemovalBlockedErrorDetails(err LegacyRemovalBlockedError) map[string]any {
|
||||
details := map[string]any{
|
||||
"blocked_operation": err.BlockedOperation,
|
||||
"legacy_removal_allowed": err.Report.LegacyRemovalAllowed,
|
||||
"bridge_hold_required": err.Report.BridgeHoldRequired,
|
||||
"bridge_hold_reasons": err.Report.BridgeHoldReasons,
|
||||
"blocked_operations": err.Report.BlockedOperations,
|
||||
"heartbeat_stale_after_seconds": err.Report.HeartbeatStaleAfterSeconds,
|
||||
"stale_nodes": err.Report.Summary.StaleNodes,
|
||||
"blocked_nodes": err.Report.Summary.BlockedNodes,
|
||||
"artifact_gap_nodes": err.Report.Summary.ArtifactGapNodes,
|
||||
"unknown_profile_nodes": err.Report.Summary.UnknownProfileNodes,
|
||||
"waiting_update_status_nodes": err.Report.Summary.WaitingUpdateStatusNodes,
|
||||
"unknown_version_nodes": err.Report.Summary.UnknownVersionNodes,
|
||||
"legacy_recovery_contract_nodes": err.Report.Summary.LegacyRecoveryContractNodes,
|
||||
"recovery_bridge_required_nodes": err.Report.Summary.RecoveryBridgeRequiredNodes,
|
||||
"recovery_bridge_replay_ready_nodes": err.Report.Summary.RecoveryBridgeReplayReadyNodes,
|
||||
"waiting_recovery_heartbeat_nodes": err.Report.Summary.WaitingRecoveryHeartbeatNodes,
|
||||
}
|
||||
blockedNodeIDs := make([]string, 0, len(err.Report.Nodes))
|
||||
for _, node := range err.Report.Nodes {
|
||||
if node.Blocked {
|
||||
blockedNodeIDs = append(blockedNodeIDs, node.NodeID)
|
||||
}
|
||||
}
|
||||
if len(blockedNodeIDs) > 0 {
|
||||
details["blocked_node_ids"] = blockedNodeIDs
|
||||
}
|
||||
if len(err.Report.BridgeHoldNodeIDs) > 0 {
|
||||
details["bridge_hold_node_ids"] = err.Report.BridgeHoldNodeIDs
|
||||
}
|
||||
return details
|
||||
}
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestWriteServiceErrorLegacyRemovalBlockedIncludesBreakdownDetails(t *testing.T) {
|
||||
recorder := httptest.NewRecorder()
|
||||
handled := writeServiceError(recorder, &LegacyRemovalBlockedError{
|
||||
BlockedOperation: "create_breaking_release",
|
||||
Report: StaleNodeRiskReport{
|
||||
HeartbeatStaleAfterSeconds: 900,
|
||||
LegacyRemovalAllowed: false,
|
||||
BridgeHoldRequired: true,
|
||||
BridgeHoldNodeIDs: []string{"node-1"},
|
||||
BridgeHoldReasons: []string{"legacy_contract_overlap"},
|
||||
BlockedOperations: []string{"create_breaking_release", "target_breaking_update_policy", "remove_recovery_bridge_overlap"},
|
||||
Nodes: []StaleNodeRiskNode{
|
||||
{NodeID: "node-1", Blocked: true, RecoveryBridgeRequired: true},
|
||||
{NodeID: "node-2", Blocked: false},
|
||||
},
|
||||
Summary: StaleNodeRiskSummary{
|
||||
StaleNodes: 1,
|
||||
BlockedNodes: 1,
|
||||
ArtifactGapNodes: 0,
|
||||
UnknownProfileNodes: 0,
|
||||
WaitingUpdateStatusNodes: 0,
|
||||
UnknownVersionNodes: 0,
|
||||
LegacyRecoveryContractNodes: 0,
|
||||
WaitingRecoveryHeartbeatNodes: 1,
|
||||
},
|
||||
},
|
||||
})
|
||||
if !handled {
|
||||
t.Fatalf("writeServiceError returned false")
|
||||
}
|
||||
if recorder.Code != http.StatusConflict {
|
||||
t.Fatalf("status = %d, want %d", recorder.Code, http.StatusConflict)
|
||||
}
|
||||
var payload struct {
|
||||
Error struct {
|
||||
Details map[string]any `json:"details"`
|
||||
} `json:"error"`
|
||||
}
|
||||
if err := json.Unmarshal(recorder.Body.Bytes(), &payload); err != nil {
|
||||
t.Fatalf("unmarshal response: %v", err)
|
||||
}
|
||||
if payload.Error.Details["blocked_operation"] != "create_breaking_release" {
|
||||
t.Fatalf("blocked_operation = %v", payload.Error.Details["blocked_operation"])
|
||||
}
|
||||
if payload.Error.Details["waiting_recovery_heartbeat_nodes"] != float64(1) {
|
||||
t.Fatalf("waiting_recovery_heartbeat_nodes = %v", payload.Error.Details["waiting_recovery_heartbeat_nodes"])
|
||||
}
|
||||
if payload.Error.Details["bridge_hold_required"] != true {
|
||||
t.Fatalf("bridge_hold_required = %v", payload.Error.Details["bridge_hold_required"])
|
||||
}
|
||||
blockedNodeIDs, ok := payload.Error.Details["blocked_node_ids"].([]any)
|
||||
if !ok || len(blockedNodeIDs) != 1 || blockedNodeIDs[0] != "node-1" {
|
||||
t.Fatalf("blocked_node_ids = %#v", payload.Error.Details["blocked_node_ids"])
|
||||
}
|
||||
bridgeHoldNodeIDs, ok := payload.Error.Details["bridge_hold_node_ids"].([]any)
|
||||
if !ok || len(bridgeHoldNodeIDs) != 1 || bridgeHoldNodeIDs[0] != "node-1" {
|
||||
t.Fatalf("bridge_hold_node_ids = %#v", payload.Error.Details["bridge_hold_node_ids"])
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -158,6 +158,7 @@ func (m *Module) bootstrapEnrollment(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
func (m *Module) registerAgent(w http.ResponseWriter, r *http.Request) {
|
||||
var payload struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
NodeKey string `json:"node_key"`
|
||||
Name string `json:"name"`
|
||||
OwnershipType string `json:"ownership_type"`
|
||||
@@ -197,6 +198,19 @@ func (m *Module) registerAgent(w http.ResponseWriter, r *http.Request) {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if payload.ClusterID != "" {
|
||||
if _, err := m.db.Exec(r.Context(), `
|
||||
INSERT INTO cluster_memberships (cluster_id, node_id, membership_status, joined_at, last_seen_at, metadata)
|
||||
VALUES ($1::uuid, $2::uuid, 'active', $3, $3, $4::jsonb)
|
||||
ON CONFLICT (cluster_id, node_id) DO UPDATE SET
|
||||
membership_status = 'active',
|
||||
last_seen_at = EXCLUDED.last_seen_at,
|
||||
metadata = cluster_memberships.metadata || EXCLUDED.metadata
|
||||
`, payload.ClusterID, nodeID, now, []byte(`{"source":"fabric_control_candidate_registration"}`)); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{
|
||||
"node_id": nodeID,
|
||||
"status": "registered",
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,18 +1,25 @@
|
||||
# RAP Android VPN
|
||||
|
||||
This is the Android client for the experimental RAP VPN service.
|
||||
This is the Android mobile node build with the `vpn-client` service enabled.
|
||||
|
||||
Implemented now:
|
||||
|
||||
- login through `/auth/login`;
|
||||
- trusted-device reconnect through `/auth/refresh` without retyping the password
|
||||
while the device session is valid;
|
||||
- load organization-scoped VPN client profile from `/clusters/{clusterID}/vpn/client-profile`;
|
||||
- installation as a first-class fabric node with an embedded QUIC bootstrap
|
||||
seed set. The seed set is not a backend selector: it contains every known
|
||||
public or local entry candidate that may help the node join the fabric from
|
||||
its current network.
|
||||
- runtime launch uses a persisted `fabric_bootstrap_config`, not a backend API
|
||||
URL. The Android node starts by attaching to the fabric through bootstrap
|
||||
peers and then discovers/uses services through fabric rules.
|
||||
- login and trusted-device refresh through the QUIC fabric control channel;
|
||||
- load organization-scoped VPN client profile through the fabric control channel;
|
||||
- request Android VPN permission and create a `VpnService` TUN interface;
|
||||
- run as a normal fabric node with the `vpn-client` service role. The local
|
||||
`VpnService` TUN is the IPv4 ingress for that node, and packet channels are
|
||||
routed by the farm to an authorized `ipv4-egress` pool. HTTP batch fallback
|
||||
and old VPN protocols are not part of the supported test path.
|
||||
routed by the farm to an authorized `ipv4-egress` pool. The supported
|
||||
dataplane is the QUIC fabric runtime only. HTTP batch forwarding, WebSocket
|
||||
packet relay, direct backend packet relay, and old VPN protocols are removed
|
||||
from the runtime path.
|
||||
- user-facing HOME-first screen: connect/disconnect is primary, while backend,
|
||||
cluster, organization, login, and password are kept in the settings dialog;
|
||||
- saved connection settings in app preferences so repeat connects do not require
|
||||
@@ -20,12 +27,23 @@ Implemented now:
|
||||
- encrypted refresh-token storage through Android Keystore. If the trusted
|
||||
device session is revoked or expires, the app asks for the password once and
|
||||
then rotates the device keys/profile again.
|
||||
- no separate diagnostic foreground service: runtime status is reported by the
|
||||
node/VPN service itself, so the Android build does not keep a parallel legacy
|
||||
control process alive.
|
||||
|
||||
This is still a lab runtime. The required target model is Android as a farm
|
||||
node with the `vpn-client` role. The VPN service must attach to the mesh as
|
||||
that node and route to an authorized IPv4 exit pool; there is no separate VPN
|
||||
entry point. Exit configuration is always pool based, including pools that
|
||||
currently contain only one node.
|
||||
currently contain only one node. A phone installed in a closed network may join
|
||||
through local seed nodes from that network; it does not need direct Internet
|
||||
access if a nearby fabric node can route onward.
|
||||
|
||||
Current code contract:
|
||||
|
||||
- Android control bootstrap field: `fabric_bootstrap_config`
|
||||
- Android runtime dataplane: QUIC `Fabricvpn` runtime only
|
||||
- Android runtime status keys: `fabric_transport_*`
|
||||
|
||||
Build from this repository on Windows:
|
||||
|
||||
|
||||
@@ -22,8 +22,12 @@ android {
|
||||
return (value == null ? "" : value.toString()).replace("\\", "\\\\").replace("\"", "\\\"")
|
||||
}
|
||||
|
||||
def defaultBackendUrl = project.findProperty("RAP_ANDROID_DEFAULT_BACKEND_URL") ?: "http://192.168.200.61:18080/api/v1"
|
||||
def defaultFabricBootstrapPeers = project.findProperty("RAP_ANDROID_FABRIC_BOOTSTRAP_PEERS") ?: "quic://192.168.200.85:18080,quic://195.123.240.88:19131"
|
||||
def defaultBackendUrl = project.findProperty("RAP_ANDROID_DEFAULT_BACKEND_URL") ?: ""
|
||||
// This is a node bootstrap seed set, not an API/backend selector. The
|
||||
// Android app installs as a fabric node and tries every QUIC endpoint that
|
||||
// may be reachable from its current network: public nodes, LAN nodes, or a
|
||||
// closed-site neighbor that can route onward through the fabric.
|
||||
def defaultFabricBootstrapPeers = project.findProperty("RAP_ANDROID_FABRIC_BOOTSTRAP_PEERS") ?: "quic://94.141.118.222:19199#sha256=49892029a27db9c394a41bc4cb917d9cceb1f86219417c351764d2ed9d6bc683,quic://94.141.118.222:19191#sha256=72e51f1631b32c3a7d1e8732fe3325e0395a897a5aa31db645888c142e4ae401,quic://192.168.200.61:19134#sha256=72e51f1631b32c3a7d1e8732fe3325e0395a897a5aa31db645888c142e4ae401,quic://192.168.200.61:19132#sha256=8d28b75144d25d29e3b8f8022b6165258ce3cb0e227a2d9d97996839abb89c2a,quic://192.168.200.61:19133#sha256=a71b07e55b810f57b01696c485b765b336983e963238163085824bf04022ecaa,quic://192.168.200.85:18080#sha256=49892029a27db9c394a41bc4cb917d9cceb1f86219417c351764d2ed9d6bc683,quic://192.168.200.85:18081#sha256=2a3be67e6345943a36cfa1197a5879c2b112c81adc019fd1ee9d7dffbf188b57,quic://192.168.200.85:18082#sha256=a318c1a756ff43595635961768dfd1677afa7e2cbf945d724c107ff82426378a"
|
||||
def defaultClusterId = project.findProperty("RAP_ANDROID_DEFAULT_CLUSTER_ID") ?: "cfc0743d-d960-49fb-9de8-96e063d5e4aa"
|
||||
def defaultOrganizationId = project.findProperty("RAP_ANDROID_DEFAULT_ORGANIZATION_ID") ?: "125ff8b2-5ac1-4406-9bbb-ebbe18f7c7ed"
|
||||
|
||||
@@ -31,8 +35,8 @@ android {
|
||||
applicationId "su.cin.rapvpn"
|
||||
minSdk 26
|
||||
targetSdk 35
|
||||
versionCode 227
|
||||
versionName "0.2.227"
|
||||
versionCode 239
|
||||
versionName "0.2.239"
|
||||
buildConfigField "String", "DEFAULT_BACKEND_URL", "\"${normalizeGradleString(defaultBackendUrl)}\""
|
||||
buildConfigField "String", "FABRIC_BOOTSTRAP_PEERS", "\"${normalizeGradleString(defaultFabricBootstrapPeers)}\""
|
||||
buildConfigField "String", "DEFAULT_CLUSTER_ID", "\"${normalizeGradleString(defaultClusterId)}\""
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -42,15 +42,6 @@
|
||||
android:value="vpn" />
|
||||
</service>
|
||||
|
||||
<service
|
||||
android:name=".RapDiagnosticService"
|
||||
android:exported="false"
|
||||
android:foregroundServiceType="specialUse">
|
||||
<property
|
||||
android:name="android.app.PROPERTY_SPECIAL_USE_FGS_SUBTYPE"
|
||||
android:value="vpn-diagnostics" />
|
||||
</service>
|
||||
|
||||
<receiver
|
||||
android:name=".RapAutostartReceiver"
|
||||
android:exported="false">
|
||||
|
||||
@@ -1,140 +0,0 @@
|
||||
package su.cin.rapvpn;
|
||||
|
||||
import android.util.Base64;
|
||||
|
||||
import org.json.JSONObject;
|
||||
|
||||
import java.net.URI;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import okhttp3.Request;
|
||||
|
||||
final class FabricServiceChannel {
|
||||
final boolean enabled;
|
||||
final String channelId;
|
||||
final String token;
|
||||
final String pathTemplate;
|
||||
final String webSocketPathTemplate;
|
||||
final String authorityPayloadHeader;
|
||||
final String authoritySignatureHeader;
|
||||
final String serviceClass;
|
||||
final String channelClass;
|
||||
|
||||
FabricServiceChannel() {
|
||||
this(false, "", "", "", "", "", "", "", "");
|
||||
}
|
||||
|
||||
private FabricServiceChannel(
|
||||
boolean enabled,
|
||||
String channelId,
|
||||
String token,
|
||||
String pathTemplate,
|
||||
String webSocketPathTemplate,
|
||||
String authorityPayloadHeader,
|
||||
String authoritySignatureHeader,
|
||||
String serviceClass,
|
||||
String channelClass) {
|
||||
this.enabled = enabled;
|
||||
this.channelId = safe(channelId);
|
||||
this.token = safe(token);
|
||||
this.pathTemplate = safe(pathTemplate);
|
||||
this.webSocketPathTemplate = safe(webSocketPathTemplate);
|
||||
this.authorityPayloadHeader = safe(authorityPayloadHeader);
|
||||
this.authoritySignatureHeader = safe(authoritySignatureHeader);
|
||||
this.serviceClass = safe(serviceClass);
|
||||
this.channelClass = safe(channelClass);
|
||||
}
|
||||
|
||||
static FabricServiceChannel fromLease(JSONObject lease) {
|
||||
if (lease == null) {
|
||||
return new FabricServiceChannel();
|
||||
}
|
||||
JSONObject tokenObject = lease.optJSONObject("token");
|
||||
JSONObject entryHttp = lease.optJSONObject("entry_http");
|
||||
String channelId = lease.optString("channel_id", "");
|
||||
String token = tokenObject == null ? "" : tokenObject.optString("token", "");
|
||||
String pathTemplate = entryHttp == null ? "" : entryHttp.optString("path_template", "");
|
||||
String wsTemplate = entryHttp == null ? "" : entryHttp.optString("websocket_path_template", "");
|
||||
String serviceClass = lease.optString("service_class", "vpn_packets");
|
||||
String channelClass = "vpn_packet";
|
||||
JSONObject authoritySignature = lease.optJSONObject("authority_signature");
|
||||
JSONObject authorityPayload = lease.optJSONObject("authority_payload");
|
||||
String payloadHeader = authorityPayload == null ? "" : encodeHeader(authorityPayload.toString());
|
||||
String signatureHeader = authoritySignature == null ? "" : encodeHeader(authoritySignature.toString());
|
||||
boolean enabled = !channelId.isEmpty() && token.startsWith("rap_fsc_") && !pathTemplate.isEmpty();
|
||||
return new FabricServiceChannel(enabled, channelId, token, pathTemplate, wsTemplate, payloadHeader, signatureHeader, serviceClass, channelClass);
|
||||
}
|
||||
|
||||
String packetPath(String clusterId, String vpnConnectionId, boolean webSocket) {
|
||||
return packetPathForBase("", clusterId, vpnConnectionId, webSocket);
|
||||
}
|
||||
|
||||
String packetPathForBase(String baseUrl, String clusterId, String vpnConnectionId, boolean webSocket) {
|
||||
String template = webSocket && !webSocketPathTemplate.isEmpty() ? webSocketPathTemplate : pathTemplate;
|
||||
if (!enabled || template.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
String path = template
|
||||
.replace("{cluster_id}", safe(clusterId))
|
||||
.replace("{clusterID}", safe(clusterId))
|
||||
.replace("{channel_id}", channelId)
|
||||
.replace("{channelID}", channelId)
|
||||
.replace("{resource_id}", safe(vpnConnectionId))
|
||||
.replace("{resourceID}", safe(vpnConnectionId))
|
||||
.replace("{vpn_connection_id}", safe(vpnConnectionId))
|
||||
.replace("{vpnConnectionID}", safe(vpnConnectionId));
|
||||
path = path.startsWith("/") ? path : "/" + path;
|
||||
String basePath = "";
|
||||
try {
|
||||
URI uri = URI.create(baseUrl == null ? "" : baseUrl);
|
||||
basePath = uri.getRawPath() == null ? "" : trimRight(uri.getRawPath());
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
if (basePath.endsWith("/api/v1") && path.startsWith("/api/v1/")) {
|
||||
path = path.substring("/api/v1".length());
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
Request.Builder applyHeaders(Request.Builder builder) {
|
||||
if (!enabled || builder == null) {
|
||||
return builder;
|
||||
}
|
||||
builder.header("X-RAP-Service-Channel-Token", token);
|
||||
builder.header("X-RAP-Fabric-Channel-ID", channelId);
|
||||
if (!serviceClass.isEmpty()) {
|
||||
builder.header("X-RAP-Service-Class", serviceClass);
|
||||
}
|
||||
if (!channelClass.isEmpty()) {
|
||||
builder.header("X-RAP-Channel-Class", channelClass);
|
||||
}
|
||||
if (!authorityPayloadHeader.isEmpty()) {
|
||||
builder.header("X-RAP-Service-Channel-Authority-Payload", authorityPayloadHeader);
|
||||
}
|
||||
if (!authoritySignatureHeader.isEmpty()) {
|
||||
builder.header("X-RAP-Service-Channel-Authority-Signature", authoritySignatureHeader);
|
||||
}
|
||||
return builder;
|
||||
}
|
||||
|
||||
private static String encodeHeader(String value) {
|
||||
if (value == null || value.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
return Base64.encodeToString(value.getBytes(StandardCharsets.UTF_8), Base64.URL_SAFE | Base64.NO_WRAP | Base64.NO_PADDING);
|
||||
}
|
||||
|
||||
private static String safe(String value) {
|
||||
return value == null ? "" : value.trim();
|
||||
}
|
||||
|
||||
private static String trimRight(String value) {
|
||||
if (value == null) {
|
||||
return "";
|
||||
}
|
||||
while (value.endsWith("/")) {
|
||||
value = value.substring(0, value.length() - 1);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
}
|
||||
@@ -24,14 +24,13 @@ import java.util.Locale;
|
||||
|
||||
public class MainActivity extends Activity {
|
||||
private static final String APP_VERSION = BuildConfig.VERSION_NAME;
|
||||
private static final String DEFAULT_BACKEND_URL = BuildConfig.DEFAULT_BACKEND_URL;
|
||||
private static final String FABRIC_BOOTSTRAP_PEERS = BuildConfig.FABRIC_BOOTSTRAP_PEERS;
|
||||
private static final String DEFAULT_CLUSTER_ID = BuildConfig.DEFAULT_CLUSTER_ID;
|
||||
private static final String DEFAULT_ORGANIZATION_ID = BuildConfig.DEFAULT_ORGANIZATION_ID;
|
||||
private static final String PREF_SELECTED_EXIT_NODE_ID = "selected_exit_node_id";
|
||||
private static final int VPN_PREPARE_REQUEST = 42;
|
||||
private static final String PREFS = "rap-vpn";
|
||||
private static final String PREF_DEVICE_FINGERPRINT = "device_fingerprint";
|
||||
private static final String PREF_FABRIC_NODE_ID = "fabric_node_id";
|
||||
private static final String PREF_REFRESH_TOKEN = "refresh_token";
|
||||
private static final String PREF_REFRESH_EXPIRES_AT = "refresh_expires_at";
|
||||
private static final String PREF_USER_ID = "user_id";
|
||||
@@ -39,7 +38,6 @@ public class MainActivity extends Activity {
|
||||
private static final String PREF_PROFILE_JSON = "profile_json";
|
||||
private static final String PREF_VPN_CONNECTION_ID = "vpn_connection_id";
|
||||
static final String PREF_FORCE_FULL_TUNNEL = "force_full_tunnel";
|
||||
private EditText backendUrl;
|
||||
private EditText clusterId;
|
||||
private EditText organizationId;
|
||||
private EditText email;
|
||||
@@ -66,7 +64,6 @@ public class MainActivity extends Activity {
|
||||
int pad = dp(20);
|
||||
root.setPadding(pad, pad, pad, pad);
|
||||
|
||||
backendUrl = field("Fabric control bootstrap", preferredBackendUrl());
|
||||
clusterId = field("Cluster ID", prefs.getString("cluster_id", DEFAULT_CLUSTER_ID));
|
||||
organizationId = field("Organization ID", prefs.getString("organization_id", DEFAULT_ORGANIZATION_ID));
|
||||
email = field("Email", prefs.getString("email", "m"));
|
||||
@@ -102,10 +99,6 @@ public class MainActivity extends Activity {
|
||||
runtimeStatus.setPadding(0, 0, 0, dp(10));
|
||||
runtimeStatus.setText(runtimeStatusText());
|
||||
|
||||
Button load = new Button(this);
|
||||
load.setText("Войти / обновить пулы");
|
||||
load.setOnClickListener(v -> loadProfile(false));
|
||||
|
||||
Button start = new Button(this);
|
||||
start.setText("Подключить");
|
||||
start.setOnClickListener(v -> prepareVpn());
|
||||
@@ -148,12 +141,11 @@ public class MainActivity extends Activity {
|
||||
});
|
||||
|
||||
Button settings = new Button(this);
|
||||
settings.setText("Аккаунт");
|
||||
settings.setText("Настройка");
|
||||
settings.setOnClickListener(v -> showSettingsDialog());
|
||||
|
||||
root.addView(title);
|
||||
root.addView(profileSummary);
|
||||
root.addView(load);
|
||||
root.addView(start);
|
||||
root.addView(stop);
|
||||
root.addView(settings);
|
||||
@@ -161,9 +153,7 @@ public class MainActivity extends Activity {
|
||||
root.addView(runtimeStatus);
|
||||
setContentView(root);
|
||||
scheduleRuntimeStatusRefresh();
|
||||
if (authContext != null && !authContext.deviceId.isEmpty()) {
|
||||
startDiagnosticChannel();
|
||||
}
|
||||
registerCandidateNodeAsync(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -179,62 +169,38 @@ public class MainActivity extends Activity {
|
||||
return input;
|
||||
}
|
||||
|
||||
private void loadProfile() {
|
||||
loadProfile(false);
|
||||
}
|
||||
|
||||
private void loadProfile(boolean startAfterLoad) {
|
||||
status.setText("Загрузка...");
|
||||
saveSettings();
|
||||
private void prepareVpn() {
|
||||
if (!hasSelectedPool()) {
|
||||
status.setText("Сначала выберите выходной пул.");
|
||||
showSettingsDialog();
|
||||
return;
|
||||
}
|
||||
status.setText("Проверяю доступ к выбранному пулу...");
|
||||
new Thread(() -> {
|
||||
try {
|
||||
RapApiClient client = new RapApiClient(backendUrl.getText().toString(), this);
|
||||
authContext = authenticate(client);
|
||||
String activeOrganizationId = resolveOrganizationId(client, authContext.userId);
|
||||
profileJson = client.vpnClientProfile(
|
||||
clusterId.getText().toString(),
|
||||
activeOrganizationId,
|
||||
authContext.userId,
|
||||
""
|
||||
);
|
||||
vpnConnectionId = firstConnectionId(profileJson);
|
||||
saveProfileState();
|
||||
refreshSavedProfileForCurrentUser();
|
||||
if (!hasSelectedPool()) {
|
||||
throw new IllegalStateException("Выбранный пул больше не доступен.");
|
||||
}
|
||||
runOnUiThread(() -> {
|
||||
profileSummary.setText(summaryText());
|
||||
status.setText(startAfterLoad ? "Список пулов обновлен. Подключаю..." : "Список доступных пулов обновлен.");
|
||||
startDiagnosticChannel();
|
||||
if (startAfterLoad) {
|
||||
requestVpnPermission();
|
||||
}
|
||||
status.setText("Доступ подтвержден. Подключаюсь к выбранному пулу.");
|
||||
requestVpnPermission();
|
||||
});
|
||||
} catch (Exception ex) {
|
||||
runOnUiThread(() -> {
|
||||
String message = friendlyError(ex);
|
||||
boolean canUseSavedProfile = startAfterLoad && !profileJson.isEmpty() && !vpnConnectionId.isEmpty();
|
||||
if (canUseSavedProfile) {
|
||||
status.setText("Список пулов сейчас не обновился: " + message + ". Подключаюсь с сохраненным рабочим профилем.");
|
||||
startDiagnosticChannel();
|
||||
requestVpnPermission();
|
||||
return;
|
||||
}
|
||||
status.setText("Ошибка входа: " + message);
|
||||
if (message.contains("логин") || message.contains("пароль") || message.contains("Сессия устройства")) {
|
||||
clearSavedAuth(false);
|
||||
showSettingsDialog();
|
||||
}
|
||||
status.setText("Нужна настройка: " + message);
|
||||
showSettingsDialog();
|
||||
});
|
||||
}
|
||||
}).start();
|
||||
}
|
||||
|
||||
private void prepareVpn() {
|
||||
loadProfile(true);
|
||||
status.setText("Обновляю сессию устройства и доступные пулы...");
|
||||
}
|
||||
|
||||
private void requestVpnPermission() {
|
||||
if (profileJson.isEmpty()) {
|
||||
status.setText("VPN-профиль не загружен.");
|
||||
if (!hasSelectedPool()) {
|
||||
status.setText("Выходной пул не выбран или больше не доступен.");
|
||||
showSettingsDialog();
|
||||
return;
|
||||
}
|
||||
Intent prepare = VpnService.prepare(this);
|
||||
@@ -254,32 +220,37 @@ public class MainActivity extends Activity {
|
||||
}
|
||||
|
||||
private void startVpn() {
|
||||
Intent intent = new Intent(this, RapVpnService.class);
|
||||
intent.putExtra(RapVpnService.EXTRA_PROFILE_JSON, profileJson);
|
||||
intent.putExtra(RapVpnService.EXTRA_BACKEND_URL, backendUrl.getText().toString());
|
||||
intent.putExtra(RapVpnService.EXTRA_CLUSTER_ID, clusterId.getText().toString());
|
||||
intent.putExtra(RapVpnService.EXTRA_VPN_CONNECTION_ID, vpnConnectionId);
|
||||
startForegroundService(intent);
|
||||
status.setText("VPN подключается через ферму. Версия " + APP_VERSION + ". Ожидаю рабочий канал.");
|
||||
runtimeStatus.setText("Запрашиваю статус... " + runtimeStatusText());
|
||||
runtimeStatus.postDelayed(() -> {
|
||||
String state = runtimePrefs.getString("state", "");
|
||||
boolean runtimeActive = isVpnRuntimeActive();
|
||||
if (!isSystemVpnActive()) {
|
||||
if (runtimeActive) {
|
||||
status.setText("VPN runtime активен, рабочий канал поднят. Android еще обновляет системный статус.");
|
||||
} else if ("stopped".equals(state) || "revoked".equals(state) || "error".equals(state)) {
|
||||
status.setText("VPN не включился: " + runtimePrefs.getString("message", "Android остановил VPN-сервис") + ".");
|
||||
} else if ("starting".equals(state) || "tunnel".equals(state) || "relay_selected".equals(state) || "relay".equals(state) || "relay_reset".equals(state)) {
|
||||
status.setText("VPN запускается. Android еще применяет туннель, ожидаю рабочий канал.");
|
||||
try {
|
||||
Intent intent = new Intent(this, RapVpnService.class);
|
||||
intent.putExtra(RapVpnService.EXTRA_PROFILE_JSON, profileJson);
|
||||
intent.putExtra(RapVpnService.EXTRA_FABRIC_BOOTSTRAP_CONFIG, fabricControlConfig());
|
||||
intent.putExtra(RapVpnService.EXTRA_CLUSTER_ID, clusterId.getText().toString());
|
||||
intent.putExtra(RapVpnService.EXTRA_VPN_CONNECTION_ID, vpnConnectionId);
|
||||
startForegroundService(intent);
|
||||
status.setText("VPN подключается через ферму. Версия " + APP_VERSION + ". Ожидаю рабочий канал.");
|
||||
runtimeStatus.setText("Запрашиваю статус... " + runtimeStatusText());
|
||||
runtimeStatus.postDelayed(() -> {
|
||||
String state = runtimePrefs.getString("state", "");
|
||||
boolean runtimeActive = isVpnRuntimeActive();
|
||||
if (!isSystemVpnActive()) {
|
||||
if (runtimeActive) {
|
||||
status.setText("VPN runtime активен, рабочий канал поднят. Android еще обновляет системный статус.");
|
||||
} else if ("stopped".equals(state) || "revoked".equals(state) || "error".equals(state)) {
|
||||
status.setText("VPN не включился: " + runtimePrefs.getString("message", "Android остановил VPN-сервис") + ".");
|
||||
} else if ("starting".equals(state) || "tunnel".equals(state) || isTransportWarmupState(state)) {
|
||||
status.setText("VPN запускается. Android еще применяет туннель, ожидаю рабочий канал.");
|
||||
} else {
|
||||
status.setText("VPN еще не активен в Android. Проверьте системный запрос разрешения VPN.");
|
||||
}
|
||||
} else {
|
||||
status.setText("VPN еще не активен в Android. Проверьте системный запрос разрешения VPN.");
|
||||
status.setText("VPN включен Android. Версия " + APP_VERSION + ".");
|
||||
}
|
||||
} else {
|
||||
status.setText("VPN включен Android. Версия " + APP_VERSION + ".");
|
||||
}
|
||||
runtimeStatus.setText(runtimeStatusText());
|
||||
}, 2500);
|
||||
runtimeStatus.setText(runtimeStatusText());
|
||||
}, 2500);
|
||||
} catch (Exception e) {
|
||||
status.setText("VPN не запущен: bootstrap-конфиг фабрики недоступен.");
|
||||
runtimeStatus.setText("Ошибка запуска: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private void scheduleRuntimeStatusRefresh() {
|
||||
@@ -335,9 +306,9 @@ public class MainActivity extends Activity {
|
||||
boolean osVpnActive = isSystemVpnActive();
|
||||
String routes = runtimePrefs.getString("routes", "");
|
||||
String dnsServers = runtimePrefs.getString("dns_servers", "");
|
||||
String profileRelayUrl = runtimePrefs.getString("packet_relay_profile_base_url", "");
|
||||
String activeRelayUrl = runtimePrefs.getString("packet_relay_active_base_url", "");
|
||||
String relayCandidates = runtimePrefs.getString("packet_relay_candidate_urls", "");
|
||||
String profileTransportEndpoint = runtimePrefs.getString("fabric_transport_profile_endpoint", "");
|
||||
String activeTransportEndpoint = runtimePrefs.getString("fabric_transport_active_endpoint", "");
|
||||
String transportCandidates = runtimePrefs.getString("fabric_transport_candidate_endpoints", "");
|
||||
boolean forceFullTunnelRuntime = false;
|
||||
boolean fastPathEnabled = false;
|
||||
try {
|
||||
@@ -350,11 +321,14 @@ public class MainActivity extends Activity {
|
||||
}
|
||||
boolean staleState = updatedAt > 0 && (System.currentTimeMillis() - updatedAt) > 12_000;
|
||||
boolean runtimeActive = isVpnRuntimeActive();
|
||||
if (!osVpnActive && !runtimeActive && ("running".equals(state) || "tunnel".equals(state) || "relay".equals(state) || "relay_reset".equals(state))) {
|
||||
if (!osVpnActive && !runtimeActive && ("running".equals(state) || "tunnel".equals(state) || isTransportWarmupState(state))) {
|
||||
state = "stale_no_os_vpn";
|
||||
message = "Сервис говорит об активном состоянии, но Android VPN-интерфейс не активен. Проверьте разрешения/ручной запуск.";
|
||||
staleState = false;
|
||||
}
|
||||
String transportEndpoint = activeTransportEndpoint.isEmpty() ? "-" : activeTransportEndpoint;
|
||||
String transportTargets = transportCandidates.isEmpty() ? "-" : transportCandidates;
|
||||
String profileTarget = profileTransportEndpoint.isEmpty() ? "-" : profileTransportEndpoint;
|
||||
return "Диагностика: " + state
|
||||
+ "\n" + message
|
||||
+ "\nOS VPN: " + (osVpnActive ? "активен" : (runtimeActive ? "runtime активен" : "неактивен"))
|
||||
@@ -369,9 +343,9 @@ public class MainActivity extends Activity {
|
||||
+ " / down " + String.format(Locale.US, "%.1f", downlinkPps)
|
||||
+ "\nDNS выхода: " + (dnsServers.isEmpty() ? "-" : dnsServers)
|
||||
+ "\nroutes: " + (routes.isEmpty() ? "-" : routes)
|
||||
+ "\nrelay active: " + (activeRelayUrl.isEmpty() ? "-" : activeRelayUrl)
|
||||
+ "\nrelay profile: " + (profileRelayUrl.isEmpty() ? "-" : profileRelayUrl)
|
||||
+ "\nrelay candidates: " + (relayCandidates.isEmpty() ? "-" : relayCandidates)
|
||||
+ "\ntransport endpoint: " + transportEndpoint
|
||||
+ "\nprofile target: " + profileTarget
|
||||
+ "\ntransport candidates: " + transportTargets
|
||||
+ "\nforced_full_tunnel: " + (forceFullTunnelRuntime ? "да" : "нет")
|
||||
+ "\nfast_path_mode: " + (fastPathEnabled ? "включен" : "выключен")
|
||||
+ "\nbytes read/sent/down: " + readBytes + "/" + sentBytes + "/" + downBytes
|
||||
@@ -389,13 +363,6 @@ public class MainActivity extends Activity {
|
||||
+ "\nобновлено: " + age;
|
||||
}
|
||||
|
||||
private void startDiagnosticChannel() {
|
||||
if (authContext == null || authContext.deviceId.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
RapDiagnosticService.start(this);
|
||||
}
|
||||
|
||||
private boolean isSystemVpnActive() {
|
||||
try {
|
||||
ConnectivityManager connectivityManager = (ConnectivityManager) getSystemService(CONNECTIVITY_SERVICE);
|
||||
@@ -426,20 +393,31 @@ public class MainActivity extends Activity {
|
||||
if (updatedAt <= 0 || (System.currentTimeMillis() - updatedAt) > 15_000) {
|
||||
return false;
|
||||
}
|
||||
String relay = runtimePrefs.getString("packet_relay_active_base_url", "");
|
||||
String activeTransportEndpoint = runtimePrefs.getString("fabric_transport_active_endpoint", "");
|
||||
long read = runtimePrefs.getLong("uplink_read_total", 0);
|
||||
long sent = runtimePrefs.getLong("uplink_sent_total", 0);
|
||||
long down = runtimePrefs.getLong("downlink_received_total", 0);
|
||||
return !relay.isEmpty() && ("running".equals(state)
|
||||
|| "relay".equals(state)
|
||||
|| "relay_reset".equals(state)
|
||||
return !activeTransportEndpoint.isEmpty() && ("running".equals(state)
|
||||
|| "fabric_transport".equals(state)
|
||||
|| "fabric_transport_reset".equals(state)
|
||||
|| "downlink".equals(state)
|
||||
|| "downlink_idle".equals(state)
|
||||
|| "uplink_sent".equals(state)
|
||||
|| read > 0 || sent > 0 || down > 0);
|
||||
}
|
||||
|
||||
private boolean isTransportWarmupState(String state) {
|
||||
return "fabric_transport_selected".equals(state)
|
||||
|| "fabric_transport".equals(state)
|
||||
|| "fabric_transport_reset".equals(state)
|
||||
|| "fabric_transport_switch".equals(state);
|
||||
}
|
||||
|
||||
private String firstConnectionId(String profile) throws Exception {
|
||||
String selected = prefs == null ? "" : prefs.getString(PREF_VPN_CONNECTION_ID, "").trim();
|
||||
if (!selected.isEmpty() && profileContainsConnection(profile, selected)) {
|
||||
return selected;
|
||||
}
|
||||
JSONObject root = new JSONObject(profile);
|
||||
JSONObject vpnProfile = root.getJSONObject("vpn_client_profile");
|
||||
JSONArray connections = vpnProfile.getJSONArray("connections");
|
||||
@@ -489,6 +467,36 @@ public class MainActivity extends Activity {
|
||||
return connections.getJSONObject(0).getString("id");
|
||||
}
|
||||
|
||||
private boolean hasSelectedPool() {
|
||||
return profileJson != null
|
||||
&& !profileJson.trim().isEmpty()
|
||||
&& vpnConnectionId != null
|
||||
&& !vpnConnectionId.trim().isEmpty()
|
||||
&& profileContainsConnection(profileJson, vpnConnectionId.trim());
|
||||
}
|
||||
|
||||
private boolean profileContainsConnection(String profile, String connectionId) {
|
||||
if (profile == null || profile.trim().isEmpty() || connectionId == null || connectionId.trim().isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
JSONObject root = new JSONObject(profile);
|
||||
JSONObject vpnProfile = root.optJSONObject("vpn_client_profile");
|
||||
JSONArray connections = vpnProfile == null ? null : vpnProfile.optJSONArray("connections");
|
||||
if (connections == null) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < connections.length(); i++) {
|
||||
JSONObject connection = connections.optJSONObject(i);
|
||||
if (connection != null && connectionId.trim().equals(connection.optString("id", ""))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private int dp(int value) {
|
||||
return (int) (value * getResources().getDisplayMetrics().density);
|
||||
}
|
||||
@@ -504,8 +512,8 @@ public class MainActivity extends Activity {
|
||||
return "Версия: " + APP_VERSION
|
||||
+ "\nУзел Android: в ферме"
|
||||
+ "\nBootstrap фермы: " + bootstrapPeerCount() + " узл."
|
||||
+ "\nДоступные выходы: " + (poolText.isEmpty() ? "войдите для загрузки" : poolText)
|
||||
+ "\nВыбранный выход: " + (selectedPoolText.isEmpty() ? "автоматически" : selectedPoolText)
|
||||
+ "\nДоступные выходы: " + (poolText.isEmpty() ? "не загружены" : poolText)
|
||||
+ "\nВыбранный выход: " + (selectedPoolText.isEmpty() ? "не выбран" : selectedPoolText)
|
||||
+ "\nDNS выхода: " + (profileDNS.isEmpty() ? "будет получен из профиля" : profileDNS)
|
||||
+ "\nТрафик: " + (prefs.getBoolean(PREF_FORCE_FULL_TUNNEL, true) ? "весь через VPN" : "по профилю")
|
||||
+ "\nDevice: " + (deviceId.isEmpty() ? "нет" : deviceId)
|
||||
@@ -647,20 +655,7 @@ public class MainActivity extends Activity {
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
private String preferredBackendUrl() {
|
||||
String saved = prefs.getString("backend_url", DEFAULT_BACKEND_URL);
|
||||
String normalized = normalizeBackendUrl(saved);
|
||||
if (!normalized.equals(saved == null ? "" : saved.trim())) {
|
||||
prefs.edit().putString("backend_url", normalized).apply();
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
private void saveSettings() {
|
||||
String normalizedBackend = normalizeBackendUrl(backendUrl.getText().toString());
|
||||
if (!normalizedBackend.equals(backendUrl.getText().toString().trim())) {
|
||||
backendUrl.setText(normalizedBackend);
|
||||
}
|
||||
normalizeAndPersistDefaults();
|
||||
if (clusterId.getText().toString().trim().isEmpty()) {
|
||||
clusterId.setText(DEFAULT_CLUSTER_ID);
|
||||
@@ -669,7 +664,6 @@ public class MainActivity extends Activity {
|
||||
organizationId.setText(DEFAULT_ORGANIZATION_ID);
|
||||
}
|
||||
prefs.edit()
|
||||
.putString("backend_url", normalizedBackend)
|
||||
.putString("cluster_id", clusterId.getText().toString())
|
||||
.putString("organization_id", organizationId.getText().toString())
|
||||
.putString("email", email.getText().toString())
|
||||
@@ -677,10 +671,6 @@ public class MainActivity extends Activity {
|
||||
}
|
||||
|
||||
private void normalizeAndPersistDefaults() {
|
||||
String normalizedBackend = normalizeBackendUrl(backendUrl.getText().toString());
|
||||
if (normalizedBackend.isEmpty()) {
|
||||
backendUrl.setText(DEFAULT_BACKEND_URL);
|
||||
}
|
||||
if (clusterId.getText().toString().trim().isEmpty()) {
|
||||
clusterId.setText(DEFAULT_CLUSTER_ID);
|
||||
}
|
||||
@@ -689,38 +679,48 @@ public class MainActivity extends Activity {
|
||||
}
|
||||
}
|
||||
|
||||
private String normalizeBackendUrl(String value) {
|
||||
String candidate = value == null ? "" : value.trim().replaceAll("/+$", "");
|
||||
if (candidate.isEmpty()) {
|
||||
return DEFAULT_BACKEND_URL;
|
||||
private String fabricControlConfig() throws Exception {
|
||||
JSONArray endpoints = new JSONArray();
|
||||
for (String peer : FABRIC_BOOTSTRAP_PEERS.split(",")) {
|
||||
String raw = peer == null ? "" : peer.trim();
|
||||
String address = raw;
|
||||
String certSHA256 = "";
|
||||
int fragmentIndex = raw.indexOf('#');
|
||||
if (fragmentIndex >= 0) {
|
||||
address = raw.substring(0, fragmentIndex).trim();
|
||||
String fragment = raw.substring(fragmentIndex + 1).trim();
|
||||
if (fragment.startsWith("sha256=")) {
|
||||
certSHA256 = fragment.substring("sha256=".length()).trim();
|
||||
}
|
||||
}
|
||||
if (address.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
JSONObject endpoint = new JSONObject();
|
||||
endpoint.put("endpoint_id", address);
|
||||
endpoint.put("address", address);
|
||||
endpoint.put("transport", "direct_quic");
|
||||
if (certSHA256.matches("^[0-9a-fA-F]{64}$")) {
|
||||
endpoint.put("peer_cert_sha256", certSHA256.toLowerCase(Locale.US));
|
||||
}
|
||||
endpoints.put(endpoint);
|
||||
}
|
||||
String lower = candidate.toLowerCase(Locale.US);
|
||||
if ("http://vpn.cin.su:19191/api/v1".equals(lower)
|
||||
|| "http://vpn.cin.su/api/v1".equals(lower)
|
||||
|| "https://vpn.cin.su:443/api/v1".equals(lower)
|
||||
|| "http://94.141.118.222:19191/api/v1".equals(lower)
|
||||
|| "http://195.123.240.88:19131/api/v1".equals(lower)) {
|
||||
return DEFAULT_BACKEND_URL;
|
||||
if (endpoints.length() == 0) {
|
||||
throw new IllegalStateException("В клиенте нет bootstrap-узлов фермы.");
|
||||
}
|
||||
return candidate;
|
||||
}
|
||||
|
||||
private String selectedExitNodeId() {
|
||||
return "";
|
||||
}
|
||||
|
||||
private String normalizeSelectedExitNodeId(String value) {
|
||||
String candidate = value == null ? "" : value.trim();
|
||||
if (candidate.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
if (candidate.matches("^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")) {
|
||||
return candidate;
|
||||
}
|
||||
if (candidate.matches("^[A-Za-z0-9][A-Za-z0-9._-]{2,63}$")) {
|
||||
return candidate;
|
||||
}
|
||||
return "";
|
||||
JSONObject service = new JSONObject();
|
||||
service.put("schema_version", "rap.fabric_service_channel_request.v1");
|
||||
service.put("channel_id", "android-control");
|
||||
service.put("service_class", "identity_runtime");
|
||||
service.put("source_role", "vpn-client");
|
||||
JSONObject cfg = new JSONObject();
|
||||
cfg.put("cluster_id", DEFAULT_CLUSTER_ID);
|
||||
cfg.put("local_node_id", fabricNodeId());
|
||||
cfg.put("vpn_connection_id", "fabric-control");
|
||||
cfg.put("stream_shards", 1);
|
||||
cfg.put("service_channel_request", service);
|
||||
cfg.put("endpoints", endpoints);
|
||||
return cfg.toString();
|
||||
}
|
||||
|
||||
private RapApiClient.AuthContext authenticate(RapApiClient client) throws Exception {
|
||||
@@ -743,6 +743,44 @@ public class MainActivity extends Activity {
|
||||
return loggedIn;
|
||||
}
|
||||
|
||||
private RapApiClient.AuthContext authenticateWithPassword(RapApiClient client, String emailValue, String passwordValue) throws Exception {
|
||||
if (passwordValue == null || passwordValue.trim().isEmpty()) {
|
||||
throw new IllegalStateException("Введите пароль для идентификации устройства и выбора пула.");
|
||||
}
|
||||
RapApiClient.AuthContext loggedIn = client.login(emailValue.trim(), passwordValue.trim(), deviceFingerprint());
|
||||
saveAuthContext(loggedIn);
|
||||
return loggedIn;
|
||||
}
|
||||
|
||||
private void refreshSavedProfileForCurrentUser() throws Exception {
|
||||
String userId = prefs.getString(PREF_USER_ID, "");
|
||||
if (userId == null || userId.trim().isEmpty()) {
|
||||
throw new IllegalStateException("Устройство еще не привязано к пользователю.");
|
||||
}
|
||||
RapApiClient client = new RapApiClient(fabricControlConfig(), this);
|
||||
String refreshToken = savedRefreshToken();
|
||||
if (!refreshToken.isEmpty()) {
|
||||
authContext = client.refresh(refreshToken);
|
||||
saveAuthContext(authContext);
|
||||
userId = authContext.userId;
|
||||
}
|
||||
String activeOrganizationId = resolveOrganizationId(client, userId);
|
||||
String refreshedProfile = client.vpnClientProfile(
|
||||
clusterId.getText().toString(),
|
||||
activeOrganizationId,
|
||||
userId,
|
||||
""
|
||||
);
|
||||
if (!profileContainsConnection(refreshedProfile, vpnConnectionId)) {
|
||||
profileJson = refreshedProfile;
|
||||
vpnConnectionId = "";
|
||||
saveProfileState();
|
||||
throw new IllegalStateException("Администратор закрыл доступ к выбранному пулу или пул удален.");
|
||||
}
|
||||
profileJson = refreshedProfile;
|
||||
saveProfileState();
|
||||
}
|
||||
|
||||
private String resolveOrganizationId(RapApiClient client, String userId) throws Exception {
|
||||
JSONObject payload = client.organizations(userId);
|
||||
JSONArray organizations = payload.optJSONArray("organizations");
|
||||
@@ -850,6 +888,89 @@ public class MainActivity extends Activity {
|
||||
return generated;
|
||||
}
|
||||
|
||||
private String fabricNodeId() {
|
||||
String existing = prefs.getString(PREF_FABRIC_NODE_ID, "");
|
||||
if (existing != null && existing.matches("^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")) {
|
||||
return existing.toLowerCase(Locale.US);
|
||||
}
|
||||
String generated = java.util.UUID.randomUUID().toString();
|
||||
prefs.edit().putString(PREF_FABRIC_NODE_ID, generated).apply();
|
||||
return generated;
|
||||
}
|
||||
|
||||
private void registerCandidateNodeAsync(boolean showStatus) {
|
||||
new Thread(() -> {
|
||||
try {
|
||||
RapApiClient client = new RapApiClient(fabricControlConfig(), this);
|
||||
String nodeId = registerCandidateNode(client);
|
||||
sendCandidateHeartbeat(client, nodeId);
|
||||
if (showStatus) {
|
||||
runOnUiThread(() -> status.setText("Узел телефона виден ферме как кандидат: " + nodeId));
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
if (showStatus) {
|
||||
runOnUiThread(() -> status.setText("Узел телефона пока не зарегистрирован в ферме: " + friendlyError(ex)));
|
||||
}
|
||||
}
|
||||
}, "rap-fabric-candidate-register").start();
|
||||
}
|
||||
|
||||
private String registerCandidateNode(RapApiClient client) throws Exception {
|
||||
String nodeId = fabricNodeId();
|
||||
JSONObject metadata = new JSONObject();
|
||||
metadata.put("source", "android_vpn_client");
|
||||
metadata.put("candidate_access", true);
|
||||
metadata.put("fabric_transport", "quic");
|
||||
metadata.put("connectivity_mode", "outbound_only");
|
||||
metadata.put("app_version", APP_VERSION);
|
||||
metadata.put("device_fingerprint", deviceFingerprint());
|
||||
JSONObject payload = new JSONObject();
|
||||
payload.put("cluster_id", clusterId.getText().toString().trim().isEmpty() ? DEFAULT_CLUSTER_ID : clusterId.getText().toString().trim());
|
||||
payload.put("node_key", "android-vpn:" + deviceFingerprint());
|
||||
payload.put("name", "android-vpn-" + deviceFingerprint().replace("android-", "").substring(0, Math.min(8, deviceFingerprint().replace("android-", "").length())));
|
||||
payload.put("ownership_type", "customer_managed");
|
||||
payload.put("owner_organization_id", organizationId.getText().toString().trim().isEmpty() ? DEFAULT_ORGANIZATION_ID : organizationId.getText().toString().trim());
|
||||
payload.put("reported_version", APP_VERSION);
|
||||
payload.put("metadata", metadata);
|
||||
JSONObject response = client.registerFabricNode(payload);
|
||||
String registeredNodeId = response.optString("node_id", nodeId).trim();
|
||||
if (!registeredNodeId.isEmpty()) {
|
||||
prefs.edit().putString(PREF_FABRIC_NODE_ID, registeredNodeId).apply();
|
||||
return registeredNodeId;
|
||||
}
|
||||
return nodeId;
|
||||
}
|
||||
|
||||
private void sendCandidateHeartbeat(RapApiClient client, String nodeId) throws Exception {
|
||||
JSONObject capabilities = new JSONObject();
|
||||
capabilities.put("fabric_quic_node", true);
|
||||
capabilities.put("android_vpn_client", true);
|
||||
capabilities.put("candidate_access", true);
|
||||
capabilities.put("vpn_client", true);
|
||||
JSONObject serviceStates = new JSONObject();
|
||||
serviceStates.put("vpn-client", new JSONObject()
|
||||
.put("state", isSystemVpnActive() ? "running" : "candidate")
|
||||
.put("runtime", "android_vpnservice")
|
||||
.put("transport", "fabric_quic_route"));
|
||||
JSONObject metadata = new JSONObject();
|
||||
metadata.put("source", "android_vpn_client");
|
||||
metadata.put("candidate", true);
|
||||
metadata.put("passive", true);
|
||||
metadata.put("app_version", APP_VERSION);
|
||||
metadata.put("mesh_endpoint_report", new JSONObject()
|
||||
.put("schema_version", "rap.mesh_endpoint_report.v1")
|
||||
.put("transport", "quic")
|
||||
.put("connectivity_mode", "outbound_only")
|
||||
.put("endpoint_candidates", new JSONArray()));
|
||||
JSONObject payload = new JSONObject();
|
||||
payload.put("health_status", "healthy");
|
||||
payload.put("reported_version", APP_VERSION);
|
||||
payload.put("capabilities", capabilities);
|
||||
payload.put("service_states", serviceStates);
|
||||
payload.put("metadata", metadata);
|
||||
client.sendFabricNodeHeartbeat(clusterId.getText().toString().trim().isEmpty() ? DEFAULT_CLUSTER_ID : clusterId.getText().toString().trim(), nodeId, payload);
|
||||
}
|
||||
|
||||
private void showSettingsDialog() {
|
||||
LinearLayout form = new LinearLayout(this);
|
||||
form.setOrientation(LinearLayout.VERTICAL);
|
||||
@@ -877,17 +998,15 @@ public class MainActivity extends Activity {
|
||||
form.addView(showPassword);
|
||||
form.addView(forceFullTunnel);
|
||||
new AlertDialog.Builder(this)
|
||||
.setTitle("Аккаунт VPN")
|
||||
.setTitle("Настройка VPN")
|
||||
.setView(form)
|
||||
.setPositiveButton("Сохранить", (dialog, which) -> {
|
||||
.setPositiveButton("Войти и выбрать выход", (dialog, which) -> {
|
||||
email.setText(emailDraft.getText().toString());
|
||||
password.setText(passwordDraft.getText().toString());
|
||||
prefs.edit()
|
||||
.remove(PREF_SELECTED_EXIT_NODE_ID)
|
||||
.apply();
|
||||
String passwordValue = passwordDraft.getText().toString();
|
||||
password.setText("");
|
||||
prefs.edit().putBoolean(PREF_FORCE_FULL_TUNNEL, forceFullTunnel.isChecked()).apply();
|
||||
saveSettings();
|
||||
profileSummary.setText(summaryText());
|
||||
loginAndChoosePool(emailDraft.getText().toString(), passwordValue);
|
||||
})
|
||||
.setNeutralButton("Забыть устройство", (dialog, which) -> {
|
||||
clearSavedAuth(true);
|
||||
@@ -897,6 +1016,72 @@ public class MainActivity extends Activity {
|
||||
.show();
|
||||
}
|
||||
|
||||
private void loginAndChoosePool(String emailValue, String passwordValue) {
|
||||
status.setText("Идентифицирую устройство и загружаю доступные выходы...");
|
||||
new Thread(() -> {
|
||||
try {
|
||||
RapApiClient client = new RapApiClient(fabricControlConfig(), this);
|
||||
authContext = authenticateWithPassword(client, emailValue, passwordValue);
|
||||
String activeOrganizationId = resolveOrganizationId(client, authContext.userId);
|
||||
String loadedProfile = client.vpnClientProfile(
|
||||
clusterId.getText().toString(),
|
||||
activeOrganizationId,
|
||||
authContext.userId,
|
||||
""
|
||||
);
|
||||
runOnUiThread(() -> showPoolChoiceDialog(loadedProfile));
|
||||
} catch (Exception ex) {
|
||||
runOnUiThread(() -> {
|
||||
status.setText("Ошибка настройки: " + friendlyError(ex));
|
||||
if (friendlyError(ex).contains("пароль")) {
|
||||
clearSavedAuth(false);
|
||||
}
|
||||
});
|
||||
}
|
||||
}).start();
|
||||
}
|
||||
|
||||
private void showPoolChoiceDialog(String loadedProfile) {
|
||||
try {
|
||||
JSONObject root = new JSONObject(loadedProfile);
|
||||
JSONObject vpnProfile = root.optJSONObject("vpn_client_profile");
|
||||
JSONArray connections = vpnProfile == null ? null : vpnProfile.optJSONArray("connections");
|
||||
if (connections == null || connections.length() == 0) {
|
||||
throw new IllegalStateException("Для пользователя нет доступных выходных пулов.");
|
||||
}
|
||||
String[] labels = new String[connections.length()];
|
||||
String[] ids = new String[connections.length()];
|
||||
int selectedIndex = 0;
|
||||
for (int i = 0; i < connections.length(); i++) {
|
||||
JSONObject connection = connections.getJSONObject(i);
|
||||
ids[i] = connection.optString("id", "");
|
||||
String name = connection.optString("exit_pool_name", "").trim();
|
||||
if (name.isEmpty()) {
|
||||
name = connection.optString("name", "").trim();
|
||||
}
|
||||
labels[i] = name.isEmpty() ? "Выход " + (i + 1) : name;
|
||||
if (!vpnConnectionId.isEmpty() && vpnConnectionId.equals(ids[i])) {
|
||||
selectedIndex = i;
|
||||
}
|
||||
}
|
||||
int initialSelection = selectedIndex;
|
||||
new AlertDialog.Builder(this)
|
||||
.setTitle("Выходной пул")
|
||||
.setSingleChoiceItems(labels, initialSelection, (dialog, which) -> {
|
||||
profileJson = loadedProfile;
|
||||
vpnConnectionId = ids[which];
|
||||
saveProfileState();
|
||||
profileSummary.setText(summaryText());
|
||||
status.setText("Выбран выходной пул: " + labels[which]);
|
||||
dialog.dismiss();
|
||||
})
|
||||
.setNegativeButton("Отмена", null)
|
||||
.show();
|
||||
} catch (Exception ex) {
|
||||
status.setText("Ошибка выбора пула: " + friendlyError(ex));
|
||||
}
|
||||
}
|
||||
|
||||
private String friendlyError(Exception ex) {
|
||||
String message = ex.getMessage();
|
||||
if (message == null || message.trim().isEmpty()) {
|
||||
|
||||
@@ -4,7 +4,6 @@ import android.content.Context;
|
||||
import android.net.ConnectivityManager;
|
||||
import android.net.Network;
|
||||
import android.net.NetworkCapabilities;
|
||||
import android.net.VpnService;
|
||||
|
||||
import okhttp3.MediaType;
|
||||
import okhttp3.OkHttpClient;
|
||||
@@ -16,35 +15,28 @@ import okhttp3.RequestBody;
|
||||
import okhttp3.Response;
|
||||
import okhttp3.ResponseBody;
|
||||
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InterruptedIOException;
|
||||
import su.cin.rapvpn.fabric.fabricvpn.Fabricvpn;
|
||||
import su.cin.rapvpn.fabric.fabricvpn.Manager;
|
||||
|
||||
import java.net.InetAddress;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.Socket;
|
||||
import java.net.URI;
|
||||
import java.net.UnknownHostException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Collections;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import javax.net.SocketFactory;
|
||||
|
||||
final class RapApiClient {
|
||||
private static final MediaType JSON = MediaType.get("application/json; charset=utf-8");
|
||||
private static final MediaType OCTET_STREAM = MediaType.get("application/octet-stream");
|
||||
private static final int MAX_PACKET_BATCH_PACKETS = 512;
|
||||
private static final int MAX_PACKET_BATCH_BYTES = 512 * 1024;
|
||||
private static final int MAX_SINGLE_PACKET_BYTES = 65535;
|
||||
private static final int MAX_BATCH_HEADER_BYTES = 4;
|
||||
private final String baseUrl;
|
||||
private final OkHttpClient httpClient;
|
||||
private final String networkMode;
|
||||
private final FabricServiceChannel fabricServiceChannel;
|
||||
private final Manager fabricControlManager;
|
||||
|
||||
RapApiClient(String baseUrl) {
|
||||
this(baseUrl, (Context) null);
|
||||
@@ -52,7 +44,7 @@ final class RapApiClient {
|
||||
|
||||
RapApiClient(String baseUrl, Context context) {
|
||||
this.baseUrl = trimRight(baseUrl);
|
||||
this.fabricServiceChannel = new FabricServiceChannel();
|
||||
this.fabricControlManager = startFabricControlManager(baseUrl);
|
||||
OkHttpClient.Builder builder = new OkHttpClient.Builder();
|
||||
// Regular app and diagnostic requests should use Android's default
|
||||
// routing. Some devices reject binding app sockets to a specific
|
||||
@@ -74,7 +66,7 @@ final class RapApiClient {
|
||||
|
||||
RapApiClient(String baseUrl, Context context, boolean preferUnderlyingNetwork) {
|
||||
this.baseUrl = trimRight(baseUrl);
|
||||
this.fabricServiceChannel = new FabricServiceChannel();
|
||||
this.fabricControlManager = startFabricControlManager(baseUrl);
|
||||
OkHttpClient.Builder builder = new OkHttpClient.Builder();
|
||||
String mode = context == null ? "default_network" : "default_network_context";
|
||||
if (preferUnderlyingNetwork && context != null) {
|
||||
@@ -99,74 +91,27 @@ final class RapApiClient {
|
||||
this.httpClient = builder.build();
|
||||
}
|
||||
|
||||
RapApiClient(String baseUrl, VpnService vpnService) {
|
||||
this(baseUrl, vpnService, new FabricServiceChannel());
|
||||
}
|
||||
|
||||
RapApiClient(String baseUrl, VpnService vpnService, FabricServiceChannel fabricServiceChannel) {
|
||||
this.baseUrl = trimRight(baseUrl);
|
||||
this.fabricServiceChannel = fabricServiceChannel == null ? new FabricServiceChannel() : fabricServiceChannel;
|
||||
OkHttpClient.Builder builder = new OkHttpClient.Builder();
|
||||
if (vpnService != null) {
|
||||
builder.socketFactory(new ProtectedSocketFactory(vpnService));
|
||||
builder.dns(new BackendPinnedDns(baseUrl));
|
||||
this.networkMode = "protected_socket";
|
||||
} else {
|
||||
this.networkMode = "default_network";
|
||||
}
|
||||
builder.connectTimeout(3, TimeUnit.SECONDS);
|
||||
builder.writeTimeout(8, TimeUnit.SECONDS);
|
||||
builder.readTimeout(8, TimeUnit.SECONDS);
|
||||
builder.callTimeout(10, TimeUnit.SECONDS);
|
||||
builder.retryOnConnectionFailure(false);
|
||||
Dispatcher dispatcher = new Dispatcher();
|
||||
dispatcher.setMaxRequests(64);
|
||||
dispatcher.setMaxRequestsPerHost(32);
|
||||
builder.dispatcher(dispatcher);
|
||||
builder.connectionPool(new ConnectionPool(16, 5, TimeUnit.MINUTES));
|
||||
this.httpClient = builder.build();
|
||||
}
|
||||
|
||||
RapApiClient(String baseUrl, Network network) {
|
||||
this.baseUrl = trimRight(baseUrl);
|
||||
this.fabricServiceChannel = new FabricServiceChannel();
|
||||
OkHttpClient.Builder builder = new OkHttpClient.Builder();
|
||||
if (network != null) {
|
||||
builder.socketFactory(network.getSocketFactory());
|
||||
builder.dns(hostname -> {
|
||||
InetAddress[] addresses = network.getAllByName(hostname);
|
||||
if (addresses == null || addresses.length == 0) {
|
||||
throw new UnknownHostException(hostname);
|
||||
}
|
||||
List<InetAddress> out = new ArrayList<>();
|
||||
Collections.addAll(out, addresses);
|
||||
return out;
|
||||
});
|
||||
this.networkMode = "vpn_network";
|
||||
} else {
|
||||
builder.dns(new BackendPinnedDns(baseUrl));
|
||||
this.networkMode = "default_network";
|
||||
}
|
||||
builder.connectTimeout(5, TimeUnit.SECONDS);
|
||||
builder.writeTimeout(12, TimeUnit.SECONDS);
|
||||
builder.readTimeout(12, TimeUnit.SECONDS);
|
||||
builder.callTimeout(15, TimeUnit.SECONDS);
|
||||
builder.retryOnConnectionFailure(true);
|
||||
Dispatcher dispatcher = new Dispatcher();
|
||||
dispatcher.setMaxRequests(64);
|
||||
dispatcher.setMaxRequestsPerHost(32);
|
||||
builder.dispatcher(dispatcher);
|
||||
builder.connectionPool(new ConnectionPool(16, 5, TimeUnit.MINUTES));
|
||||
this.httpClient = builder.build();
|
||||
}
|
||||
|
||||
String networkMode() {
|
||||
return networkMode;
|
||||
}
|
||||
|
||||
private Manager startFabricControlManager(String config) {
|
||||
String value = config == null ? "" : config.trim();
|
||||
if (!value.startsWith("{")) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
Fabricvpn.touch();
|
||||
Manager manager = Fabricvpn.newManager();
|
||||
manager.start(value);
|
||||
return manager;
|
||||
} catch (Exception e) {
|
||||
String detail = e.getMessage() == null ? e.getClass().getSimpleName() : e.getMessage();
|
||||
throw new IllegalStateException("Не удалось подключиться к ферме через QUIC bootstrap. Последняя ошибка: " + detail, e);
|
||||
}
|
||||
}
|
||||
|
||||
static final class BackendPinnedDns implements Dns {
|
||||
private static final String VPN_PUBLIC_HOST = "vpn.cin.su";
|
||||
private static final String VPN_PUBLIC_IPV4 = "94.141.118.222";
|
||||
private final String backendHost;
|
||||
|
||||
BackendPinnedDns(String baseUrl) {
|
||||
@@ -180,10 +125,6 @@ final class RapApiClient {
|
||||
|
||||
@Override
|
||||
public List<InetAddress> lookup(String hostname) throws UnknownHostException {
|
||||
String host = hostname == null ? "" : hostname.trim().toLowerCase();
|
||||
if (!backendHost.isEmpty() && host.equals(backendHost) && VPN_PUBLIC_HOST.equals(host)) {
|
||||
return Collections.singletonList(InetAddress.getByName(VPN_PUBLIC_IPV4));
|
||||
}
|
||||
return Dns.SYSTEM.lookup(hostname);
|
||||
}
|
||||
}
|
||||
@@ -243,103 +184,26 @@ final class RapApiClient {
|
||||
return get(path);
|
||||
}
|
||||
|
||||
JSONObject startSession(String resourceId, String userId, String deviceId) throws Exception {
|
||||
JSONObject body = new JSONObject();
|
||||
body.put("resource_id", resourceId);
|
||||
body.put("user_id", userId);
|
||||
body.put("device_id", deviceId);
|
||||
return post("/sessions/", body);
|
||||
JSONObject registerFabricNode(JSONObject payload) throws Exception {
|
||||
return post("/node-agents/register", payload);
|
||||
}
|
||||
|
||||
JSONObject reportVPNDiagnosticStatus(String clusterId, String deviceId, JSONObject payload) throws Exception {
|
||||
return post("/clusters/" + clusterId + "/vpn/client-diagnostics/" + deviceId + "/status", payload);
|
||||
}
|
||||
|
||||
JSONObject nextVPNDiagnosticCommand(String clusterId, String deviceId, int timeoutMs) throws Exception {
|
||||
byte[] payload = getBytes("/clusters/" + clusterId + "/vpn/client-diagnostics/" + deviceId + "/commands?timeout_ms=" + timeoutMs);
|
||||
if (payload.length == 0) {
|
||||
return null;
|
||||
}
|
||||
return new JSONObject(new String(payload, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
JSONObject vpnPacketStats(String clusterId, String vpnConnectionId) throws Exception {
|
||||
return get("/clusters/" + clusterId + "/vpn-connections/" + vpnConnectionId + "/tunnel/stats");
|
||||
}
|
||||
|
||||
JSONObject resetVPNPacketQueues(String clusterId, String vpnConnectionId) throws Exception {
|
||||
return post("/clusters/" + clusterId + "/vpn-connections/" + vpnConnectionId + "/tunnel/reset", new JSONObject());
|
||||
}
|
||||
|
||||
void sendClientPacket(String clusterId, String vpnConnectionId, byte[] packet, int length) throws Exception {
|
||||
postBytes(clientPacketPath(clusterId, vpnConnectionId, ""), packet, length);
|
||||
}
|
||||
|
||||
void sendClientPacketBatch(String clusterId, String vpnConnectionId, List<byte[]> packets) throws Exception {
|
||||
if (packets == null || packets.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
List<List<byte[]>> chunks = chunkPacketsForBatch(packets);
|
||||
if (chunks.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
for (List<byte[]> chunk : chunks) {
|
||||
postBytes(clientPacketPath(clusterId, vpnConnectionId, "?batch=true"), encodePacketBatch(chunk));
|
||||
}
|
||||
}
|
||||
|
||||
byte[] receiveClientPacket(String clusterId, String vpnConnectionId, int timeoutMs) throws Exception {
|
||||
try {
|
||||
return getBytes(clientPacketPath(clusterId, vpnConnectionId, "?timeout_ms=" + timeoutMs));
|
||||
} catch (InterruptedIOException e) {
|
||||
return new byte[0];
|
||||
} catch (IOException e) {
|
||||
if (e.getMessage() != null && e.getMessage().toLowerCase().contains("timeout")) {
|
||||
return new byte[0];
|
||||
}
|
||||
throw e;
|
||||
} catch (IllegalStateException e) {
|
||||
String message = e.getMessage();
|
||||
if (message != null && (message.contains("HTTP 502") || message.contains("HTTP 503") || message.contains("HTTP 504"))) {
|
||||
return new byte[0];
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
List<byte[]> receiveClientPacketBatch(String clusterId, String vpnConnectionId, int timeoutMs) throws Exception {
|
||||
byte[] payload;
|
||||
try {
|
||||
payload = getBytes(clientPacketPath(clusterId, vpnConnectionId, "?batch=true&timeout_ms=" + timeoutMs));
|
||||
if (payload == null || payload.length == 0) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
if (!isLikelyPacketBatch(payload)) {
|
||||
return receiveSinglePacketAsBatch(clusterId, vpnConnectionId, timeoutMs);
|
||||
}
|
||||
return decodePacketBatch(payload);
|
||||
} catch (InterruptedIOException e) {
|
||||
return new ArrayList<>();
|
||||
} catch (IOException e) {
|
||||
if (e.getMessage() != null && e.getMessage().toLowerCase().contains("timeout")) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
throw e;
|
||||
} catch (IllegalStateException e) {
|
||||
String message = e.getMessage();
|
||||
if (message != null && (message.contains("HTTP 502") || message.contains("HTTP 503") || message.contains("HTTP 504"))) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
JSONObject sendFabricNodeHeartbeat(String clusterId, String nodeId, JSONObject payload) throws Exception {
|
||||
return post("/clusters/" + clusterId + "/nodes/" + nodeId + "/heartbeats", payload);
|
||||
}
|
||||
|
||||
private JSONObject get(String path) throws Exception {
|
||||
if (fabricControlManager != null) {
|
||||
return fabricControlJSON("GET", path, null);
|
||||
}
|
||||
Request request = new Request.Builder().url(baseUrl + path).get().build();
|
||||
return read(request);
|
||||
}
|
||||
|
||||
private JSONObject post(String path, JSONObject body) throws Exception {
|
||||
if (fabricControlManager != null) {
|
||||
return fabricControlJSON("POST", path, body);
|
||||
}
|
||||
Request request = new Request.Builder()
|
||||
.url(baseUrl + path)
|
||||
.post(RequestBody.create(body.toString().getBytes(StandardCharsets.UTF_8), JSON))
|
||||
@@ -347,39 +211,60 @@ final class RapApiClient {
|
||||
return read(request);
|
||||
}
|
||||
|
||||
private byte[] getBytes(String path) throws Exception {
|
||||
Request.Builder builder = new Request.Builder().url(baseUrl + path).get();
|
||||
applyFabricHeadersIfNeeded(builder, path);
|
||||
Request request = builder.build();
|
||||
try (Response response = httpClient.newCall(request).execute()) {
|
||||
if (response.code() == 204) {
|
||||
return new byte[0];
|
||||
}
|
||||
if (!response.isSuccessful()) {
|
||||
throw new IllegalStateException(describeHttpFailure(response));
|
||||
}
|
||||
ResponseBody body = response.body();
|
||||
return body == null ? new byte[0] : body.bytes();
|
||||
private JSONObject fabricControlJSON(String method, String path, JSONObject body) throws Exception {
|
||||
byte[] payload = fabricControlBodyBytes(method, path, body);
|
||||
if (payload.length == 0) {
|
||||
return new JSONObject();
|
||||
}
|
||||
return new JSONObject(new String(payload, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
private void postBytes(String path, byte[] packet, int length) throws Exception {
|
||||
byte[] bodyBytes = new byte[length];
|
||||
System.arraycopy(packet, 0, bodyBytes, 0, length);
|
||||
postBytes(path, bodyBytes);
|
||||
private byte[] fabricControlBodyBytes(String method, String path, JSONObject body) throws Exception {
|
||||
JSONObject request = new JSONObject();
|
||||
request.put("method", method);
|
||||
request.put("path", path);
|
||||
if (body != null) {
|
||||
request.put("body", body);
|
||||
}
|
||||
String raw;
|
||||
try {
|
||||
raw = fabricControlManager.controlRequest(request.toString());
|
||||
} catch (Exception e) {
|
||||
throw new IllegalStateException("Ферма сейчас не смогла выполнить контрольный запрос. Попробуйте еще раз.", e);
|
||||
}
|
||||
JSONObject wrapper = raw == null || raw.trim().isEmpty() ? new JSONObject() : new JSONObject(raw);
|
||||
int statusCode = wrapper.optInt("status_code", 200);
|
||||
Object bodyValue = wrapper.opt("body");
|
||||
String bodyText = jsonBodyText(bodyValue);
|
||||
if (statusCode < 200 || statusCode >= 300) {
|
||||
if (statusCode == 401 && bodyText.contains("auth.invalid_credentials")) {
|
||||
throw new IllegalStateException("Неверный логин или пароль.");
|
||||
}
|
||||
if (statusCode == 401 && bodyText.contains("auth.invalid_refresh_token")) {
|
||||
throw new IllegalStateException("Сессия устройства истекла. Введите пароль один раз.");
|
||||
}
|
||||
throw new IllegalStateException("fabric control HTTP " + statusCode + ": " + compactText(bodyText, 240));
|
||||
}
|
||||
return bodyText.getBytes(java.nio.charset.StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
private void postBytes(String path, byte[] bodyBytes) throws Exception {
|
||||
Request.Builder builder = new Request.Builder()
|
||||
.url(baseUrl + path)
|
||||
.post(RequestBody.create(bodyBytes, OCTET_STREAM));
|
||||
applyFabricHeadersIfNeeded(builder, path);
|
||||
Request request = builder.build();
|
||||
try (Response response = httpClient.newCall(request).execute()) {
|
||||
if (!response.isSuccessful()) {
|
||||
throw new IllegalStateException(describeHttpFailure(response));
|
||||
}
|
||||
private String jsonBodyText(Object bodyValue) {
|
||||
if (bodyValue == null || JSONObject.NULL.equals(bodyValue)) {
|
||||
return "";
|
||||
}
|
||||
if (bodyValue instanceof JSONObject || bodyValue instanceof JSONArray) {
|
||||
return bodyValue.toString();
|
||||
}
|
||||
String text = String.valueOf(bodyValue);
|
||||
return text == null ? "" : text;
|
||||
}
|
||||
|
||||
private String compactText(String text, int limit) {
|
||||
String value = text == null ? "" : text.replace('\n', ' ').replace('\r', ' ').trim();
|
||||
if (value.length() > limit) {
|
||||
return value.substring(0, limit);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private String describeHttpFailure(Response response) {
|
||||
@@ -401,45 +286,6 @@ final class RapApiClient {
|
||||
return message.toString();
|
||||
}
|
||||
|
||||
private String clientPacketPath(String clusterId, String vpnConnectionId, String suffix) throws IOException {
|
||||
String path = fabricServiceChannel.packetPathForBase(baseUrl, clusterId, vpnConnectionId, false);
|
||||
if (path.isEmpty()) {
|
||||
throw new IOException("fabric service channel lease required for VPN packet dataplane");
|
||||
}
|
||||
return path + (suffix == null ? "" : suffix);
|
||||
}
|
||||
|
||||
private void applyFabricHeadersIfNeeded(Request.Builder builder, String path) {
|
||||
if (path != null && path.contains("/fabric/service-channels/")) {
|
||||
fabricServiceChannel.applyHeaders(builder);
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] encodePacketBatch(List<byte[]> packets) {
|
||||
int total = 0;
|
||||
for (byte[] packet : packets) {
|
||||
if (packet != null && packet.length > 0) {
|
||||
total += 4 + packet.length;
|
||||
}
|
||||
}
|
||||
byte[] out = new byte[total];
|
||||
int offset = 0;
|
||||
for (byte[] packet : packets) {
|
||||
if (packet == null || packet.length == 0) {
|
||||
continue;
|
||||
}
|
||||
int length = packet.length;
|
||||
out[offset] = (byte) ((length >> 24) & 0xff);
|
||||
out[offset + 1] = (byte) ((length >> 16) & 0xff);
|
||||
out[offset + 2] = (byte) ((length >> 8) & 0xff);
|
||||
out[offset + 3] = (byte) (length & 0xff);
|
||||
offset += 4;
|
||||
System.arraycopy(packet, 0, out, offset, length);
|
||||
offset += length;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
private JSONObject read(Request request) throws Exception {
|
||||
try (Response response = httpClient.newCall(request).execute()) {
|
||||
ResponseBody body = response.body();
|
||||
@@ -457,93 +303,6 @@ final class RapApiClient {
|
||||
}
|
||||
}
|
||||
|
||||
private List<byte[]> decodePacketBatch(byte[] payload) {
|
||||
List<byte[]> packets = new ArrayList<>();
|
||||
int offset = 0;
|
||||
while (payload != null && offset + 4 <= payload.length) {
|
||||
int length = ((payload[offset] & 0xff) << 24)
|
||||
| ((payload[offset + 1] & 0xff) << 16)
|
||||
| ((payload[offset + 2] & 0xff) << 8)
|
||||
| (payload[offset + 3] & 0xff);
|
||||
offset += 4;
|
||||
if (length <= 0 || offset + length > payload.length) {
|
||||
break;
|
||||
}
|
||||
byte[] packet = new byte[length];
|
||||
System.arraycopy(payload, offset, packet, 0, length);
|
||||
packets.add(packet);
|
||||
offset += length;
|
||||
}
|
||||
return packets;
|
||||
}
|
||||
|
||||
private List<List<byte[]>> chunkPacketsForBatch(List<byte[]> packets) {
|
||||
List<List<byte[]>> chunks = new ArrayList<>();
|
||||
List<byte[]> current = new ArrayList<>();
|
||||
int currentBytes = 0;
|
||||
boolean hasData = false;
|
||||
for (byte[] packet : packets) {
|
||||
if (packet == null || packet.length == 0) {
|
||||
continue;
|
||||
}
|
||||
if (packet.length > MAX_SINGLE_PACKET_BYTES) {
|
||||
continue;
|
||||
}
|
||||
hasData = true;
|
||||
|
||||
int projected = currentBytes + MAX_BATCH_HEADER_BYTES + packet.length;
|
||||
if (!current.isEmpty() && (current.size() >= MAX_PACKET_BATCH_PACKETS || projected > MAX_PACKET_BATCH_BYTES)) {
|
||||
chunks.add(current);
|
||||
current = new ArrayList<>();
|
||||
currentBytes = 0;
|
||||
}
|
||||
current.add(packet);
|
||||
currentBytes = projected;
|
||||
}
|
||||
if (!hasData) {
|
||||
return chunks;
|
||||
}
|
||||
if (!current.isEmpty()) {
|
||||
chunks.add(current);
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
private boolean isLikelyPacketBatch(byte[] payload) {
|
||||
if (payload == null || payload.length < MAX_BATCH_HEADER_BYTES) {
|
||||
return false;
|
||||
}
|
||||
int offset = 0;
|
||||
int consumed = 0;
|
||||
while (offset + MAX_BATCH_HEADER_BYTES <= payload.length) {
|
||||
int length = ((payload[offset] & 0xff) << 24)
|
||||
| ((payload[offset + 1] & 0xff) << 16)
|
||||
| ((payload[offset + 2] & 0xff) << 8)
|
||||
| (payload[offset + 3] & 0xff);
|
||||
offset += MAX_BATCH_HEADER_BYTES;
|
||||
if (length <= 0 || length > MAX_SINGLE_PACKET_BYTES) {
|
||||
return false;
|
||||
}
|
||||
if (offset + length > payload.length) {
|
||||
return false;
|
||||
}
|
||||
offset += length;
|
||||
consumed++;
|
||||
if (consumed > MAX_PACKET_BATCH_PACKETS) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return offset == payload.length && consumed > 0;
|
||||
}
|
||||
|
||||
private List<byte[]> receiveSinglePacketAsBatch(String clusterId, String vpnConnectionId, int timeoutMs) throws Exception {
|
||||
byte[] payload = receiveClientPacket(clusterId, vpnConnectionId, timeoutMs);
|
||||
if (payload == null || payload.length == 0) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
return new ArrayList<>(Collections.singletonList(payload));
|
||||
}
|
||||
|
||||
private AuthContext parseAuthContext(JSONObject response) throws Exception {
|
||||
JSONObject user = response.getJSONObject("user");
|
||||
String userId = user.optString("id", "");
|
||||
@@ -570,65 +329,6 @@ final class RapApiClient {
|
||||
return value;
|
||||
}
|
||||
|
||||
static final class ProtectedSocketFactory extends SocketFactory {
|
||||
private final SocketFactory delegate = SocketFactory.getDefault();
|
||||
private final VpnService vpnService;
|
||||
|
||||
ProtectedSocketFactory(VpnService vpnService) {
|
||||
this.vpnService = vpnService;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket() throws IOException {
|
||||
Socket socket = delegate.createSocket();
|
||||
socket.bind(null);
|
||||
return protect(socket);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(String host, int port) throws IOException {
|
||||
Socket socket = createSocket();
|
||||
socket.connect(new InetSocketAddress(host, port));
|
||||
return socket;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(String host, int port, InetAddress localHost, int localPort) throws IOException {
|
||||
Socket socket = delegate.createSocket();
|
||||
socket.bind(new InetSocketAddress(localHost, localPort));
|
||||
protect(socket);
|
||||
socket.connect(new InetSocketAddress(host, port));
|
||||
return socket;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(InetAddress host, int port) throws IOException {
|
||||
Socket socket = createSocket();
|
||||
socket.connect(new InetSocketAddress(host, port));
|
||||
return socket;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort) throws IOException {
|
||||
Socket socket = delegate.createSocket();
|
||||
socket.bind(new InetSocketAddress(localAddress, localPort));
|
||||
protect(socket);
|
||||
socket.connect(new InetSocketAddress(address, port));
|
||||
return socket;
|
||||
}
|
||||
|
||||
private Socket protect(Socket socket) throws IOException {
|
||||
if (!vpnService.protect(socket)) {
|
||||
try {
|
||||
socket.close();
|
||||
} catch (IOException ignored) {
|
||||
}
|
||||
throw new IOException("protect control-plane socket failed");
|
||||
}
|
||||
return socket;
|
||||
}
|
||||
}
|
||||
|
||||
static final class AuthContext {
|
||||
final String userId;
|
||||
final String deviceId;
|
||||
|
||||
@@ -10,7 +10,6 @@ import android.os.Build;
|
||||
public final class RapAutostartReceiver extends BroadcastReceiver {
|
||||
private static final String PREFS = "rap-vpn";
|
||||
private static final String PREF_PROFILE_JSON = "profile_json";
|
||||
private static final String PREF_BACKEND_URL = "backend_url";
|
||||
private static final String PREF_CLUSTER_ID = "cluster_id";
|
||||
private static final String PREF_VPN_CONNECTION_ID = "vpn_connection_id";
|
||||
private static final String PREF_MANUAL_STOPPED = "manual_stopped";
|
||||
@@ -25,21 +24,18 @@ public final class RapAutostartReceiver extends BroadcastReceiver {
|
||||
&& !Intent.ACTION_BOOT_COMPLETED.equals(action)) {
|
||||
return;
|
||||
}
|
||||
RapDiagnosticService.start(context);
|
||||
SharedPreferences prefs = context.getSharedPreferences(PREFS, Context.MODE_PRIVATE);
|
||||
if (prefs.getBoolean(PREF_MANUAL_STOPPED, false)) {
|
||||
return;
|
||||
}
|
||||
if (Intent.ACTION_MY_PACKAGE_REPLACED.equals(action)) {
|
||||
// Diagnostic service owns post-upgrade VPN restart. Starting both services from
|
||||
// MY_PACKAGE_REPLACED can race foreground-service startup and leave diagnostics stale.
|
||||
// After package replacement we wait for an explicit user action or runtime resume.
|
||||
return;
|
||||
}
|
||||
String profile = prefs.getString(PREF_PROFILE_JSON, "");
|
||||
String backendUrl = prefs.getString(PREF_BACKEND_URL, "");
|
||||
String clusterId = prefs.getString(PREF_CLUSTER_ID, "");
|
||||
String vpnConnectionId = prefs.getString(PREF_VPN_CONNECTION_ID, "");
|
||||
if (profile.isEmpty() || backendUrl.isEmpty() || clusterId.isEmpty() || vpnConnectionId.isEmpty()) {
|
||||
if (profile.isEmpty() || clusterId.isEmpty() || vpnConnectionId.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
if (VpnService.prepare(context) != null) {
|
||||
@@ -47,7 +43,6 @@ public final class RapAutostartReceiver extends BroadcastReceiver {
|
||||
}
|
||||
Intent service = new Intent(context, RapVpnService.class);
|
||||
service.putExtra("profile_json", profile);
|
||||
service.putExtra("backend_url", backendUrl);
|
||||
service.putExtra("cluster_id", clusterId);
|
||||
service.putExtra("vpn_connection_id", vpnConnectionId);
|
||||
if (Build.VERSION.SDK_INT >= 26) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -54,7 +54,7 @@ public class TestTrafficActivity extends Activity {
|
||||
setContentView(layout);
|
||||
String url = getIntent().getStringExtra(EXTRA_URL);
|
||||
if (url == null || url.isEmpty()) {
|
||||
url = "http://192.168.200.61:18080/";
|
||||
url = "http://example.com/";
|
||||
}
|
||||
target = url;
|
||||
assetErrorCount = 0;
|
||||
|
||||
@@ -11,7 +11,7 @@ import java.nio.charset.StandardCharsets;
|
||||
public class TestVpnActivity extends Activity {
|
||||
public static final String EXTRA_PROFILE_JSON = "profile_json";
|
||||
public static final String EXTRA_PROFILE_BASE64 = "profile_base64";
|
||||
public static final String EXTRA_BACKEND_URL = "backend_url";
|
||||
public static final String EXTRA_FABRIC_BOOTSTRAP_CONFIG = "fabric_bootstrap_config";
|
||||
public static final String EXTRA_CLUSTER_ID = "cluster_id";
|
||||
public static final String EXTRA_VPN_CONNECTION_ID = "vpn_connection_id";
|
||||
private static final int VPN_PREPARE_REQUEST = 77;
|
||||
@@ -44,7 +44,10 @@ public class TestVpnActivity extends Activity {
|
||||
private Intent buildServiceIntent(Intent source) {
|
||||
Intent intent = new Intent(this, RapVpnService.class);
|
||||
intent.putExtra(RapVpnService.EXTRA_PROFILE_JSON, profileJson(source));
|
||||
intent.putExtra(RapVpnService.EXTRA_BACKEND_URL, source.getStringExtra(EXTRA_BACKEND_URL));
|
||||
String fabricBootstrapConfig = source.getStringExtra(EXTRA_FABRIC_BOOTSTRAP_CONFIG);
|
||||
if (fabricBootstrapConfig != null && !fabricBootstrapConfig.isEmpty()) {
|
||||
intent.putExtra(RapVpnService.EXTRA_FABRIC_BOOTSTRAP_CONFIG, fabricBootstrapConfig);
|
||||
}
|
||||
intent.putExtra(RapVpnService.EXTRA_CLUSTER_ID, source.getStringExtra(EXTRA_CLUSTER_ID));
|
||||
intent.putExtra(RapVpnService.EXTRA_VPN_CONNECTION_ID, source.getStringExtra(EXTRA_VPN_CONNECTION_ID));
|
||||
return intent;
|
||||
|
||||
@@ -1,393 +0,0 @@
|
||||
package su.cin.rapvpn;
|
||||
|
||||
import android.net.VpnService;
|
||||
import android.util.Log;
|
||||
|
||||
import java.net.URI;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import okhttp3.ConnectionPool;
|
||||
import okhttp3.Dispatcher;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import okhttp3.Response;
|
||||
import okhttp3.WebSocket;
|
||||
import okhttp3.WebSocketListener;
|
||||
import okio.ByteString;
|
||||
|
||||
final class VpnPacketWebSocketRelay {
|
||||
private static final String TAG = "RapVpnWebSocketRelay";
|
||||
private static final int MAX_PACKET_BATCH_PACKETS = 512;
|
||||
private static final int MAX_PACKET_BATCH_BYTES = 1024 * 1024;
|
||||
private static final int MAX_SINGLE_PACKET_BYTES = 65535;
|
||||
private static final long CONNECTING_STALE_MS = 8000;
|
||||
private static final long OPEN_WAIT_MS = 3500;
|
||||
private static final int PRIORITY_GRACE_MS = 2;
|
||||
|
||||
private final String baseUrl;
|
||||
private final VpnService vpnService;
|
||||
private final OkHttpClient httpClient;
|
||||
private final FabricServiceChannel fabricServiceChannel;
|
||||
private final BlockingQueue<List<byte[]>> priorityIncoming = new ArrayBlockingQueue<>(512);
|
||||
private final BlockingQueue<List<byte[]>> incoming = new ArrayBlockingQueue<>(2048);
|
||||
private final Object lock = new Object();
|
||||
|
||||
private WebSocket webSocket;
|
||||
private String connectedClusterId = "";
|
||||
private String connectedVpnConnectionId = "";
|
||||
private volatile boolean open;
|
||||
private volatile boolean connecting;
|
||||
private volatile long connectingSinceMs;
|
||||
private volatile long reconnectAfterMs;
|
||||
private volatile String lastError = "";
|
||||
|
||||
VpnPacketWebSocketRelay(String baseUrl, VpnService vpnService) {
|
||||
this(baseUrl, vpnService, new FabricServiceChannel());
|
||||
}
|
||||
|
||||
VpnPacketWebSocketRelay(String baseUrl, VpnService vpnService, FabricServiceChannel fabricServiceChannel) {
|
||||
this.baseUrl = trimRight(baseUrl);
|
||||
this.vpnService = vpnService;
|
||||
this.fabricServiceChannel = fabricServiceChannel == null ? new FabricServiceChannel() : fabricServiceChannel;
|
||||
OkHttpClient.Builder builder = new OkHttpClient.Builder();
|
||||
if (vpnService != null) {
|
||||
builder.socketFactory(new RapApiClient.ProtectedSocketFactory(vpnService));
|
||||
}
|
||||
builder.dns(new RapApiClient.BackendPinnedDns(baseUrl));
|
||||
builder.connectTimeout(5, TimeUnit.SECONDS);
|
||||
builder.writeTimeout(10, TimeUnit.SECONDS);
|
||||
builder.readTimeout(0, TimeUnit.SECONDS);
|
||||
builder.retryOnConnectionFailure(true);
|
||||
Dispatcher dispatcher = new Dispatcher();
|
||||
dispatcher.setMaxRequests(16);
|
||||
dispatcher.setMaxRequestsPerHost(8);
|
||||
builder.dispatcher(dispatcher);
|
||||
builder.connectionPool(new ConnectionPool(8, 5, TimeUnit.MINUTES));
|
||||
this.httpClient = builder.build();
|
||||
}
|
||||
|
||||
String baseUrl() {
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
boolean isOpen() {
|
||||
return open;
|
||||
}
|
||||
|
||||
String lastError() {
|
||||
return lastError == null ? "" : lastError;
|
||||
}
|
||||
|
||||
void connect(String clusterId, String vpnConnectionId) {
|
||||
if (clusterId == null || clusterId.isEmpty() || vpnConnectionId == null || vpnConnectionId.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
long now = System.currentTimeMillis();
|
||||
synchronized (lock) {
|
||||
if (open && clusterId.equals(connectedClusterId) && vpnConnectionId.equals(connectedVpnConnectionId)) {
|
||||
return;
|
||||
}
|
||||
if (connecting && clusterId.equals(connectedClusterId) && vpnConnectionId.equals(connectedVpnConnectionId)) {
|
||||
if (now - connectingSinceMs < CONNECTING_STALE_MS) {
|
||||
return;
|
||||
}
|
||||
lastError = "stale websocket connect";
|
||||
closeLocked();
|
||||
}
|
||||
if (now < reconnectAfterMs) {
|
||||
return;
|
||||
}
|
||||
closeLocked();
|
||||
String wsUrl = webSocketUrl(clusterId, vpnConnectionId);
|
||||
if (wsUrl.isEmpty()) {
|
||||
lastError = "invalid websocket url";
|
||||
reconnectAfterMs = now + 5000;
|
||||
return;
|
||||
}
|
||||
connectedClusterId = clusterId;
|
||||
connectedVpnConnectionId = vpnConnectionId;
|
||||
connecting = true;
|
||||
connectingSinceMs = now;
|
||||
Request.Builder requestBuilder = new Request.Builder().url(wsUrl);
|
||||
this.fabricServiceChannel.applyHeaders(requestBuilder);
|
||||
Request request = requestBuilder.build();
|
||||
lastError = "connecting";
|
||||
webSocket = httpClient.newWebSocket(request, new Listener());
|
||||
}
|
||||
}
|
||||
|
||||
boolean sendClientPacketBatch(String clusterId, String vpnConnectionId, List<byte[]> packets) {
|
||||
packets = cleanPacketBatch(packets);
|
||||
if (packets.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
connect(clusterId, vpnConnectionId);
|
||||
if (!awaitOpen(OPEN_WAIT_MS)) {
|
||||
return false;
|
||||
}
|
||||
WebSocket socket = webSocket;
|
||||
if (socket == null) {
|
||||
lastError = "websocket missing after open";
|
||||
return false;
|
||||
}
|
||||
byte[] payload = encodePacketBatch(packets);
|
||||
if (payload.length == 0) {
|
||||
return true;
|
||||
}
|
||||
boolean queued = socket.send(ByteString.of(payload));
|
||||
if (!queued) {
|
||||
lastError = "websocket send queue rejected batch";
|
||||
synchronized (lock) {
|
||||
if (socket == webSocket) {
|
||||
reconnectAfterMs = 0;
|
||||
closeLocked();
|
||||
}
|
||||
}
|
||||
}
|
||||
return queued;
|
||||
}
|
||||
|
||||
List<byte[]> receiveClientPacketBatch(String clusterId, String vpnConnectionId, int timeoutMs) throws InterruptedException {
|
||||
connect(clusterId, vpnConnectionId);
|
||||
awaitOpen(Math.min(OPEN_WAIT_MS, Math.max(1, timeoutMs)));
|
||||
int waitMs = Math.max(1, timeoutMs);
|
||||
List<byte[]> packets = priorityIncoming.poll();
|
||||
if (packets != null) {
|
||||
return packets;
|
||||
}
|
||||
packets = priorityIncoming.poll(Math.min(PRIORITY_GRACE_MS, waitMs), TimeUnit.MILLISECONDS);
|
||||
if (packets != null) {
|
||||
return packets;
|
||||
}
|
||||
packets = incoming.poll();
|
||||
if (packets != null) {
|
||||
return packets;
|
||||
}
|
||||
packets = priorityIncoming.poll();
|
||||
if (packets != null) {
|
||||
return packets;
|
||||
}
|
||||
packets = incoming.poll(Math.max(1, waitMs - PRIORITY_GRACE_MS), TimeUnit.MILLISECONDS);
|
||||
return packets == null ? new ArrayList<>() : packets;
|
||||
}
|
||||
|
||||
void close() {
|
||||
synchronized (lock) {
|
||||
closeLocked();
|
||||
}
|
||||
}
|
||||
|
||||
private void closeLocked() {
|
||||
open = false;
|
||||
connecting = false;
|
||||
connectingSinceMs = 0;
|
||||
priorityIncoming.clear();
|
||||
incoming.clear();
|
||||
if (webSocket != null) {
|
||||
try {
|
||||
webSocket.close(1000, "relay switch");
|
||||
} catch (Exception ignored) {
|
||||
}
|
||||
}
|
||||
webSocket = null;
|
||||
}
|
||||
|
||||
private boolean awaitOpen(long timeoutMs) {
|
||||
long deadline = System.currentTimeMillis() + Math.max(1, timeoutMs);
|
||||
synchronized (lock) {
|
||||
while (!open && connecting) {
|
||||
long waitMs = deadline - System.currentTimeMillis();
|
||||
if (waitMs <= 0) {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
lock.wait(waitMs);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
lastError = "interrupted waiting for websocket open";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!open && "connecting".equals(lastError)) {
|
||||
lastError = "connecting_timeout";
|
||||
}
|
||||
return open;
|
||||
}
|
||||
}
|
||||
|
||||
private String webSocketUrl(String clusterId, String vpnConnectionId) {
|
||||
try {
|
||||
URI uri = URI.create(baseUrl);
|
||||
String scheme = "https".equalsIgnoreCase(uri.getScheme()) ? "wss" : "ws";
|
||||
String path = uri.getRawPath() == null || uri.getRawPath().isEmpty() ? "" : trimRight(uri.getRawPath());
|
||||
String fabricPath = fabricServiceChannel.packetPathForBase(baseUrl, clusterId, vpnConnectionId, true);
|
||||
if (!fabricPath.isEmpty()) {
|
||||
path += fabricPath;
|
||||
} else {
|
||||
path += "/clusters/" + clusterId + "/vpn-connections/" + vpnConnectionId + "/tunnel/client/packets/ws";
|
||||
}
|
||||
URI ws = new URI(scheme, uri.getRawUserInfo(), uri.getHost(), uri.getPort(), path, null, null);
|
||||
return ws.toString();
|
||||
} catch (Exception e) {
|
||||
lastError = e.getClass().getSimpleName() + ": " + e.getMessage();
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private final class Listener extends WebSocketListener {
|
||||
@Override
|
||||
public void onOpen(WebSocket webSocket, Response response) {
|
||||
synchronized (lock) {
|
||||
open = true;
|
||||
connecting = false;
|
||||
reconnectAfterMs = 0;
|
||||
lastError = "";
|
||||
lock.notifyAll();
|
||||
}
|
||||
Log.i(TAG, "vpn packet websocket opened " + baseUrl);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onMessage(WebSocket webSocket, ByteString bytes) {
|
||||
List<byte[]> packets = decodePacketBatch(bytes.toByteArray());
|
||||
if (packets.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
offerIncomingPacketBatch(packets);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onClosed(WebSocket webSocket, int code, String reason) {
|
||||
synchronized (lock) {
|
||||
open = false;
|
||||
connecting = false;
|
||||
reconnectAfterMs = System.currentTimeMillis() + 1000;
|
||||
lastError = "closed " + code + " " + reason;
|
||||
lock.notifyAll();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onFailure(WebSocket webSocket, Throwable t, Response response) {
|
||||
String responseStatus = "";
|
||||
if (response != null) {
|
||||
responseStatus = " status=" + response.code();
|
||||
}
|
||||
synchronized (lock) {
|
||||
open = false;
|
||||
connecting = false;
|
||||
reconnectAfterMs = System.currentTimeMillis() + 3000;
|
||||
lastError = (t == null ? "websocket failure" : t.getClass().getSimpleName() + ": " + t.getMessage()) + responseStatus;
|
||||
lock.notifyAll();
|
||||
}
|
||||
Log.w(TAG, "vpn packet websocket failed " + baseUrl + ": " + lastError);
|
||||
}
|
||||
}
|
||||
|
||||
private static List<byte[]> cleanPacketBatch(List<byte[]> packets) {
|
||||
List<byte[]> cleaned = new ArrayList<>();
|
||||
int bytes = 0;
|
||||
if (packets == null) {
|
||||
return cleaned;
|
||||
}
|
||||
for (byte[] packet : packets) {
|
||||
if (packet == null || packet.length <= 0 || packet.length > MAX_SINGLE_PACKET_BYTES) {
|
||||
continue;
|
||||
}
|
||||
int projected = bytes + 4 + packet.length;
|
||||
if (cleaned.size() >= MAX_PACKET_BATCH_PACKETS || projected > MAX_PACKET_BATCH_BYTES) {
|
||||
break;
|
||||
}
|
||||
cleaned.add(packet);
|
||||
bytes = projected;
|
||||
}
|
||||
return cleaned;
|
||||
}
|
||||
|
||||
private static byte[] encodePacketBatch(List<byte[]> packets) {
|
||||
packets = cleanPacketBatch(packets);
|
||||
int total = 0;
|
||||
for (byte[] packet : packets) {
|
||||
total += 4 + packet.length;
|
||||
}
|
||||
byte[] out = new byte[total];
|
||||
int offset = 0;
|
||||
for (byte[] packet : packets) {
|
||||
int length = packet.length;
|
||||
out[offset] = (byte) ((length >> 24) & 0xff);
|
||||
out[offset + 1] = (byte) ((length >> 16) & 0xff);
|
||||
out[offset + 2] = (byte) ((length >> 8) & 0xff);
|
||||
out[offset + 3] = (byte) (length & 0xff);
|
||||
offset += 4;
|
||||
System.arraycopy(packet, 0, out, offset, length);
|
||||
offset += length;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
private static List<byte[]> decodePacketBatch(byte[] payload) {
|
||||
List<byte[]> packets = new ArrayList<>();
|
||||
int offset = 0;
|
||||
while (payload != null && offset + 4 <= payload.length && packets.size() < MAX_PACKET_BATCH_PACKETS) {
|
||||
int length = ((payload[offset] & 0xff) << 24)
|
||||
| ((payload[offset + 1] & 0xff) << 16)
|
||||
| ((payload[offset + 2] & 0xff) << 8)
|
||||
| (payload[offset + 3] & 0xff);
|
||||
offset += 4;
|
||||
if (length <= 0 || length > MAX_SINGLE_PACKET_BYTES || offset + length > payload.length) {
|
||||
break;
|
||||
}
|
||||
byte[] packet = new byte[length];
|
||||
System.arraycopy(payload, offset, packet, 0, length);
|
||||
packets.add(packet);
|
||||
offset += length;
|
||||
}
|
||||
return packets;
|
||||
}
|
||||
|
||||
private void offerIncomingPacketBatch(List<byte[]> packets) {
|
||||
BlockingQueue<List<byte[]>> target = containsTCPControlPacket(packets) ? priorityIncoming : incoming;
|
||||
if (!target.offer(packets)) {
|
||||
target.poll();
|
||||
target.offer(packets);
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean containsTCPControlPacket(List<byte[]> packets) {
|
||||
if (packets == null) {
|
||||
return false;
|
||||
}
|
||||
for (byte[] packet : packets) {
|
||||
if (isTCPControlPacket(packet)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean isTCPControlPacket(byte[] packet) {
|
||||
if (packet == null || packet.length < 20 || (packet[0] >> 4) != 4) {
|
||||
return false;
|
||||
}
|
||||
int ihl = (packet[0] & 0x0f) * 4;
|
||||
if (ihl < 20 || packet.length < ihl + 20 || packet[9] != 6) {
|
||||
return false;
|
||||
}
|
||||
int flags = packet[ihl + 13] & 0xff;
|
||||
return (flags & 0x17) != 0;
|
||||
}
|
||||
|
||||
private static String trimRight(String value) {
|
||||
if (value == null) {
|
||||
return "";
|
||||
}
|
||||
while (value.endsWith("/")) {
|
||||
value = value.substring(0, value.length() - 1);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,16 @@ This file exists so architecture documents have a stable guardrails reference
|
||||
inside `docs/architecture`. The operational Codex guardrails remain in
|
||||
`docs/codex/ARCHITECTURE_GUARDRAILS.md`.
|
||||
|
||||
Transport clarification: references in this document to direct worker WSS and
|
||||
backend gateway fallback belong to the preserved historical RDP service
|
||||
baseline. They are not the active source of truth for inter-node transport.
|
||||
Current fabric node-to-node transport is QUIC-only and is defined by
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
|
||||
Node survivability, recovery overlap, and no-manual-access repair rules are
|
||||
defined by `docs/architecture/FABRIC_NODE_SURVIVAL_AND_RECOVERY_POLICY.md`.
|
||||
|
||||
## 1. Preserve the Proven RDP Baseline
|
||||
|
||||
The following are already proven and must remain stable:
|
||||
@@ -16,8 +26,8 @@ The following are already proven and must remain stable:
|
||||
- detach without killing the remote session
|
||||
- reattach without recreating the remote session
|
||||
- takeover without recreating the remote session
|
||||
- direct worker WSS data plane
|
||||
- backend gateway fallback
|
||||
- historical direct worker WSS RDP path
|
||||
- historical backend gateway fallback for the RDP baseline
|
||||
- C++ RDP Adapter as the active RDP runtime
|
||||
|
||||
Architecture clarification must not silently weaken this behavior.
|
||||
@@ -191,6 +201,9 @@ Updates must support:
|
||||
- local update cache where approved
|
||||
- OS / architecture specific artifacts under signed release manifests
|
||||
- explicit migration bundles when data structures change
|
||||
- legacy recovery compatibility until the fleet is converged or explicitly
|
||||
retired
|
||||
- multi-source artifact retrieval for stranded or NAT-only nodes
|
||||
|
||||
Version Storage stores immutable release manifests, artifacts, hashes,
|
||||
signatures, compatibility metadata, provenance, and approved migration bundles.
|
||||
|
||||
@@ -1059,7 +1059,8 @@ accepts a signed/introspected `remote_workspace` service-channel lease on
|
||||
`remote-workspaces/{resource_id}/streams/{channel_class}`, validates service
|
||||
class, channel class, selected entry node, and data-plane flow isolation, and
|
||||
reports access telemetry. It intentionally returns a probe contract with
|
||||
`payload_flow=not_implemented` for non-empty RDP payloads; this stage proves
|
||||
`payload_flow=validated_only` for empty control probes; non-empty RDP payloads are
|
||||
rejected with `probe_only required`. This stage proves
|
||||
the Fabric ingress contract without forwarding desktop frames yet. The live
|
||||
smoke is `scripts/fabric/c19d-remote-workspace-entry-ingress-smoke.ps1`.
|
||||
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
# Data Plane v1 for RDP
|
||||
|
||||
Archived status: this document is a historical RDP/WebSocket stage record, not
|
||||
the current runtime source of truth for transport architecture. The active
|
||||
fabric transport model is QUIC-only between nodes; see
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
|
||||
|
||||
Status: DP-3A grayscale full-frame binary render foundation is implemented and smoke-proven on the test Docker environment as of 2026-04-25. DP-3B adaptive quality policy/selection is intentionally paused. The accepted C++ RDP Adapter baseline is the ordered-region path. RDP-Perf-6 makes direct dirty-region binary render explicit with `render.frame.full` / `render.frame.region` RAP2 message types and is build/probe/live-smoke-proven on the test Docker environment as of 2026-04-26. The current test Docker deployment for the RDP Adapter performance path is `rap-rdp-worker:rdp-perf6-dirty-region`. The Stage 5.2 core download data path remains runtime-proven for direct worker WSS and backend gateway fallback. Data-plane and RDP work are paused; the next active focus is Stage C10 Fabric Core / cluster foundation, not another data-plane feature.
|
||||
|
||||
This document defines the first staged data-plane evolution for the RDP MVP. It does not implement direct worker WebSocket runtime, mesh routing, VPN, QUIC, UDP, WebRTC, relay nodes, or multi-cluster behavior.
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
# Direct Worker WSS TLS / PKI
|
||||
|
||||
Archived status: this document captures a direct-worker WSS trust design track
|
||||
and is no longer the primary reference for node-to-node transport. The active
|
||||
fabric transport model is QUIC-only between nodes; see
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
|
||||
|
||||
Status: P3.4 trust-model design/prep complete.
|
||||
|
||||
This document defines the production trust model for direct worker WSS. It does
|
||||
|
||||
@@ -24,6 +24,21 @@ policy allows, host limited control/storage roles when approved, and report
|
||||
mobile-specific capacity signals such as battery, network type, NAT behavior,
|
||||
foreground/background state, and metered network policy.
|
||||
|
||||
Node survival and recovery across endpoint moves, NAT-only reachability, legacy
|
||||
contract overlap, and unavailable manual host access are governed by
|
||||
`docs/architecture/FABRIC_NODE_SURVIVAL_AND_RECOVERY_POLICY.md`. In
|
||||
particular, nodes like `ifcm-rufms-s-mo1cr` must remain recoverable through the
|
||||
fabric/update/recovery plane even when direct host login is unavailable.
|
||||
|
||||
Android implementation contract:
|
||||
|
||||
- app install/build contains a QUIC bootstrap seed set;
|
||||
- runtime launch carries a `fabric_bootstrap_config`, not a backend URL;
|
||||
- user login/profile selection happens over the fabric control channel;
|
||||
- the Android VPN dataplane is QUIC fabric runtime only; HTTP batch packet
|
||||
forwarding, WebSocket packet relay, and direct backend packet relay are not
|
||||
part of the supported runtime path.
|
||||
|
||||
## What Was Missing
|
||||
|
||||
The current implementation proves route leases and production VPN forwarding,
|
||||
@@ -60,8 +75,9 @@ route and stream semantics.
|
||||
must keep working through cached policy, peer directories, route leases, and
|
||||
local health when central components are degraded.
|
||||
7. Mobile nodes are first-class nodes with stricter capability scoring.
|
||||
8. HTTP forwarding remains a compatibility and emergency fallback, not the
|
||||
primary high-speed data plane.
|
||||
8. QUIC is the single runtime transport between fabric nodes. HTTP/HTTPS may
|
||||
serve human-facing download or panel pages, but it is not a node data-plane
|
||||
fallback and must not carry service packets.
|
||||
9. There must be no single management service that can seize the fabric. Control,
|
||||
storage, update distribution, route authority, and certificate authority are
|
||||
fabric roles assigned to eligible nodes and protected by quorum signatures.
|
||||
@@ -73,6 +89,20 @@ route and stream semantics.
|
||||
the usable candidate locally by policy, reachability, latency, load, and
|
||||
trust.
|
||||
|
||||
## Transport vs Control API
|
||||
|
||||
The system must keep two layers separate in naming, design, and diagnostics:
|
||||
|
||||
- `Fabric Transport` means inter-node runtime delivery only. It is QUIC over UDP
|
||||
and carries leased service-channel/data-plane traffic between nodes.
|
||||
- `Control API` means human/operator/programmatic management surfaces such as
|
||||
web-admin, release publication, policy mutation, audit queries, and status
|
||||
reads. Today that surface is HTTP/JSON and may sit behind HTTPS ingress.
|
||||
|
||||
The HTTP Control API is not a fallback transport for node-to-node runtime
|
||||
traffic. A `409 Conflict` from the backend, a panel page load, or a release
|
||||
download is control-plane behavior, not fabric transport behavior.
|
||||
|
||||
## Distributed Control And Trust
|
||||
|
||||
The target fabric behaves like a distributed network, not a client/server
|
||||
@@ -145,6 +175,143 @@ Endpoint state is also distributed:
|
||||
- Neighbor selection is local and latency/load-aware; the state log announces
|
||||
facts and policy, not a forced single next hop.
|
||||
|
||||
### Fabric Registry Gossip
|
||||
|
||||
Moving a service must not break the farm.
|
||||
|
||||
`RAP_BACKEND_URL` or any fixed HTTP/API address is only a migration fallback for
|
||||
old nodes. It is not cluster truth. After bootstrap, a node finds services by
|
||||
logical role through signed fabric registry records that can be carried by any
|
||||
reachable peer.
|
||||
|
||||
The rule is:
|
||||
|
||||
- any node may relay registry knowledge;
|
||||
- only authorized signatures can create or replace trusted registry truth;
|
||||
- a new record becomes active only after signature/authority checks and a
|
||||
successful live probe through the fabric or a policy-approved direct QUIC
|
||||
candidate;
|
||||
- older still-valid records remain as fallback until their TTL expires.
|
||||
|
||||
Registry record shape:
|
||||
|
||||
```text
|
||||
schema_version: rap.fabric.registry.gossip_record.v1
|
||||
cluster_id
|
||||
service: control-api | update-store | update-cache | web-admin | vpn-egress-pool | ...
|
||||
scope: farm | cluster | organization
|
||||
organization_id: optional
|
||||
epoch: monotonic service epoch
|
||||
generation: optional human/debug generation
|
||||
issued_at
|
||||
expires_at
|
||||
issuer_node_id
|
||||
issuer_role: control-authority | update-authority | storage-authority | route-authority
|
||||
endpoints:
|
||||
- endpoint_id
|
||||
address: quic://...
|
||||
transport: direct_quic | relay_quic | reverse_quic
|
||||
reachability
|
||||
connectivity_mode
|
||||
priority / weight
|
||||
peer_cert_sha256
|
||||
signatures:
|
||||
- key_id
|
||||
issuer_id
|
||||
role
|
||||
alg: ed25519
|
||||
value
|
||||
```
|
||||
|
||||
Acceptance algorithm:
|
||||
|
||||
1. Reject records for a different cluster, expired records, future records past
|
||||
allowed clock skew, unsupported schema, missing endpoints, or non-QUIC
|
||||
endpoints.
|
||||
2. Verify the canonical record payload, excluding `signatures`, against the
|
||||
configured authority set.
|
||||
3. Check the signer role is allowed for that service and scope.
|
||||
4. Require quorum where policy says M-of-N; development may use one trusted
|
||||
signer but must mark that signer as bootstrap/development authority.
|
||||
5. Store accepted records as `candidate`.
|
||||
6. Promote `candidate` to `active` only after live-probing at least one endpoint
|
||||
and verifying the endpoint identity/pin.
|
||||
7. Prefer higher epoch, then newer issued time, then generation. Do not replace
|
||||
a live active record with an older record.
|
||||
8. Keep the previous active record usable as fallback until TTL expiry when a
|
||||
newer candidate is not yet live-verified.
|
||||
|
||||
This is the recovery path for mass moves. If every known service endpoint moves
|
||||
at once, the operator or a control-authority node only has to deliver a signed
|
||||
registry record to one reachable fabric node. That node validates it, probes it,
|
||||
promotes it, and gossips it onward. User/mobile/candidate nodes may carry the
|
||||
record, but cannot make it authoritative unless their role certificate permits
|
||||
that service/scope.
|
||||
|
||||
Service classes that must use this registry before production hardening:
|
||||
|
||||
- `control-api`: heartbeat, auth/profile control projection, node registration,
|
||||
policy/snapshot fetch.
|
||||
- `update-store`: signed release manifests and compatibility windows.
|
||||
- `update-cache`: artifact mirrors close to nodes.
|
||||
- `web-admin`: management UI/API ingress replicas.
|
||||
- `vpn-egress-pool`: user-visible exit pools; users see pools, not backing
|
||||
nodes.
|
||||
|
||||
Legacy endpoint compatibility is allowed only for rolling migration:
|
||||
|
||||
- Old nodes may use their baked HTTP/control URL only to fetch a new version or
|
||||
a signed registry bootstrap record.
|
||||
- New nodes must treat fixed URLs as fallback hints, not as authority.
|
||||
- Old code is removed only after every live node reports a version that supports
|
||||
signed registry gossip and service discovery by role.
|
||||
|
||||
Listener configuration is split into bind sockets and reachability candidates:
|
||||
|
||||
- `listen_addr` is what the local process binds, for example
|
||||
`0.0.0.0:18080` on `home-1`.
|
||||
- `endpoint_candidates` is the ordered set of addresses other nodes may try.
|
||||
A single node can publish LAN addresses, addresses on several network
|
||||
adapters, STUN/reflexive addresses, and multiple public NAT forwards from
|
||||
different providers.
|
||||
- Public NAT forwards are modeled as candidates with metadata, not as a
|
||||
replacement for the internal bind address. Example:
|
||||
`quic://94.141.118.222:19199 reachability=public connectivity=direct
|
||||
provider=isp1 maps_to=192.168.200.85:18080`.
|
||||
- A candidate may be valid only from outside the NAT. Same-LAN hairpin failure
|
||||
is not a proof that the public candidate is broken; verification must be
|
||||
scoped to an external peer or remote probe.
|
||||
- The route builder scores candidates by reachability, measured latency, loss,
|
||||
load, policy, and verification freshness. If one provider or interface fails,
|
||||
the node keeps the same node identity and republishes a new candidate epoch.
|
||||
|
||||
## Install Artifact Bootstrap Contract
|
||||
|
||||
Every installable artifact is a node image plus a bootstrap seed set.
|
||||
|
||||
This applies to Android, Docker, Linux services, and Windows services. The seed
|
||||
set is baked into the artifact or delivered beside it as signed install
|
||||
metadata. It is not a single backend URL and not a management server choice. It
|
||||
is a bounded list of known fabric endpoint candidates that may be reachable from
|
||||
different network positions:
|
||||
|
||||
- public QUIC candidates, for example `usa-los-1` or externally reachable
|
||||
`home-1`;
|
||||
- private/LAN QUIC candidates, for example Docker-test or home LAN nodes;
|
||||
- closed-site candidates that have no Internet route themselves but can reach a
|
||||
neighboring fabric node;
|
||||
- optional pinned certificate hashes or authority descriptors for high-trust
|
||||
entry candidates.
|
||||
|
||||
On first start the installed node tries the seed set, joins through any reachable
|
||||
peer, registers as a candidate node with minimal rights, and then receives
|
||||
signed peer-directory, role, update, and policy state through the fabric. If a
|
||||
node is installed in an isolated network, it can still become visible and usable
|
||||
when at least one nearby seed node can route onward to the rest of the fabric.
|
||||
User login on Android is only identity/profile selection for the `vpn-client`
|
||||
service; the underlying phone node already exists and participates in the
|
||||
fabric with candidate permissions.
|
||||
|
||||
## Node Roles
|
||||
|
||||
Initial role vocabulary:
|
||||
@@ -172,7 +339,7 @@ uplink stability, foreground state, and user cost policy.
|
||||
Nodes must advertise capability facts in heartbeats and peer updates:
|
||||
|
||||
- supported fabric protocol versions;
|
||||
- supported transports: UDP/QUIC, TCP, WebSocket, HTTPS fallback;
|
||||
- supported transport: UDP/QUIC;
|
||||
- NAT type and reachability;
|
||||
- measured RTT/loss/jitter/bandwidth to peers and entry candidates;
|
||||
- CPU, memory, queue depth, file descriptor/socket pressure;
|
||||
@@ -184,9 +351,8 @@ Nodes must advertise capability facts in heartbeats and peer updates:
|
||||
|
||||
## Fabric Data Session V1
|
||||
|
||||
The first practical protocol step is a persistent binary data session. It may
|
||||
initially run over WebSocket/TCP for faster delivery, but the framing must be
|
||||
transport-neutral so the same protocol can move to QUIC/UDP.
|
||||
The first practical protocol step is a persistent binary QUIC data session.
|
||||
The framing stays service-neutral, but the runtime transport is QUIC only.
|
||||
|
||||
Minimum frame set:
|
||||
|
||||
@@ -338,69 +504,36 @@ Deliverables:
|
||||
|
||||
### Stage FNP-3: WebSocket/TCP Compatibility Transport
|
||||
|
||||
Status: started with a transport-neutral `io.Reader`/`io.Writer` frame loop,
|
||||
WebSocket frame adapter in `agents/rap-node-agent/internal/fabricproto`, and a
|
||||
gated/authenticated mesh smoke endpoint/client at `/mesh/v1/fabric/session/ws`.
|
||||
`rap-host-agent fabric-session-smoke` provides the first operator smoke command
|
||||
and can pass signed fabric-session authority payload/signature headers for
|
||||
authority-pinned nodes.
|
||||
Node-agent exposes the endpoint only when `RAP_MESH_FABRIC_SESSION_ENABLED` /
|
||||
`-mesh-fabric-session-enabled` is set, and reports the enabled endpoint in
|
||||
heartbeat metadata.
|
||||
`mesh-live-smoke` includes a fabric-session `PING`/`PONG` check alongside the
|
||||
existing route and test-service probes. Mesh client code now has a reusable
|
||||
`FabricSessionClient` for multiple frame exchanges over one WebSocket session,
|
||||
plus a pump mode with outbound/inbound queues for asynchronous stream traffic.
|
||||
Live smoke verifies two `PING`/`PONG` round trips on the same connection.
|
||||
`vpnruntime` has a binary VPN packet-batch mapper for `FrameData` payloads so
|
||||
packet delivery can move away from JSON production envelopes in a gated mode.
|
||||
`FabricSessionPacketTransport` now adapts that mapper to the existing
|
||||
`PacketTransport` interface and can demultiplex inbound DATA frames into the
|
||||
VPN packet inbox by stream id.
|
||||
`mesh-live-smoke` now sends a real VPN packet batch through
|
||||
`FabricSessionPacketTransport` over the WebSocket fabric session and requires a
|
||||
stream ACK from the remote node.
|
||||
Mesh has a peer session manager that reuses one pump per peer endpoint, giving
|
||||
VPN transport selection a stable place to acquire long-lived fabric sessions.
|
||||
Node config now carries a separate gated
|
||||
`RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED` switch and heartbeat report for the
|
||||
binary VPN packet transport, keeping endpoint exposure and VPN dataplane
|
||||
rollout independently controllable.
|
||||
When the VPN fabric-session switch is enabled, node-agent now attempts to use a
|
||||
long-lived peer session for gateway packet transport and falls back to the
|
||||
existing HTTP production envelope path when the peer session is unavailable.
|
||||
Peer session reuse now evicts closed pumps before reuse, so failed WebSocket
|
||||
sessions can be reopened on the next transport acquisition.
|
||||
Heartbeat telemetry includes peer session manager counters for active sessions,
|
||||
reuses, opens, closed-pump evictions, and explicit close operations.
|
||||
The mesh package now exposes a service-neutral `FabricTransport` abstraction;
|
||||
the current WebSocket carrier implements it as `WebSocketFabricTransport`, so
|
||||
future QUIC/UDP transport can be added without changing VPN/RDP/HTTP services.
|
||||
`QUICFabricTransport` now implements the same interface and carries the same
|
||||
binary `fabricproto` frames over a QUIC stream, with local smoke coverage for
|
||||
`PING`/`PONG` and DATA/ACK.
|
||||
Carrier selection understands QUIC transport labels and `quic://host:port`
|
||||
endpoints while preserving WebSocket as the default fallback.
|
||||
`QUICFabricServer` provides the matching node-side QUIC listener for accepting
|
||||
fabric streams and running the same session frame handler as other carriers.
|
||||
Node-agent can now gate the QUIC listener with
|
||||
`RAP_MESH_QUIC_FABRIC_ENABLED` / `RAP_MESH_QUIC_FABRIC_LISTEN_ADDR`, report it
|
||||
in heartbeat metadata, and pass the setting through host-agent install/update
|
||||
profiles.
|
||||
`mesh-live-smoke` verifies the QUIC carrier by starting a temporary QUIC fabric
|
||||
server and requiring a `PING`/`PONG` round trip over `QUICFabricTransport`.
|
||||
Nodes now advertise enabled QUIC fabric listeners as `direct_quic` fast-path
|
||||
endpoint candidates, and endpoint ranking prefers QUIC over WebSocket/HTTPS
|
||||
compatibility candidates for fabric sessions.
|
||||
Status: retired as a migration-only stage.
|
||||
|
||||
This stage existed to bootstrap binary frame semantics before QUIC routing and
|
||||
carrier reuse were ready. It introduced the transport-neutral frame loop,
|
||||
session-shaped packet mapper, and early smoke tooling. That work was useful as
|
||||
scaffolding, but it is no longer the target runtime.
|
||||
|
||||
Current rule:
|
||||
|
||||
- WebSocket/TCP fabric-session transport is not part of the supported node
|
||||
dataplane.
|
||||
- QUIC/UDP is the only supported runtime carrier between fabric nodes.
|
||||
- Old WebSocket/TCP smoke helpers are being removed; migration/debug tooling
|
||||
must move to QUIC-native smoke and recovery paths.
|
||||
- Any routing, heartbeat, registry, peer probe, or service dataplane logic must
|
||||
reject WebSocket/TCP carriers as non-QUIC transport, not treat them as a
|
||||
valid alternate path.
|
||||
|
||||
What survives from this stage is the service-neutral frame model and the
|
||||
`FabricSessionPacketTransport` mapping, which now ride on QUIC carriers instead
|
||||
of a WebSocket fallback.
|
||||
VPN fabric-session gateway transport now consumes ranked endpoint candidates,
|
||||
so dataplane sessions can select QUIC fast-path candidates and fall back to
|
||||
legacy peer endpoints when the control plane has not published candidates yet.
|
||||
so dataplane sessions can select QUIC fast-path candidates and refuse non-QUIC
|
||||
peer endpoints when the control plane has not published valid candidates yet.
|
||||
The temporary self-signed QUIC listener advertises its SHA-256 certificate
|
||||
fingerprint in endpoint metadata, and the QUIC client can pin that fingerprint
|
||||
instead of disabling verification while the cluster CA path is being finished.
|
||||
VPN fabric-session dialing now walks all ranked endpoint candidates before
|
||||
falling back to the legacy peer endpoint, so a failed QUIC candidate does not
|
||||
block WebSocket/HTTPS compatibility transport.
|
||||
declaring the target unavailable, so a failed QUIC candidate does not silently
|
||||
re-enable WebSocket/HTTPS compatibility transport.
|
||||
Successful VPN fabric-session dialing logs the selected candidate, transport,
|
||||
certificate pin usage, and remaining fallback count for phone-side diagnostics.
|
||||
Heartbeat telemetry now includes VPN fabric-session dial counters for attempts,
|
||||
@@ -416,8 +549,8 @@ Endpoint health observations are now emitted as a bounded standalone heartbeat
|
||||
report (`rap.vpn_fabric_endpoint_health_report.v1`) so control plane can ingest
|
||||
candidate feedback without parsing the transport diagnostics blob.
|
||||
VPN fabric-session transport telemetry is carrier-neutral
|
||||
(`fabric_session_binary_frames`) and reports QUIC/WebSocket as available
|
||||
carriers instead of describing the dataplane as WebSocket-only.
|
||||
(`fabric_session_binary_frames`) and reports QUIC selection plus non-QUIC
|
||||
candidate rejection instead of describing the dataplane as WebSocket-capable.
|
||||
Endpoint health observations are pruned in-memory by age and count before
|
||||
snapshot/report generation, preventing long-running nodes from accumulating
|
||||
unbounded candidate history.
|
||||
@@ -583,10 +716,10 @@ propagated by host-agent install profiles.
|
||||
|
||||
Deliverables:
|
||||
|
||||
- carry binary frames over one persistent WebSocket/TCP connection;
|
||||
- carry binary frames over one persistent QUIC fabric session;
|
||||
- replace high-frequency `/mesh/v1/forward` packet POST usage for VPN routes in
|
||||
a gated mode;
|
||||
- keep HTTP forwarding as fallback.
|
||||
- remove HTTP/WebSocket packet forwarding from the supported dataplane.
|
||||
|
||||
### Stage FNP-4: Android As Mobile Fabric Node
|
||||
|
||||
@@ -609,12 +742,12 @@ Deliverables:
|
||||
|
||||
### Stage FNP-6: QUIC/UDP Transport
|
||||
|
||||
Status: started with `QUICFabricTransport` in `internal/mesh`.
|
||||
Status: active runtime baseline in `internal/mesh`.
|
||||
|
||||
Deliverables:
|
||||
|
||||
- implement QUIC transport for Fabric Data Session V1;
|
||||
- preserve WebSocket/TCP as fallback;
|
||||
- keep QUIC/UDP as the only supported inter-node runtime transport;
|
||||
- test 4G/Wi-Fi transition and NAT behavior;
|
||||
- benchmark throughput, latency, and recovery against current HTTP forwarding.
|
||||
|
||||
|
||||
@@ -0,0 +1,183 @@
|
||||
# Fabric Area And Peer Stability Model
|
||||
|
||||
Status: active design correction.
|
||||
|
||||
This document replaces the oversimplified rule "every node must keep 3
|
||||
connections" with a stability model based on failure domains ("areas"),
|
||||
multi-path reachability, and live peer memory.
|
||||
|
||||
## 1. Why the old "3 connections" rule is not enough
|
||||
|
||||
A raw connection count is too weak as a resilience rule.
|
||||
|
||||
Three links are not equivalent when:
|
||||
|
||||
- all three peers are in the same private network;
|
||||
- all three depend on the same NAT or relay path;
|
||||
- all three depend on the same public ingress;
|
||||
- all three are relay-ready but not direct-ready;
|
||||
- all three are stale observations rather than recently verified paths.
|
||||
|
||||
Therefore the fabric must not use a single scalar count as the stability
|
||||
criterion.
|
||||
|
||||
## 2. Area
|
||||
|
||||
Introduce the concept of an `area`.
|
||||
|
||||
An area is a failure domain with high mutual reachability and shared external
|
||||
risk. Examples:
|
||||
|
||||
- `home` - nodes in the same home/private site
|
||||
- `test` - nodes in the same test Docker/LAN site
|
||||
- `usa` - a public node in a remote Internet site
|
||||
- `ifcm` - a separate NAT/domain behind another administrative boundary
|
||||
|
||||
An area can be derived from:
|
||||
|
||||
- operator-declared site/area label;
|
||||
- shared private address space or local interface group;
|
||||
- shared public egress/NAT identity;
|
||||
- shared administrative host or cluster.
|
||||
|
||||
The area label must be part of live node metadata and endpoint candidate
|
||||
metadata.
|
||||
|
||||
## 3. Stability objective
|
||||
|
||||
Each node should maintain a working peer set with diversity, not just count.
|
||||
|
||||
### 3.1 Minimum stable peer objective
|
||||
|
||||
For an ordinary production node:
|
||||
|
||||
- at least `2` recently verified direct-ready peers overall;
|
||||
- at least `2` distinct external areas represented in the ready set when more
|
||||
than one external area exists;
|
||||
- at least `1` persistent recovery-capable path outside the local area;
|
||||
- at least `1` additional relay-ready or rendezvous-capable path outside the
|
||||
primary recovery path.
|
||||
|
||||
For an area gateway or strategically important public node:
|
||||
|
||||
- at least `3` direct-ready peers overall;
|
||||
- at least `2` distinct external areas represented in the direct-ready set;
|
||||
- at least `1` extra recovery path that does not share the same public ingress
|
||||
or NAT dependency.
|
||||
|
||||
For a node in a tiny fleet where only one external area currently exists:
|
||||
|
||||
- the system must report `reduced-diversity mode`, not pretend the target is
|
||||
fully satisfied.
|
||||
|
||||
### 3.2 What counts as "ready"
|
||||
|
||||
`ready` means:
|
||||
|
||||
- recently verified;
|
||||
- usable for immediate QUIC route establishment;
|
||||
- not only a historical candidate;
|
||||
- not blocked on stale relay replacement;
|
||||
- not only a compatibility `Control API/downloads` overlap path.
|
||||
|
||||
`relay_ready` does not replace `direct_ready`.
|
||||
|
||||
## 4. What a node must remember
|
||||
|
||||
Every node must keep a live working set, not just a tiny current-peer list.
|
||||
|
||||
Minimum retained peer memory:
|
||||
|
||||
1. all currently healthy nodes in the fleet, when the fleet is small enough;
|
||||
2. for larger fleets, a bounded full directory plus prioritized recent working
|
||||
peers;
|
||||
3. for every known node:
|
||||
- node id
|
||||
- area
|
||||
- role summary
|
||||
- latest verified direct candidates
|
||||
- latest verified relay/rendezvous candidates
|
||||
- last success timestamp
|
||||
- last failure class
|
||||
- NAT / ingress dependency hints
|
||||
- cert pin / authority compatibility metadata
|
||||
|
||||
For the current fleet size, every node should indeed be capable of remembering
|
||||
the full directory of every other node. There is no scale excuse at 6-8 nodes.
|
||||
|
||||
## 5. Probe strategy
|
||||
|
||||
The node should not aggressively probe every possible path at full frequency.
|
||||
It should maintain a layered strategy.
|
||||
|
||||
### 5.1 Hot set
|
||||
|
||||
Always keep a hot set of:
|
||||
|
||||
- current direct-ready peers;
|
||||
- one recovery peer outside the local area;
|
||||
- one alternate peer per external area.
|
||||
|
||||
These should be revalidated frequently.
|
||||
|
||||
### 5.2 Warm set
|
||||
|
||||
Maintain a warm set of:
|
||||
|
||||
- previously successful peers;
|
||||
- peers from underrepresented areas;
|
||||
- peers that would restore diversity if a hot peer fails.
|
||||
|
||||
These should be revalidated on a slower cadence and promoted when diversity or
|
||||
direct-ready count drops.
|
||||
|
||||
### 5.3 Cold directory
|
||||
|
||||
Retain the full known directory and signed registry records, even if not
|
||||
actively probed at the same rate.
|
||||
|
||||
## 6. Failure handling
|
||||
|
||||
When a direct-ready peer is lost:
|
||||
|
||||
1. do not merely replace it with the numerically cheapest peer;
|
||||
2. prefer restoring:
|
||||
- area diversity
|
||||
- independent ingress diversity
|
||||
- direct-ready count
|
||||
3. only then fall back to relay-ready stabilization if direct replacement is
|
||||
not currently available.
|
||||
|
||||
## 7. Implications for the current fleet
|
||||
|
||||
Current area mapping should be treated approximately as:
|
||||
|
||||
- `home`: `home-1`, `home-2`, `home-3`
|
||||
- `test`: `test-1`, `test-2`, `test-3`
|
||||
- `usa`: `usa-los-1`
|
||||
- `ifcm`: `ifcm-rufms-s-mo1cr`
|
||||
|
||||
Under this model:
|
||||
|
||||
- a node in `home` should avoid satisfying its minimum peer objective using
|
||||
only `home` peers plus one relay;
|
||||
- `usa-los-1` and `ifcm-rufms-s-mo1cr` should both maintain direct-ready links
|
||||
that span at least two foreign areas when possible;
|
||||
- a fleet-wide alert should trigger when a node loses cross-area diversity even
|
||||
if its total peer count still looks healthy.
|
||||
|
||||
## 8. Required implementation changes
|
||||
|
||||
1. Add `area` to node metadata and endpoint candidate metadata.
|
||||
2. Track peer readiness by area, not only total count.
|
||||
3. Separate:
|
||||
- `direct_ready_count`
|
||||
- `relay_ready_count`
|
||||
- `external_area_ready_count`
|
||||
- `independent_ingress_ready_count`
|
||||
4. Alert on:
|
||||
- zero recovery path outside the local area
|
||||
- direct-ready deficit
|
||||
- area diversity deficit
|
||||
- registry resolution deficit
|
||||
5. Preserve a full node directory for the current small fleet.
|
||||
@@ -289,7 +289,10 @@ Production fabric-core migration boundary:
|
||||
LAN/interface QUIC, STUN reflexive `ice_quic`, reverse/outbound-only, and
|
||||
`relay_quic` fallback. Candidate metadata carries `local_segment_id`,
|
||||
`nat_group_id`, `stun_server`, `ice_foundation`, `relay_node_id`, and
|
||||
`relay_endpoint` when configured.
|
||||
`relay_endpoint` when configured. When a relay endpoint is the first physical
|
||||
QUIC hop, its advertised certificate fingerprint must survive route planning
|
||||
so public-IP relay paths can verify the relay node by pin instead of falling
|
||||
back to hostname/IP SAN matching.
|
||||
- Endpoint candidate scoring is QUIC-mode only. It ranks `direct_quic`,
|
||||
`lan_quic`, `ice_quic`, `reverse_quic`, and `relay_quic` using freshness,
|
||||
health observations, latency, reliability, region, policy tags, and live
|
||||
|
||||
@@ -0,0 +1,179 @@
|
||||
# Fabric Live Audit 2026-05-18
|
||||
|
||||
Status: live operational audit of the current fabric. This document records the
|
||||
real state observed on 2026-05-18 and explicitly calls out where runtime
|
||||
behavior still differs from the target architecture.
|
||||
|
||||
## Current confirmed state
|
||||
|
||||
- Inter-node transport for the live node-agent fleet is `QUIC over UDP`.
|
||||
- The active node set
|
||||
- `home-1`
|
||||
- `home-2`
|
||||
- `home-3`
|
||||
- `test-1`
|
||||
- `test-2`
|
||||
- `test-3`
|
||||
- `usa-los-1`
|
||||
- `ifcm-rufms-s-mo1cr`
|
||||
is converged on `0.2.321-directreadytarget`.
|
||||
- `ifcm-rufms-s-mo1cr` recovered through the compatibility recovery path and is
|
||||
no longer stale.
|
||||
|
||||
## Why TCP traffic is still visible
|
||||
|
||||
Visible TCP traffic is not coming from the inter-node fabric transport. It is
|
||||
coming from the temporary compatibility recovery overlap that is still active.
|
||||
|
||||
Observed live listeners:
|
||||
|
||||
- `docker-test`
|
||||
- `19191/tcp` - compatibility `Control API/downloads` bridge
|
||||
- `18080/tcp` - web-admin
|
||||
- `18090/tcp` - release files
|
||||
- `18121/tcp` - backend Control API
|
||||
- `19132/udp`, `19133/udp`, `19134/udp` - QUIC fabric listeners
|
||||
- `usa-los-1`
|
||||
- `19131/udp` - QUIC fabric listener
|
||||
- `19191/tcp` - external compatibility bridge currently held open so legacy
|
||||
recovery contracts can still reach `Control API/downloads`
|
||||
|
||||
Therefore:
|
||||
|
||||
- `TCP` is still present by design for recovery overlap.
|
||||
- `UDP/QUIC` is the current node-to-node transport.
|
||||
- The statement "the fabric is fully UDP-only" is not yet true at the full
|
||||
system level while `19191/tcp` compatibility recovery remains enabled.
|
||||
|
||||
## Why nodes were still falling away
|
||||
|
||||
### 1. Nodes do not yet operate from a fully active signed registry gossip plane
|
||||
|
||||
Observed on the live `ifcm-rufms-s-mo1cr` heartbeat:
|
||||
|
||||
- `fabric_registry_runtime_report.status = candidate_only`
|
||||
- `resolved_service_count = 0`
|
||||
- `resolved_services.control-api = no_active_record`
|
||||
- `resolved_services.update-store = no_active_record`
|
||||
- `resolved_services.update-cache = no_active_record`
|
||||
|
||||
This means the current runtime still depends on compatibility control URLs more
|
||||
than the target architecture allows. The node is alive in the fabric, but not
|
||||
yet operating from a fully resolved active registry view.
|
||||
|
||||
### 2. Legacy control/download contracts are still real dependencies
|
||||
|
||||
Observed on the live `ifcm-rufms-s-mo1cr` heartbeat after recovery:
|
||||
|
||||
- `mesh_outbound_session_report.control_plane_url = http://vpn.cin.su:19191/api/v1`
|
||||
|
||||
This confirms the root recovery lesson:
|
||||
|
||||
- a NAT node without manual host access was still anchored to the old recovery
|
||||
contract;
|
||||
- until that contract was temporarily restored, the node could not advance;
|
||||
- the node did not disappear because QUIC failed; it disappeared because the
|
||||
recovery/control overlap was removed before the node had converged.
|
||||
|
||||
### 3. Direct peer resilience is still below the intended threshold
|
||||
|
||||
Observed from live heartbeat metadata:
|
||||
|
||||
- `ifcm-rufms-s-mo1cr`
|
||||
- `peer_connection_ready = 2`
|
||||
- `peer_connection_relay_ready = 3`
|
||||
- `target_ready_peers = 3`
|
||||
- `usa-los-1`
|
||||
- `peer_connection_ready = 1`
|
||||
- `peer_connection_relay_ready = 5`
|
||||
- `target_ready_peers = 3`
|
||||
|
||||
This means the direct-path resilience target is not satisfied yet, even though
|
||||
the nodes are healthy.
|
||||
|
||||
The practical reason is simple:
|
||||
|
||||
- the cluster has only a small number of externally reachable direct QUIC
|
||||
endpoints;
|
||||
- some nodes still advertise only private/LAN-reachable direct candidates;
|
||||
- relay-ready adjacency is masking direct peer deficit, but it does not replace
|
||||
the requirement for at least three direct-ready peers.
|
||||
|
||||
### 4. Observability is still heterogeneous
|
||||
|
||||
Live heartbeat coverage is inconsistent:
|
||||
|
||||
- `test-*`, `ifcm`, `usa-los-1` emit rich `c17z20` heartbeat metadata with
|
||||
endpoint, peer recovery, and registry sections.
|
||||
- `home-*` currently do not expose the same full sections in their latest
|
||||
heartbeat rows.
|
||||
|
||||
This means operator visibility is uneven and the documentation must not imply
|
||||
uniform live introspection across every node today.
|
||||
|
||||
## What is true right now
|
||||
|
||||
1. The fleet is converged on one live node-agent version.
|
||||
2. QUIC/UDP is the actual node-to-node transport.
|
||||
3. Compatibility `19191/tcp` is still required for recovery overlap.
|
||||
4. Signed registry gossip is not yet the sole active discovery/control source.
|
||||
5. The "at least 3 direct-ready peers per node" resilience target is not yet
|
||||
met for all externally significant nodes.
|
||||
|
||||
## Operational rule until the next audit
|
||||
|
||||
Do not remove the compatibility `19191/tcp` recovery overlap while any of the
|
||||
following remain true:
|
||||
|
||||
- any live node still reports a `control_plane_url` on the `19191` contract;
|
||||
- any live node has `fabric_registry_runtime_report.status != active`;
|
||||
- any externally significant node has fewer than 3 direct-ready peers;
|
||||
- any node can only recover through legacy `Control API/downloads` overlap.
|
||||
|
||||
## Required next work
|
||||
|
||||
### A. Finish signed registry activation
|
||||
|
||||
Each node must be able to resolve active records for at least:
|
||||
|
||||
- `control-api`
|
||||
- `update-store`
|
||||
- `update-cache`
|
||||
|
||||
without falling back to the `19191` compatibility contract.
|
||||
|
||||
### B. Promote full direct endpoint dissemination
|
||||
|
||||
All nodes with public reachability must advertise every valid public direct QUIC
|
||||
endpoint, and nodes must retain enough live peer memory to reconnect without
|
||||
operator intervention.
|
||||
|
||||
### C. Enforce the direct-ready floor as a live alert
|
||||
|
||||
If a node has fewer than 3 direct-ready peers, this must remain a real
|
||||
operational alert even when relay-ready peers exist.
|
||||
|
||||
### D. Normalize heartbeat observability
|
||||
|
||||
Every production node must emit the same minimum audit surface:
|
||||
|
||||
- endpoint candidates
|
||||
- peer recovery counts
|
||||
- registry runtime state
|
||||
- update runtime state
|
||||
|
||||
without mixing rich and reduced heartbeat schemas across the fleet.
|
||||
|
||||
### E. Replace the naive peer-count rule
|
||||
|
||||
The live fleet shows that a plain "3 links per node" rule is not a sufficient
|
||||
resilience model.
|
||||
|
||||
The current corrective design is documented in
|
||||
[FABRIC_AREA_AND_PEER_STABILITY_MODEL.md](\\nas\\MST\\codex\\rdp-proxy\\docs\\architecture\\FABRIC_AREA_AND_PEER_STABILITY_MODEL.md)
|
||||
and introduces:
|
||||
|
||||
- `area` as a failure-domain label;
|
||||
- direct-ready vs relay-ready separation;
|
||||
- cross-area diversity requirements;
|
||||
- full-directory retention for small fleets.
|
||||
@@ -0,0 +1,427 @@
|
||||
# Fabric Node Survival And Recovery Policy
|
||||
|
||||
Status: active architecture policy.
|
||||
|
||||
This document defines the non-negotiable survival, compatibility, and recovery
|
||||
rules for Secure Access Fabric nodes. It exists because losing a node is not an
|
||||
acceptable operating model once the fabric grows beyond a small manually
|
||||
maintained fleet.
|
||||
|
||||
Reference incident:
|
||||
|
||||
- `ifcm-rufms-s-mo1cr` is the canonical recovery case.
|
||||
- The node is behind NAT.
|
||||
- There is no direct administrative access to the Windows host.
|
||||
- The node must remain recoverable through the fabric/update/recovery plane
|
||||
without relying on manual host login.
|
||||
|
||||
The latest live recovery evidence for this case is documented in
|
||||
[FABRIC_LIVE_AUDIT_2026-05-18.md](\\nas\\MST\\codex\\rdp-proxy\\docs\\architecture\\FABRIC_LIVE_AUDIT_2026-05-18.md).
|
||||
|
||||
This policy applies to Linux, Windows, Android, containerized nodes, and future
|
||||
node types.
|
||||
|
||||
## 1. Core Decision
|
||||
|
||||
The fabric must be able to lose:
|
||||
|
||||
- old API endpoints;
|
||||
- old artifact URLs;
|
||||
- previous public IP addresses;
|
||||
- previous NAT mappings;
|
||||
- previous relay nodes;
|
||||
- previous route-authority replicas;
|
||||
- previous update-cache replicas;
|
||||
- old service locations;
|
||||
- operator access to the host OS;
|
||||
- the current physical location of a workload;
|
||||
- part of the cluster.
|
||||
|
||||
And still keep the node recoverable.
|
||||
|
||||
Manual repair is allowed as an emergency tool. It must not be the default
|
||||
survival strategy.
|
||||
|
||||
## 2. Non-Negotiable Invariants
|
||||
|
||||
### 2.1 Node Identity Must Survive
|
||||
|
||||
A recoverable node must preserve:
|
||||
|
||||
- `node_id`;
|
||||
- node keypair or key reference;
|
||||
- pinned cluster authority / quorum descriptor;
|
||||
- last accepted signed registry records;
|
||||
- last accepted bootstrap seed set;
|
||||
- last known good update policy;
|
||||
- last known good workload desired state;
|
||||
- rollback metadata;
|
||||
- recovery audit trail.
|
||||
|
||||
Reinstall or repair must prefer preserving local state. Identity reset is a
|
||||
high-risk operator action, not the default repair path.
|
||||
|
||||
### 2.2 Compatibility Must Stay Until Recovery Is Complete
|
||||
|
||||
Any change to the fabric must keep older nodes recoverable until one of these
|
||||
is true:
|
||||
|
||||
1. every node has confirmed the new contract; or
|
||||
2. the missing nodes were manually retired, revoked, or explicitly accepted as
|
||||
lost.
|
||||
|
||||
This applies to:
|
||||
|
||||
- update plan formats;
|
||||
- signed registry schemas;
|
||||
- artifact install types;
|
||||
- authority signature envelopes;
|
||||
- bootstrap config formats;
|
||||
- recovery seed formats;
|
||||
- host-agent / updater runtime contracts;
|
||||
- control endpoints needed only for migration.
|
||||
|
||||
The rule is strict: do not delete the old recovery format while nodes that may
|
||||
still need it remain unrecovered.
|
||||
|
||||
### 2.3 QUIC-Only Transport Does Not Mean Single Bootstrap Location
|
||||
|
||||
Node-to-node runtime transport remains QUIC over UDP only.
|
||||
|
||||
That does not permit:
|
||||
|
||||
- one bootstrap address;
|
||||
- one update mirror;
|
||||
- one registry carrier;
|
||||
- one ingress node;
|
||||
- one relay;
|
||||
- one control replica.
|
||||
|
||||
QUIC is the transport. Survivability requires many signed ways to discover the
|
||||
current valid QUIC endpoints.
|
||||
|
||||
### 2.4 No Single Service May Own Recovery
|
||||
|
||||
Recovery must not depend on one:
|
||||
|
||||
- backend URL;
|
||||
- DNS name;
|
||||
- HTTP ingress;
|
||||
- update repository host;
|
||||
- relay node;
|
||||
- cluster admin node.
|
||||
|
||||
Any of those may disappear while the node is still healthy enough to recover.
|
||||
|
||||
## 3. Required Recovery Layers
|
||||
|
||||
### 3.1 Embedded Bootstrap Seed Set
|
||||
|
||||
Each installable node package must contain a bounded bootstrap seed set:
|
||||
|
||||
- multiple seed nodes;
|
||||
- public and private candidates where appropriate;
|
||||
- QUIC endpoint candidates only;
|
||||
- signed bootstrap metadata;
|
||||
- expiry / epoch rules;
|
||||
- optional organization / cluster scope constraints.
|
||||
|
||||
The bootstrap seed set is only the first door, not cluster truth.
|
||||
|
||||
### 3.2 Signed Registry Gossip
|
||||
|
||||
After bootstrap, a node must learn current service locations through signed
|
||||
fabric registry records that can be carried by any reachable peer.
|
||||
|
||||
Required properties:
|
||||
|
||||
- multiple records per service;
|
||||
- quorum or otherwise policy-approved signatures;
|
||||
- monotonic epoch/generation;
|
||||
- expiry and freshness checks;
|
||||
- live probe before promotion;
|
||||
- ability to accept newer records from a reachable neighbor even when old
|
||||
origins are gone.
|
||||
|
||||
### 3.3 Outbound-Only Recovery Attachment
|
||||
|
||||
A node behind NAT or in passive mode must be recoverable through an outbound
|
||||
attachment.
|
||||
|
||||
Required behaviors:
|
||||
|
||||
- the node can maintain at least one long-lived outbound QUIC control channel;
|
||||
- that channel survives IP changes by reconnecting through any remaining seed or
|
||||
signed registry endpoint;
|
||||
- the node may receive updated registry truth, update triggers, workload
|
||||
changes, and recovery instructions over that channel;
|
||||
- the fabric must not require inbound TCP/UDP reachability to repair the node.
|
||||
|
||||
### 3.4 Local Recovery Agent Boundary
|
||||
|
||||
The node must have a minimal recovery-capable local agent boundary that is
|
||||
separate from ordinary service workloads.
|
||||
|
||||
It must be able to:
|
||||
|
||||
- validate signed update plans;
|
||||
- download artifacts from multiple mirrors;
|
||||
- stage replacement binaries;
|
||||
- restart node-agent or host-agent tasks;
|
||||
- rollback to previous binaries;
|
||||
- swap to new signed registry/bootstrap records;
|
||||
- emit recovery status when transport returns.
|
||||
|
||||
If node workloads fail, this local recovery boundary must still exist.
|
||||
|
||||
### 3.5 Multi-Source Artifact Delivery
|
||||
|
||||
Artifacts must be retrievable from more than one source:
|
||||
|
||||
- local cached file;
|
||||
- cluster update-cache;
|
||||
- organization-local cache if policy allows;
|
||||
- public or internet-reachable mirror;
|
||||
- neighbor-assisted relay transfer over the fabric.
|
||||
|
||||
A node must not become unrecoverable because one artifact hostname or one
|
||||
download service disappeared.
|
||||
|
||||
### 3.6 Trigger And Subscription Plane
|
||||
|
||||
Polling alone is not enough for very large fleets.
|
||||
|
||||
Required model:
|
||||
|
||||
- nodes may still perform slow fallback polling;
|
||||
- primary update notification uses subscription/signal delivery;
|
||||
- update-cache or registry service can repeatedly signal pending updates until
|
||||
acknowledged;
|
||||
- signals are idempotent;
|
||||
- signals do not require the old control endpoint to remain alive.
|
||||
|
||||
## 4. Update Safety Rules
|
||||
|
||||
### 4.1 Upgrade Contracts
|
||||
|
||||
Every release that changes recovery-critical contracts must explicitly declare:
|
||||
|
||||
- minimum supported old version;
|
||||
- maximum tolerated skew;
|
||||
- whether migration is rolling-safe;
|
||||
- whether the node must first update host-agent or node-agent;
|
||||
- rollback compatibility;
|
||||
- whether old bootstrap/registry envelopes remain accepted.
|
||||
|
||||
### 4.2 Two-Key Rule For Breaking Changes
|
||||
|
||||
Do not simultaneously break:
|
||||
|
||||
- discovery of where to get the update; and
|
||||
- ability to understand the update once found.
|
||||
|
||||
At least one of those must remain compatible until fleet convergence or
|
||||
explicit retirement.
|
||||
|
||||
### 4.3 Old Artifact Retention
|
||||
|
||||
Recovery-critical artifact versions must remain available until:
|
||||
|
||||
- all nodes have moved past them; or
|
||||
- the remaining nodes are revoked/retired and recorded as intentionally lost.
|
||||
|
||||
Do not garbage-collect the last working host-agent or node-agent build for an
|
||||
unrecovered population.
|
||||
|
||||
### 4.4 Install Type Continuity
|
||||
|
||||
If historical nodes request different install types for the same product
|
||||
(`windows_binary`, `windows_service`, `native`, `linux_binary`, etc.), recovery
|
||||
planning must keep compatibility aliases until the fleet converges.
|
||||
|
||||
The fabric must not strand nodes on an install-type naming mismatch.
|
||||
|
||||
### 4.5 Legacy Recovery Contract Drift Must Be Treated As A Blocking Risk
|
||||
|
||||
A stale node may report:
|
||||
|
||||
- a compatible recovery artifact exists under the current registry; but
|
||||
- the last local updater/host-agent status still says `no_matching_artifact` or
|
||||
an equivalent legacy contract failure.
|
||||
|
||||
This means the node is not only waiting for a heartbeat. It is running an older
|
||||
recovery planner contract and may still depend on:
|
||||
|
||||
- historical install-type aliases;
|
||||
- older artifact matching semantics;
|
||||
- older update-plan interpretation rules;
|
||||
- overlap in signed registry / bootstrap envelopes.
|
||||
|
||||
This condition must be classified as `legacy recovery contract drift` and must
|
||||
block compatibility removal the same way an artifact gap does.
|
||||
|
||||
Operationally this also means:
|
||||
|
||||
- the node requires a `recovery bridge`;
|
||||
- the cluster enters `bridge hold active` for compatibility-removal decisions;
|
||||
- `bridge hold` remains active until the node reports a recovery-compatible
|
||||
status on the current contract or the operator explicitly retires the node;
|
||||
- when a compatible artifact and target mapping already exist, the node should
|
||||
be classified as `bridge replay ready`, meaning the system can replay the
|
||||
legacy-compatible update plan as soon as the node regains an outbound control
|
||||
cycle;
|
||||
- operator tooling should expose a canonical `bridge replay plan` per node so
|
||||
recovery replay uses the same signed update-plan logic as normal updates;
|
||||
- compatibility aliases / overlap must remain enabled for that node population;
|
||||
- dashboards and rollout guards must show this separately from ordinary
|
||||
`waiting recovery heartbeat`.
|
||||
|
||||
Canonical example:
|
||||
|
||||
- `ifcm-rufms-s-mo1cr` is stale;
|
||||
- the current backend can match a Windows-compatible host-agent artifact;
|
||||
- the last host-agent report still says `no_matching_artifact`;
|
||||
- therefore the node must be treated as a legacy recovery-contract blocker, not
|
||||
merely as a delayed heartbeat.
|
||||
|
||||
## 5. Service And Location Mobility Rules
|
||||
|
||||
Moving a service must not strand nodes that only know the old location.
|
||||
|
||||
Required pattern:
|
||||
|
||||
1. publish new signed registry records;
|
||||
2. keep old records valid during overlap;
|
||||
3. allow any reachable peer to relay the new records;
|
||||
4. live-probe and promote the new endpoints;
|
||||
5. only then retire the old location;
|
||||
6. keep enough overlap for slow or partitioned nodes to catch up.
|
||||
|
||||
This applies to:
|
||||
|
||||
- control-api replicas;
|
||||
- update-cache/update-store replicas;
|
||||
- web/admin ingress replicas;
|
||||
- relay/rendezvous nodes;
|
||||
- service-channel endpoints.
|
||||
|
||||
## 6. Failure Classes The Fabric Must Tolerate
|
||||
|
||||
The design must explicitly handle all of these:
|
||||
|
||||
- node behind NAT with only outbound connectivity;
|
||||
- several nodes behind one NAT/local segment;
|
||||
- node changes public IP;
|
||||
- node changes private IP;
|
||||
- old DNS/URL becomes dead;
|
||||
- artifact mirror disappears;
|
||||
- control ingress disappears;
|
||||
- relay disappears;
|
||||
- update install fails halfway;
|
||||
- binary staged but restart fails;
|
||||
- old task/service name changes;
|
||||
- local disk is nearly full;
|
||||
- time skew causes signature freshness risk;
|
||||
- authority rotates;
|
||||
- route authority replica disappears;
|
||||
- state directory survives but binary is broken;
|
||||
- binary survives but state directory is partly stale;
|
||||
- node reboots during update;
|
||||
- only one peer still knows the new registry truth;
|
||||
- node is partitioned for a long time and rejoins later;
|
||||
- platform removes legacy support too early;
|
||||
- operator has no shell/RDP/WinRM/SSH access to the host.
|
||||
|
||||
## 7. Required Local State And Journaling
|
||||
|
||||
The node local state store must retain at least:
|
||||
|
||||
- active and previous signed registry records;
|
||||
- active and previous bootstrap seeds;
|
||||
- last successful update plan per product;
|
||||
- last applied artifact hash/version;
|
||||
- last rollback candidate;
|
||||
- last successful service endpoints used for update/control;
|
||||
- pending trigger generation;
|
||||
- recovery attempts with timestamps and reasons;
|
||||
- last known good runtime command line / task/unit identity;
|
||||
- last known workload desired states.
|
||||
|
||||
Writes must be atomic. A power loss must not leave the node with zero valid
|
||||
state.
|
||||
|
||||
## 8. Observability And Fleet Safety Rules
|
||||
|
||||
The control plane must make invisible-recovery risk explicit.
|
||||
|
||||
It must surface:
|
||||
|
||||
- nodes with stale heartbeat but recent updater activity;
|
||||
- nodes with no working compatible recovery artifact;
|
||||
- nodes whose pinned registry/bootstrap epoch is too old;
|
||||
- nodes whose only known artifact URL is dead;
|
||||
- nodes whose desired state requires a contract they cannot parse;
|
||||
- nodes whose local agent version is below the minimum recovery floor;
|
||||
- nodes whose last successful contact depended on a single service replica.
|
||||
|
||||
Cluster-wide changes that would strand such nodes must be blocked or require an
|
||||
explicit recovery-admin override.
|
||||
|
||||
## 9. Release And Migration Checklist
|
||||
|
||||
Before deleting old code, old formats, or old endpoints, verify all of these:
|
||||
|
||||
1. every active node has confirmed a compatible version; or the remaining nodes
|
||||
are explicitly marked for manual retirement/recovery;
|
||||
2. host-agent and node-agent recovery paths both have matching artifacts;
|
||||
3. bootstrap/registry overlap exists for the migration window;
|
||||
4. at least two independent artifact sources remain reachable;
|
||||
5. signed registry gossip can carry the new locations without the old API
|
||||
hostname;
|
||||
6. rollback artifacts are still available;
|
||||
7. install type aliases remain for historical agents where needed;
|
||||
8. NAT/passive/outbound-only nodes were explicitly tested;
|
||||
9. stale-node risk report is empty or consciously accepted by recovery-admin;
|
||||
10. removal of legacy support is documented with the exact cutoff conditions.
|
||||
|
||||
## 10. `ifcm-rufms-s-mo1cr` Rule
|
||||
|
||||
`ifcm-rufms-s-mo1cr` is the standing reference case for future work.
|
||||
|
||||
For this node class, the platform must assume:
|
||||
|
||||
- the host is behind NAT;
|
||||
- the node may only keep outbound channels;
|
||||
- no direct Windows administrative access exists;
|
||||
- old discovery endpoints may disappear;
|
||||
- only the fabric/update/recovery plane can save the node.
|
||||
|
||||
Any future transport, update, authority, bootstrap, registry, or workload
|
||||
change must be reviewed against this question:
|
||||
|
||||
> If `ifcm-rufms-s-mo1cr` is still on the older contract and we cannot log in to
|
||||
> the host, can the fabric still recover it?
|
||||
|
||||
If the answer is no, the change is incomplete.
|
||||
|
||||
## 11. Immediate Follow-Through
|
||||
|
||||
The system should keep implementing these concrete items:
|
||||
|
||||
- separate documented recovery-plane tests for Windows NAT nodes;
|
||||
- signed registry retention and overlap checks before endpoint migration;
|
||||
- compatibility alias coverage for historical install types;
|
||||
- artifact availability health over all mirrors;
|
||||
- stale-node risk dashboard/report before legacy removal;
|
||||
- node-local journaling for last good registry/update state;
|
||||
- neighbor-assisted artifact relay path;
|
||||
- explicit recovery simulation for outbound-only nodes with dead old endpoints.
|
||||
|
||||
## 12. Decision
|
||||
|
||||
The fabric must treat node survival as a first-class architecture contract.
|
||||
|
||||
A node is not considered safe merely because the happy path works. It is safe
|
||||
only when it can survive protocol migration, endpoint relocation, partial
|
||||
cluster loss, artifact source loss, and lack of manual host access without
|
||||
being abandoned.
|
||||
@@ -256,9 +256,11 @@ The first backend contract slice is implemented:
|
||||
observations, and degraded backend relay usage. These incidents keep backend
|
||||
relay visible as degraded compatibility behavior rather than hidden steady
|
||||
state.
|
||||
- Node-agent access telemetry distinguishes backend relay actually used from
|
||||
backend relay blocked by signed data-plane policy. Blocked fallback reports
|
||||
include `backend_fallback_blocked` and the last violation status/reason, and
|
||||
- Node-agent access telemetry distinguishes degraded compatibility requested
|
||||
from degraded compatibility blocked by signed data-plane policy. Blocked
|
||||
compatibility reports include `degraded_compatibility_blocked` and the last
|
||||
violation status/reason, while preserving the original raw violation code in
|
||||
a separate field for historical correlation, and
|
||||
backend projects them to access telemetry plus `data_plane_contract`
|
||||
incidents.
|
||||
- Backend correlates access-report send failures with active service-channel
|
||||
@@ -421,8 +423,8 @@ The first backend contract slice is implemented:
|
||||
keeps failing outside manual retry cooldown creates a bounded rebuild
|
||||
request. If an unfenced alternate is available, Control Plane marks the
|
||||
rebuild `applied` and selects that route generation; if no alternate exists,
|
||||
it records `pending_degraded_fallback` and keeps backend relay as the
|
||||
explicit degraded path until a new route appears. The compatibility release
|
||||
it records `pending_degraded_route_state` and keeps the channel in explicit
|
||||
degraded route state until a new route appears. The compatibility release
|
||||
`0.2.175` keeps node/host-agent signed-config models aligned with these new
|
||||
fields.
|
||||
- C18U moves rebuild metadata into node-agent runtime behavior. Node-agent
|
||||
@@ -437,10 +439,10 @@ The first backend contract slice is implemented:
|
||||
- C18V adds route-manager transition telemetry and churn coverage. Node-agent
|
||||
`0.2.177` reports `route_manager_transition` alongside the current manager
|
||||
snapshot, including previous/current generation, status, decision count,
|
||||
withdrawn route count, restored route count, pending-degraded fallback count,
|
||||
withdrawn route count, restored route count, pending degraded route-state count,
|
||||
rebuild applied count, and any cached selected route cleared because Control
|
||||
Plane withdrew it. Coverage verifies three service-neutral lifecycle cases:
|
||||
applied rebuild replacement, pending degraded fallback when no alternate is
|
||||
applied rebuild replacement, pending degraded route state when no alternate is
|
||||
available, and rollback/restoration when a fresh config removes the rebuild
|
||||
decision.
|
||||
- C18W adds a live docker-test verification loop for that telemetry. The smoke
|
||||
@@ -973,8 +975,8 @@ The first backend contract slice is implemented:
|
||||
in C18Z45; rebuild snapshot maintenance health with overdue/runtime-evidence
|
||||
visibility landed in C18Z46; node-agent signed service-channel lease
|
||||
enforcement when cluster authority is pinned landed in C18Z47; backend
|
||||
introspection fallback for unsigned compatibility clients landed in C18Z48;
|
||||
accepted-by telemetry for signed/introspection/legacy ingress landed in
|
||||
introspection fallback for token-authorized compatibility clients landed in C18Z48;
|
||||
accepted-by telemetry for signed/introspection/token-authorized ingress landed in
|
||||
C18Z49; durable lease introspection across backend restarts landed in C18Z50;
|
||||
bounded durable lease cleanup and admin visibility landed in C18Z51; durable
|
||||
accepted-by access telemetry aggregation with heartbeat fallback and admin
|
||||
@@ -983,9 +985,9 @@ The first backend contract slice is implemented:
|
||||
visibility landed in C18Z53; C18Z54 smoke proves the same diagnostics on a
|
||||
normal non-fallback primary route with healthy rolling route-quality feedback;
|
||||
C18Z55 smoke proves degraded/fenced normal-route feedback is shown separately
|
||||
from explicit backend fallback; C18Z56 adds active-channel remediation
|
||||
from explicit degraded compatibility requests; C18Z56 adds active-channel remediation
|
||||
diagnostics (`none`, `rebuild_route`, `prefer_alternate_route`,
|
||||
`use_backend_fallback`) to make the next runtime action explicit, and its
|
||||
`hold_degraded_route_state`) to make the next runtime action explicit, and its
|
||||
alternate-route branch is live-smoke-proven with backend fallback kept off.
|
||||
C18Z57 adds the bounded machine-readable `remediation_command` contract to
|
||||
active access telemetry rows so route-manager can consume a short-lived
|
||||
@@ -1058,7 +1060,7 @@ The first backend contract slice is implemented:
|
||||
`rebuild_request_recorded` or `rebuild_request_rejected` for the active
|
||||
channel. C18Z76 adds node-side acknowledgement for the allowed
|
||||
`rebuild_route` branch: node-agent consumes the command as a route-manager
|
||||
`pending_degraded_fallback` decision with source
|
||||
`pending_degraded_route_state` decision with source
|
||||
`service_channel_remediation_command`, while guarded commands remain ignored.
|
||||
Backend access telemetry correlates that heartbeat evidence with the durable
|
||||
ledger and reports `rebuild_request_recorded_node_pending`. C18Z77 resolves
|
||||
@@ -1089,7 +1091,7 @@ The first backend contract slice is implemented:
|
||||
reselecting the degraded replacement or adding fallback/failure/drop deltas.
|
||||
C18Z82 proves the no-safe-recovery branch: if that replacement is also fenced
|
||||
and no safe recovery route exists, synthetic config reports
|
||||
`service_channel_feedback_no_alternate` / `pending_degraded_fallback` with
|
||||
`service_channel_feedback_no_alternate` / `pending_degraded_route_state` with
|
||||
`no_unfenced_alternate_route` instead of silently keeping a bad route.
|
||||
C18Z83 projects that route-manager decision into active access telemetry and
|
||||
web-admin active-channel diagnostics, including decision source, route id,
|
||||
@@ -1124,7 +1126,8 @@ The first backend contract slice is implemented:
|
||||
`data_plane` is present in the lease, authority payload, introspection
|
||||
response, and lease-maintenance/admin list. It declares backend API as
|
||||
control-plane transport, fabric service channel/fabric route as working
|
||||
data/steady-state transport, backend relay as degraded fallback only, and
|
||||
data/steady-state transport, degraded compatibility relay as an explicit
|
||||
compatibility state only, and
|
||||
service-neutral protocol-agnostic isolated logical flows as the runtime
|
||||
contract for VPN, Remote Workspace, files, video, and future services. C18Z91
|
||||
makes node-agent consume the signed/introspected data-plane contract, apply
|
||||
@@ -1187,12 +1190,13 @@ channel class, selected entry node, allowed flow isolation, and data-plane
|
||||
contract on `remote-workspaces/{resource_id}/streams/{channel_class}`. Empty
|
||||
probe requests return `202` with a remote-workspace ingress probe contract and
|
||||
access telemetry; real RDP frame forwarding remains deliberately
|
||||
`not_implemented` until the service adapter work begins.
|
||||
`validated_only` for empty probes until the service adapter work begins.
|
||||
C19E adds a narrow frame-batch probe on that boundary. The adapter contract
|
||||
advertises `rap.remote_workspace_frame_batch.v1`, and entry-node accepts
|
||||
non-empty payloads only when they are JSON probe batches with `probe_only=true`,
|
||||
valid remote-workspace logical channels, valid directions, and bounded payload
|
||||
metadata. Accepted probes return `payload_flow=validated_probe_only`; production
|
||||
metadata. Accepted frame probes return `payload_flow=validated_probe_only`, while
|
||||
empty/control probes return `payload_flow=validated_only`; production
|
||||
frame forwarding is still not enabled.
|
||||
C19F connects that validated probe to a node-agent local adapter sink. The
|
||||
in-memory `node_agent_rdp_worker_contract_probe` sink accepts only validated
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
Status: Stage C17 planning completed. Stage C17A synthetic mesh runtime
|
||||
skeleton, Stage C17B route health/failover probes, Stage C17C relay semantic
|
||||
hardening, Stage C17D non-production test-service path experiment, Stage C17E
|
||||
live node-to-node synthetic HTTP transport skeleton, Stage C17F scoped
|
||||
historical live node-to-node synthetic HTTP transport skeleton, Stage C17F scoped
|
||||
synthetic route config boundary, Stage C17G Control Plane scoped synthetic
|
||||
config read boundary, Stage C17H deployed multi-agent synthetic config smoke,
|
||||
Stage C17I production forwarding gate, Stage C17J production envelope
|
||||
@@ -44,8 +44,9 @@ invalidation. C17C added synthetic relay validation, per-channel bounded
|
||||
queues, QoS dequeue order, telemetry-only drop/backpressure, and reliable
|
||||
fabric/control rejection behavior. C17D added one bounded `synthetic.echo`
|
||||
test-service path over direct, single-relay, and forced fallback routes. C17E
|
||||
added real HTTP peer transport and a disabled-by-default node-agent synthetic
|
||||
endpoint/smoke harness for direct and single-relay synthetic traffic. C17F
|
||||
added one historical real-HTTP peer transport experiment and a
|
||||
disabled-by-default node-agent synthetic endpoint/smoke harness for direct and
|
||||
single-relay synthetic traffic only. C17F
|
||||
added scoped synthetic peer/route config loading and synthetic route-health
|
||||
link observation reporting. C17G added the Control Plane read boundary for
|
||||
node-scoped synthetic mesh config. C17H proved that boundary in a deployed
|
||||
@@ -596,10 +597,12 @@ C17H implemented a deployed multi-agent synthetic config smoke on
|
||||
VPN/IP tunnel work remains a separate C18 track and must not be mixed into
|
||||
C17 mesh runtime work.
|
||||
|
||||
## 15.4 C17E Result
|
||||
## 15.4 C17E Historical Result
|
||||
|
||||
C17E implemented live node-to-node synthetic HTTP transport while preserving
|
||||
the production forwarding kill-switch:
|
||||
C17E implemented a historical live node-to-node synthetic HTTP transport
|
||||
experiment while preserving the production forwarding kill-switch. This result
|
||||
is retained only as test-history context; it is not the active transport
|
||||
direction for the fabric runtime:
|
||||
|
||||
- `HTTPPeerTransport` maps explicit peer node IDs to synthetic HTTP endpoint
|
||||
URLs.
|
||||
@@ -613,6 +616,13 @@ the production forwarding kill-switch:
|
||||
- `/mesh/v1/forward` remains disabled.
|
||||
- no production service traffic is authorized.
|
||||
|
||||
Current direction:
|
||||
|
||||
- active fabric runtime transport is QUIC-only
|
||||
- synthetic HTTP motion is historical test-only context
|
||||
- production forwarding/runtime acceptance must use QUIC route execution rather
|
||||
than HTTP peer transport
|
||||
|
||||
Verification:
|
||||
|
||||
```powershell
|
||||
@@ -888,9 +898,11 @@ runtime. Stage C17A implements the first narrow runtime skeleton for synthetic
|
||||
Fabric messages only. Stage C17B adds route health/failover observations using
|
||||
synthetic Fabric messages only. Stage C17C adds relay semantic hardening for
|
||||
synthetic channel classes only. Stage C17D adds one bounded non-production
|
||||
`synthetic.echo` service-path experiment only. Stage C17E proves live
|
||||
node-to-node synthetic HTTP transport using real local endpoints only. Stage
|
||||
C17F proves scoped synthetic config loading and route-health reporting only.
|
||||
`synthetic.echo` service-path experiment only. Stage C17E proves one
|
||||
historical synthetic HTTP carrier experiment using real local endpoints only;
|
||||
it is test-only and not representative of the active QUIC fabric runtime.
|
||||
Stage C17F proves scoped synthetic config loading and route-health reporting
|
||||
only.
|
||||
Stage C17G proves Control Plane scoped synthetic config read/consume only.
|
||||
Stage C17H proves deployed multi-agent Control Plane synthetic config
|
||||
consumption and synthetic route-health reporting on `docker-test` only.
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
# Production Direct Worker WSS Trust
|
||||
|
||||
Archived status: this document describes an older direct-worker WSS trust
|
||||
track. It is not the current runtime transport source of truth. For the active
|
||||
fabric transport model, use
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
|
||||
|
||||
Status: P3.4 design/prep complete.
|
||||
|
||||
This document defines the production trust model for direct worker WSS. It is a
|
||||
|
||||
@@ -1,5 +1,13 @@
|
||||
# RDP Adapter Runtime
|
||||
|
||||
Paused/archival note: this document remains useful for RDP adapter internals,
|
||||
but it is not the current source of truth for transport/runtime architecture.
|
||||
Fabric transport is now QUIC-only between nodes. For active transport,
|
||||
recovery, and routing behavior, see
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
|
||||
|
||||
Status: active implementation plan for the new C++ RDP Adapter internals.
|
||||
|
||||
Current implementation status:
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
# RDP Stage 5.2 Design Pass - Server-To-Client File Download
|
||||
|
||||
Archived status: this document belongs to the earlier direct-worker/back-gateway
|
||||
RDP track and is not the current source of truth for fabric transport
|
||||
architecture. The active inter-node transport model is QUIC-only; see
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
|
||||
|
||||
Status: design-complete proposal, no runtime implementation in this step.
|
||||
|
||||
Date: 2026-04-26
|
||||
|
||||
@@ -1,5 +1,13 @@
|
||||
# RDP Service C++ Performance Target
|
||||
|
||||
Paused/archival note: this document is an RDP performance track record, not the
|
||||
current source of truth for node-to-node transport. Fabric transport is now
|
||||
QUIC-only between nodes; use
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md` for the active transport
|
||||
model.
|
||||
|
||||
## Status
|
||||
|
||||
This is the paused RDP service performance direction. The implementation name is `RDP Adapter`: a concrete `Service Adapter` that translates Microsoft RDP into the platform session/data-plane protocol. The common adapter contract is defined in `docs/architecture/SERVICE_ADAPTER_PROTOCOL.md`; the RDP-specific runtime plan is defined in `docs/architecture/RDP_ADAPTER_RUNTIME.md`.
|
||||
|
||||
@@ -1,5 +1,13 @@
|
||||
# RDP Service C# Target Architecture
|
||||
|
||||
Archived scope note: this document is retained as historical RDP runtime
|
||||
research and is not the current source of truth for node-to-node transport.
|
||||
Fabric transport is now QUIC-only between nodes; use
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md` for the active transport
|
||||
model.
|
||||
|
||||
## Status
|
||||
|
||||
Superseded.
|
||||
|
||||
@@ -8,6 +8,12 @@ The current proven RDP lifecycle remains a preserved implementation baseline.
|
||||
RDP work is currently paused by product decision. The active architecture focus
|
||||
is the lower Fabric Core / cluster / node foundation.
|
||||
|
||||
Transport clarification: historical references in this document to direct
|
||||
worker WSS or backend gateway fallback describe the earlier RDP service proof
|
||||
path and migration context. They must not be read as the current inter-node
|
||||
transport contract. The active fabric node-to-node runtime transport is
|
||||
QUIC-only.
|
||||
|
||||
## 1. Project Vision
|
||||
|
||||
The project is a Secure Access Fabric: a distributed, multi-tenant platform for secure access to private resources across sites, networks, and organizations.
|
||||
@@ -1702,7 +1708,7 @@ Channels must have independent priority, reliability, and backpressure behavior.
|
||||
|
||||
The current RDP MVP proves lifecycle and basic viewer behavior. It is not the target production performance model.
|
||||
|
||||
Target RDP realtime model:
|
||||
Target RDP realtime model for the paused historical RDP service track:
|
||||
|
||||
- client connects to direct/relay data plane, not backend frame relay
|
||||
- input/control channels are separate from render/video
|
||||
@@ -2459,7 +2465,11 @@ This is an incremental migration plan. It must not be executed as a big-bang rew
|
||||
|
||||
### Current Fallback
|
||||
|
||||
Keep the current backend WebSocket gateway as fallback while the production data plane is introduced.
|
||||
Historical migration note: the older RDP MVP kept the backend WebSocket
|
||||
gateway as a temporary fallback while an earlier production data-plane design
|
||||
was being introduced. This is not the active fabric transport plan. Current
|
||||
fabric node-to-node runtime transport is QUIC-only, and old compatibility paths
|
||||
are being removed rather than extended.
|
||||
|
||||
Current RDP MVP remains the preserved service-adapter baseline, but it is not
|
||||
the active implementation focus while Fabric Core stages are underway.
|
||||
@@ -2543,9 +2553,14 @@ These stages must be introduced only through explicit, narrow implementation
|
||||
prompts. RDP/VNC/SSH/VPN/video/file services remain above the Fabric Core and
|
||||
must not define the lower fabric foundation.
|
||||
|
||||
### Stage DP-1: Direct Worker WSS
|
||||
### Historical Stage DP-1: Direct Worker WSS
|
||||
|
||||
Introduce a short-lived authorized direct WSS path from client to worker or worker-local live endpoint.
|
||||
This stage records an earlier RDP service migration concept. It is paused and
|
||||
retained for historical context only. It must not be read as the active fabric
|
||||
transport roadmap.
|
||||
|
||||
Introduce a short-lived authorized direct WSS path from client to worker or
|
||||
worker-local live endpoint.
|
||||
|
||||
Goals:
|
||||
|
||||
@@ -2554,7 +2569,7 @@ Goals:
|
||||
- keep session broker lifecycle unchanged
|
||||
- keep fallback gateway available
|
||||
|
||||
### Stage DP-2: Binary Frames
|
||||
### Historical Stage DP-2: Binary Frames
|
||||
|
||||
Replace base64 JSON frame payloads with binary frame messages.
|
||||
|
||||
@@ -2565,7 +2580,7 @@ Goals:
|
||||
- reduce JSON/base64 overhead
|
||||
- preserve latest-frame-only behavior
|
||||
|
||||
### Stage DP-3: Adaptive Quality
|
||||
### Historical Stage DP-3: Adaptive Quality
|
||||
|
||||
Implement adaptive RDP quality profiles.
|
||||
|
||||
@@ -2577,9 +2592,10 @@ Goals:
|
||||
- bandwidth and latency feedback
|
||||
- bounded frame queues
|
||||
|
||||
### Stage DP-4: Relay Nodes
|
||||
### Historical Stage DP-4: Relay Nodes
|
||||
|
||||
Introduce `entry-node` and `relay-node` roles for data-plane routing.
|
||||
Introduce `entry-node` and `relay-node` roles for the earlier service-specific
|
||||
data-plane routing model.
|
||||
|
||||
Goals:
|
||||
|
||||
|
||||
@@ -1,20 +1,28 @@
|
||||
# Security And Secrets Readiness
|
||||
|
||||
Status: P3.3 test-stand smoke complete for encrypted resource secrets,
|
||||
assignment-time resolution, and production fallback behavior with smoke-only
|
||||
direct worker WSS trust.
|
||||
Archived scope note: this document records an earlier RDP/direct-worker trust
|
||||
and secret-handling stage. It is not the current source of truth for fabric
|
||||
transport architecture. The active inter-node transport model is QUIC-only; see
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
|
||||
|
||||
Status: P3.3 historical test-stand smoke complete for encrypted resource
|
||||
secrets, assignment-time resolution, and legacy RDP baseline behavior with
|
||||
smoke-only direct-worker trust.
|
||||
|
||||
This document defines the next security hardening layer around the accepted RDP
|
||||
MVP baseline. It does not implement mesh, VPN, server-to-client download, new
|
||||
protocol adapters, or another RDP rendering mode.
|
||||
|
||||
## Current Accepted Baseline
|
||||
## Current Accepted Historical RDP Baseline
|
||||
|
||||
- RDP worker baseline: `rap-rdp-worker:rdp-p1-region-order2`
|
||||
- Backend control plane remains source of truth.
|
||||
- Redis remains live coordination/routing only.
|
||||
- Direct worker WSS is preferred for realtime RDP.
|
||||
- Backend gateway remains fallback/debug.
|
||||
- Historical direct-worker WSS was the preferred realtime RDP path in this
|
||||
stage.
|
||||
- Historical backend gateway remained a fallback/debug path for this stage.
|
||||
- Text clipboard is policy-gated and accepted.
|
||||
- Client-to-server file upload and restricted `RAP_Transfers` visibility are
|
||||
accepted.
|
||||
@@ -124,22 +132,24 @@ Already accepted:
|
||||
- worker rejects wrong worker, wrong attachment, wrong organization, wrong
|
||||
resource, over-broad channels, failed/terminated sessions, and jti replay
|
||||
|
||||
Production still needs:
|
||||
Production still needed for that stage:
|
||||
|
||||
- deployed certificate chain for direct worker WSS on production nodes
|
||||
- pinned or platform-issued worker certificates in live production config
|
||||
- deployed certificate chain for the historical direct-worker WSS path on
|
||||
production nodes
|
||||
- pinned or platform-issued worker certificates in live production config for
|
||||
that historical path
|
||||
- no smoke-only TLS bypass in production clients
|
||||
- rotation process for data-plane signing keys
|
||||
- audit for failed token validation/bind attempts
|
||||
|
||||
P3.2 guard exists:
|
||||
P3.2 historical guard exists:
|
||||
|
||||
- backend distinguishes `smoke_insecure`, `public_ca`, and `platform_ca`
|
||||
direct worker WSS trust modes
|
||||
- production backend omits smoke-only direct candidates
|
||||
- Windows production client skips untrusted or smoke-only direct candidates
|
||||
- backend distinguished `smoke_insecure`, `public_ca`, and `platform_ca`
|
||||
direct-worker trust modes for the historical RDP path
|
||||
- production backend omitted smoke-only direct candidates on that path
|
||||
- Windows production client skipped untrusted or smoke-only direct candidates
|
||||
|
||||
P3.3 test-stand smoke exists:
|
||||
P3.3 historical test-stand smoke exists:
|
||||
|
||||
- `resource_secrets` migration is applied on `docker-test`
|
||||
- backend runs as `APP_ENV=production` with a test-only
|
||||
@@ -149,9 +159,9 @@ P3.3 test-stand smoke exists:
|
||||
- `resources.metadata`, `remote_sessions.metadata`, and `audit_events` were
|
||||
checked for plaintext username/password leakage
|
||||
- production backend with `DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE=smoke_insecure`
|
||||
returns backend gateway fallback only
|
||||
returned the historical backend gateway debug path only
|
||||
- development/smoke backend with the same trust mode advertises the explicit
|
||||
smoke-only direct worker WSS candidate
|
||||
smoke-only historical direct-worker candidate
|
||||
- `RAP_Transfers` smoke passed on the secret-backed resource
|
||||
|
||||
## Required Regression Tests
|
||||
@@ -202,8 +212,8 @@ P3.1 implemented audit events for:
|
||||
assignment payload; a future resolver pull/token flow should reduce exposure
|
||||
in Redis control queues.
|
||||
- Worker still depends on plaintext assignment metadata for development smoke.
|
||||
- Production direct worker WSS certificate issuance/rotation and platform CA
|
||||
distribution are not complete.
|
||||
- Production certificate issuance/rotation and platform CA distribution for the
|
||||
historical direct-worker path are not complete.
|
||||
- The test-stand secret key is a host-local test file, not a production KMS or
|
||||
HSM-backed key.
|
||||
- Automated end-to-end policy denial coverage is still thin.
|
||||
|
||||
@@ -1,7 +1,21 @@
|
||||
# Service Adapter Protocol
|
||||
|
||||
Scope note: this document remains the common adapter-model reference, but it is
|
||||
not the current source of truth for transport/runtime topology between fabric
|
||||
nodes. Fabric transport is now QUIC-only between nodes; for active transport,
|
||||
routing, and recovery behavior see
|
||||
`docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md`,
|
||||
`docs/architecture/FABRIC_FIRST_TRANSPORT_AND_STRESS_PLAN.md`, and
|
||||
`docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`.
|
||||
|
||||
Status: target contract and compile-safe foundation. This document defines the common adapter model for RDP, SSH, VNC, and future services. It does not replace the current backend control plane or current RDP runtime by itself.
|
||||
|
||||
Transport clarification: historical references in this document to direct
|
||||
worker WSS, backend gateway fallback, or DP-1 channel shape belong to the
|
||||
earlier RDP service baseline. They are not the active inter-node transport
|
||||
contract. Current fabric node-to-node transport is QUIC-only; service adapters
|
||||
consume fabric routes rather than define transport fallback behavior.
|
||||
|
||||
## 1. Purpose
|
||||
|
||||
The platform client must not implement third-party protocols directly.
|
||||
@@ -94,12 +108,16 @@ adapter runtime.
|
||||
- Service Adapter does not know UI implementation details.
|
||||
- Control Plane remains authoritative for session lifecycle and policy.
|
||||
- PostgreSQL remains source of truth; Redis remains live coordination only.
|
||||
- Direct worker WSS and backend gateway fallback remain valid transports.
|
||||
- Fabric transport remains QUIC-only between nodes; any historical direct
|
||||
worker or backend fallback paths belong to paused service-specific baselines,
|
||||
not to the active fabric transport contract.
|
||||
- Adapter runtime must not create sessions outside broker/assignment control.
|
||||
|
||||
## 4. Logical Channels
|
||||
|
||||
The session protocol is channel-oriented even when DP-1 uses one WSS connection.
|
||||
The session protocol is channel-oriented regardless of the concrete carrier. A
|
||||
historical DP-1 single-WSS shape may still appear in paused RDP notes, but it
|
||||
is not the current fabric transport contract.
|
||||
|
||||
| Channel | Direction | Reliability | Priority | Purpose |
|
||||
| --- | --- | --- | --- | --- |
|
||||
|
||||
@@ -7,6 +7,11 @@ Secure Access Fabric. It does not implement VPN runtime, packet routing, TUN
|
||||
devices, mesh traffic, service workload execution, API changes, migrations, or
|
||||
RDP behavior changes.
|
||||
|
||||
Transport clarification: this document defines a service layer above Fabric
|
||||
Core. It does not redefine node-to-node transport. Current fabric inter-node
|
||||
transport is QUIC-only; VPN/IP tunnel runtime must request and use fabric
|
||||
routes instead of introducing a separate packet transport contract.
|
||||
|
||||
## Purpose
|
||||
|
||||
VPN/IP tunnel is a service above the Fabric Core, not a node-local setting.
|
||||
|
||||
@@ -9,6 +9,15 @@ Secure Access Fabric.
|
||||
The fabric node-to-node transport remains QUIC-only. HTTP/HTTPS is allowed only
|
||||
as an external client-facing service edge.
|
||||
|
||||
Terminology rule:
|
||||
|
||||
- `Fabric Transport` = QUIC/UDP node-to-node runtime layer.
|
||||
- `Control API` = HTTP/HTTPS management surface for UI, automation, releases,
|
||||
policy, audit, and status.
|
||||
|
||||
The Control API may use HTTP/HTTPS, but it is not a fallback or alternate
|
||||
carrier for fabric node-to-node runtime traffic.
|
||||
|
||||
## Purpose
|
||||
|
||||
The platform needs a clear distinction between:
|
||||
|
||||
@@ -115,9 +115,9 @@ for container in rap_test_postgres rap_test_redis rap_test_backend rap_web_admin
|
||||
done
|
||||
|
||||
redis_guard
|
||||
probe_http "downloads" "$BACKEND_URL/downloads/rap-android-rdp-vpn-build.json"
|
||||
probe_http "downloads" "$BACKEND_URL/downloads/rap-android-vpn-build.json"
|
||||
probe_http "web_admin_root" "$BACKEND_URL/"
|
||||
probe_http "diagnostics" "$PUBLIC_URL/api/v1/clusters/$CLUSTER_ID/vpn/client-diagnostics"
|
||||
probe_http "backend_healthz" "http://127.0.0.1:18121/healthz"
|
||||
|
||||
used_after="$(disk_used_percent)"
|
||||
status="ok"
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -4,7 +4,7 @@
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Панель Secure Access Fabric</title>
|
||||
<script type="module" crossorigin src="/assets/index-gMV--oab.js"></script>
|
||||
<script type="module" crossorigin src="/assets/index-CiNvRobk.js"></script>
|
||||
<link rel="stylesheet" crossorigin href="/assets/index-Cur_BAkX.css">
|
||||
</head>
|
||||
<body>
|
||||
|
||||
+827
-55
File diff suppressed because it is too large
Load Diff
+133
-1
@@ -36,6 +36,7 @@ import type {
|
||||
NodeSyntheticMeshConfig,
|
||||
NodeTelemetryObservation,
|
||||
NodeUpdatePlan,
|
||||
NodeBridgeReplayPlan,
|
||||
NodeUpdatePolicy,
|
||||
NodeUpdateStatus,
|
||||
NodeWorkloadDesiredState,
|
||||
@@ -46,6 +47,7 @@ import type {
|
||||
ReleaseVersion,
|
||||
Resource,
|
||||
RoleAssignment,
|
||||
StaleNodeRiskReport,
|
||||
UserAccount,
|
||||
VPNClientDiagnosticCommand,
|
||||
VPNClientDiagnosticStatus,
|
||||
@@ -66,6 +68,7 @@ type ApiErrorPayload = {
|
||||
message_key?: string;
|
||||
fallback_message?: string;
|
||||
trace_id?: string;
|
||||
details?: Record<string, unknown>;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -106,6 +109,25 @@ export type UpsertNodeUpdatePolicyPayload = {
|
||||
healthWindowSeconds?: number;
|
||||
};
|
||||
|
||||
export type CreateReleaseVersionPayload = {
|
||||
product: string;
|
||||
version: string;
|
||||
channel?: string;
|
||||
status?: string;
|
||||
compatibility?: Record<string, unknown>;
|
||||
changelog?: string;
|
||||
artifacts: Array<{
|
||||
os: string;
|
||||
arch: string;
|
||||
installType: string;
|
||||
kind: string;
|
||||
url: string;
|
||||
sha256: string;
|
||||
sizeBytes?: number;
|
||||
metadata?: Record<string, unknown>;
|
||||
}>;
|
||||
};
|
||||
|
||||
export type UpdateFabricServiceChannelRecoveryPolicyPayload = {
|
||||
hysteresisPenalty?: number;
|
||||
promotionMinSamples?: number;
|
||||
@@ -436,6 +458,37 @@ export class AdminApiClient {
|
||||
return payload.release_versions ?? [];
|
||||
}
|
||||
|
||||
async createReleaseVersion(clusterId: string, input: CreateReleaseVersionPayload): Promise<ReleaseVersion> {
|
||||
const payload = await this.post<{ release_version: ReleaseVersion }>(`/clusters/${clusterId}/updates/releases`, {
|
||||
actor_user_id: this.actorUserId,
|
||||
product: input.product,
|
||||
version: input.version,
|
||||
channel: input.channel || "stable",
|
||||
status: input.status || "active",
|
||||
compatibility: input.compatibility || {},
|
||||
changelog: input.changelog || "",
|
||||
artifacts: input.artifacts.map((artifact) => ({
|
||||
os: artifact.os,
|
||||
arch: artifact.arch,
|
||||
install_type: artifact.installType,
|
||||
kind: artifact.kind,
|
||||
url: artifact.url,
|
||||
sha256: artifact.sha256,
|
||||
size_bytes: artifact.sizeBytes || 0,
|
||||
metadata: artifact.metadata || {},
|
||||
})),
|
||||
});
|
||||
return payload.release_version;
|
||||
}
|
||||
|
||||
async getStaleNodeRiskReport(clusterId: string): Promise<StaleNodeRiskReport> {
|
||||
const params = new URLSearchParams({ actor_user_id: this.actorUserId });
|
||||
const payload = await this.get<{ stale_node_risk_report: StaleNodeRiskReport }>(
|
||||
`/clusters/${clusterId}/updates/stale-node-risk-report?${params.toString()}`,
|
||||
);
|
||||
return payload.stale_node_risk_report;
|
||||
}
|
||||
|
||||
async getNodeUpdatePlan(
|
||||
clusterId: string,
|
||||
nodeId: string,
|
||||
@@ -453,6 +506,14 @@ export class AdminApiClient {
|
||||
return payload.node_update_plan;
|
||||
}
|
||||
|
||||
async getNodeBridgeReplayPlan(clusterId: string, nodeId: string): Promise<NodeBridgeReplayPlan> {
|
||||
const params = new URLSearchParams({ actor_user_id: this.actorUserId });
|
||||
const payload = await this.get<{ node_bridge_replay_plan: NodeBridgeReplayPlan }>(
|
||||
`/clusters/${clusterId}/nodes/${nodeId}/updates/bridge-replay-plan?${params.toString()}`,
|
||||
);
|
||||
return payload.node_bridge_replay_plan;
|
||||
}
|
||||
|
||||
async upsertNodeUpdatePolicy(clusterId: string, nodeId: string, input: UpsertNodeUpdatePolicyPayload): Promise<NodeUpdatePolicy> {
|
||||
const payload = await this.put<{ node_update_policy: NodeUpdatePolicy }>(`/clusters/${clusterId}/nodes/${nodeId}/updates/policy`, {
|
||||
actor_user_id: this.actorUserId,
|
||||
@@ -1269,7 +1330,7 @@ export class AdminApiClient {
|
||||
let message = `Запрос завершился ошибкой HTTP ${response.status}`;
|
||||
try {
|
||||
const payload = (await response.json()) as ApiErrorPayload;
|
||||
message = payload.error?.fallback_message || payload.error?.code || message;
|
||||
message = formatApiErrorMessage(payload, response.status) || payload.error?.fallback_message || payload.error?.code || message;
|
||||
} catch {
|
||||
// Keep generic HTTP message if backend did not return JSON.
|
||||
}
|
||||
@@ -1279,6 +1340,77 @@ export class AdminApiClient {
|
||||
}
|
||||
}
|
||||
|
||||
function formatApiErrorMessage(payload: ApiErrorPayload, status: number) {
|
||||
const error = payload.error;
|
||||
if (!error) {
|
||||
return "";
|
||||
}
|
||||
if (status === 409 && error.code === "conflict.legacy_compatibility_removal_is_blocked_while_stale_recovery_risk_nodes_remain") {
|
||||
const details = error.details || {};
|
||||
const parts: string[] = ["Compatibility cleanup заблокирован."];
|
||||
const blockedOperation = stringDetail(details, "blocked_operation");
|
||||
if (blockedOperation) {
|
||||
parts.push(`Операция: ${blockedOperation}.`);
|
||||
}
|
||||
const counters = [
|
||||
numberDetail(details, "blocked_nodes") ? `blockers ${numberDetail(details, "blocked_nodes")}` : "",
|
||||
numberDetail(details, "stale_nodes") ? `stale ${numberDetail(details, "stale_nodes")}` : "",
|
||||
numberDetail(details, "artifact_gap_nodes") ? `artifact gap ${numberDetail(details, "artifact_gap_nodes")}` : "",
|
||||
numberDetail(details, "unknown_profile_nodes") ? `profile unknown ${numberDetail(details, "unknown_profile_nodes")}` : "",
|
||||
numberDetail(details, "waiting_update_status_nodes") ? `waiting status ${numberDetail(details, "waiting_update_status_nodes")}` : "",
|
||||
numberDetail(details, "unknown_version_nodes") ? `version unknown ${numberDetail(details, "unknown_version_nodes")}` : "",
|
||||
numberDetail(details, "legacy_recovery_contract_nodes") ? `legacy contract ${numberDetail(details, "legacy_recovery_contract_nodes")}` : "",
|
||||
numberDetail(details, "recovery_bridge_required_nodes") ? `recovery bridge ${numberDetail(details, "recovery_bridge_required_nodes")}` : "",
|
||||
numberDetail(details, "recovery_bridge_replay_ready_nodes") ? `bridge replay ready ${numberDetail(details, "recovery_bridge_replay_ready_nodes")}` : "",
|
||||
numberDetail(details, "waiting_recovery_heartbeat_nodes") ? `waiting heartbeat ${numberDetail(details, "waiting_recovery_heartbeat_nodes")}` : "",
|
||||
].filter(Boolean);
|
||||
if (counters.length > 0) {
|
||||
parts.push(counters.join(" / ") + ".");
|
||||
}
|
||||
const nodeIds = arrayDetail(details, "blocked_node_ids");
|
||||
if (nodeIds.length > 0) {
|
||||
parts.push(`Blocked nodes: ${nodeIds.join(", ")}.`);
|
||||
}
|
||||
if (booleanDetail(details, "bridge_hold_required")) {
|
||||
const holdReasons = arrayDetail(details, "bridge_hold_reasons");
|
||||
const holdNodes = arrayDetail(details, "bridge_hold_node_ids");
|
||||
const holdSummary: string[] = [];
|
||||
if (holdReasons.length > 0) {
|
||||
holdSummary.push(`reasons ${holdReasons.join(", ")}`);
|
||||
}
|
||||
if (holdNodes.length > 0) {
|
||||
holdSummary.push(`nodes ${holdNodes.join(", ")}`);
|
||||
}
|
||||
parts.push(`Recovery bridge hold active${holdSummary.length > 0 ? `: ${holdSummary.join(" / ")}` : ""}.`);
|
||||
}
|
||||
const traceID = error.trace_id?.trim();
|
||||
if (traceID) {
|
||||
parts.push(`Trace: ${traceID}.`);
|
||||
}
|
||||
return parts.join(" ");
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function stringDetail(source: Record<string, unknown>, key: string) {
|
||||
const value = source[key];
|
||||
return typeof value === "string" ? value.trim() : "";
|
||||
}
|
||||
|
||||
function numberDetail(source: Record<string, unknown>, key: string) {
|
||||
const value = source[key];
|
||||
return typeof value === "number" && Number.isFinite(value) ? value : 0;
|
||||
}
|
||||
|
||||
function booleanDetail(source: Record<string, unknown>, key: string) {
|
||||
return source[key] === true;
|
||||
}
|
||||
|
||||
function arrayDetail(source: Record<string, unknown>, key: string) {
|
||||
const value = source[key];
|
||||
return Array.isArray(value) ? value.filter((item): item is string => typeof item === "string" && item.trim().length > 0) : [];
|
||||
}
|
||||
|
||||
function browserDeviceFingerprint(): string {
|
||||
const key = "rap.webAdmin.deviceFingerprint";
|
||||
const existing = localStorage.getItem(key);
|
||||
|
||||
@@ -343,6 +343,28 @@ export type NodeUpdatePlan = {
|
||||
production_forwarding: boolean;
|
||||
};
|
||||
|
||||
export type NodeBridgeReplayProductPlan = {
|
||||
product: string;
|
||||
recovery_bridge_mode?: string;
|
||||
recovery_bridge_replay_ready: boolean;
|
||||
last_status_reason?: string;
|
||||
update_plan: NodeUpdatePlan;
|
||||
};
|
||||
|
||||
export type NodeBridgeReplayPlan = {
|
||||
schema_version: string;
|
||||
cluster_id: string;
|
||||
node_id: string;
|
||||
node_name?: string;
|
||||
health_status?: string;
|
||||
heartbeat_stale: boolean;
|
||||
bridge_hold_required: boolean;
|
||||
recovery_bridge_replay_ready: boolean;
|
||||
bridge_hold_reasons?: string[];
|
||||
bridge_actions?: string[];
|
||||
products?: NodeBridgeReplayProductPlan[];
|
||||
};
|
||||
|
||||
export type NodeUpdatePolicy = {
|
||||
id: string;
|
||||
cluster_id: string;
|
||||
@@ -374,6 +396,78 @@ export type NodeUpdateStatus = {
|
||||
observed_at: string;
|
||||
};
|
||||
|
||||
export type StaleNodeRiskProduct = {
|
||||
product: string;
|
||||
current_version?: string | null;
|
||||
target_version?: string | null;
|
||||
channel?: string | null;
|
||||
strategy?: string | null;
|
||||
enabled?: boolean;
|
||||
detected_os?: string | null;
|
||||
detected_arch?: string | null;
|
||||
detected_install_type?: string | null;
|
||||
compatible_artifact_found: boolean;
|
||||
matching_release_version?: string | null;
|
||||
last_status_observed_at?: string | null;
|
||||
last_status_phase?: string | null;
|
||||
last_status_value?: string | null;
|
||||
last_status_reason?: string | null;
|
||||
recovery_bridge_required?: boolean;
|
||||
recovery_bridge_replay_ready?: boolean;
|
||||
recovery_bridge_mode?: string | null;
|
||||
risks?: string[];
|
||||
};
|
||||
|
||||
export type StaleNodeRiskNode = {
|
||||
node_id: string;
|
||||
name: string;
|
||||
node_key?: string;
|
||||
reported_version?: string | null;
|
||||
health_status: string;
|
||||
registration_status: string;
|
||||
last_seen_at?: string | null;
|
||||
heartbeat_stale: boolean;
|
||||
blocked: boolean;
|
||||
direct_peer_alert?: boolean;
|
||||
direct_peer_ready_count?: number;
|
||||
direct_peer_target_count?: number;
|
||||
direct_peer_deficit?: number;
|
||||
alerts?: string[];
|
||||
recovery_bridge_required?: boolean;
|
||||
recovery_bridge_replay_ready?: boolean;
|
||||
recovery_bridge_actions?: string[];
|
||||
risks: string[];
|
||||
products: StaleNodeRiskProduct[];
|
||||
};
|
||||
|
||||
export type StaleNodeRiskSummary = {
|
||||
total_nodes: number;
|
||||
stale_nodes: number;
|
||||
blocked_nodes: number;
|
||||
direct_peer_alert_nodes?: number;
|
||||
artifact_gap_nodes?: number;
|
||||
unknown_profile_nodes?: number;
|
||||
waiting_update_status_nodes?: number;
|
||||
unknown_version_nodes?: number;
|
||||
legacy_recovery_contract_nodes?: number;
|
||||
recovery_bridge_required_nodes?: number;
|
||||
recovery_bridge_replay_ready_nodes?: number;
|
||||
waiting_recovery_heartbeat_nodes?: number;
|
||||
};
|
||||
|
||||
export type StaleNodeRiskReport = {
|
||||
cluster_id: string;
|
||||
generated_at: string;
|
||||
heartbeat_stale_after_seconds?: number;
|
||||
legacy_removal_allowed: boolean;
|
||||
bridge_hold_required?: boolean;
|
||||
bridge_hold_node_ids?: string[];
|
||||
bridge_hold_reasons?: string[];
|
||||
blocked_operations?: string[];
|
||||
summary: StaleNodeRiskSummary;
|
||||
nodes: StaleNodeRiskNode[];
|
||||
};
|
||||
|
||||
export type MeshLink = {
|
||||
id: string;
|
||||
cluster_id: string;
|
||||
@@ -1196,6 +1290,8 @@ export type NodeSyntheticMeshConfig = {
|
||||
auto_port_start?: number;
|
||||
auto_port_end?: number;
|
||||
advertise_endpoint?: string;
|
||||
advertise_endpoints?: string[];
|
||||
endpoint_candidates?: PeerEndpointCandidate[];
|
||||
advertise_transport?: string;
|
||||
connectivity_mode?: string;
|
||||
nat_type?: string;
|
||||
|
||||
Reference in New Issue
Block a user