This commit is contained in:
2026-05-18 21:33:39 +03:00
parent 5096155d83
commit 469fa0e860
94 changed files with 8761 additions and 8003 deletions
@@ -7,7 +7,7 @@ import (
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const Version = "0.2.309-latencyaware"
const Version = "0.2.321-directreadytarget"
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
return client.EnrollRequest{
@@ -828,9 +828,6 @@ func (c *Client) RawControl(ctx context.Context, request RawControlRequest) (Raw
if err != nil {
return RawControlResponse{}, err
}
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return RawControlResponse{}, fmt.Errorf("backend returned status %d: %s", httpResp.StatusCode, string(payload))
}
return RawControlResponse{StatusCode: httpResp.StatusCode, Body: json.RawMessage(payload)}, nil
}
@@ -1,6 +1,7 @@
package config
import (
"encoding/json"
"errors"
"flag"
"os"
@@ -31,7 +32,6 @@ type Config struct {
EnrollmentPollTimeout time.Duration
MeshSyntheticRuntimeEnabled bool
MeshProductionForwardingEnabled bool
MeshFabricSessionEnabled bool
VPNFabricSessionTransportEnabled bool
MeshQUICFabricEnabled bool
MeshQUICFabricListenAddr string
@@ -45,6 +45,7 @@ type Config struct {
MeshListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
FabricRegistryRecordsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
@@ -86,7 +87,6 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.StringVar(&cfg.WebIngressRuntimeServiceClasses, "web-ingress-runtime-service-classes", getEnv(env, "RAP_WEB_INGRESS_RUNTIME_SERVICE_CLASSES", ""), "Optional comma-separated allow-list of web ingress runtime service classes accepted by this node.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getEnvBool(env, "RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint. Disabled by default.")
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getEnvBool(env, "RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric session when explicitly enabled. Disabled by default.")
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getEnvBool(env, "RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener. Disabled by default.")
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getEnv(env, "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "Listen address for QUIC/UDP fabric endpoint, for example :19443.")
@@ -94,12 +94,13 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getEnvInt(env, "RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.DurationVar(&cfg.VPNFabricQUICIdleTTL, "vpn-fabric-quic-idle-ttl", time.Duration(getEnvInt(env, "RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300))*time.Second, "Idle TTL for cached VPN QUIC carrier connections.")
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default historical synthetic mesh HTTP endpoint.")
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
fs.StringVar(&cfg.FabricRegistryRecordsJSON, "fabric-registry-records-json", getEnv(env, "RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry gossip records used as bootstrap discovery seeds.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Transport label for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.")
@@ -150,6 +151,7 @@ func Load(args []string, env map[string]string) (Config, error) {
}
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.FabricRegistryRecordsJSON = strings.TrimSpace(cfg.FabricRegistryRecordsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
if cfg.MeshAdvertiseTransport == "" {
cfg.MeshAdvertiseTransport = "quic"
@@ -199,6 +201,9 @@ func Load(args []string, env map[string]string) (Config, error) {
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
}
if cfg.FabricRegistryRecordsJSON != "" && !isJSONArray(cfg.FabricRegistryRecordsJSON) {
return Config{}, errors.New("fabric registry records must be a JSON array")
}
switch cfg.MeshListenPortMode {
case "", "manual", "auto", "disabled":
if cfg.MeshListenPortMode == "" {
@@ -269,6 +274,11 @@ func hasLegacyEndpointScheme(endpoint string) bool {
strings.HasPrefix(endpoint, "wss://")
}
func isJSONArray(value string) bool {
var items []json.RawMessage
return json.Unmarshal([]byte(strings.TrimSpace(value)), &items) == nil
}
func readEnv() map[string]string {
out := map[string]string{}
for _, pair := range os.Environ() {
@@ -25,7 +25,6 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true",
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
"RAP_MESH_FABRIC_SESSION_ENABLED": "true",
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED": "true",
"RAP_MESH_QUIC_FABRIC_ENABLED": "true",
"RAP_MESH_QUIC_FABRIC_LISTEN_ADDR": ":19443",
@@ -39,6 +38,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443/",
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
"RAP_FABRIC_REGISTRY_RECORDS_JSON": ` [{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}] `,
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
"RAP_MESH_CONNECTIVITY_MODE": "outbound_only",
"RAP_MESH_NAT_TYPE": "symmetric",
@@ -93,9 +93,6 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if !cfg.MeshProductionForwardingEnabled {
t.Fatal("MeshProductionForwardingEnabled = false, want true")
}
if !cfg.MeshFabricSessionEnabled {
t.Fatal("MeshFabricSessionEnabled = false, want true")
}
if !cfg.VPNFabricSessionTransportEnabled {
t.Fatal("VPNFabricSessionTransportEnabled = false, want true")
}
@@ -122,6 +119,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
}
if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:19443" ||
cfg.MeshAdvertiseEndpointsJSON == "" ||
cfg.FabricRegistryRecordsJSON != `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]` ||
cfg.MeshAdvertiseTransport != "direct_quic" ||
cfg.MeshConnectivityMode != "outbound_only" ||
cfg.MeshNATType != "symmetric" ||
@@ -1,6 +1,7 @@
package hostagent
import (
"encoding/json"
"errors"
"fmt"
"strings"
@@ -29,7 +30,6 @@ type RuntimeConfig struct {
WorkloadSupervisionEnabled bool
MeshSyntheticRuntimeEnabled bool
MeshProductionForwardingEnabled bool
MeshFabricSessionEnabled bool
VPNFabricSessionTransportEnabled bool
MeshQUICFabricEnabled bool
MeshQUICFabricListenAddr string
@@ -42,6 +42,7 @@ type RuntimeConfig struct {
MeshListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
FabricRegistryRecordsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
@@ -84,6 +85,7 @@ func (cfg RuntimeConfig) Normalize() RuntimeConfig {
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.FabricRegistryRecordsJSON = strings.TrimSpace(cfg.FabricRegistryRecordsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
@@ -145,6 +147,9 @@ func (cfg RuntimeConfig) ValidateInstall() error {
if cfg.ProductionObservationSinkCap < 0 {
return errors.New("production observation sink capacity must not be negative")
}
if cfg.FabricRegistryRecordsJSON != "" && !isJSONArray(cfg.FabricRegistryRecordsJSON) {
return errors.New("fabric registry records must be a JSON array")
}
for _, item := range cfg.ExtraEnv {
if !strings.Contains(item, "=") {
return fmt.Errorf("extra env %q must be KEY=VALUE", item)
@@ -176,3 +181,8 @@ func hasLegacyEndpointScheme(endpoint string) bool {
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
}
func isJSONArray(value string) bool {
var items []json.RawMessage
return json.Unmarshal([]byte(strings.TrimSpace(value)), &items) == nil
}
@@ -264,7 +264,6 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
"RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled),
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled),
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled),
"RAP_MESH_FABRIC_SESSION_ENABLED=" + boolString(cfg.MeshFabricSessionEnabled),
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED=" + boolString(cfg.VPNFabricSessionTransportEnabled),
"RAP_MESH_QUIC_FABRIC_ENABLED=" + boolString(cfg.MeshQUICFabricEnabled),
"RAP_VPN_FABRIC_SESSION_STREAM_SHARDS=" + strconv.Itoa(cfg.VPNFabricSessionStreamShards),
@@ -295,6 +294,9 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
if cfg.MeshAdvertiseEndpointsJSON != "" {
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON="+cfg.MeshAdvertiseEndpointsJSON)
}
if cfg.FabricRegistryRecordsJSON != "" {
env = append(env, "RAP_FABRIC_REGISTRY_RECORDS_JSON="+cfg.FabricRegistryRecordsJSON)
}
if cfg.MeshAdvertiseTransport != "" {
env = append(env, "RAP_MESH_ADVERTISE_TRANSPORT="+cfg.MeshAdvertiseTransport)
}
@@ -74,6 +74,7 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
VPNFabricQUICIdleTTLSeconds: 120,
MeshListenAddr: ":19131",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443/",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
MeshAdvertiseTransport: "direct_quic",
MeshConnectivityMode: "private_lan",
})
@@ -96,6 +97,7 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS=120",
"RAP_MESH_LISTEN_ADDR=:19131",
"RAP_MESH_ADVERTISE_ENDPOINT=quic://10.0.0.11:19443",
`RAP_FABRIC_REGISTRY_RECORDS_JSON=[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
"RAP_MESH_ADVERTISE_TRANSPORT=direct_quic",
"RAP_MESH_CONNECTIVITY_MODE=private_lan",
"rap-node-agent:test",
@@ -164,6 +166,11 @@ func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
"node_name": "node-a",
"image": "rap-node-agent:test",
"artifact_endpoints": []string{"https://cache.example.test/artifacts"},
"fabric_registry_records": []map[string]any{{
"schema": "rap.fabric.registry.gossip_record.v1",
"service_class": "control-api",
"service_id": "control-a",
}},
"docker_image_artifact": map[string]any{
"kind": "docker_image_tar",
"image": "rap-node-agent:test",
@@ -207,6 +214,7 @@ func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
!cfg.MeshQUICFabricEnabled ||
cfg.MeshQUICFabricListenAddr != ":19443" ||
cfg.VPNFabricSessionStreamShards != 6 ||
cfg.FabricRegistryRecordsJSON != `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api","service_id":"control-a"}]` ||
cfg.MeshConnectivityMode != "outbound_only" {
t.Fatalf("unexpected cfg: %+v", cfg)
}
@@ -72,7 +72,6 @@ func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConf
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshFabricSessionEnabled: profile.MeshFabricSessionEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr,
@@ -287,7 +286,6 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
args := []string{
result.HostAgentPath,
"update-loop",
"--backend-url", cfg.RuntimeConfig.BackendURL,
"--cluster-id", cfg.RuntimeConfig.ClusterID,
"--state-dir", result.StateDir,
"--current-version", cfg.AutoUpdateCurrentVersion,
@@ -303,6 +301,10 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
"--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"),
"--host-agent-binary-path", result.HostAgentPath,
}
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
args = append(args, "--backend-url", strings.TrimSpace(cfg.RuntimeConfig.BackendURL))
}
args = appendFabricUpdateArgs(args, cfg.RuntimeConfig)
if strings.TrimSpace(cfg.NodeID) != "" {
args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID))
}
@@ -363,48 +365,48 @@ func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Updat
}
status.Payload["systemd_unit"] = req.SystemdUnitName
status.Payload["binary_path"] = req.BinaryPath
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
_ = ReportNodeUpdateStatusForRequest(ctx, req, status)
}
return result, nil
}
if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact == nil {
err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != BinaryUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
runner := m.runner()
_, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "apply", "failed", err))
return result, err
}
result.Replaced = true
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
_ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()})
return result, nil
}
@@ -31,31 +31,34 @@ const (
)
type MonitorConfig struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
Product string
CurrentVersion string
Interval time.Duration
InitialDelay time.Duration
MaxRuns int
DockerBinary string
WatchContainers []string
RestartContainers bool
RestartCooldown time.Duration
StaleRestartingAfter time.Duration
DiskPath string
TmpDir string
DiskWarnPercent int
DiskCleanupPercent int
DiskCriticalPercent int
TmpMinAge time.Duration
CleanupDocker bool
StatusFile string
Runner CommandRunner
Logf func(format string, args ...any)
restartHistory map[string]time.Time
BackendURL string
ClusterID string
NodeID string
StateDir string
ClusterAuthorityPublicKey string
FabricRegistryRecordsJSON string
MeshRegion string
Product string
CurrentVersion string
Interval time.Duration
InitialDelay time.Duration
MaxRuns int
DockerBinary string
WatchContainers []string
RestartContainers bool
RestartCooldown time.Duration
StaleRestartingAfter time.Duration
DiskPath string
TmpDir string
DiskWarnPercent int
DiskCleanupPercent int
DiskCriticalPercent int
TmpMinAge time.Duration
CleanupDocker bool
StatusFile string
Runner CommandRunner
Logf func(format string, args ...any)
restartHistory map[string]time.Time
}
type DiskUsage struct {
@@ -421,7 +424,18 @@ func reportMonitorStatus(ctx context.Context, cfg MonitorConfig, result MonitorR
if errText != "" {
req.ErrorMessage = &errText
}
return ReportNodeUpdateStatus(ctx, cfg.BackendURL, clusterID, nodeID, req)
return ReportNodeUpdateStatusForRequest(ctx, UpdateRequest{
BackendURL: cfg.BackendURL,
ClusterID: clusterID,
NodeID: nodeID,
StateDir: cfg.StateDir,
ClusterAuthorityPublicKey: cfg.ClusterAuthorityPublicKey,
FabricRegistryRecordsJSON: cfg.FabricRegistryRecordsJSON,
MeshRegion: cfg.MeshRegion,
Product: cfg.Product,
CurrentVersion: cfg.CurrentVersion,
InstallType: DefaultUpdateInstallType,
}, req)
}
func resolveMonitorIdentity(cfg MonitorConfig) (string, string, error) {
@@ -16,6 +16,7 @@ type DockerInstallProfile struct {
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
@@ -30,7 +31,6 @@ type DockerInstallProfile struct {
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshFabricSessionEnabled bool `json:"mesh_fabric_session_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"`
@@ -70,6 +70,7 @@ type WindowsInstallProfile struct {
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
@@ -79,7 +80,6 @@ type WindowsInstallProfile struct {
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshFabricSessionEnabled bool `json:"mesh_fabric_session_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"`
@@ -109,6 +109,7 @@ type LinuxInstallProfile struct {
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
@@ -118,7 +119,6 @@ type LinuxInstallProfile struct {
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshFabricSessionEnabled bool `json:"mesh_fabric_session_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
MeshQUICFabricListenAddr string `json:"mesh_quic_fabric_listen_addr"`
@@ -302,7 +302,6 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshFabricSessionEnabled: profile.MeshFabricSessionEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr,
@@ -315,6 +314,7 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
FabricRegistryRecordsJSON: string(profile.FabricRegistryRecords),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
@@ -10,19 +10,22 @@ import (
)
type HostAgentUpdateRequest struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
CurrentVersion string
Channel string
OS string
Arch string
InstallType string
BinaryPath string
DryRun bool
RestartService string
RestartAfterApply bool
BackendURL string
ClusterID string
NodeID string
StateDir string
ClusterAuthorityPublicKey string
FabricRegistryRecordsJSON string
MeshRegion string
CurrentVersion string
Channel string
OS string
Arch string
InstallType string
BinaryPath string
DryRun bool
RestartService string
RestartAfterApply bool
}
type HostAgentUpdateLoopConfig struct {
@@ -37,18 +40,21 @@ type HostAgentUpdateLoopConfig struct {
func (req HostAgentUpdateRequest) updateRequest() UpdateRequest {
return UpdateRequest{
BackendURL: req.BackendURL,
ClusterID: req.ClusterID,
NodeID: req.NodeID,
StateDir: req.StateDir,
Product: HostAgentUpdateProduct,
CurrentVersion: req.CurrentVersion,
OS: firstNonEmpty(req.OS, "linux"),
Arch: req.Arch,
InstallType: firstNonEmpty(req.InstallType, BinaryUpdateInstallType),
Channel: req.Channel,
ContainerName: "host-agent-service",
DryRun: req.DryRun,
BackendURL: req.BackendURL,
ClusterID: req.ClusterID,
NodeID: req.NodeID,
StateDir: req.StateDir,
ClusterAuthorityPublicKey: req.ClusterAuthorityPublicKey,
FabricRegistryRecordsJSON: req.FabricRegistryRecordsJSON,
MeshRegion: req.MeshRegion,
Product: HostAgentUpdateProduct,
CurrentVersion: req.CurrentVersion,
OS: firstNonEmpty(req.OS, "linux"),
Arch: req.Arch,
InstallType: firstNonEmpty(req.InstallType, BinaryUpdateInstallType),
Channel: req.Channel,
ContainerName: "host-agent-service",
DryRun: req.DryRun,
}
}
@@ -79,25 +85,25 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
status.Payload = map[string]any{}
}
status.Payload["binary_path"] = binaryPath
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, status)
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, status)
}
return result, nil
}
if plan.Artifact == nil {
err := errors.New("host-agent update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "preflight", "failed", err))
return result, err
}
if !isBinaryInstallType(plan.Artifact.InstallType) {
err := fmt.Errorf("unsupported host-agent artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL)
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
@@ -109,7 +115,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "download", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
@@ -125,7 +131,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
@@ -137,7 +143,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
})
return result, nil
}
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr)))
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr)))
return result, err
}
result.Loaded = true
@@ -151,7 +157,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
@@ -173,8 +173,8 @@ func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServi
func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
var missing []string
if runtimeCfg.BackendURL == "" {
missing = append(missing, "backend-url")
if runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "" {
missing = append(missing, "backend-url-or-fabric-registry-records-json")
}
if runtimeCfg.ClusterID == "" {
missing = append(missing, "cluster-id")
@@ -191,7 +191,6 @@ func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
args := []string{
cfg.BinaryInstallPath,
"update-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir,
"--container-name", runtimeCfg.ContainerName,
@@ -202,9 +201,13 @@ func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
"--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec),
}
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
args = appendFabricUpdateArgs(args, runtimeCfg)
execStart := systemdJoin(args)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent updater for %s
@@ -225,8 +228,8 @@ WantedBy=multi-user.target
func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host-agent self updater")
if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host-agent self updater")
}
unitName := "rap-host-agent-self-updater.service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
@@ -234,7 +237,6 @@ func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, stri
args := []string{
cfg.BinaryInstallPath,
"update-host-agent-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir,
"--binary-path", firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath),
@@ -243,9 +245,13 @@ func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, stri
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30),
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
}
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
args = appendFabricUpdateArgs(args, runtimeCfg)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent self updater
After=network-online.target docker.service
@@ -265,8 +271,8 @@ WantedBy=multi-user.target
func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host monitor")
if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host monitor")
}
containers := uniqueTrimmed(append([]string{runtimeCfg.ContainerName}, cfg.MonitorContainers...))
if len(containers) == 0 {
@@ -277,7 +283,6 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
args := []string{
cfg.BinaryInstallPath,
"monitor-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir,
"--current-version", firstNonEmpty(cfg.SelfUpdateVersion, cfg.CurrentVersion),
@@ -286,6 +291,9 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
"--disk-cleanup-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCleanup, DefaultMonitorDiskCleanupPercent)),
"--disk-critical-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCritical, DefaultMonitorDiskCriticalPercent)),
}
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if cfg.MonitorCleanupDocker {
args = append(args, "--cleanup-docker")
}
@@ -295,6 +303,7 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
for _, container := range containers {
args = append(args, "--watch-container", container)
}
args = appendFabricUpdateArgs(args, runtimeCfg)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent monitor for %s
After=network-online.target docker.service
@@ -312,6 +321,16 @@ WantedBy=multi-user.target
`, runtimeCfg.ContainerName, systemdJoin(args)), unitName, unitPath, nil
}
func appendFabricUpdateArgs(args []string, runtimeCfg RuntimeConfig) []string {
if strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON) != "" {
args = append(args, "--fabric-registry-records-json", strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON))
}
if strings.TrimSpace(runtimeCfg.MeshRegion) != "" {
args = append(args, "--mesh-region", strings.TrimSpace(runtimeCfg.MeshRegion))
}
return args
}
func firstNonZero(values ...int) int {
for _, value := range values {
if value != 0 {
@@ -119,7 +119,7 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
for _, want := range []string{
":loop",
"rap-host-agent.exe.next",
"update-loop --backend-url",
"update-loop --cluster-id",
"--backend-url \"http://control/api/v1\"",
"--cluster-id \"cluster-1\"",
"--node-id \"node-1\"",
@@ -139,6 +139,35 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
}
}
func TestWindowsHostAgentUpdateScriptOmitsEmptyBackendURL(t *testing.T) {
cfg := WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
ClusterID: "cluster-1",
FabricRegistryRecordsJSON: `[{"record_id":"r1"}]`,
MeshRegion: "ru-msk",
},
AutoUpdateCurrentVersion: "0.1.2",
}
result := WindowsInstallResult{
NodeName: "win-a",
StateDir: `C:\ProgramData\RAP\nodes\win-a`,
NodeAgentPath: `C:\Program Files\RAP\win-a\rap-node-agent.exe`,
TaskName: "RAP Node Agent win-a",
}
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
if strings.Contains(script, "--backend-url") {
t.Fatalf("script must not include backend-url when it is empty:\n%s", script)
}
for _, want := range []string{
`--fabric-registry-records-json [{"record_id":"r1"}]`,
"--mesh-region ru-msk",
} {
if !strings.Contains(script, want) {
t.Fatalf("script missing %q:\n%s", want, script)
}
}
}
func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) {
result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
@@ -3,6 +3,8 @@ package hostagent
import (
"bytes"
"context"
"crypto/ed25519"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
@@ -17,6 +19,8 @@ import (
"time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
@@ -33,23 +37,26 @@ const (
var ErrNodeIdentityNotReady = errors.New("node identity is not approved yet")
type UpdateRequest struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
Product string
CurrentVersion string
OS string
Arch string
InstallType string
Channel string
ContainerName string
BinaryPath string
WindowsTaskName string
SystemdUnitName string
HealthTimeout time.Duration
DryRun bool
AllowProductionMesh bool
BackendURL string
ClusterID string
NodeID string
StateDir string
ClusterAuthorityPublicKey string
FabricRegistryRecordsJSON string
MeshRegion string
Product string
CurrentVersion string
OS string
Arch string
InstallType string
Channel string
ContainerName string
BinaryPath string
WindowsTaskName string
SystemdUnitName string
HealthTimeout time.Duration
DryRun bool
AllowProductionMesh bool
}
type UpdateResult struct {
@@ -204,6 +211,9 @@ func (req UpdateRequest) Normalize() UpdateRequest {
req.ClusterID = strings.TrimSpace(req.ClusterID)
req.NodeID = strings.TrimSpace(req.NodeID)
req.StateDir = strings.TrimSpace(req.StateDir)
req.ClusterAuthorityPublicKey = strings.TrimSpace(req.ClusterAuthorityPublicKey)
req.FabricRegistryRecordsJSON = strings.TrimSpace(req.FabricRegistryRecordsJSON)
req.MeshRegion = strings.TrimSpace(req.MeshRegion)
req.Product = firstNonEmpty(req.Product, DefaultUpdateProduct)
req.OS = firstNonEmpty(req.OS, runtime.GOOS)
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
@@ -222,8 +232,8 @@ func (req UpdateRequest) Normalize() UpdateRequest {
func (req UpdateRequest) Validate() error {
req = req.Normalize()
var missing []string
if req.BackendURL == "" {
missing = append(missing, "backend-url")
if req.BackendURL == "" && req.FabricRegistryRecordsJSON == "" {
missing = append(missing, "backend-url-or-fabric-registry-records-json")
}
if req.ClusterID == "" {
missing = append(missing, "cluster-id")
@@ -285,30 +295,30 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
}
if plan.Action != "update" {
if !req.DryRun {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromNoopPlan(req, plan))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromNoopPlan(req, plan))
}
return result, nil
}
if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact == nil {
err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != DefaultUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
result.NewImage = artifactImage(*plan.Artifact, "")
return result, nil
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
@@ -321,7 +331,7 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
current, cfg, err := m.runtimeConfigFromContainer(ctx, runner, docker, req.ContainerName)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "inspect", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "inspect", "failed", err))
return result, err
}
result.PreviousImageID = current.Image
@@ -339,7 +349,7 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
cfg.JoinToken = ""
result.NewImage = cfg.Image
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
@@ -351,7 +361,7 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
})
installed, err := m.Install(ctx, cfg)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "apply", "failed", err))
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
if rollbackErr == nil && plan.RollbackAllowed {
result.RolledBack = true
@@ -363,14 +373,14 @@ func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upda
result.ContainerID = installed.ContainerID
if err := m.waitContainerRunning(ctx, runner, docker, req.ContainerName, req.HealthTimeout); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "health_check", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "health_check", "failed", err))
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
if rollbackErr == nil && plan.RollbackAllowed {
result.RolledBack = true
}
return result, err
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
@@ -515,7 +525,27 @@ func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan
if req.Channel != "" {
values.Set("channel", req.Channel)
}
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/plan?%s", req.BackendURL, url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode())
path := fmt.Sprintf("/clusters/%s/nodes/%s/updates/plan?%s", url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode())
if raw, viaFabric, err := updateControlRawViaFabric(ctx, req, client.RawControlRequest{Method: http.MethodGet, Path: path}); viaFabric {
if err != nil {
return NodeUpdatePlan{}, err
}
if raw.StatusCode < 200 || raw.StatusCode >= 300 {
return NodeUpdatePlan{}, fmt.Errorf("fetch update plan via fabric: status %d", raw.StatusCode)
}
var out NodeUpdatePlanResponse
if err := json.Unmarshal(raw.Body, &out); err != nil {
return NodeUpdatePlan{}, err
}
if err := verifyNodeUpdatePlanAuthority(req, out.Plan); err != nil {
return NodeUpdatePlan{}, err
}
return out.Plan, nil
}
endpoint := req.BackendURL + path
if req.BackendURL == "" {
return NodeUpdatePlan{}, errors.New("update plan control API is unavailable: no active fabric route and backend-url is empty")
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return NodeUpdatePlan{}, err
@@ -538,6 +568,110 @@ func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan
return out.Plan, nil
}
func updateControlRawViaFabric(ctx context.Context, req UpdateRequest, rawReq client.RawControlRequest) (client.RawControlResponse, bool, error) {
if strings.TrimSpace(req.FabricRegistryRecordsJSON) == "" {
return client.RawControlResponse{}, false, nil
}
publicKey, err := decodeUpdateFabricRegistryPublicKey(req)
if err != nil {
return client.RawControlResponse{}, false, err
}
registry, _, err := mesh.LoadFabricRegistryBootstrapRecords(req.FabricRegistryRecordsJSON, mesh.FabricRegistryVerificationPolicy{
LocalClusterID: req.ClusterID,
TrustedIssuers: []mesh.FabricRegistryTrustedIssuer{{
IssuerID: "cluster-authority",
Role: mesh.FabricRegistryAuthorityControl,
PublicKey: publicKey,
Scopes: []string{mesh.FabricRegistryScopeFarm, mesh.FabricRegistryScopeCluster, mesh.FabricRegistryScopeOrganization},
Services: []string{mesh.FabricRegistryServiceControlAPI},
}},
RequiredSignatures: 1,
MaxClockSkew: 2 * time.Minute,
Now: time.Now().UTC(),
}, false)
if err != nil {
return client.RawControlResponse{}, false, err
}
transport := mesh.NewQUICFabricTransport(nil)
if req.NodeID != "" {
transport.SetLocalPeerID(req.NodeID)
}
registry.VerifyCandidates(ctx, transport, mesh.FabricRegistryLiveProbeRequest{
ClusterID: req.ClusterID,
PreferredRegion: req.MeshRegion,
Timeout: 2 * time.Second,
MaxCandidates: 8,
Now: time.Now().UTC(),
})
resolved := registry.ResolveService(mesh.FabricRegistryResolveRequest{
ClusterID: req.ClusterID,
Service: mesh.FabricRegistryServiceControlAPI,
Scope: mesh.FabricRegistryScopeCluster,
PreferredRegion: req.MeshRegion,
Now: time.Now().UTC(),
})
if !resolved.Found || len(resolved.Endpoints) == 0 {
return client.RawControlResponse{}, false, nil
}
payload, err := json.Marshal(rawReq)
if err != nil {
return client.RawControlResponse{}, false, err
}
var lastErr error
for _, endpoint := range resolved.Endpoints {
result, err := mesh.SendFabricControlForward(ctx, transport, endpoint, payload, 5*time.Second)
if err != nil {
lastErr = err
continue
}
var envelope struct {
Payload json.RawMessage `json:"payload,omitempty"`
Error string `json:"error,omitempty"`
}
if err := json.Unmarshal(result.Payload, &envelope); err != nil {
lastErr = err
continue
}
if strings.TrimSpace(envelope.Error) != "" {
lastErr = errors.New(envelope.Error)
continue
}
var raw client.RawControlResponse
if err := json.Unmarshal(envelope.Payload, &raw); err != nil {
lastErr = err
continue
}
return raw, true, nil
}
if lastErr == nil {
lastErr = errors.New("fabric control registry endpoints unavailable")
}
return client.RawControlResponse{}, false, lastErr
}
func decodeUpdateFabricRegistryPublicKey(req UpdateRequest) (ed25519.PublicKey, error) {
value := strings.TrimSpace(req.ClusterAuthorityPublicKey)
if value == "" && strings.TrimSpace(req.StateDir) != "" {
if identity, err := state.Load(filepath.Join(req.StateDir, state.FileName)); err == nil {
value = strings.TrimSpace(identity.ClusterAuthorityPublicKey)
}
}
if value == "" {
return nil, errors.New("cluster authority public key is required for fabric registry records")
}
decoded, err := base64.StdEncoding.DecodeString(value)
if err != nil {
decoded, err = base64.RawStdEncoding.DecodeString(value)
}
if err != nil {
decoded, err = base64.RawURLEncoding.DecodeString(value)
}
if err != nil || len(decoded) != ed25519.PublicKeySize {
return nil, errors.New("cluster authority public key must be base64 Ed25519 public key")
}
return ed25519.PublicKey(decoded), nil
}
func verifyNodeUpdatePlanAuthority(req UpdateRequest, plan NodeUpdatePlan) error {
identity, ok := pinnedUpdatePlanAuthority(req)
if !ok {
@@ -642,6 +776,9 @@ func resolveUpdateRequest(req UpdateRequest) (UpdateRequest, error) {
func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID string, request NodeUpdateStatusRequest) error {
backendURL = strings.TrimRight(strings.TrimSpace(backendURL), "/")
if backendURL == "" {
return errors.New("update status control API is unavailable: backend-url is empty")
}
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/status", backendURL, url.PathEscape(clusterID), url.PathEscape(nodeID))
body, err := json.Marshal(request)
if err != nil {
@@ -663,6 +800,33 @@ func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID s
return nil
}
func ReportNodeUpdateStatusForRequest(ctx context.Context, req UpdateRequest, request NodeUpdateStatusRequest) error {
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return err
}
body, err := json.Marshal(request)
if err != nil {
return err
}
raw, viaFabric, err := updateControlRawViaFabric(ctx, req, client.RawControlRequest{
Method: http.MethodPost,
Path: fmt.Sprintf("/clusters/%s/nodes/%s/updates/status", url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID)),
Body: body,
})
if viaFabric {
if err != nil {
return err
}
if raw.StatusCode < 200 || raw.StatusCode >= 300 {
return fmt.Errorf("report update status via fabric: status %d", raw.StatusCode)
}
return nil
}
return ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, request)
}
func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner CommandRunner, docker, containerName string) (dockerInspectContainer, RuntimeConfig, error) {
out, err := runner.Run(ctx, docker, "inspect", containerName)
if err != nil {
@@ -686,9 +850,8 @@ func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner Co
Network: firstNonEmpty(inspected[0].HostConfig.NetworkMode, DefaultNetwork),
RestartPolicy: firstNonEmpty(inspected[0].HostConfig.RestartPolicy.Name, "unless-stopped"),
WorkloadSupervisionEnabled: parseBool(env["RAP_WORKLOAD_SUPERVISION_ENABLED"]),
MeshSyntheticRuntimeEnabled: true,
MeshSyntheticRuntimeEnabled: parseBool(env["RAP_MESH_SYNTHETIC_RUNTIME_ENABLED"]),
MeshProductionForwardingEnabled: parseBool(env["RAP_MESH_PRODUCTION_FORWARDING_ENABLED"]),
MeshFabricSessionEnabled: parseBool(env["RAP_MESH_FABRIC_SESSION_ENABLED"]),
VPNFabricSessionTransportEnabled: parseBool(env["RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED"]),
MeshQUICFabricEnabled: parseBool(env["RAP_MESH_QUIC_FABRIC_ENABLED"]),
MeshQUICFabricListenAddr: env["RAP_MESH_QUIC_FABRIC_LISTEN_ADDR"],
@@ -4,9 +4,17 @@ import (
"context"
"crypto/ed25519"
cryptorand "crypto/rand"
"crypto/rsa"
"crypto/sha256"
"crypto/tls"
"crypto/x509"
"crypto/x509/pkix"
"encoding/base64"
"encoding/hex"
"encoding/json"
"fmt"
"math/big"
"net"
"net/http"
"net/http/httptest"
"os"
@@ -16,6 +24,8 @@ import (
"time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
@@ -120,6 +130,81 @@ func signHostAgentPayload(t *testing.T, payload json.RawMessage, privateKey ed25
}
}
func testHostAgentQUICTLSConfig(t *testing.T) *tls.Config {
t.Helper()
key, err := rsa.GenerateKey(cryptorand.Reader, 2048)
if err != nil {
t.Fatalf("generate rsa key: %v", err)
}
template := x509.Certificate{
SerialNumber: big.NewInt(1),
Subject: pkix.Name{CommonName: "127.0.0.1"},
NotBefore: time.Now().Add(-time.Hour),
NotAfter: time.Now().Add(time.Hour),
KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
IPAddresses: []net.IP{net.ParseIP("127.0.0.1")},
}
der, err := x509.CreateCertificate(cryptorand.Reader, &template, &template, &key.PublicKey, key)
if err != nil {
t.Fatalf("create cert: %v", err)
}
return &tls.Config{
Certificates: []tls.Certificate{{Certificate: [][]byte{der}, PrivateKey: key}},
NextProtos: []string{"rap-fabric-data-session-v1"},
}
}
func testHostAgentQUICCertSHA256(t *testing.T, cfg *tls.Config) string {
t.Helper()
if len(cfg.Certificates) == 0 || len(cfg.Certificates[0].Certificate) == 0 {
t.Fatal("missing test certificate")
}
sum := sha256.Sum256(cfg.Certificates[0].Certificate[0])
return hex.EncodeToString(sum[:])
}
func signedUpdateControlRegistry(t *testing.T, clusterID, endpoint, certSHA256 string, publicKey ed25519.PublicKey, privateKey ed25519.PrivateKey) string {
t.Helper()
now := time.Now().UTC()
issuer := mesh.FabricRegistryTrustedIssuer{IssuerID: "cluster-authority", Role: mesh.FabricRegistryAuthorityControl, PublicKey: publicKey}
record := mesh.FabricRegistryGossipRecord{
SchemaVersion: mesh.FabricRegistryGossipRecordSchema,
ClusterID: clusterID,
Service: mesh.FabricRegistryServiceControlAPI,
Scope: mesh.FabricRegistryScopeCluster,
Epoch: 1,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Hour),
IssuerNodeID: "cluster-authority",
IssuerRole: mesh.FabricRegistryAuthorityControl,
Endpoints: []mesh.FabricRegistryEndpoint{{
EndpointID: "control-a",
Address: endpoint,
Transport: "direct_quic",
PeerCertSHA256: certSHA256,
}},
}
signed, err := mesh.SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign registry record: %v", err)
}
raw, err := json.Marshal([]mesh.FabricRegistryGossipRecord{signed})
if err != nil {
t.Fatalf("marshal registry record: %v", err)
}
return string(raw)
}
func mustJSONRaw(t *testing.T, value any) json.RawMessage {
t.Helper()
raw, err := json.Marshal(value)
if err != nil {
t.Fatalf("marshal json: %v", err)
}
return raw
}
func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.T) {
urls := artifactURLsForBackend(ReleaseArtifact{
URL: "/downloads/rap-node-agent-0.2.92.tar",
@@ -223,6 +308,111 @@ func TestFetchNodeUpdatePlanAcceptsSignedPlanWithPinnedAuthority(t *testing.T) {
}
}
func TestFetchNodeUpdatePlanUsesFabricRegistryQUICControlAPI(t *testing.T) {
stateDir, publicKey, privateKey := writePinnedAuthorityIdentity(t)
plan := map[string]any{
"schema_version": "rap.node_update_plan.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0",
"action": "none",
"reason": "already_current",
"production_forwarding": false,
}
payload := map[string]any{
"schema_version": "rap.node_update_plan_authority.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0",
"action": "none",
"target_version": "",
"artifact_sha256": "",
"control_plane_only": true,
"production_forwarding": false,
}
rawPayload, signature := signedAuthorityPayload(t, publicKey, privateKey, payload)
plan["authority_payload"] = json.RawMessage(rawPayload)
plan["authority_signature"] = signature
tlsConfig := testHostAgentQUICTLSConfig(t)
var received client.RawControlRequest
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
if err := json.Unmarshal(payload, &received); err != nil {
return nil, err
}
if received.Method != http.MethodGet || !strings.HasPrefix(received.Path, "/clusters/cluster-1/nodes/node-1/updates/plan?") {
return nil, fmt.Errorf("unexpected request: %+v", received)
}
return json.Marshal(client.RawControlResponse{StatusCode: 200, Body: mustJSONRaw(t, map[string]any{"node_update_plan": plan})})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
got, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
BackendURL: "http://127.0.0.1:1",
ClusterID: "cluster-1",
NodeID: "node-1",
StateDir: stateDir,
FabricRegistryRecordsJSON: signedUpdateControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testHostAgentQUICCertSHA256(t, tlsConfig), publicKey, privateKey),
CurrentVersion: "0.1.0",
OS: "linux",
Arch: "amd64",
InstallType: "docker",
})
if err != nil {
t.Fatalf("fetch plan via fabric: %v", err)
}
if got.Action != "none" || got.Reason != "already_current" {
t.Fatalf("plan = %+v", got)
}
}
func TestReportNodeUpdateStatusUsesFabricRegistryQUICControlAPI(t *testing.T) {
stateDir, publicKey, privateKey := writePinnedAuthorityIdentity(t)
tlsConfig := testHostAgentQUICTLSConfig(t)
var received client.RawControlRequest
server, err := mesh.StartQUICFabricServer(context.Background(), mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
if err := json.Unmarshal(payload, &received); err != nil {
return nil, err
}
if received.Method != http.MethodPost || received.Path != "/clusters/cluster-1/nodes/node-1/updates/status" {
return nil, fmt.Errorf("unexpected request: %+v", received)
}
return json.Marshal(client.RawControlResponse{StatusCode: 204, Body: json.RawMessage(`{}`)})
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
err = ReportNodeUpdateStatusForRequest(context.Background(), UpdateRequest{
BackendURL: "http://127.0.0.1:1",
ClusterID: "cluster-1",
NodeID: "node-1",
StateDir: stateDir,
FabricRegistryRecordsJSON: signedUpdateControlRegistry(t, "cluster-1", "quic://"+server.Addr().String(), testHostAgentQUICCertSHA256(t, tlsConfig), publicKey, privateKey),
CurrentVersion: "0.1.0",
OS: "linux",
Arch: "amd64",
InstallType: "docker",
}, NodeUpdateStatusRequest{Product: "rap-node-agent", Phase: "download", Status: "started"})
if err != nil {
t.Fatalf("report status via fabric: %v", err)
}
if len(received.Body) == 0 || !strings.Contains(string(received.Body), `"phase":"download"`) {
t.Fatalf("unexpected status body: %s", string(received.Body))
}
}
func TestFetchNodeUpdatePlanAcceptsQuorumSignedPlan(t *testing.T) {
stateDir, descriptor, privateKeys := writePinnedQuorumIdentity(t)
plan := map[string]any{
@@ -66,7 +66,6 @@ func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInsta
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshFabricSessionEnabled: profile.MeshFabricSessionEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
MeshQUICFabricListenAddr: profile.MeshQUICFabricListenAddr,
@@ -48,29 +48,29 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
}
status.Payload["task"] = req.WindowsTaskName
status.Payload["binary_path"] = req.BinaryPath
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
_ = ReportNodeUpdateStatusForRequest(ctx, req, status)
}
return result, nil
}
if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact == nil {
err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != WindowsUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
return result, nil
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
@@ -81,7 +81,7 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName},
})
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
@@ -93,7 +93,7 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
@@ -101,16 +101,16 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if retryErr := copyFile(path, req.BinaryPath, 0o755); retryErr != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "apply", "failed", err))
return result, err
}
}
result.Replaced = true
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
@@ -290,7 +290,6 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
updateLoopArgs := []string{
`"` + hostAgentPath + `"`,
"update-loop",
"--backend-url", `"` + cfg.RuntimeConfig.BackendURL + `"`,
"--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`,
"--state-dir", `"` + result.StateDir + `"`,
"--current-version", currentVersion,
@@ -306,6 +305,10 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
"--host-agent-current-version", currentVersion,
"--host-agent-binary-path", `"` + hostAgentPath + `"`,
}
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
updateLoopArgs = append(updateLoopArgs, "--backend-url", `"`+strings.TrimSpace(cfg.RuntimeConfig.BackendURL)+`"`)
}
updateLoopArgs = appendFabricUpdateArgs(updateLoopArgs, cfg.RuntimeConfig)
if strings.TrimSpace(cfg.NodeID) != "" {
updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`)
}
@@ -6,13 +6,7 @@ import (
"encoding/json"
"fmt"
"net/http"
"net/url"
"strings"
"sync"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
"github.com/gorilla/websocket"
)
type Client struct {
@@ -20,38 +14,6 @@ type Client struct {
HTTPClient *http.Client
}
type FabricSessionDialOptions struct {
Token string
Header http.Header
Dialer *websocket.Dialer
Timeout time.Duration
MaxPayload int
}
type FabricSessionClient struct {
conn *websocket.Conn
timeout time.Duration
maxPayload int
readMu sync.Mutex
writeMu sync.Mutex
}
type FabricSessionPumpOptions struct {
OutboundBuffer int
InboundBuffer int
ErrorBuffer int
}
type FabricSessionPump struct {
session *FabricSessionClient
outbound chan fabricproto.Frame
inbound chan fabricproto.Frame
errors chan error
done chan struct{}
cancel context.CancelFunc
closeMu sync.Once
}
func NewClient(baseURL string) Client {
return Client{
BaseURL: baseURL,
@@ -147,270 +109,3 @@ func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope)
}
return result, nil
}
func (c Client) DialFabricSession(ctx context.Context, opts FabricSessionDialOptions) (*websocket.Conn, *http.Response, error) {
target, err := c.fabricSessionWebSocketURL()
if err != nil {
return nil, nil, err
}
header := cloneHeader(opts.Header)
if strings.TrimSpace(opts.Token) != "" {
header.Set("X-RAP-Fabric-Session-Token", strings.TrimSpace(opts.Token))
}
dialer := opts.Dialer
if dialer == nil {
base := *websocket.DefaultDialer
if opts.Timeout > 0 {
base.HandshakeTimeout = opts.Timeout
}
dialer = &base
}
return dialer.DialContext(ctx, target, header)
}
func (c Client) OpenFabricSession(ctx context.Context, opts FabricSessionDialOptions) (*FabricSessionClient, *http.Response, error) {
conn, resp, err := c.DialFabricSession(ctx, opts)
if err != nil {
if resp != nil {
return nil, resp, fmt.Errorf("fabric session websocket rejected with status %d: %w", resp.StatusCode, err)
}
return nil, resp, err
}
maxPayload := opts.MaxPayload
if maxPayload <= 0 {
maxPayload = fabricproto.DefaultMaxPayload
}
return &FabricSessionClient{
conn: conn,
timeout: opts.Timeout,
maxPayload: maxPayload,
}, resp, nil
}
func (c Client) SendFabricSessionFrame(ctx context.Context, opts FabricSessionDialOptions, frame fabricproto.Frame) (fabricproto.Frame, error) {
session, _, err := c.OpenFabricSession(ctx, opts)
if err != nil {
return fabricproto.Frame{}, err
}
defer session.Close()
return session.RoundTrip(ctx, frame)
}
func (c *FabricSessionClient) Close() error {
if c == nil || c.conn == nil {
return nil
}
return c.conn.Close()
}
func (c *FabricSessionClient) WriteFrame(ctx context.Context, frame fabricproto.Frame) error {
if c == nil || c.conn == nil {
return fmt.Errorf("fabric session client is closed")
}
payload, err := fabricproto.MarshalFrame(frame)
if err != nil {
return err
}
c.writeMu.Lock()
defer c.writeMu.Unlock()
c.applyWriteDeadline(ctx)
return c.conn.WriteMessage(websocket.BinaryMessage, payload)
}
func (c *FabricSessionClient) ReadFrame(ctx context.Context) (fabricproto.Frame, error) {
if c == nil || c.conn == nil {
return fabricproto.Frame{}, fmt.Errorf("fabric session client is closed")
}
c.readMu.Lock()
defer c.readMu.Unlock()
c.applyReadDeadline(ctx)
messageType, responsePayload, err := c.conn.ReadMessage()
if err != nil {
return fabricproto.Frame{}, err
}
if messageType != websocket.BinaryMessage {
return fabricproto.Frame{}, fmt.Errorf("fabric session websocket returned non-binary message type %d", messageType)
}
return fabricproto.UnmarshalFrame(responsePayload, c.maxPayload)
}
func (c *FabricSessionClient) RoundTrip(ctx context.Context, frame fabricproto.Frame) (fabricproto.Frame, error) {
if err := c.WriteFrame(ctx, frame); err != nil {
return fabricproto.Frame{}, err
}
return c.ReadFrame(ctx)
}
func (c *FabricSessionClient) StartPump(ctx context.Context, opts FabricSessionPumpOptions) *FabricSessionPump {
if opts.OutboundBuffer <= 0 {
opts.OutboundBuffer = 64
}
if opts.InboundBuffer <= 0 {
opts.InboundBuffer = 64
}
if opts.ErrorBuffer <= 0 {
opts.ErrorBuffer = 8
}
pumpCtx, cancel := context.WithCancel(ctx)
pump := &FabricSessionPump{
session: c,
outbound: make(chan fabricproto.Frame, opts.OutboundBuffer),
inbound: make(chan fabricproto.Frame, opts.InboundBuffer),
errors: make(chan error, opts.ErrorBuffer),
done: make(chan struct{}),
cancel: cancel,
}
go pump.writeLoop(pumpCtx)
go pump.readLoop(pumpCtx)
return pump
}
func (p *FabricSessionPump) Send(ctx context.Context, frame fabricproto.Frame) error {
if p == nil {
return fmt.Errorf("fabric session pump is nil")
}
select {
case <-ctx.Done():
return ctx.Err()
case <-p.done:
return fmt.Errorf("fabric session pump is closed")
case p.outbound <- frame:
return nil
}
}
func (p *FabricSessionPump) Frames() <-chan fabricproto.Frame {
if p == nil {
return nil
}
return p.inbound
}
func (p *FabricSessionPump) Errors() <-chan error {
if p == nil {
return nil
}
return p.errors
}
func (p *FabricSessionPump) Closed() bool {
if p == nil {
return true
}
select {
case <-p.done:
return true
default:
return false
}
}
func (p *FabricSessionPump) Close() error {
if p == nil {
return nil
}
var err error
p.closeMu.Do(func() {
close(p.done)
p.cancel()
err = p.session.Close()
})
return err
}
func (p *FabricSessionPump) writeLoop(ctx context.Context) {
defer p.Close()
for {
select {
case <-ctx.Done():
p.reportError(ctx.Err())
return
case <-p.done:
return
case frame := <-p.outbound:
if err := p.session.WriteFrame(ctx, frame); err != nil {
p.reportError(err)
return
}
}
}
}
func (p *FabricSessionPump) readLoop(ctx context.Context) {
defer p.Close()
for {
frame, err := p.session.ReadFrame(ctx)
if err != nil {
p.reportError(err)
return
}
select {
case <-ctx.Done():
p.reportError(ctx.Err())
return
case <-p.done:
return
case p.inbound <- frame:
}
}
}
func (p *FabricSessionPump) reportError(err error) {
if err == nil {
return
}
select {
case p.errors <- err:
default:
}
}
func (c *FabricSessionClient) applyReadDeadline(ctx context.Context) {
if deadline, ok := ctx.Deadline(); ok {
_ = c.conn.SetReadDeadline(deadline)
} else if c.timeout > 0 {
_ = c.conn.SetReadDeadline(time.Now().Add(c.timeout))
}
}
func (c *FabricSessionClient) applyWriteDeadline(ctx context.Context) {
if deadline, ok := ctx.Deadline(); ok {
_ = c.conn.SetWriteDeadline(deadline)
} else if c.timeout > 0 {
_ = c.conn.SetWriteDeadline(time.Now().Add(c.timeout))
}
}
func (c Client) fabricSessionWebSocketURL() (string, error) {
base := strings.TrimSpace(c.BaseURL)
if base == "" {
return "", fmt.Errorf("mesh base url is required")
}
parsed, err := url.Parse(base)
if err != nil {
return "", err
}
switch parsed.Scheme {
case "http":
parsed.Scheme = "ws"
case "https":
parsed.Scheme = "wss"
case "ws", "wss":
default:
return "", fmt.Errorf("unsupported mesh base url scheme %q", parsed.Scheme)
}
parsed.Path = strings.TrimRight(parsed.Path, "/") + "/mesh/v1/fabric/session/ws"
parsed.RawQuery = ""
parsed.Fragment = ""
return parsed.String(), nil
}
func cloneHeader(header http.Header) http.Header {
out := http.Header{}
for key, values := range header {
for _, value := range values {
out.Add(key, value)
}
}
return out
}
@@ -1,243 +0,0 @@
package mesh
import (
"context"
"net/http/httptest"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestClientFabricSessionFrameRoundTrip(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
response, err := client.SendFabricSessionFrame(ctx, FabricSessionDialOptions{
Token: "rap_fsn_clienttest",
Timeout: time.Second,
}, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 12,
Payload: []byte("probe"),
})
if err != nil {
t.Fatalf("send fabric session frame: %v", err)
}
if response.Type != fabricproto.FramePong || response.Sequence != 12 || string(response.Payload) != "probe" {
t.Fatalf("response = %+v, want pong seq 12", response)
}
}
func TestClientFabricSessionPersistentRoundTrips(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
session, _, err := client.OpenFabricSession(ctx, FabricSessionDialOptions{
Token: "rap_fsn_persistent",
Timeout: time.Second,
})
if err != nil {
t.Fatalf("open fabric session: %v", err)
}
defer session.Close()
first, err := session.RoundTrip(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 1,
Payload: []byte("first"),
})
if err != nil {
t.Fatalf("first round trip: %v", err)
}
second, err := session.RoundTrip(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 2,
Payload: []byte("second"),
})
if err != nil {
t.Fatalf("second round trip: %v", err)
}
if first.Type != fabricproto.FramePong || first.Sequence != 1 || string(first.Payload) != "first" {
t.Fatalf("first response = %+v, want pong seq 1", first)
}
if second.Type != fabricproto.FramePong || second.Sequence != 2 || string(second.Payload) != "second" {
t.Fatalf("second response = %+v, want pong seq 2", second)
}
}
func TestClientFabricSessionPersistentDataAcks(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
session, _, err := client.OpenFabricSession(ctx, FabricSessionDialOptions{
Token: "rap_fsn_dataacks",
Timeout: time.Second,
})
if err != nil {
t.Fatalf("open fabric session: %v", err)
}
defer session.Close()
if err := session.WriteFrame(ctx, fabricproto.Frame{
Type: fabricproto.FrameOpenStream,
StreamID: 77,
TrafficClass: fabricproto.TrafficClassInteractive,
}); err != nil {
t.Fatalf("open stream frame: %v", err)
}
first, err := session.RoundTrip(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
StreamID: 77,
Sequence: 10,
TrafficClass: fabricproto.TrafficClassInteractive,
Payload: []byte("first payload"),
})
if err != nil {
t.Fatalf("first data round trip: %v", err)
}
second, err := session.RoundTrip(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
StreamID: 77,
Sequence: 11,
TrafficClass: fabricproto.TrafficClassInteractive,
Payload: []byte("second payload"),
})
if err != nil {
t.Fatalf("second data round trip: %v", err)
}
if first.Type != fabricproto.FrameAck || first.StreamID != 77 || first.Sequence != 10 {
t.Fatalf("first ack = %+v, want stream 77 seq 10", first)
}
if second.Type != fabricproto.FrameAck || second.StreamID != 77 || second.Sequence != 11 {
t.Fatalf("second ack = %+v, want stream 77 seq 11", second)
}
}
func TestClientFabricSessionPumpMovesIndependentFrames(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
session, _, err := client.OpenFabricSession(ctx, FabricSessionDialOptions{
Token: "rap_fsn_pump",
Timeout: time.Second,
})
if err != nil {
t.Fatalf("open fabric session: %v", err)
}
pump := session.StartPump(ctx, FabricSessionPumpOptions{
OutboundBuffer: 4,
InboundBuffer: 4,
ErrorBuffer: 4,
})
defer pump.Close()
if err := pump.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameOpenStream,
StreamID: 900,
TrafficClass: fabricproto.TrafficClassBulk,
}); err != nil {
t.Fatalf("send open bulk stream: %v", err)
}
if err := pump.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
StreamID: 900,
Sequence: 31,
TrafficClass: fabricproto.TrafficClassBulk,
Payload: []byte("bulk payload"),
}); err != nil {
t.Fatalf("send bulk data: %v", err)
}
if err := pump.Send(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 32,
Payload: []byte("control ping"),
}); err != nil {
t.Fatalf("send ping: %v", err)
}
gotAck := false
gotPong := false
for !gotAck || !gotPong {
select {
case frame := <-pump.Frames():
switch {
case frame.Type == fabricproto.FrameAck && frame.StreamID == 900 && frame.Sequence == 31:
gotAck = true
case frame.Type == fabricproto.FramePong && frame.Sequence == 32 && string(frame.Payload) == "control ping":
gotPong = true
}
case err := <-pump.Errors():
t.Fatalf("pump error: %v", err)
case <-ctx.Done():
t.Fatalf("timed out waiting for pump frames: ack=%v pong=%v", gotAck, gotPong)
}
}
}
func TestClientFabricSessionReportsRejectedStatus(t *testing.T) {
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
}.Handler())
defer server.Close()
client := NewClient(server.URL)
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, err := client.SendFabricSessionFrame(ctx, FabricSessionDialOptions{}, fabricproto.Frame{Type: fabricproto.FramePing})
if err == nil {
t.Fatal("send fabric session without token unexpectedly succeeded")
}
}
func TestClientFabricSessionWebSocketURL(t *testing.T) {
cases := []struct {
base string
want string
}{
{base: "http://node.example", want: "ws://node.example/mesh/v1/fabric/session/ws"},
{base: "https://node.example/base/", want: "wss://node.example/base/mesh/v1/fabric/session/ws"},
{base: "ws://node.example", want: "ws://node.example/mesh/v1/fabric/session/ws"},
}
for _, tc := range cases {
client := NewClient(tc.base)
got, err := client.fabricSessionWebSocketURL()
if err != nil {
t.Fatalf("fabricSessionWebSocketURL(%q): %v", tc.base, err)
}
if got != tc.want {
t.Fatalf("fabricSessionWebSocketURL(%q) = %q, want %q", tc.base, got, tc.want)
}
}
}
@@ -0,0 +1,94 @@
package mesh
import (
"context"
"encoding/json"
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
var fabricControlForwardSequence atomic.Uint64
type FabricControlForwardResult struct {
Payload json.RawMessage `json:"payload,omitempty"`
LatencyMs int64 `json:"latency_ms"`
Endpoint string `json:"endpoint,omitempty"`
}
func FabricTransportTargetFromRegistryEndpoint(endpoint FabricRegistryEndpoint) FabricTransportTarget {
return FabricTransportTarget{
EndpointID: strings.TrimSpace(endpoint.EndpointID),
PeerID: strings.TrimSpace(endpoint.EndpointID),
Endpoint: strings.TrimSpace(endpoint.Address),
Transport: strings.TrimSpace(endpoint.Transport),
PeerCertSHA256: strings.TrimSpace(endpoint.PeerCertSHA256),
Timeout: 5 * time.Second,
InboundBuffer: 4,
ErrorBuffer: 4,
}
}
func SendFabricControlForward(ctx context.Context, transport FabricTransport, endpoint FabricRegistryEndpoint, payload []byte, timeout time.Duration) (FabricControlForwardResult, error) {
if transport == nil {
return FabricControlForwardResult{}, fmt.Errorf("fabric control transport is unavailable")
}
if len(payload) == 0 {
return FabricControlForwardResult{}, fmt.Errorf("fabric control payload is empty")
}
if timeout <= 0 {
timeout = 5 * time.Second
}
target := FabricTransportTargetFromRegistryEndpoint(endpoint)
target.Timeout = timeout
session, err := transport.Connect(ctx, target)
if err != nil {
return FabricControlForwardResult{}, err
}
defer session.Close()
sequence := fabricControlForwardSequence.Add(1)
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricproto.TrafficClassReliable,
StreamID: FabricControlForwardQUICStreamID,
Sequence: sequence,
Payload: append([]byte(nil), payload...),
}); err != nil {
return FabricControlForwardResult{}, err
}
waitCtx := ctx
var cancel context.CancelFunc
if timeout > 0 {
waitCtx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
startedAt := time.Now()
for {
select {
case <-waitCtx.Done():
return FabricControlForwardResult{}, waitCtx.Err()
case err, ok := <-session.Errors():
if !ok {
return FabricControlForwardResult{}, fmt.Errorf("fabric control session closed")
}
if err != nil {
return FabricControlForwardResult{}, err
}
case frame, ok := <-session.Frames():
if !ok {
return FabricControlForwardResult{}, fmt.Errorf("fabric control session closed")
}
if frame.Type != fabricproto.FrameData || frame.StreamID != FabricControlForwardQUICStreamID || frame.Sequence != sequence {
continue
}
return FabricControlForwardResult{
Payload: append(json.RawMessage(nil), frame.Payload...),
LatencyMs: time.Since(startedAt).Milliseconds(),
Endpoint: endpoint.Address,
}, nil
}
}
}
@@ -565,6 +565,43 @@ func TestQUICFabricServerHandlesWebIngressForwardFrames(t *testing.T) {
}
}
func TestSendFabricControlForwardUsesQUICStream(t *testing.T) {
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
FabricControlHandler: func(_ context.Context, payload []byte) ([]byte, error) {
if string(payload) != `{"method":"GET","path":"/auth/login"}` {
return nil, ErrForwardRuntimeUnavailable
}
return []byte(`{"status_code":200,"body":{"ok":true}}`), nil
},
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
result, err := SendFabricControlForward(ctx, NewQUICFabricTransport(nil), FabricRegistryEndpoint{
EndpointID: "control-a",
Address: "quic://" + server.Addr().String(),
Transport: "direct_quic",
PeerCertSHA256: testQUICCertSHA256(t, tlsConfig),
}, []byte(`{"method":"GET","path":"/auth/login"}`), time.Second)
if err != nil {
t.Fatalf("send fabric control forward: %v", err)
}
var response quicFabricControlForwardResponse
if err := json.Unmarshal(result.Payload, &response); err != nil {
t.Fatalf("decode response: %v", err)
}
if response.Error != "" || string(response.Payload) != `{"status_code":200,"body":{"ok":true}}` {
t.Fatalf("response = %+v", response)
}
}
func startQUICFabricEchoServer(t *testing.T) *quic.Listener {
t.Helper()
return startQUICFabricEchoServerWithTLS(t, testQUICTLSConfig(t))
@@ -164,6 +164,7 @@ func fabricRouteHopsForCandidate(candidate PeerEndpointCandidate, metadata Fabri
case FabricRouteRelay:
relayNodeID := firstNonEmpty(strings.TrimSpace(metadata.RelayNodeID), strings.TrimSpace(metadata.ViaNodeID))
relayEndpoint := firstNonEmpty(strings.TrimRight(strings.TrimSpace(metadata.RelayEndpoint), "/"), endpoint)
relayPeerCertSHA256 := candidatePeerCertSHA256(candidate)
hops := []FabricRouteHop{}
if localNodeID != "" {
hops = append(hops, FabricRouteHop{NodeID: localNodeID, Mode: FabricRouteDirect})
@@ -173,7 +174,7 @@ func fabricRouteHopsForCandidate(candidate PeerEndpointCandidate, metadata Fabri
return hops
}
hops = append(hops,
FabricRouteHop{NodeID: relayNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID + ":relay", Address: relayEndpoint},
FabricRouteHop{NodeID: relayNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID + ":relay", Address: relayEndpoint, PeerCertSHA256: relayPeerCertSHA256},
FabricRouteHop{NodeID: targetNodeID, Mode: FabricRouteRelay, EndpointID: candidate.EndpointID, Address: endpoint, PeerCertSHA256: candidatePeerCertSHA256(candidate)},
)
return hops
@@ -44,7 +44,13 @@ func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
}
func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{RelayNodeID: "node-r", RelayEndpoint: "quic://node-r:19443"})
metadata, _ := json.Marshal(struct {
FabricCandidateMetadata
TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"`
}{
FabricCandidateMetadata: FabricCandidateMetadata{RelayNodeID: "node-r", RelayEndpoint: "quic://node-r:19443"},
TLSCertSHA256: "relay-cert",
})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{{
EndpointID: "node-b-relay",
NodeID: "node-b",
@@ -69,6 +75,9 @@ func TestFabricRouteSetForPeerEndpointCandidatesBuildsRelayFallback(t *testing.T
if got := routeSet.Primary.Hops[1].NodeID; got != "node-r" {
t.Fatalf("relay hop = %q, want node-r", got)
}
if got := routeSet.Primary.Hops[1].PeerCertSHA256; got != "relay-cert" {
t.Fatalf("relay hop peer cert = %q, want relay-cert", got)
}
if routeSet.Primary.Capacity != 50 {
t.Fatalf("capacity = %d, want 50", routeSet.Primary.Capacity)
}
@@ -1,156 +0,0 @@
package mesh
import (
"context"
"fmt"
"strings"
"sync"
)
type FabricSessionPeerManager struct {
mu sync.Mutex
sessions map[string]*FabricSessionPump
stats FabricSessionPeerManagerStats
}
type FabricSessionPeerTarget struct {
PeerID string
BaseURL string
Options FabricSessionDialOptions
Pump FabricSessionPumpOptions
}
type FabricSessionPeerManagerStats struct {
Opens uint64 `json:"opens"`
Reuses uint64 `json:"reuses"`
ClosedEvicted uint64 `json:"closed_evicted"`
ClosePeerCalls uint64 `json:"close_peer_calls"`
CloseAllCalls uint64 `json:"close_all_calls"`
}
type FabricSessionPeerManagerSnapshot struct {
SchemaVersion string `json:"schema_version"`
ActiveCount int `json:"active_count"`
ClosedCount int `json:"closed_count"`
Stats FabricSessionPeerManagerStats `json:"stats"`
}
func NewFabricSessionPeerManager() *FabricSessionPeerManager {
return &FabricSessionPeerManager{
sessions: map[string]*FabricSessionPump{},
}
}
func (m *FabricSessionPeerManager) Get(ctx context.Context, target FabricSessionPeerTarget) (*FabricSessionPump, error) {
if m == nil {
return nil, fmt.Errorf("fabric session peer manager is nil")
}
key, err := fabricSessionPeerKey(target)
if err != nil {
return nil, err
}
m.mu.Lock()
if pump := m.sessions[key]; pump != nil {
if pump.Closed() {
delete(m.sessions, key)
m.stats.ClosedEvicted++
} else {
m.stats.Reuses++
m.mu.Unlock()
return pump, nil
}
}
m.mu.Unlock()
session, _, err := NewClient(target.BaseURL).OpenFabricSession(ctx, target.Options)
if err != nil {
return nil, err
}
pump := session.StartPump(context.Background(), target.Pump)
m.mu.Lock()
if existing := m.sessions[key]; existing != nil {
if existing.Closed() {
delete(m.sessions, key)
m.stats.ClosedEvicted++
} else {
m.stats.Reuses++
m.mu.Unlock()
_ = pump.Close()
return existing, nil
}
}
if m.sessions == nil {
m.sessions = map[string]*FabricSessionPump{}
}
m.sessions[key] = pump
m.stats.Opens++
m.mu.Unlock()
return pump, nil
}
func (m *FabricSessionPeerManager) ClosePeer(target FabricSessionPeerTarget) error {
if m == nil {
return nil
}
key, err := fabricSessionPeerKey(target)
if err != nil {
return err
}
m.mu.Lock()
m.stats.ClosePeerCalls++
pump := m.sessions[key]
delete(m.sessions, key)
m.mu.Unlock()
if pump == nil {
return nil
}
return pump.Close()
}
func (m *FabricSessionPeerManager) Close() error {
if m == nil {
return nil
}
m.mu.Lock()
m.stats.CloseAllCalls++
sessions := m.sessions
m.sessions = map[string]*FabricSessionPump{}
m.mu.Unlock()
var firstErr error
for _, pump := range sessions {
if err := pump.Close(); err != nil && firstErr == nil {
firstErr = err
}
}
return firstErr
}
func (m *FabricSessionPeerManager) Snapshot() FabricSessionPeerManagerSnapshot {
if m == nil {
return FabricSessionPeerManagerSnapshot{SchemaVersion: "rap.fabric_session_peer_manager.v1"}
}
m.mu.Lock()
defer m.mu.Unlock()
snapshot := FabricSessionPeerManagerSnapshot{
SchemaVersion: "rap.fabric_session_peer_manager.v1",
Stats: m.stats,
}
for _, pump := range m.sessions {
if pump == nil || pump.Closed() {
snapshot.ClosedCount++
continue
}
snapshot.ActiveCount++
}
return snapshot
}
func fabricSessionPeerKey(target FabricSessionPeerTarget) (string, error) {
peerID := strings.TrimSpace(target.PeerID)
baseURL := strings.TrimRight(strings.TrimSpace(target.BaseURL), "/")
if peerID == "" || baseURL == "" {
return "", fmt.Errorf("fabric session peer id and base url are required")
}
return peerID + "\x00" + baseURL, nil
}
@@ -1,194 +0,0 @@
package mesh
import (
"context"
"net/http/httptest"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func TestFabricSessionPeerManagerReusesPeerPump(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
}
},
}.Handler())
defer server.Close()
manager := NewFabricSessionPeerManager()
defer manager.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricSessionPeerTarget{
PeerID: "node-a",
BaseURL: server.URL,
Options: FabricSessionDialOptions{
Token: "rap_fsn_manager",
Timeout: time.Second,
},
Pump: FabricSessionPumpOptions{
OutboundBuffer: 4,
InboundBuffer: 4,
},
}
first, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("first get: %v", err)
}
second, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("second get: %v", err)
}
if first != second {
t.Fatal("manager did not reuse peer pump")
}
if opened != 1 {
t.Fatalf("opened sessions = %d, want 1", opened)
}
snapshot := manager.Snapshot()
if snapshot.SchemaVersion != "rap.fabric_session_peer_manager.v1" ||
snapshot.ActiveCount != 1 ||
snapshot.ClosedCount != 0 ||
snapshot.Stats.Opens != 1 ||
snapshot.Stats.Reuses != 1 {
t.Fatalf("snapshot = %+v", snapshot)
}
if err := first.Send(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
Sequence: 1,
Payload: []byte("manager"),
}); err != nil {
t.Fatalf("send ping: %v", err)
}
select {
case frame := <-first.Frames():
if frame.Type != fabricproto.FramePong || frame.Sequence != 1 || string(frame.Payload) != "manager" {
t.Fatalf("frame = %+v", frame)
}
case err := <-first.Errors():
t.Fatalf("pump error: %v", err)
case <-ctx.Done():
t.Fatal(ctx.Err())
}
}
func TestFabricSessionPeerManagerClosePeerReopens(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
}
},
}.Handler())
defer server.Close()
manager := NewFabricSessionPeerManager()
defer manager.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricSessionPeerTarget{
PeerID: "node-a",
BaseURL: server.URL,
Options: FabricSessionDialOptions{
Token: "rap_fsn_manager_reopen",
Timeout: time.Second,
},
}
first, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("first get: %v", err)
}
if err := manager.ClosePeer(target); err != nil {
t.Fatalf("close peer: %v", err)
}
second, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("second get: %v", err)
}
if first == second {
t.Fatal("manager reused pump after close peer")
}
if opened != 2 {
t.Fatalf("opened sessions = %d, want 2", opened)
}
if snapshot := manager.Snapshot(); snapshot.Stats.ClosePeerCalls != 1 || snapshot.Stats.Opens != 2 {
t.Fatalf("snapshot = %+v", snapshot)
}
}
func TestFabricSessionPeerManagerReopensClosedPump(t *testing.T) {
var opened int
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
FabricSessionEnabled: true,
FabricSessionWebSocketEnabled: true,
FabricSessionLogger: func(entry FabricSessionEventLogEntry) {
if entry.Event == "fabric_session_websocket_opened" {
opened++
}
},
}.Handler())
defer server.Close()
manager := NewFabricSessionPeerManager()
defer manager.Close()
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
target := FabricSessionPeerTarget{
PeerID: "node-a",
BaseURL: server.URL,
Options: FabricSessionDialOptions{
Token: "rap_fsn_manager_closed",
Timeout: time.Second,
},
}
first, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("first get: %v", err)
}
if err := first.Close(); err != nil {
t.Fatalf("close first pump: %v", err)
}
if !first.Closed() {
t.Fatal("first pump should report closed")
}
second, err := manager.Get(ctx, target)
if err != nil {
t.Fatalf("second get: %v", err)
}
if first == second {
t.Fatal("manager reused closed pump")
}
if opened != 2 {
t.Fatalf("opened sessions = %d, want 2", opened)
}
snapshot := manager.Snapshot()
if snapshot.ActiveCount != 1 ||
snapshot.Stats.Opens != 2 ||
snapshot.Stats.ClosedEvicted != 1 {
t.Fatalf("snapshot = %+v", snapshot)
}
}
func TestFabricSessionPeerManagerRejectsIncompleteTarget(t *testing.T) {
manager := NewFabricSessionPeerManager()
_, err := manager.Get(context.Background(), FabricSessionPeerTarget{PeerID: "node-a"})
if err == nil {
t.Fatal("incomplete target unexpectedly succeeded")
}
}
@@ -308,7 +308,7 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
Transport: intent.Transport,
PeerCertSHA256: intent.BestPeerCertSHA256,
}}
if intent.DirectCandidate {
if intent.DirectCandidate || peerConnectionShouldProbeDirectUpgrade(intent, cacheEntry) {
targets = peerConnectionProbeTargets(intent, cacheEntry)
}
var lastFailure string
@@ -354,7 +354,9 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
result.SelectedCandidateID = probePeer.BestCandidateID
result.SelectedEndpoint = probePeer.Endpoint
result.LatencyMs = latency
if intent.RelayCandidate {
if probeTargetUsesDirectQUIC(probeTarget) {
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
} else if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
@@ -410,6 +412,10 @@ func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer P
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
seen := map[string]struct{}{}
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
fallbackPeerCertSHA256 := firstNonEmpty(
strings.TrimSpace(cacheEntry.BestPeerCertSHA256),
strings.TrimSpace(intent.BestPeerCertSHA256),
)
add := func(candidateID, endpoint, transport, peerCertSHA256 string) {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
if endpoint == "" {
@@ -423,6 +429,9 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
return
}
seen[key] = struct{}{}
if strings.TrimSpace(peerCertSHA256) == "" {
peerCertSHA256 = fallbackPeerCertSHA256
}
out = append(out, peerConnectionProbeTarget{
CandidateID: strings.TrimSpace(candidateID),
Endpoint: endpoint,
@@ -440,6 +449,31 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
return out
}
func peerConnectionShouldProbeDirectUpgrade(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) bool {
if intent.DirectCandidate {
return true
}
if strings.TrimSpace(intent.ConnectionState) != PeerConnectionRelayReady &&
!intent.RelayCandidate &&
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayControl {
return false
}
for _, candidate := range cacheEntry.EndpointCandidates {
if candidateUsableForDirectProbe(candidate) {
return true
}
}
return false
}
func probeTargetUsesDirectQUIC(target peerConnectionProbeTarget) bool {
transport := strings.ToLower(strings.TrimSpace(target.Transport))
if strings.Contains(transport, "relay") || strings.Contains(transport, "reverse") || strings.Contains(transport, "outbound") {
return false
}
return peerConnectionTargetIsQUIC(target.Transport, target.Endpoint)
}
func peerConnectionTargetIsQUIC(transport string, endpoint string) bool {
return isQUICOnlyCandidateTransport(transport) || strings.HasPrefix(strings.ToLower(strings.TrimSpace(endpoint)), "quic://")
}
@@ -221,6 +221,125 @@ func TestPeerConnectionProbeTargetKeepsPeerForLocalRelayReverseQUIC(t *testing.T
}
}
func TestPeerConnectionProbeTargetsFallsBackToBestPeerCertSHA256(t *testing.T) {
intent := PeerConnectionIntent{
NodeID: "node-b",
BestPeerCertSHA256: "intent-cert",
}
cacheEntry := PeerCacheEntry{
NodeID: "node-b",
BestPeerCertSHA256: "cache-cert",
BestCandidateID: "node-b-best",
BestTransport: "direct_quic",
Endpoint: "quic://94.141.118.222:19199",
EndpointCandidates: []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19199",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 1,
},
},
}
targets := peerConnectionProbeTargets(intent, cacheEntry)
if len(targets) != 1 {
t.Fatalf("target count = %d, want 1", len(targets))
}
for _, target := range targets {
if target.Endpoint != "quic://94.141.118.222:19199" {
continue
}
if target.PeerCertSHA256 != "cache-cert" {
t.Fatalf("peer cert = %q, want cache-cert", target.PeerCertSHA256)
}
}
}
func TestPeerConnectionProbeTargetsUpgradeRelayReadyPeerToDirectQUIC(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
current := now
tlsConfig := testQUICTLSConfig(t)
server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0",
TLSConfig: tlsConfig,
})
if err != nil {
t.Fatalf("start quic fabric server: %v", err)
}
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
certSHA256 := testQUICCertSHA256(t, tlsConfig)
leases := []PeerRendezvousLease{{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "quic://127.0.0.1:1",
Transport: "relay_quic",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
}}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://" + server.Addr().String(),
Reachability: "public",
ConnectivityMode: "direct",
Priority: 1,
Metadata: peerConnectionProbeMetadata(t, certSHA256),
},
},
},
RendezvousLeases: leases,
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
RendezvousLeases: leases,
QUICTransport: NewQUICFabricTransport(nil),
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || len(cycle.Results) != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
result := cycle.Results[0]
if result.SelectedCandidateID != "node-b-direct" || result.SelectedEndpoint != "quic://"+server.Addr().String() {
t.Fatalf("relay-ready peer did not upgrade to direct candidate: %+v", result)
}
if result.ConnectionState.State != PeerConnectionReady {
t.Fatalf("connection state = %q, want ready", result.ConnectionState.State)
}
if len(result.CandidateResults) == 0 || result.CandidateResults[0].Transport != "direct_quic" || result.CandidateResults[0].LinkStatus != PeerConnectionProbeReachable {
t.Fatalf("candidate trail missing direct probe success: %+v", result.CandidateResults)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || snapshot.RelayReady != 0 {
t.Fatalf("unexpected tracker snapshot after direct upgrade: %+v", snapshot)
}
}
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
current := now
@@ -102,8 +102,11 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
continue
}
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
case PeerConnectionReady:
ready++
case PeerConnectionRelayReady:
// Relay-ready peers remain valuable for control-plane reachability,
// but they do not satisfy the target for direct-ready transport paths.
case PeerConnectionDegraded:
degraded++
case PeerConnectionBackoff:
@@ -69,7 +69,7 @@ func TestPeerRecoveryPlanAddsRecoverySeedWhenReadyDeficit(t *testing.T) {
}
}
func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
func TestPeerRecoveryPlanTreatsRelayReadyPeersAsRecoveryGap(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
@@ -92,12 +92,15 @@ func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
Now: now,
})
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
t.Fatalf("unexpected steady plan: %+v", plan)
if plan.Mode != PeerRecoveryModeRecovery || plan.Healthy {
t.Fatalf("unexpected relay-ready recovery plan: %+v", plan)
}
if !recoveryPlanHasCandidate(plan, "node-c", "maintain_ready") {
t.Fatalf("relay-ready peer was not maintained: %+v", plan.Candidates)
}
if plan.ReadyPeerCount != 0 || plan.Deficit != 1 {
t.Fatalf("relay-ready peer should not satisfy direct-ready target: %+v", plan)
}
}
func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
@@ -0,0 +1,713 @@
package mesh
import (
"bytes"
"context"
"crypto/ed25519"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"sort"
"strings"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
const (
FabricRegistryGossipRecordSchema = "rap.fabric.registry.gossip_record.v1"
FabricRegistryScopeFarm = "farm"
FabricRegistryScopeCluster = "cluster"
FabricRegistryScopeOrganization = "organization"
FabricRegistryServiceControlAPI = "control-api"
FabricRegistryServiceUpdateStore = "update-store"
FabricRegistryServiceUpdateCache = "update-cache"
FabricRegistryServiceWebAdmin = "web-admin"
FabricRegistryServiceVPNExitPool = "vpn-egress-pool"
FabricRegistryAuthorityControl = "control-authority"
FabricRegistryAuthorityUpdate = "update-authority"
FabricRegistryAuthorityStorage = "storage-authority"
FabricRegistryAuthorityRoute = "route-authority"
)
type FabricRegistryEndpoint struct {
EndpointID string `json:"endpoint_id"`
Address string `json:"address"`
Transport string `json:"transport"`
Reachability string `json:"reachability,omitempty"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
Region string `json:"region,omitempty"`
Priority int `json:"priority,omitempty"`
Weight int `json:"weight,omitempty"`
PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type FabricRegistrySignature struct {
KeyID string `json:"key_id"`
IssuerID string `json:"issuer_id"`
Role string `json:"role"`
Alg string `json:"alg"`
Value string `json:"value"`
}
type FabricRegistryGossipRecord struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
Service string `json:"service"`
Scope string `json:"scope"`
OrganizationID string `json:"organization_id,omitempty"`
Epoch int64 `json:"epoch"`
Generation string `json:"generation,omitempty"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
IssuerNodeID string `json:"issuer_node_id"`
IssuerRole string `json:"issuer_role"`
Endpoints []FabricRegistryEndpoint `json:"endpoints"`
Metadata json.RawMessage `json:"metadata,omitempty"`
Signatures []FabricRegistrySignature `json:"signatures,omitempty"`
}
type FabricRegistryTrustedIssuer struct {
IssuerID string
Role string
PublicKey ed25519.PublicKey
Scopes []string
Services []string
}
type FabricRegistryVerificationPolicy struct {
LocalClusterID string
TrustedIssuers []FabricRegistryTrustedIssuer
RequiredSignatures int
MaxClockSkew time.Duration
Now time.Time
}
type FabricRegistryVerificationResult struct {
AcceptedSignatureCount int `json:"accepted_signature_count"`
AcceptedIssuers []string `json:"accepted_issuers,omitempty"`
RecordHash string `json:"record_hash"`
}
type FabricRegistryEntryState string
const (
FabricRegistryCandidate FabricRegistryEntryState = "candidate"
FabricRegistryActive FabricRegistryEntryState = "active"
FabricRegistryExpired FabricRegistryEntryState = "expired"
FabricRegistryRejected FabricRegistryEntryState = "rejected"
)
type FabricRegistryEntry struct {
Record FabricRegistryGossipRecord `json:"record"`
State FabricRegistryEntryState `json:"state"`
AcceptedAt time.Time `json:"accepted_at"`
PromotedAt *time.Time `json:"promoted_at,omitempty"`
VerifyResult FabricRegistryVerificationResult `json:"verify_result"`
}
type FabricRegistryBootstrapReport struct {
Total int `json:"total"`
Active int `json:"active"`
Candidate int `json:"candidate"`
Rejected int `json:"rejected"`
Rejects []string `json:"rejects,omitempty"`
RecordKeys []string `json:"record_keys,omitempty"`
}
type FabricRegistryResolveRequest struct {
ClusterID string
Service string
Scope string
OrganizationID string
PreferredRegion string
Now time.Time
}
type FabricRegistryResolvedService struct {
Found bool `json:"found"`
Service string `json:"service"`
Scope string `json:"scope,omitempty"`
OrganizationID string `json:"organization_id,omitempty"`
RecordEpoch int64 `json:"record_epoch,omitempty"`
RecordHash string `json:"record_hash,omitempty"`
Endpoints []FabricRegistryEndpoint `json:"endpoints,omitempty"`
Reason string `json:"reason,omitempty"`
}
type FabricRegistryLiveProbeRequest struct {
ClusterID string
PreferredRegion string
Timeout time.Duration
Now time.Time
MaxCandidates int
}
type FabricRegistryLiveProbeResult struct {
Service string `json:"service"`
Scope string `json:"scope"`
OrganizationID string `json:"organization_id,omitempty"`
EndpointID string `json:"endpoint_id,omitempty"`
Address string `json:"address,omitempty"`
Status string `json:"status"`
LatencyMs int64 `json:"latency_ms,omitempty"`
Promoted bool `json:"promoted"`
Error string `json:"error,omitempty"`
}
type FabricRegistrySnapshot struct {
Active int `json:"active"`
Candidate int `json:"candidate"`
ActiveKeys []string `json:"active_keys,omitempty"`
CandidateKeys []string `json:"candidate_keys,omitempty"`
}
type FabricRegistry struct {
entries map[string]FabricRegistryEntry
candidates map[string]FabricRegistryEntry
}
func NewFabricRegistry() *FabricRegistry {
return &FabricRegistry{entries: map[string]FabricRegistryEntry{}, candidates: map[string]FabricRegistryEntry{}}
}
func LoadFabricRegistryBootstrapRecords(recordsJSON string, policy FabricRegistryVerificationPolicy, liveVerified bool) (*FabricRegistry, FabricRegistryBootstrapReport, error) {
registry := NewFabricRegistry()
recordsJSON = strings.TrimSpace(recordsJSON)
if recordsJSON == "" {
return registry, FabricRegistryBootstrapReport{}, nil
}
var records []FabricRegistryGossipRecord
if err := json.Unmarshal([]byte(recordsJSON), &records); err != nil {
return nil, FabricRegistryBootstrapReport{}, fmt.Errorf("decode fabric registry bootstrap records: %w", err)
}
report := FabricRegistryBootstrapReport{Total: len(records)}
for _, record := range records {
entry, changed, err := registry.ApplyGossipRecord(record, policy, liveVerified)
if err != nil {
report.Rejected++
report.Rejects = append(report.Rejects, err.Error())
continue
}
if !changed {
continue
}
report.RecordKeys = append(report.RecordKeys, fabricRegistryRecordKey(record))
switch entry.State {
case FabricRegistryActive:
report.Active++
case FabricRegistryCandidate:
report.Candidate++
}
}
return registry, report, nil
}
func (r *FabricRegistry) ApplyGossipRecord(record FabricRegistryGossipRecord, policy FabricRegistryVerificationPolicy, liveVerified bool) (FabricRegistryEntry, bool, error) {
if r == nil {
return FabricRegistryEntry{}, false, fmt.Errorf("fabric registry is nil")
}
result, err := VerifyFabricRegistryGossipRecord(record, policy)
if err != nil {
return FabricRegistryEntry{}, false, err
}
now := registryNow(policy.Now)
key := fabricRegistryRecordKey(record)
current, exists := r.entries[key]
if exists && !fabricRegistryRecordNewer(record, current.Record, now) {
return current, false, nil
}
state := FabricRegistryCandidate
var promotedAt *time.Time
if liveVerified {
state = FabricRegistryActive
t := now
promotedAt = &t
}
entry := FabricRegistryEntry{
Record: normalizeFabricRegistryRecord(record),
State: state,
AcceptedAt: now,
PromotedAt: promotedAt,
VerifyResult: result,
}
if state == FabricRegistryActive {
r.entries[key] = entry
delete(r.candidates, key)
return entry, true, nil
}
if r.candidates == nil {
r.candidates = map[string]FabricRegistryEntry{}
}
r.candidates[key] = entry
return entry, true, nil
}
func (r *FabricRegistry) MarkLiveVerified(clusterID, service, scope, organizationID string, now time.Time) bool {
if r == nil {
return false
}
key := fabricRegistryKey(clusterID, service, scope, organizationID)
entry, ok := r.candidates[key]
if !ok || entry.State == FabricRegistryExpired || entry.State == FabricRegistryRejected {
return false
}
t := registryNow(now)
entry.State = FabricRegistryActive
entry.PromotedAt = &t
r.entries[key] = entry
delete(r.candidates, key)
return true
}
func (r *FabricRegistry) Active(clusterID, service, scope, organizationID string, now time.Time) (FabricRegistryGossipRecord, bool) {
if r == nil {
return FabricRegistryGossipRecord{}, false
}
entry, ok := r.entries[fabricRegistryKey(clusterID, service, scope, organizationID)]
if !ok || entry.State != FabricRegistryActive || !entry.Record.ExpiresAt.After(registryNow(now)) {
return FabricRegistryGossipRecord{}, false
}
return entry.Record, true
}
func (r *FabricRegistry) ResolveService(req FabricRegistryResolveRequest) FabricRegistryResolvedService {
service := strings.ToLower(strings.TrimSpace(req.Service))
if service == "" {
return FabricRegistryResolvedService{Found: false, Reason: "service_required"}
}
scopeOrder := fabricRegistryScopeResolutionOrder(req.Scope, req.OrganizationID)
for _, scope := range scopeOrder {
organizationID := strings.TrimSpace(req.OrganizationID)
if scope != FabricRegistryScopeOrganization {
organizationID = ""
}
record, ok := r.Active(req.ClusterID, service, scope, organizationID, req.Now)
if !ok {
continue
}
endpoints := selectFabricRegistryEndpoints(record.Endpoints, req.PreferredRegion)
if len(endpoints) == 0 {
return FabricRegistryResolvedService{Found: false, Service: service, Scope: scope, OrganizationID: organizationID, Reason: "no_usable_endpoints"}
}
result, _ := canonicalFabricRegistryPayload(record)
sum := sha256.Sum256(result)
return FabricRegistryResolvedService{
Found: true,
Service: service,
Scope: scope,
OrganizationID: organizationID,
RecordEpoch: record.Epoch,
RecordHash: hex.EncodeToString(sum[:]),
Endpoints: endpoints,
}
}
return FabricRegistryResolvedService{Found: false, Service: service, Reason: "no_active_record"}
}
func (r *FabricRegistry) Snapshot(now time.Time) FabricRegistrySnapshot {
if r == nil {
return FabricRegistrySnapshot{}
}
now = registryNow(now)
out := FabricRegistrySnapshot{}
for key, entry := range r.entries {
if entry.State == FabricRegistryActive && entry.Record.ExpiresAt.After(now) {
out.Active++
out.ActiveKeys = append(out.ActiveKeys, key)
}
}
for key, entry := range r.candidates {
if entry.State == FabricRegistryCandidate && entry.Record.ExpiresAt.After(now) {
out.Candidate++
out.CandidateKeys = append(out.CandidateKeys, key)
}
}
sort.Strings(out.ActiveKeys)
sort.Strings(out.CandidateKeys)
return out
}
func (r *FabricRegistry) VerifyCandidates(ctx context.Context, transport FabricTransport, req FabricRegistryLiveProbeRequest) []FabricRegistryLiveProbeResult {
if r == nil {
return nil
}
now := registryNow(req.Now)
timeout := req.Timeout
if timeout <= 0 {
timeout = 2 * time.Second
}
maxCandidates := req.MaxCandidates
if maxCandidates <= 0 {
maxCandidates = 16
}
candidates := make([]FabricRegistryEntry, 0, len(r.candidates))
for _, entry := range r.candidates {
if entry.State != FabricRegistryCandidate || !entry.Record.ExpiresAt.After(now) {
continue
}
if clusterID := strings.TrimSpace(req.ClusterID); clusterID != "" && entry.Record.ClusterID != clusterID {
continue
}
candidates = append(candidates, entry)
}
sort.SliceStable(candidates, func(i, j int) bool {
if candidates[i].Record.Service != candidates[j].Record.Service {
return candidates[i].Record.Service < candidates[j].Record.Service
}
if candidates[i].Record.Scope != candidates[j].Record.Scope {
return candidates[i].Record.Scope < candidates[j].Record.Scope
}
return candidates[i].Record.Epoch > candidates[j].Record.Epoch
})
if len(candidates) > maxCandidates {
candidates = candidates[:maxCandidates]
}
results := make([]FabricRegistryLiveProbeResult, 0, len(candidates))
for _, entry := range candidates {
record := entry.Record
result := FabricRegistryLiveProbeResult{
Service: record.Service,
Scope: record.Scope,
OrganizationID: record.OrganizationID,
Status: "unreachable",
}
endpoints := selectFabricRegistryEndpoints(record.Endpoints, req.PreferredRegion)
if len(endpoints) == 0 {
result.Error = "no_usable_endpoints"
results = append(results, result)
continue
}
for _, endpoint := range endpoints {
probeCtx, cancel := context.WithTimeout(ctx, timeout)
latency, err := probeFabricRegistryEndpoint(probeCtx, transport, endpoint, timeout)
cancel()
result.EndpointID = endpoint.EndpointID
result.Address = endpoint.Address
if err != nil {
result.Error = err.Error()
continue
}
result.Status = "reachable"
result.LatencyMs = latency.Milliseconds()
result.Promoted = r.MarkLiveVerified(record.ClusterID, record.Service, record.Scope, record.OrganizationID, now)
result.Error = ""
break
}
results = append(results, result)
}
return results
}
func SignFabricRegistryGossipRecord(record FabricRegistryGossipRecord, issuer FabricRegistryTrustedIssuer, privateKey ed25519.PrivateKey) (FabricRegistryGossipRecord, error) {
payload, err := canonicalFabricRegistryPayload(record)
if err != nil {
return record, err
}
sig := ed25519.Sign(privateKey, payload)
record.Signatures = append(record.Signatures, FabricRegistrySignature{
KeyID: firstNonEmpty(issuer.IssuerID, record.IssuerNodeID),
IssuerID: firstNonEmpty(issuer.IssuerID, record.IssuerNodeID),
Role: firstNonEmpty(issuer.Role, record.IssuerRole),
Alg: "ed25519",
Value: hex.EncodeToString(sig),
})
return record, nil
}
func VerifyFabricRegistryGossipRecord(record FabricRegistryGossipRecord, policy FabricRegistryVerificationPolicy) (FabricRegistryVerificationResult, error) {
record = normalizeFabricRegistryRecord(record)
if err := validateFabricRegistryGossipRecord(record, policy); err != nil {
return FabricRegistryVerificationResult{}, err
}
payload, err := canonicalFabricRegistryPayload(record)
if err != nil {
return FabricRegistryVerificationResult{}, err
}
sum := sha256.Sum256(payload)
trusted := map[string]FabricRegistryTrustedIssuer{}
for _, issuer := range policy.TrustedIssuers {
if strings.TrimSpace(issuer.IssuerID) != "" {
trusted[issuer.IssuerID] = issuer
}
if strings.TrimSpace(issuer.IssuerID) != "" && strings.TrimSpace(issuer.Role) != "" {
trusted[issuer.IssuerID+"\x00"+issuer.Role] = issuer
}
}
accepted := map[string]struct{}{}
for _, signature := range record.Signatures {
if strings.ToLower(strings.TrimSpace(signature.Alg)) != "ed25519" {
continue
}
issuer, ok := trusted[strings.TrimSpace(signature.IssuerID)+"\x00"+strings.TrimSpace(signature.Role)]
if !ok {
issuer, ok = trusted[strings.TrimSpace(signature.IssuerID)]
}
if !ok || !fabricRegistryIssuerAllowed(issuer, record) {
continue
}
rawSig, err := hex.DecodeString(strings.TrimSpace(signature.Value))
if err != nil || len(rawSig) != ed25519.SignatureSize || len(issuer.PublicKey) != ed25519.PublicKeySize {
continue
}
if ed25519.Verify(issuer.PublicKey, payload, rawSig) {
accepted[signature.IssuerID] = struct{}{}
}
}
required := policy.RequiredSignatures
if required <= 0 {
required = 1
}
if len(accepted) < required {
return FabricRegistryVerificationResult{RecordHash: hex.EncodeToString(sum[:])}, fmt.Errorf("fabric registry gossip record lacks required trusted signatures")
}
issuers := make([]string, 0, len(accepted))
for issuer := range accepted {
issuers = append(issuers, issuer)
}
sort.Strings(issuers)
return FabricRegistryVerificationResult{
AcceptedSignatureCount: len(accepted),
AcceptedIssuers: issuers,
RecordHash: hex.EncodeToString(sum[:]),
}, nil
}
func validateFabricRegistryGossipRecord(record FabricRegistryGossipRecord, policy FabricRegistryVerificationPolicy) error {
if record.SchemaVersion != FabricRegistryGossipRecordSchema {
return fmt.Errorf("fabric registry gossip record schema_version is invalid")
}
if strings.TrimSpace(record.ClusterID) == "" || (strings.TrimSpace(policy.LocalClusterID) != "" && record.ClusterID != policy.LocalClusterID) {
return ErrClusterMismatch
}
if strings.TrimSpace(record.Service) == "" || strings.TrimSpace(record.Scope) == "" || strings.TrimSpace(record.IssuerNodeID) == "" || strings.TrimSpace(record.IssuerRole) == "" {
return fmt.Errorf("fabric registry gossip record is missing service, scope, or issuer")
}
if record.Epoch <= 0 || record.IssuedAt.IsZero() || record.ExpiresAt.IsZero() || !record.ExpiresAt.After(record.IssuedAt) {
return fmt.Errorf("fabric registry gossip record has invalid epoch or validity window")
}
now := registryNow(policy.Now)
skew := policy.MaxClockSkew
if skew <= 0 {
skew = time.Minute
}
if record.IssuedAt.After(now.Add(skew)) || !record.ExpiresAt.After(now) {
return fmt.Errorf("fabric registry gossip record is not currently valid")
}
if len(record.Endpoints) == 0 {
return fmt.Errorf("fabric registry gossip record has no endpoints")
}
for _, endpoint := range record.Endpoints {
if strings.TrimSpace(endpoint.EndpointID) == "" || strings.TrimSpace(endpoint.Address) == "" || strings.TrimSpace(endpoint.Transport) == "" {
return fmt.Errorf("fabric registry gossip record contains invalid endpoint")
}
if !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
return fmt.Errorf("fabric registry gossip endpoint must be QUIC-only")
}
if len(endpoint.Metadata) > 0 && !json.Valid(endpoint.Metadata) {
return fmt.Errorf("fabric registry gossip endpoint metadata is invalid")
}
}
if len(record.Metadata) > 0 && !json.Valid(record.Metadata) {
return fmt.Errorf("fabric registry gossip metadata is invalid")
}
return nil
}
func canonicalFabricRegistryPayload(record FabricRegistryGossipRecord) ([]byte, error) {
record = normalizeFabricRegistryRecord(record)
record.Signatures = nil
payload, err := json.Marshal(record)
if err != nil {
return nil, err
}
var compact bytes.Buffer
if err := json.Compact(&compact, payload); err != nil {
return nil, err
}
return compact.Bytes(), nil
}
func normalizeFabricRegistryRecord(record FabricRegistryGossipRecord) FabricRegistryGossipRecord {
record.SchemaVersion = strings.TrimSpace(record.SchemaVersion)
record.ClusterID = strings.TrimSpace(record.ClusterID)
record.Service = strings.ToLower(strings.TrimSpace(record.Service))
record.Scope = strings.ToLower(strings.TrimSpace(record.Scope))
record.OrganizationID = strings.TrimSpace(record.OrganizationID)
record.IssuerNodeID = strings.TrimSpace(record.IssuerNodeID)
record.IssuerRole = strings.TrimSpace(record.IssuerRole)
record.Generation = strings.TrimSpace(record.Generation)
for i := range record.Endpoints {
record.Endpoints[i].EndpointID = strings.TrimSpace(record.Endpoints[i].EndpointID)
record.Endpoints[i].Address = strings.TrimSpace(record.Endpoints[i].Address)
record.Endpoints[i].Transport = strings.TrimSpace(record.Endpoints[i].Transport)
record.Endpoints[i].Reachability = strings.TrimSpace(record.Endpoints[i].Reachability)
record.Endpoints[i].ConnectivityMode = strings.TrimSpace(record.Endpoints[i].ConnectivityMode)
record.Endpoints[i].Region = strings.TrimSpace(record.Endpoints[i].Region)
record.Endpoints[i].PeerCertSHA256 = normalizeCertSHA256(record.Endpoints[i].PeerCertSHA256)
}
sort.SliceStable(record.Endpoints, func(i, j int) bool {
if record.Endpoints[i].Priority != record.Endpoints[j].Priority {
return record.Endpoints[i].Priority < record.Endpoints[j].Priority
}
return record.Endpoints[i].EndpointID < record.Endpoints[j].EndpointID
})
sort.SliceStable(record.Signatures, func(i, j int) bool {
if record.Signatures[i].IssuerID != record.Signatures[j].IssuerID {
return record.Signatures[i].IssuerID < record.Signatures[j].IssuerID
}
return record.Signatures[i].KeyID < record.Signatures[j].KeyID
})
return record
}
func fabricRegistryIssuerAllowed(issuer FabricRegistryTrustedIssuer, record FabricRegistryGossipRecord) bool {
if strings.TrimSpace(issuer.Role) != "" && issuer.Role != record.IssuerRole {
return false
}
if len(issuer.Scopes) > 0 && !stringInSlice(record.Scope, issuer.Scopes) {
return false
}
if len(issuer.Services) > 0 && !stringInSlice(record.Service, issuer.Services) {
return false
}
return true
}
func fabricRegistryRecordKey(record FabricRegistryGossipRecord) string {
return fabricRegistryKey(record.ClusterID, record.Service, record.Scope, record.OrganizationID)
}
func fabricRegistryScopeResolutionOrder(scope string, organizationID string) []string {
scope = strings.ToLower(strings.TrimSpace(scope))
switch scope {
case FabricRegistryScopeOrganization:
if strings.TrimSpace(organizationID) != "" {
return []string{FabricRegistryScopeOrganization, FabricRegistryScopeCluster, FabricRegistryScopeFarm}
}
return []string{FabricRegistryScopeCluster, FabricRegistryScopeFarm}
case FabricRegistryScopeFarm:
return []string{FabricRegistryScopeFarm}
case FabricRegistryScopeCluster, "":
return []string{FabricRegistryScopeCluster, FabricRegistryScopeFarm}
default:
return []string{scope, FabricRegistryScopeCluster, FabricRegistryScopeFarm}
}
}
func selectFabricRegistryEndpoints(endpoints []FabricRegistryEndpoint, preferredRegion string) []FabricRegistryEndpoint {
preferredRegion = strings.TrimSpace(preferredRegion)
out := make([]FabricRegistryEndpoint, 0, len(endpoints))
for _, endpoint := range endpoints {
if strings.TrimSpace(endpoint.Address) == "" || !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
continue
}
out = append(out, endpoint)
}
sort.SliceStable(out, func(i, j int) bool {
if preferredRegion != "" {
iMatch := strings.EqualFold(out[i].Region, preferredRegion)
jMatch := strings.EqualFold(out[j].Region, preferredRegion)
if iMatch != jMatch {
return iMatch
}
}
if out[i].Priority != out[j].Priority {
return out[i].Priority < out[j].Priority
}
if out[i].Weight != out[j].Weight {
return out[i].Weight > out[j].Weight
}
return out[i].EndpointID < out[j].EndpointID
})
return out
}
func probeFabricRegistryEndpoint(ctx context.Context, transport FabricTransport, endpoint FabricRegistryEndpoint, timeout time.Duration) (time.Duration, error) {
if transport == nil {
return 0, fmt.Errorf("fabric registry live probe transport is unavailable")
}
if timeout <= 0 {
timeout = 2 * time.Second
}
target := FabricTransportTarget{
EndpointID: endpoint.EndpointID,
PeerID: endpoint.EndpointID,
Endpoint: endpoint.Address,
Transport: endpoint.Transport,
PeerCertSHA256: endpoint.PeerCertSHA256,
Timeout: timeout,
InboundBuffer: 2,
ErrorBuffer: 2,
}
startedAt := time.Now()
session, err := transport.Connect(ctx, target)
if err != nil {
return 0, err
}
defer session.Close()
sequence := uint64(startedAt.UnixNano())
if err := session.Send(ctx, fabricproto.Frame{Type: fabricproto.FramePing, TrafficClass: fabricproto.TrafficClassReliable, Sequence: sequence, Payload: []byte("fabric-registry-live-probe")}); err != nil {
return 0, err
}
for {
select {
case frame, ok := <-session.Frames():
if !ok {
return 0, fmt.Errorf("fabric registry live probe session closed")
}
if frame.Type == fabricproto.FramePong && frame.Sequence == sequence {
return time.Since(startedAt), nil
}
case err, ok := <-session.Errors():
if !ok {
return 0, fmt.Errorf("fabric registry live probe error channel closed")
}
if err != nil {
return 0, err
}
case <-ctx.Done():
return 0, ctx.Err()
}
}
}
func fabricRegistryKey(clusterID, service, scope, organizationID string) string {
return strings.TrimSpace(clusterID) + "\x00" + strings.ToLower(strings.TrimSpace(service)) + "\x00" + strings.ToLower(strings.TrimSpace(scope)) + "\x00" + strings.TrimSpace(organizationID)
}
func fabricRegistryRecordNewer(next, current FabricRegistryGossipRecord, now time.Time) bool {
if !current.ExpiresAt.After(now) {
return true
}
if next.Epoch != current.Epoch {
return next.Epoch > current.Epoch
}
if !next.IssuedAt.Equal(current.IssuedAt) {
return next.IssuedAt.After(current.IssuedAt)
}
return strings.TrimSpace(next.Generation) > strings.TrimSpace(current.Generation)
}
func registryNow(now time.Time) time.Time {
if now.IsZero() {
return time.Now().UTC()
}
return now.UTC()
}
func stringInSlice(value string, values []string) bool {
value = strings.TrimSpace(value)
for _, candidate := range values {
if strings.TrimSpace(candidate) == value {
return true
}
}
return false
}
@@ -0,0 +1,280 @@
package mesh
import (
"context"
"crypto/ed25519"
"testing"
"time"
)
func TestFabricRegistryGossipRecordRequiresTrustedSignature(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
record := testFabricRegistryGossipRecord(now, 10)
issuer := FabricRegistryTrustedIssuer{
IssuerID: "authority-1",
Role: FabricRegistryAuthorityControl,
PublicKey: publicKey,
Scopes: []string{FabricRegistryScopeCluster},
Services: []string{FabricRegistryServiceControlAPI},
}
signed, err := SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign record: %v", err)
}
if _, err := VerifyFabricRegistryGossipRecord(signed, FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}); err != nil {
t.Fatalf("verify signed record: %v", err)
}
tampered := signed
tampered.Endpoints[0].Address = "quic://10.10.10.10:19443"
if _, err := VerifyFabricRegistryGossipRecord(tampered, FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}); err == nil {
t.Fatal("tampered record verified")
}
}
func TestFabricRegistryRejectsLegacyEndpointAndExpiredRecord(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
record := testFabricRegistryGossipRecord(now, 10)
record.Endpoints[0].Address = "https://control.example.test/api/v1"
signed, err := SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign record: %v", err)
}
if _, err := VerifyFabricRegistryGossipRecord(signed, FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{
{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey},
},
Now: now,
}); err == nil {
t.Fatal("legacy HTTP endpoint was accepted")
}
expired := testFabricRegistryGossipRecord(now.Add(-2*time.Hour), 11)
expired.ExpiresAt = now.Add(-time.Minute)
expiredSigned, err := SignFabricRegistryGossipRecord(expired, issuer, privateKey)
if err != nil {
t.Fatalf("sign expired record: %v", err)
}
if _, err := VerifyFabricRegistryGossipRecord(expiredSigned, FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{
{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey},
},
Now: now,
}); err == nil {
t.Fatal("expired record was accepted")
}
}
func TestFabricRegistryKeepsActiveRecordUntilNewerVerified(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
policy := FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}
registry := NewFabricRegistry()
active, err := SignFabricRegistryGossipRecord(testFabricRegistryGossipRecord(now, 10), issuer, privateKey)
if err != nil {
t.Fatalf("sign active: %v", err)
}
entry, changed, err := registry.ApplyGossipRecord(active, policy, true)
if err != nil || !changed || entry.State != FabricRegistryActive {
t.Fatalf("apply active entry changed=%t entry=%+v err=%v", changed, entry, err)
}
old := testFabricRegistryGossipRecord(now.Add(time.Minute), 9)
old.Endpoints[0].Address = "quic://192.0.2.9:19443"
oldSigned, err := SignFabricRegistryGossipRecord(old, issuer, privateKey)
if err != nil {
t.Fatalf("sign old: %v", err)
}
entry, changed, err = registry.ApplyGossipRecord(oldSigned, policy, true)
if err != nil {
t.Fatalf("apply old: %v", err)
}
if changed || entry.Record.Epoch != 10 || entry.Record.Endpoints[0].Address != "quic://192.0.2.10:19443" {
t.Fatalf("older record replaced active entry: changed=%t entry=%+v", changed, entry)
}
newer := testFabricRegistryGossipRecord(now.Add(2*time.Minute), 11)
newer.Endpoints[0].Address = "quic://192.0.2.11:19443"
newerSigned, err := SignFabricRegistryGossipRecord(newer, issuer, privateKey)
if err != nil {
t.Fatalf("sign newer: %v", err)
}
policy.Now = now.Add(2 * time.Minute)
entry, changed, err = registry.ApplyGossipRecord(newerSigned, policy, false)
if err != nil || !changed || entry.State != FabricRegistryCandidate {
t.Fatalf("apply newer candidate changed=%t entry=%+v err=%v", changed, entry, err)
}
activeRecord, ok := registry.Active("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", policy.Now)
if !ok || activeRecord.Endpoints[0].Address != "quic://192.0.2.10:19443" {
t.Fatalf("unverified newer candidate displaced active fallback: ok=%t record=%+v", ok, activeRecord)
}
if !registry.MarkLiveVerified("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", policy.Now.Add(time.Second)) {
t.Fatal("mark live verified failed")
}
activeRecord, ok = registry.Active("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", policy.Now.Add(time.Second))
if !ok || activeRecord.Endpoints[0].Address != "quic://192.0.2.11:19443" {
t.Fatalf("newer verified record not active: ok=%t record=%+v", ok, activeRecord)
}
}
func TestFabricRegistryResolveServicePrefersVerifiedScopedRegionalEndpoint(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
policy := FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}
registry := NewFabricRegistry()
clusterRecord := testFabricRegistryGossipRecord(now, 10)
clusterRecord.Endpoints = []FabricRegistryEndpoint{
{EndpointID: "control-eu", Address: "quic://eu.example.test:19443", Transport: "direct_quic", Region: "eu", Priority: 10, Weight: 1},
{EndpointID: "control-us", Address: "quic://us.example.test:19443", Transport: "direct_quic", Region: "us", Priority: 10, Weight: 10},
}
signedCluster, err := SignFabricRegistryGossipRecord(clusterRecord, issuer, privateKey)
if err != nil {
t.Fatalf("sign cluster record: %v", err)
}
if _, _, err := registry.ApplyGossipRecord(signedCluster, policy, true); err != nil {
t.Fatalf("apply cluster record: %v", err)
}
orgRecord := testFabricRegistryGossipRecord(now.Add(time.Minute), 11)
orgRecord.Scope = FabricRegistryScopeOrganization
orgRecord.OrganizationID = "org-1"
orgRecord.Endpoints = []FabricRegistryEndpoint{
{EndpointID: "control-org", Address: "quic://org.example.test:19443", Transport: "direct_quic", Region: "eu", Priority: 1, Weight: 1},
}
signedOrg, err := SignFabricRegistryGossipRecord(orgRecord, issuer, privateKey)
if err != nil {
t.Fatalf("sign org record: %v", err)
}
policy.Now = now.Add(time.Minute)
if _, _, err := registry.ApplyGossipRecord(signedOrg, policy, false); err != nil {
t.Fatalf("apply org candidate: %v", err)
}
resolved := registry.ResolveService(FabricRegistryResolveRequest{
ClusterID: "cluster-1",
Service: FabricRegistryServiceControlAPI,
Scope: FabricRegistryScopeOrganization,
OrganizationID: "org-1",
PreferredRegion: "us",
Now: now.Add(time.Minute),
})
if !resolved.Found || resolved.Scope != FabricRegistryScopeCluster || resolved.Endpoints[0].EndpointID != "control-us" {
t.Fatalf("expected cluster fallback with preferred region endpoint, got %+v", resolved)
}
if !registry.MarkLiveVerified("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeOrganization, "org-1", now.Add(2*time.Minute)) {
t.Fatal("mark org live verified failed")
}
resolved = registry.ResolveService(FabricRegistryResolveRequest{
ClusterID: "cluster-1",
Service: FabricRegistryServiceControlAPI,
Scope: FabricRegistryScopeOrganization,
OrganizationID: "org-1",
Now: now.Add(2 * time.Minute),
})
if !resolved.Found || resolved.Scope != FabricRegistryScopeOrganization || resolved.Endpoints[0].EndpointID != "control-org" {
t.Fatalf("expected verified organization record, got %+v", resolved)
}
snapshot := registry.Snapshot(now.Add(2 * time.Minute))
if snapshot.Active != 2 || snapshot.Candidate != 0 {
t.Fatalf("unexpected snapshot: %+v", snapshot)
}
}
func TestFabricRegistryVerifyCandidatesPromotesAfterQUICPong(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
tlsConfig := testQUICTLSConfig(t)
listener := startQUICFabricEchoServerWithTLS(t, tlsConfig)
defer listener.Close()
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatal(err)
}
issuer := FabricRegistryTrustedIssuer{IssuerID: "authority-1", Role: FabricRegistryAuthorityControl, PublicKey: publicKey}
policy := FabricRegistryVerificationPolicy{
LocalClusterID: "cluster-1",
TrustedIssuers: []FabricRegistryTrustedIssuer{issuer},
RequiredSignatures: 1,
Now: now,
}
record := testFabricRegistryGossipRecord(now, 12)
record.Endpoints[0].Address = "quic://" + listener.Addr().String()
record.Endpoints[0].PeerCertSHA256 = testQUICCertSHA256(t, tlsConfig)
signed, err := SignFabricRegistryGossipRecord(record, issuer, privateKey)
if err != nil {
t.Fatalf("sign record: %v", err)
}
registry := NewFabricRegistry()
if entry, changed, err := registry.ApplyGossipRecord(signed, policy, false); err != nil || !changed || entry.State != FabricRegistryCandidate {
t.Fatalf("apply candidate changed=%t entry=%+v err=%v", changed, entry, err)
}
results := registry.VerifyCandidates(context.Background(), NewQUICFabricTransport(nil), FabricRegistryLiveProbeRequest{
ClusterID: "cluster-1",
Timeout: 3 * time.Second,
Now: now.Add(time.Second),
MaxCandidates: 1,
})
if len(results) != 1 || results[0].Status != "reachable" || !results[0].Promoted {
t.Fatalf("unexpected live probe results: %+v", results)
}
if _, ok := registry.Active("cluster-1", FabricRegistryServiceControlAPI, FabricRegistryScopeCluster, "", now.Add(time.Second)); !ok {
t.Fatal("candidate was not promoted to active")
}
}
func testFabricRegistryGossipRecord(now time.Time, epoch int64) FabricRegistryGossipRecord {
return FabricRegistryGossipRecord{
SchemaVersion: FabricRegistryGossipRecordSchema,
ClusterID: "cluster-1",
Service: FabricRegistryServiceControlAPI,
Scope: FabricRegistryScopeCluster,
Epoch: epoch,
Generation: "gen",
IssuedAt: now,
ExpiresAt: now.Add(10 * time.Minute),
IssuerNodeID: "authority-1",
IssuerRole: FabricRegistryAuthorityControl,
Endpoints: []FabricRegistryEndpoint{
{
EndpointID: "control-a",
Address: "quic://192.0.2.10:19443",
Transport: "direct_quic",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 1,
},
},
}
}
+74 -743
View File
@@ -20,7 +20,6 @@ import (
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
"github.com/gorilla/websocket"
)
type ProductionEnvelopeObserver func(context.Context, ProductionEnvelopeObservation) error
@@ -55,6 +54,22 @@ type RemoteWorkspaceFrameSinkSessionMailboxConsumerResume interface {
type RemoteWorkspaceFrameSinkSessionMailboxPreflight interface {
PreflightAdapterSessionMailboxConsumerResume(adapterSessionID string, consumerID string, resumeFrom string, limit int, now time.Time) (RemoteWorkspaceAdapterMailboxPreflightSnapshot, error)
}
type FabricSessionEventLogEntry struct {
Event string `json:"event"`
ClusterID string `json:"cluster_id,omitempty"`
NodeID string `json:"node_id,omitempty"`
PeerID string `json:"peer_id,omitempty"`
AcceptedBy string `json:"accepted_by,omitempty"`
SessionID string `json:"session_id,omitempty"`
SessionEvent fabricproto.SessionEventType `json:"session_event,omitempty"`
StreamID uint64 `json:"stream_id,omitempty"`
Sequence uint64 `json:"sequence,omitempty"`
TrafficClass fabricproto.TrafficClass `json:"traffic_class,omitempty"`
RemoteAddr string `json:"remote_addr,omitempty"`
Reason string `json:"reason,omitempty"`
ObservedAt time.Time `json:"observed_at"`
}
type VPNPacketIngress interface {
SendClientPacketBatch(ctx context.Context, clusterID string, vpnConnectionID string, packets [][]byte) error
ReceiveClientPacketBatch(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration) ([][]byte, error)
@@ -69,24 +84,21 @@ type VPNPacketIngressRoutePreference interface {
}
type Server struct {
Local PeerIdentity
SyntheticRuntime *SyntheticRuntime
ProductionForwardingEnabled bool
ProductionEnvelopeObserver ProductionEnvelopeObserver
ProductionEnvelopeDelivery ProductionEnvelopeDelivery
ProductionForwardTransport ProductionForwardTransport
ProductionForwardLogger ProductionForwardLogger
DisableHTTPDataPlane bool
FabricServiceChannelLogger FabricServiceChannelAccessLogger
RemoteWorkspaceFrameSink RemoteWorkspaceFrameSink
ProductionRoutes []SyntheticRoute
VPNPacketIngress VPNPacketIngress
BackendProxyBaseURL string
ClusterAuthorityPublicKey string
ServiceChannelIntrospection bool
FabricSessionEnabled bool
FabricSessionWebSocketEnabled bool
FabricSessionLogger FabricSessionEventLogger
Local PeerIdentity
SyntheticRuntime *SyntheticRuntime
ProductionForwardingEnabled bool
ProductionEnvelopeObserver ProductionEnvelopeObserver
ProductionEnvelopeDelivery ProductionEnvelopeDelivery
ProductionForwardTransport ProductionForwardTransport
ProductionForwardLogger ProductionForwardLogger
DisableHTTPDataPlane bool
FabricServiceChannelLogger FabricServiceChannelAccessLogger
RemoteWorkspaceFrameSink RemoteWorkspaceFrameSink
ProductionRoutes []SyntheticRoute
VPNPacketIngress VPNPacketIngress
BackendProxyBaseURL string
ClusterAuthorityPublicKey string
ServiceChannelIntrospection bool
}
func (s Server) Handler() http.Handler {
@@ -94,9 +106,6 @@ func (s Server) Handler() http.Handler {
mux.HandleFunc("/mesh/v1/health", s.handleHealth)
mux.HandleFunc("/mesh/v1/forward", s.handleForward)
mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe)
if s.FabricSessionEnabled && s.FabricSessionWebSocketEnabled {
mux.HandleFunc("/mesh/v1/fabric/session/ws", s.handleFabricSessionWebSocket)
}
if s.RemoteWorkspaceFrameSink != nil {
mux.HandleFunc("/mesh/v1/remote-workspace/adapter-sessions/", s.handleRemoteWorkspaceAdapterSessionControl)
}
@@ -196,185 +205,6 @@ func (s Server) handleRemoteWorkspaceAdapterSessionSnapshot(w http.ResponseWrite
_ = json.NewEncoder(w).Encode(snapshotter.SnapshotAdapterSessions(includeTerminal, limit, time.Now().UTC()))
}
type FabricSessionEventLogEntry struct {
Event string `json:"event"`
ClusterID string `json:"cluster_id,omitempty"`
NodeID string `json:"node_id,omitempty"`
PeerID string `json:"peer_id,omitempty"`
AcceptedBy string `json:"accepted_by,omitempty"`
SessionID string `json:"session_id,omitempty"`
SessionEvent fabricproto.SessionEventType `json:"session_event,omitempty"`
StreamID uint64 `json:"stream_id,omitempty"`
Sequence uint64 `json:"sequence,omitempty"`
TrafficClass fabricproto.TrafficClass `json:"traffic_class,omitempty"`
RemoteAddr string `json:"remote_addr,omitempty"`
Reason string `json:"reason,omitempty"`
ObservedAt time.Time `json:"observed_at"`
}
type fabricSessionAuthorityPayload struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
SessionID string `json:"session_id"`
SourceNodeID string `json:"source_node_id,omitempty"`
SelectedEntryNodeID string `json:"selected_entry_node_id,omitempty"`
TokenHash string `json:"token_hash"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
}
type fabricSessionAuthDecision struct {
AcceptedBy string
SessionID string
}
func (s Server) handleFabricSessionWebSocket(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
decision, ok := s.validateFabricSessionRequest(w, r)
if !ok {
return
}
upgrader := websocket.Upgrader{
CheckOrigin: func(_ *http.Request) bool { return true },
}
conn, err := upgrader.Upgrade(w, r, nil)
if err != nil {
return
}
defer conn.Close()
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_websocket_opened",
ClusterID: s.Local.ClusterID,
NodeID: s.Local.NodeID,
AcceptedBy: decision.AcceptedBy,
SessionID: decision.SessionID,
RemoteAddr: r.RemoteAddr,
ObservedAt: time.Now().UTC(),
})
loop := fabricproto.TransportLoop{
Session: fabricproto.NewSession(fabricproto.SessionConfig{}),
OnEvent: func(event fabricproto.SessionEvent) ([]fabricproto.Frame, error) {
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_event",
ClusterID: s.Local.ClusterID,
NodeID: s.Local.NodeID,
AcceptedBy: decision.AcceptedBy,
SessionID: decision.SessionID,
SessionEvent: event.Type,
StreamID: event.StreamID,
Sequence: event.Sequence,
TrafficClass: event.TrafficClass,
RemoteAddr: r.RemoteAddr,
ObservedAt: time.Now().UTC(),
})
return nil, nil
},
}
err = loop.RunWebSocket(r.Context(), conn, fabricproto.WebSocketTransportConfig{})
if err != nil && !errors.Is(err, context.Canceled) {
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_websocket_closed",
ClusterID: s.Local.ClusterID,
NodeID: s.Local.NodeID,
AcceptedBy: decision.AcceptedBy,
SessionID: decision.SessionID,
RemoteAddr: r.RemoteAddr,
Reason: err.Error(),
ObservedAt: time.Now().UTC(),
})
return
}
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_websocket_closed",
ClusterID: s.Local.ClusterID,
NodeID: s.Local.NodeID,
AcceptedBy: decision.AcceptedBy,
SessionID: decision.SessionID,
RemoteAddr: r.RemoteAddr,
ObservedAt: time.Now().UTC(),
})
}
func (s Server) validateFabricSessionRequest(w http.ResponseWriter, r *http.Request) (fabricSessionAuthDecision, bool) {
var decision fabricSessionAuthDecision
token := fabricSessionBearerToken(r)
if !strings.HasPrefix(token, "rap_fsn_") {
http.Error(w, "fabric session token is required", http.StatusUnauthorized)
return decision, false
}
payload, err := s.verifyFabricSessionAuthority(r, token)
if err != nil {
http.Error(w, err.Error(), http.StatusForbidden)
return decision, false
}
decision.AcceptedBy = "legacy_unsigned"
if payload != nil {
decision.AcceptedBy = "signed"
decision.SessionID = strings.TrimSpace(payload.SessionID)
}
return decision, true
}
func (s Server) verifyFabricSessionAuthority(r *http.Request, token string) (*fabricSessionAuthorityPayload, error) {
publicKey := strings.TrimSpace(s.ClusterAuthorityPublicKey)
payloadHeader := strings.TrimSpace(r.Header.Get("X-RAP-Fabric-Session-Authority-Payload"))
signatureHeader := strings.TrimSpace(r.Header.Get("X-RAP-Fabric-Session-Authority-Signature"))
if payloadHeader == "" && signatureHeader == "" {
if publicKey != "" {
return nil, fmt.Errorf("%w: signed fabric session authority is required", ErrUnauthorizedChannel)
}
return nil, nil
}
if publicKey == "" {
return nil, ErrUnauthorizedChannel
}
if payloadHeader == "" || signatureHeader == "" {
return nil, fmt.Errorf("%w: fabric session authority payload and signature are required together", ErrUnauthorizedChannel)
}
payloadRaw, err := decodeHeaderJSON(payloadHeader)
if err != nil {
return nil, fmt.Errorf("%w: invalid fabric session authority payload", ErrUnauthorizedChannel)
}
signatureRaw, err := decodeHeaderJSON(signatureHeader)
if err != nil {
return nil, fmt.Errorf("%w: invalid fabric session authority signature", ErrUnauthorizedChannel)
}
var signature authority.Signature
if err := json.Unmarshal(signatureRaw, &signature); err != nil {
return nil, fmt.Errorf("%w: invalid fabric session authority signature", ErrUnauthorizedChannel)
}
if err := authority.VerifyRaw(publicKey, payloadRaw, signature); err != nil {
return nil, fmt.Errorf("%w: fabric session authority signature rejected", ErrUnauthorizedChannel)
}
var payload fabricSessionAuthorityPayload
if err := json.Unmarshal(payloadRaw, &payload); err != nil {
return nil, fmt.Errorf("%w: invalid fabric session authority payload", ErrUnauthorizedChannel)
}
if payload.SchemaVersion != "rap.fabric_session_authority.v1" ||
payload.ClusterID != s.Local.ClusterID ||
payload.TokenHash != fabricSessionTokenHash(token) ||
strings.TrimSpace(payload.SessionID) == "" {
return nil, fmt.Errorf("%w: fabric session authority payload mismatch", ErrUnauthorizedChannel)
}
if payload.SelectedEntryNodeID != "" && s.Local.NodeID != "" && payload.SelectedEntryNodeID != s.Local.NodeID {
return nil, fmt.Errorf("%w: fabric session entry node mismatch", ErrUnauthorizedChannel)
}
if !payload.ExpiresAt.IsZero() && !payload.ExpiresAt.After(time.Now().UTC()) {
return nil, fmt.Errorf("%w: fabric session lease expired", ErrUnauthorizedChannel)
}
return &payload, nil
}
func (s Server) logFabricSession(entry FabricSessionEventLogEntry) {
if s.FabricSessionLogger != nil {
s.FabricSessionLogger(entry)
}
}
func (s Server) handleRemoteWorkspaceAdapterSessionMailbox(w http.ResponseWriter, r *http.Request) {
reader, ok := s.RemoteWorkspaceFrameSink.(RemoteWorkspaceFrameSinkSessionMailbox)
if !ok {
@@ -711,15 +541,15 @@ func parseRemoteWorkspaceAdapterSessionControlPath(path string) (string, bool) {
}
func (s Server) handleVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool {
if clusterID, vpnConnectionID, ok := parseVPNClientPacketWebSocketPath(r.URL.Path); ok {
s.handleVPNPacketWebSocket(w, r, clusterID, "", vpnConnectionID, false, true, "")
if isVPNClientPacketWebSocketPath(r.URL.Path) {
http.Error(w, "legacy VPN WebSocket dataplane is removed; use QUIC fabric route", http.StatusGone)
return true
}
clusterID, vpnConnectionID, ok := parseVPNClientPacketPath(r.URL.Path)
if !ok {
if _, _, ok := parseVPNClientPacketPath(r.URL.Path); !ok {
return false
}
return s.handleVPNPacketHTTP(w, r, clusterID, "", vpnConnectionID, "", false, true, "")
http.Error(w, "legacy VPN HTTP dataplane is removed; use QUIC fabric route", http.StatusGone)
return true
}
func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.ResponseWriter, r *http.Request) bool {
@@ -728,7 +558,7 @@ func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.Response
return false
}
if webSocket {
http.Error(w, "remote workspace service-channel websocket forwarding is not implemented", http.StatusNotImplemented)
http.Error(w, "remote workspace service-channel websocket ingress is removed; use QUIC fabric route", http.StatusGone)
return true
}
decision, valid := s.validateFabricServiceChannelRequest(w, r, clusterID, channelID, resourceID, FabricServiceClassRemoteWorkspace, channelClass)
@@ -809,7 +639,7 @@ func (s Server) handleFabricServiceChannelRemoteWorkspaceIngress(w http.Response
"channel_id": channelID,
"resource_id": resourceID,
"data_plane": "validated",
"payload_flow": "not_implemented",
"payload_flow": "validated_only",
})
return true
}
@@ -898,7 +728,7 @@ func validateRemoteWorkspaceFrameBatchProbe(payload []byte, requiredChannelClass
return decoded, fmt.Errorf("unsupported remote workspace frame batch schema")
}
if !decoded.ProbeOnly {
return decoded, fmt.Errorf("remote workspace payload forwarding is not implemented")
return decoded, fmt.Errorf("remote workspace production payload forwarding is disabled; probe_only required")
}
if strings.TrimSpace(strings.ToLower(decoded.ServiceClass)) != FabricServiceClassRemoteWorkspace {
return decoded, fmt.Errorf("remote workspace frame batch service class mismatch")
@@ -952,438 +782,6 @@ func isAllowedRemoteWorkspaceAdapterFrameDirection(channel string, direction str
}
}
func (s Server) handleFabricServiceChannelVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool {
if clusterID, channelID, vpnConnectionID, ok := parseFabricServiceChannelVPNPacketWebSocketPath(r.URL.Path); ok {
decision, valid := s.validateFabricServiceChannelVPNRequest(w, r, clusterID, channelID, vpnConnectionID)
if !valid {
return true
}
s.logFabricServiceChannelAccess(r, clusterID, channelID, vpnConnectionID, decision)
s.preferVPNPacketIngressRoute(decision.PreferredRouteID)
s.handleVPNPacketWebSocket(w, r, clusterID, channelID, vpnConnectionID, decision.ForceBackendFallback, decision.BackendFallbackAllowed(), decision.BackendRelayPolicy)
return true
}
clusterID, channelID, vpnConnectionID, ok := parseFabricServiceChannelVPNPacketPath(r.URL.Path)
if !ok {
return false
}
decision, valid := s.validateFabricServiceChannelVPNRequest(w, r, clusterID, channelID, vpnConnectionID)
if !valid {
return true
}
w.Header().Set("X-RAP-Service-Channel-Accepted-By", decision.AcceptedBy)
s.logFabricServiceChannelAccess(r, clusterID, channelID, vpnConnectionID, decision)
s.preferVPNPacketIngressRoute(decision.PreferredRouteID)
backendPath := "/api/v1/clusters/" + clusterID + "/vpn-connections/" + vpnConnectionID + "/tunnel/client/packets"
return s.handleVPNPacketHTTP(w, r, clusterID, channelID, vpnConnectionID, backendPath, decision.ForceBackendFallback, decision.BackendFallbackAllowed(), decision.BackendRelayPolicy)
}
func (s Server) preferVPNPacketIngressRoute(routeID string) {
routeID = strings.TrimSpace(routeID)
if routeID == "" || s.VPNPacketIngress == nil {
return
}
if preferred, ok := s.VPNPacketIngress.(VPNPacketIngressRoutePreference); ok {
preferred.PreferClientRoute(routeID)
}
}
func (s Server) handleVPNPacketHTTP(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, vpnConnectionID string, backendFallbackPath string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) bool {
switch r.Method {
case http.MethodPost:
body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, MaxProductionVPNPacketPayloadBytes))
if err != nil {
http.Error(w, "invalid vpn packet payload", http.StatusBadRequest)
return true
}
if r.URL.Query().Get("batch") != "true" && len(body) == 0 {
http.Error(w, "empty vpn packet payload", http.StatusBadRequest)
return true
}
packets := [][]byte{body}
if r.URL.Query().Get("batch") == "true" {
packets, err = decodeVPNIngressPacketBatch(body)
if err != nil {
http.Error(w, "invalid vpn packet batch", http.StatusBadRequest)
return true
}
}
packets = cleanVPNIngressPacketBatch(packets)
if len(packets) == 0 {
http.Error(w, "empty vpn packet batch", http.StatusBadRequest)
return true
}
if forceBackendFallback {
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, body, backendFallbackPath) {
return true
}
s.logFabricServiceChannelViolation(r, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
http.Error(w, ErrRouteNotFound.Error(), vpnIngressStatusCode(ErrRouteNotFound))
return true
}
trafficClass := inferVPNPacketTrafficClass(r.Header.Get("X-RAP-Traffic-Class"), packets)
var sendErr error
if classIngress, ok := s.VPNPacketIngress.(VPNPacketIngressTrafficClass); ok {
sendErr = classIngress.SendClientPacketBatchWithTrafficClass(r.Context(), clusterID, vpnConnectionID, trafficClass, packets)
} else {
sendErr = s.VPNPacketIngress.SendClientPacketBatch(r.Context(), clusterID, vpnConnectionID, packets)
}
if sendErr != nil {
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, body, backendFallbackPath) {
return true
}
s.logFabricServiceChannelViolation(r, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "fabric_route_send_failed_backend_fallback_blocked", sendErr.Error())
http.Error(w, sendErr.Error(), vpnIngressStatusCode(sendErr))
return true
}
w.WriteHeader(http.StatusAccepted)
return true
case http.MethodGet:
if forceBackendFallback {
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, nil, backendFallbackPath) {
return true
}
s.logFabricServiceChannelViolation(r, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
w.WriteHeader(http.StatusNoContent)
return true
}
timeout := vpnIngressTimeout(r)
packets, err := s.VPNPacketIngress.ReceiveClientPacketBatch(r.Context(), clusterID, vpnConnectionID, timeout)
if err != nil {
http.Error(w, err.Error(), vpnIngressStatusCode(err))
return true
}
packets = cleanVPNIngressPacketBatch(packets)
if len(packets) == 0 {
if backendFallbackAllowed && s.proxyVPNPacketIngressToBackendPath(w, r, nil, backendFallbackPath) {
return true
}
w.WriteHeader(http.StatusNoContent)
return true
}
if r.URL.Query().Get("batch") == "true" {
w.Header().Set("Content-Type", "application/vnd.rap.vpn-packet-batch.v1")
_, _ = w.Write(encodeVPNIngressPacketBatch(packets))
return true
}
w.Header().Set("Content-Type", "application/octet-stream")
_, _ = w.Write(packets[0])
return true
default:
w.WriteHeader(http.StatusMethodNotAllowed)
return true
}
}
func (s Server) handleVPNPacketWebSocket(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, vpnConnectionID string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) {
if r.Method != http.MethodGet {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if s.VPNPacketIngress == nil {
http.Error(w, ErrForwardRuntimeUnavailable.Error(), http.StatusServiceUnavailable)
return
}
upgrader := websocket.Upgrader{
CheckOrigin: func(_ *http.Request) bool { return true },
}
conn, err := upgrader.Upgrade(w, r, nil)
if err != nil {
return
}
defer conn.Close()
conn.SetReadLimit(MaxProductionVPNPacketPayloadBytes)
ctx, cancel := context.WithCancel(r.Context())
defer cancel()
trafficClass := r.Header.Get("X-RAP-Traffic-Class")
errCh := make(chan error, 2)
go func() {
errCh <- s.readVPNPacketWebSocket(ctx, conn, clusterID, channelID, vpnConnectionID, trafficClass, forceBackendFallback, backendFallbackAllowed, backendRelayPolicy)
}()
go func() {
errCh <- s.writeVPNPacketWebSocket(ctx, conn, clusterID, channelID, vpnConnectionID, forceBackendFallback, backendFallbackAllowed, backendRelayPolicy)
}()
select {
case <-ctx.Done():
case <-errCh:
cancel()
}
}
func (s Server) readVPNPacketWebSocket(ctx context.Context, conn *websocket.Conn, clusterID string, channelID string, vpnConnectionID string, trafficClass string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) error {
for {
messageType, payload, err := conn.ReadMessage()
if err != nil {
return err
}
if messageType != websocket.BinaryMessage {
continue
}
packets, err := decodeVPNIngressPacketBatch(payload)
if err != nil {
return err
}
packets = cleanVPNIngressPacketBatch(packets)
if len(packets) == 0 {
continue
}
if forceBackendFallback {
if !backendFallbackAllowed {
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
return ErrRouteNotFound
}
if proxyErr := s.backendVPNPacketPost(ctx, clusterID, vpnConnectionID, payload); proxyErr != nil {
return proxyErr
}
continue
}
sendErr := s.sendVPNPacketWebSocketBatch(ctx, clusterID, vpnConnectionID, inferVPNPacketTrafficClass(trafficClass, packets), packets, !backendFallbackAllowed)
if sendErr != nil {
if !backendFallbackAllowed {
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "fabric_route_send_failed_backend_fallback_blocked", sendErr.Error())
if isRetryableVPNPacketIngressError(sendErr) {
continue
}
return sendErr
}
if proxyErr := s.backendVPNPacketPost(ctx, clusterID, vpnConnectionID, payload); proxyErr != nil {
return sendErr
}
}
}
}
func (s Server) sendVPNPacketWebSocketBatch(ctx context.Context, clusterID string, vpnConnectionID string, trafficClass string, packets [][]byte, retryRouteErrors bool) error {
const maxAttempts = 6
var lastErr error
for attempt := 0; attempt < maxAttempts; attempt++ {
if err := ctx.Err(); err != nil {
return err
}
var sendErr error
if classIngress, ok := s.VPNPacketIngress.(VPNPacketIngressTrafficClass); ok {
sendErr = classIngress.SendClientPacketBatchWithTrafficClass(ctx, clusterID, vpnConnectionID, trafficClass, packets)
} else {
sendErr = s.VPNPacketIngress.SendClientPacketBatch(ctx, clusterID, vpnConnectionID, packets)
}
if sendErr == nil {
return nil
}
lastErr = sendErr
if !retryRouteErrors || !isRetryableVPNPacketIngressError(sendErr) {
return sendErr
}
timer := time.NewTimer(time.Duration(75+attempt*50) * time.Millisecond)
select {
case <-ctx.Done():
timer.Stop()
return ctx.Err()
case <-timer.C:
}
}
return lastErr
}
func isRetryableVPNPacketIngressError(err error) bool {
return errors.Is(err, ErrRouteNotFound) ||
errors.Is(err, ErrForwardRuntimeUnavailable) ||
errors.Is(err, ErrForwardPeerUnavailable) ||
errors.Is(err, ErrSyntheticPeerUnavailable)
}
func (s Server) receiveVPNPacketWebSocketBatch(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration, retryRouteErrors bool) ([][]byte, error) {
const maxAttempts = 4
var lastErr error
for attempt := 0; attempt < maxAttempts; attempt++ {
if err := ctx.Err(); err != nil {
return nil, err
}
packets, err := s.VPNPacketIngress.ReceiveClientPacketBatch(ctx, clusterID, vpnConnectionID, timeout)
if err == nil {
return packets, nil
}
lastErr = err
if !retryRouteErrors || !isRetryableVPNPacketIngressError(err) {
return nil, err
}
timer := time.NewTimer(time.Duration(75+attempt*50) * time.Millisecond)
select {
case <-ctx.Done():
timer.Stop()
return nil, ctx.Err()
case <-timer.C:
}
}
if retryRouteErrors && isRetryableVPNPacketIngressError(lastErr) {
return nil, nil
}
return nil, lastErr
}
func (s Server) writeVPNPacketWebSocket(ctx context.Context, conn *websocket.Conn, clusterID string, channelID string, vpnConnectionID string, forceBackendFallback bool, backendFallbackAllowed bool, backendRelayPolicy string) error {
lastPing := time.Now()
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
var packets [][]byte
var err error
if !forceBackendFallback {
packets, err = s.receiveVPNPacketWebSocketBatch(ctx, clusterID, vpnConnectionID, 50*time.Millisecond, !backendFallbackAllowed)
}
if forceBackendFallback && !backendFallbackAllowed {
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "backend_fallback_blocked_by_policy", ErrRouteNotFound.Error())
return ErrRouteNotFound
}
if err != nil && !backendFallbackAllowed {
s.logFabricServiceChannelViolation(nil, clusterID, channelID, vpnConnectionID, backendRelayPolicy, "fabric_route_receive_failed_backend_fallback_blocked", err.Error())
return err
}
if backendFallbackAllowed && (forceBackendFallback || err != nil || len(packets) == 0) {
backendPackets, proxyErr := s.backendVPNPacketGet(ctx, clusterID, vpnConnectionID, 50*time.Millisecond)
if proxyErr != nil && err != nil {
return err
}
if len(backendPackets) > 0 {
packets = backendPackets
}
}
if len(packets) > 0 {
if err := conn.SetWriteDeadline(time.Now().Add(5 * time.Second)); err != nil {
return err
}
if err := conn.WriteMessage(websocket.BinaryMessage, encodeVPNIngressPacketBatch(packets)); err != nil {
return err
}
continue
}
if time.Since(lastPing) >= 15*time.Second {
if err := conn.SetWriteDeadline(time.Now().Add(5 * time.Second)); err != nil {
return err
}
if err := conn.WriteMessage(websocket.PingMessage, []byte("rap-vpn")); err != nil {
return err
}
lastPing = time.Now()
}
}
}
func (s Server) backendVPNPacketPost(ctx context.Context, clusterID string, vpnConnectionID string, batchPayload []byte) error {
target := strings.TrimRight(strings.TrimSpace(s.BackendProxyBaseURL), "/")
if target == "" {
return ErrRouteNotFound
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, target+"/clusters/"+clusterID+"/vpn-connections/"+vpnConnectionID+"/tunnel/client/packets?batch=true", bytes.NewReader(batchPayload))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/octet-stream")
req.Header.Set("X-RAP-Entry-Node", s.Local.NodeID)
req.Header.Set("X-RAP-Entry-Cluster", s.Local.ClusterID)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("backend vpn packet post failed: status=%d", resp.StatusCode)
}
return nil
}
func (s Server) backendVPNPacketGet(ctx context.Context, clusterID string, vpnConnectionID string, timeout time.Duration) ([][]byte, error) {
target := strings.TrimRight(strings.TrimSpace(s.BackendProxyBaseURL), "/")
if target == "" {
return nil, ErrRouteNotFound
}
if timeout <= 0 {
timeout = 50 * time.Millisecond
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, target+"/clusters/"+clusterID+"/vpn-connections/"+vpnConnectionID+"/tunnel/client/packets?batch=true&timeout_ms="+strconv.FormatInt(timeout.Milliseconds(), 10), nil)
if err != nil {
return nil, err
}
req.Header.Set("Accept", "application/vnd.rap.vpn-packet-batch.v1")
req.Header.Set("X-RAP-Entry-Node", s.Local.NodeID)
req.Header.Set("X-RAP-Entry-Cluster", s.Local.ClusterID)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNoContent {
return nil, nil
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, fmt.Errorf("backend vpn packet get failed: status=%d", resp.StatusCode)
}
body, err := io.ReadAll(io.LimitReader(resp.Body, MaxProductionVPNPacketPayloadBytes))
if err != nil {
return nil, err
}
if len(body) == 0 {
return nil, nil
}
return decodeVPNIngressPacketBatch(body)
}
func (s Server) proxyVPNPacketIngressToBackend(w http.ResponseWriter, r *http.Request, body []byte) bool {
return s.proxyVPNPacketIngressToBackendPath(w, r, body, "")
}
func (s Server) proxyVPNPacketIngressToBackendPath(w http.ResponseWriter, r *http.Request, body []byte, backendPath string) bool {
if strings.TrimSpace(s.BackendProxyBaseURL) == "" {
return false
}
target, err := url.Parse(s.BackendProxyBaseURL)
if err != nil || target.Scheme == "" || target.Host == "" {
return false
}
if strings.EqualFold(target.Host, r.Host) {
return false
}
var reader io.Reader
if body != nil {
reader = bytes.NewReader(body)
}
requestURI := r.URL.RequestURI()
if backendPath != "" {
requestURI = backendPath
if r.URL.RawQuery != "" {
requestURI += "?" + r.URL.RawQuery
}
}
req, err := http.NewRequestWithContext(r.Context(), r.Method, target.Scheme+"://"+target.Host+requestURI, reader)
if err != nil {
return false
}
for _, key := range []string{"Accept", "Content-Type"} {
if value := r.Header.Get(key); value != "" {
req.Header.Set(key, value)
}
}
req.Header.Set("X-RAP-Entry-Node", s.Local.NodeID)
req.Header.Set("X-RAP-Entry-Cluster", s.Local.ClusterID)
resp, err := http.DefaultClient.Do(req)
if err != nil {
return false
}
defer resp.Body.Close()
for _, key := range []string{"Content-Type"} {
if value := resp.Header.Get(key); value != "" {
w.Header().Set(key, value)
}
}
w.WriteHeader(resp.StatusCode)
_, _ = io.Copy(w, resp.Body)
return true
}
type fabricServiceChannelLeaseAuthorityPayload struct {
SchemaVersion string `json:"schema_version"`
ChannelID string `json:"channel_id"`
@@ -1443,10 +841,6 @@ func (d fabricServiceChannelRequestDecision) BackendFallbackAllowed() bool {
return strings.TrimSpace(d.BackendRelayPolicy) != "disabled"
}
func (s Server) validateFabricServiceChannelVPNRequest(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, vpnConnectionID string) (fabricServiceChannelRequestDecision, bool) {
return s.validateFabricServiceChannelRequest(w, r, clusterID, channelID, vpnConnectionID, FabricServiceClassVPNPackets, ProductionChannelVPNPacket)
}
func (s Server) validateFabricServiceChannelRequest(w http.ResponseWriter, r *http.Request, clusterID string, channelID string, resourceID string, expectedServiceClass string, defaultChannelClass string) (fabricServiceChannelRequestDecision, bool) {
var decision fabricServiceChannelRequestDecision
expectedServiceClass = strings.TrimSpace(strings.ToLower(expectedServiceClass))
@@ -1485,7 +879,7 @@ func (s Server) validateFabricServiceChannelRequest(w http.ResponseWriter, r *ht
http.Error(w, err.Error(), http.StatusForbidden)
return decision, false
}
decision.AcceptedBy = "legacy_unsigned"
decision.AcceptedBy = "token_authorized"
decision.ServiceClass = serviceClass
decision.ChannelClass = channelClass
if payload != nil && (payload.Status == "degraded_fallback" || payload.PrimaryRoute.Status == "missing_route_intent") {
@@ -1571,30 +965,6 @@ func (s Server) logFabricServiceChannelAccess(r *http.Request, clusterID string,
s.FabricServiceChannelLogger(entry)
}
func (s Server) logFabricServiceChannelViolation(r *http.Request, clusterID string, channelID string, resourceID string, backendRelayPolicy string, status string, reason string) {
if s.FabricServiceChannelLogger == nil || strings.TrimSpace(channelID) == "" {
return
}
entry := FabricServiceChannelAccessLogEntry{
Event: "fabric_service_channel_data_plane_violation",
ClusterID: clusterID,
ChannelID: channelID,
ResourceID: resourceID,
LocalNodeID: s.Local.NodeID,
BackendRelayPolicy: strings.TrimSpace(backendRelayPolicy),
ViolationStatus: strings.TrimSpace(status),
ViolationReason: strings.TrimSpace(reason),
OccurredAt: time.Now().UTC(),
}
if r != nil {
entry.Method = r.Method
if r.URL != nil {
entry.Path = r.URL.Path
}
}
s.FabricServiceChannelLogger(entry)
}
func (s Server) verifyFabricServiceChannelLeaseAuthority(r *http.Request, clusterID string, channelID string, resourceID string, serviceClass string, channelClass string, token string) (*fabricServiceChannelLeaseAuthorityPayload, error) {
publicKey := strings.TrimSpace(s.ClusterAuthorityPublicKey)
payloadHeader := strings.TrimSpace(r.Header.Get("X-RAP-Service-Channel-Authority-Payload"))
@@ -1657,15 +1027,15 @@ func validateFabricServiceChannelDataPlaneContract(contract fabricServiceChannel
}
requiredFlowClass = strings.TrimSpace(strings.ToLower(requiredFlowClass))
if contract.SchemaVersion != "rap.fabric_service_channel_data_plane.v1" ||
contract.WorkingDataTransport != "fabric_service_channel" ||
contract.WorkingDataTransport != "fabric_quic_route" ||
contract.SteadyStateTransport != "fabric_route" ||
(contract.BackendRelayPolicy != "degraded_fallback_only" && contract.BackendRelayPolicy != "disabled") ||
contract.BackendRelayPolicy != "disabled" ||
!contract.ServiceNeutral ||
!contract.ProtocolAgnostic ||
contract.LogicalFlowMode != "multi_flow_isolated" {
return fmt.Errorf("%w: unsupported service channel data-plane contract", ErrUnauthorizedChannel)
}
if contract.Mode != "" && contract.Mode != "fabric_primary" && contract.Mode != "degraded_backend_fallback" {
if contract.Mode != "" && contract.Mode != "fabric_primary" && contract.Mode != "fabric_quic_only" {
return fmt.Errorf("%w: unsupported service channel data-plane mode", ErrUnauthorizedChannel)
}
if requiredFlowClass != "" && len(contract.RequiredFlowIsolationClasses) > 0 && !containsString(contract.RequiredFlowIsolationClasses, requiredFlowClass) {
@@ -1796,29 +1166,6 @@ func fabricServiceChannelBearerToken(r *http.Request) string {
return strings.TrimSpace(r.URL.Query().Get("service_channel_token"))
}
func fabricSessionTokenHash(token string) string {
sum := sha256.Sum256([]byte(strings.TrimSpace(token)))
return hex.EncodeToString(sum[:])
}
func fabricSessionBearerToken(r *http.Request) string {
if r == nil {
return ""
}
if token := strings.TrimSpace(r.Header.Get("X-RAP-Fabric-Session-Token")); token != "" {
return token
}
auth := strings.TrimSpace(r.Header.Get("Authorization"))
if len(auth) > len("Bearer ") && strings.EqualFold(auth[:len("Bearer ")], "Bearer ") {
return strings.TrimSpace(auth[len("Bearer "):])
}
return strings.TrimSpace(r.URL.Query().Get("fabric_session_token"))
}
func isAllowedFabricServiceVPNChannel(channel string) bool {
return isAllowedFabricServiceChannelForClass(FabricServiceClassVPNPackets, channel)
}
func isAllowedFabricServiceChannelForClass(serviceClass string, channel string) bool {
serviceClass = strings.TrimSpace(strings.ToLower(serviceClass))
channel = strings.TrimSpace(strings.ToLower(channel))
@@ -1846,25 +1193,6 @@ func containsString(values []string, target string) bool {
return false
}
func parseFabricServiceChannelVPNPacketWebSocketPath(path string) (string, string, string, bool) {
parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) != 11 ||
parts[0] != "api" ||
parts[1] != "v1" ||
parts[2] != "clusters" ||
parts[4] != "fabric" ||
parts[5] != "service-channels" ||
parts[7] != "vpn-connections" ||
parts[9] != "packets" ||
parts[10] != "ws" {
return "", "", "", false
}
if parts[3] == "" || parts[6] == "" || parts[8] == "" {
return "", "", "", false
}
return parts[3], parts[6], parts[8], true
}
func parseFabricServiceChannelRemoteWorkspacePath(path string) (string, string, string, string, bool, bool) {
parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) == 11 &&
@@ -1897,6 +1225,34 @@ func parseFabricServiceChannelRemoteWorkspacePath(path string) (string, string,
return parts[3], parts[6], parts[8], strings.TrimSpace(strings.ToLower(parts[10])), false, true
}
func (s Server) handleFabricServiceChannelVPNPacketIngress(w http.ResponseWriter, r *http.Request) bool {
if isFabricServiceChannelVPNPacketWebSocketPath(r.URL.Path) {
http.Error(w, "fabric service-channel WebSocket dataplane is removed; use QUIC fabric route", http.StatusGone)
return true
}
if _, _, _, ok := parseFabricServiceChannelVPNPacketPath(r.URL.Path); !ok {
return false
}
http.Error(w, "fabric service-channel HTTP dataplane is removed; use QUIC fabric route", http.StatusGone)
return true
}
func isFabricServiceChannelVPNPacketWebSocketPath(path string) bool {
parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) != 11 ||
parts[0] != "api" ||
parts[1] != "v1" ||
parts[2] != "clusters" ||
parts[4] != "fabric" ||
parts[5] != "service-channels" ||
parts[7] != "vpn-connections" ||
parts[9] != "packets" ||
parts[10] != "ws" {
return false
}
return parts[3] != "" && parts[6] != "" && parts[8] != ""
}
func parseFabricServiceChannelVPNPacketPath(path string) (string, string, string, bool) {
parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) != 10 ||
@@ -1915,7 +1271,7 @@ func parseFabricServiceChannelVPNPacketPath(path string) (string, string, string
return parts[3], parts[6], parts[8], true
}
func parseVPNClientPacketWebSocketPath(path string) (string, string, bool) {
func isVPNClientPacketWebSocketPath(path string) bool {
parts := strings.Split(strings.Trim(path, "/"), "/")
if len(parts) != 10 ||
parts[0] != "api" ||
@@ -1926,12 +1282,9 @@ func parseVPNClientPacketWebSocketPath(path string) (string, string, bool) {
parts[7] != "client" ||
parts[8] != "packets" ||
parts[9] != "ws" {
return "", "", false
return false
}
if parts[3] == "" || parts[5] == "" {
return "", "", false
}
return parts[3], parts[5], true
return parts[3] != "" && parts[5] != ""
}
func parseVPNClientPacketPath(path string) (string, string, bool) {
@@ -1952,28 +1305,6 @@ func parseVPNClientPacketPath(path string) (string, string, bool) {
return parts[3], parts[5], true
}
func vpnIngressTimeout(r *http.Request) time.Duration {
timeoutMs, _ := strconv.Atoi(r.URL.Query().Get("timeout_ms"))
if timeoutMs <= 0 {
timeoutMs = 25000
}
if timeoutMs > 30000 {
timeoutMs = 30000
}
return time.Duration(timeoutMs) * time.Millisecond
}
func vpnIngressStatusCode(err error) int {
switch err {
case ErrForwardRuntimeUnavailable, ErrRouteNotFound, ErrForwardPeerUnavailable:
return http.StatusServiceUnavailable
case ErrUnauthorizedChannel, ErrClusterMismatch, ErrNodeMismatch:
return http.StatusForbidden
default:
return http.StatusBadGateway
}
}
func encodeVPNIngressPacketBatch(packets [][]byte) []byte {
packets = cleanVPNIngressPacketBatch(packets)
total := 0
File diff suppressed because it is too large Load Diff