package main import ( "context" "encoding/json" "flag" "fmt" "log" "net/http" "os" "os/signal" "runtime" "strings" "syscall" "time" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/hostagent" "github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh" ) type installCommandConfig struct { Runtime hostagent.RuntimeConfig DryRun bool AutoUpdateEnabled bool AutoUpdate hostagent.UpdateServiceConfig } func main() { log.SetFlags(0) applyStagedSelfUpdate() if len(os.Args) < 2 { usage() os.Exit(2) } ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) defer stop() switch os.Args[1] { case "install": if err := runInstall(ctx, os.Args[2:]); err != nil { log.Fatalf("install failed: %v", err) } case "install-windows": if err := runInstallWindows(ctx, os.Args[2:]); err != nil { log.Fatalf("install-windows failed: %v", err) } case "install-linux": if err := runInstallLinux(ctx, os.Args[2:]); err != nil { log.Fatalf("install-linux failed: %v", err) } case "status": if err := runStatus(ctx, os.Args[2:]); err != nil { log.Fatalf("status failed: %v", err) } case "update": if err := runUpdate(ctx, os.Args[2:]); err != nil { log.Fatalf("update failed: %v", err) } case "update-loop": if err := runUpdateLoop(ctx, os.Args[2:]); err != nil { log.Fatalf("update-loop failed: %v", err) } case "monitor-loop": if err := runMonitorLoop(ctx, os.Args[2:]); err != nil { log.Fatalf("monitor-loop failed: %v", err) } case "monitor-once": if err := runMonitorOnce(ctx, os.Args[2:]); err != nil { log.Fatalf("monitor-once failed: %v", err) } case "install-updater": if err := runInstallUpdater(ctx, os.Args[2:]); err != nil { log.Fatalf("install-updater failed: %v", err) } case "update-host-agent": if err := runUpdateHostAgent(ctx, os.Args[2:]); err != nil { log.Fatalf("update-host-agent failed: %v", err) } case "update-host-agent-loop": if err := runUpdateHostAgentLoop(ctx, os.Args[2:]); err != nil { log.Fatalf("update-host-agent-loop failed: %v", err) } case "fabric-session-smoke": if err := runFabricSessionSmoke(ctx, os.Args[2:]); err != nil { log.Fatalf("fabric-session-smoke failed: %v", err) } default: usage() os.Exit(2) } } func applyStagedSelfUpdate() { if runtime.GOOS == "windows" { return } executable, err := os.Executable() if err != nil { return } staged := executable + ".next" if _, err := os.Stat(staged); err != nil { return } backup := executable + ".old" _ = os.Remove(backup) if err := os.Rename(executable, backup); err != nil { return } if err := os.Rename(staged, executable); err != nil { _ = os.Rename(backup, executable) return } _ = os.Chmod(executable, 0o755) _ = os.Remove(backup) } func runFabricSessionSmoke(ctx context.Context, args []string) error { fs := flag.NewFlagSet("fabric-session-smoke", flag.ContinueOnError) var meshURL string var token string var timeoutSeconds int var payload string var authorityPayload string var authoritySignature string fs.StringVar(&meshURL, "mesh-url", getenv("RAP_MESH_SMOKE_URL", ""), "Mesh base URL, for example http://node:19131.") fs.StringVar(&token, "token", getenv("RAP_FABRIC_SESSION_TOKEN", ""), "Fabric session token starting with rap_fsn_.") fs.IntVar(&timeoutSeconds, "timeout-seconds", getenvInt("RAP_FABRIC_SESSION_SMOKE_TIMEOUT_SECONDS", 5), "Smoke timeout in seconds.") fs.StringVar(&payload, "payload", getenv("RAP_FABRIC_SESSION_SMOKE_PAYLOAD", "rap-fabric-session-smoke"), "Ping payload.") fs.StringVar(&authorityPayload, "authority-payload", getenv("RAP_FABRIC_SESSION_AUTHORITY_PAYLOAD", ""), "Base64 or JSON fabric session authority payload header.") fs.StringVar(&authoritySignature, "authority-signature", getenv("RAP_FABRIC_SESSION_AUTHORITY_SIGNATURE", ""), "Base64 or JSON fabric session authority signature header.") if err := fs.Parse(args); err != nil { return err } if strings.TrimSpace(meshURL) == "" { return fmt.Errorf("mesh-url is required") } if strings.TrimSpace(token) == "" { return fmt.Errorf("token is required") } if timeoutSeconds <= 0 { timeoutSeconds = 5 } smokeCtx, cancel := context.WithTimeout(ctx, time.Duration(timeoutSeconds)*time.Second) defer cancel() header := make(http.Header) if strings.TrimSpace(authorityPayload) != "" { header.Set("X-RAP-Fabric-Session-Authority-Payload", strings.TrimSpace(authorityPayload)) } if strings.TrimSpace(authoritySignature) != "" { header.Set("X-RAP-Fabric-Session-Authority-Signature", strings.TrimSpace(authoritySignature)) } startedAt := time.Now() response, err := mesh.NewClient(meshURL).SendFabricSessionFrame(smokeCtx, mesh.FabricSessionDialOptions{ Token: token, Header: header, Timeout: time.Duration(timeoutSeconds) * time.Second, }, fabricproto.Frame{ Type: fabricproto.FramePing, Sequence: uint64(startedAt.UnixNano()), Payload: []byte(payload), }) duration := time.Since(startedAt) result := map[string]any{ "schema_version": "rap.fabric_session_smoke_result.v1", "mesh_url": strings.TrimSpace(meshURL), "ok": err == nil && response.Type == fabricproto.FramePong && string(response.Payload) == payload, "latency_ms": duration.Milliseconds(), "response_type": response.Type, "sequence": response.Sequence, "authority": strings.TrimSpace(authorityPayload) != "" || strings.TrimSpace(authoritySignature) != "", } if err != nil { result["error"] = err.Error() } encoded, marshalErr := json.MarshalIndent(result, "", " ") if marshalErr != nil { return marshalErr } fmt.Println(string(encoded)) if err != nil { return err } if response.Type != fabricproto.FramePong || string(response.Payload) != payload { return fmt.Errorf("fabric session smoke returned unexpected response type=%d payload=%q", response.Type, string(response.Payload)) } return nil } func runInstallLinux(ctx context.Context, args []string) error { fs := flag.NewFlagSet("install-linux", flag.ContinueOnError) cfg := hostagent.LinuxInstallConfig{} var profileURL string var installToken string fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.") fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.") fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.") fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/linux-install-profile URL for profile-based install.") fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Linux install profile.") fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.") fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.") fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_LINUX_INSTALL_DIR", ""), "Directory for rap-node-agent and rap-host-agent.") fs.StringVar(&cfg.ConfigDir, "config-dir", getenv("RAP_LINUX_CONFIG_DIR", ""), "Directory for node-agent env file.") fs.StringVar(&cfg.StartupMode, "startup-mode", getenv("RAP_LINUX_STARTUP_MODE", "systemd"), "Startup mode: systemd, auto, or none.") fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_REPLACE", true), "Replace local node-agent binary/config when an artifact is available.") fs.BoolVar(&cfg.DryRun, "dry-run", false, "Print resolved placement without installing.") fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Linux host-agent update service.") fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.") fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.") fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.") fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.") fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.") fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent path copied to the persistent updater location.") fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.") fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.") fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.") fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.") fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.") fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.") fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.") fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.") fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.") fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.") fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.") fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.") fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.") fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Advertised transport.") fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.") fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.") fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "linux"), "Region/site hint.") fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.") fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.") fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.") if err := fs.Parse(args); err != nil { return err } if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" { dryRun := cfg.DryRun startupMode := strings.TrimSpace(cfg.StartupMode) autoUpdateEnabled := cfg.AutoUpdateEnabled autoUpdateCurrentVersion := cfg.AutoUpdateCurrentVersion autoUpdateChannel := cfg.AutoUpdateChannel autoUpdateIntervalSeconds := cfg.AutoUpdateIntervalSeconds autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds hostAgentSourcePath := cfg.HostAgentSourcePath profile, err := hostagent.FetchLinuxInstallProfile(ctx, hostagent.ProfileRequest{URL: profileURL, ClusterID: cfg.RuntimeConfig.ClusterID, InstallToken: installToken, NodeName: cfg.RuntimeConfig.NodeName}) if err != nil { return err } cfg = hostagent.LinuxInstallConfigFromProfile(profile) cfg.Replace = true cfg.DryRun = dryRun cfg.AutoUpdateEnabled = autoUpdateEnabled cfg.AutoUpdateCurrentVersion = autoUpdateCurrentVersion cfg.AutoUpdateChannel = autoUpdateChannel cfg.AutoUpdateIntervalSeconds = autoUpdateIntervalSeconds cfg.AutoUpdateInitialDelaySeconds = autoUpdateInitialDelaySeconds cfg.AutoUpdateHealthTimeoutSeconds = autoUpdateHealthTimeoutSeconds cfg.HostAgentSourcePath = hostAgentSourcePath if startupMode != "" { cfg.StartupMode = startupMode } } result, err := (hostagent.LinuxManager{}).Install(ctx, cfg) if err != nil { return err } fmt.Printf("node=%s install_dir=%s state_dir=%s node_agent=%s unit=%s downloaded=%t started=%t updater_unit=%s updater_started=%t\n", result.NodeName, result.InstallDir, result.StateDir, result.NodeAgentPath, result.UnitName, result.Downloaded, result.Started, result.UpdaterUnitName, result.UpdaterStarted) fmt.Println("next: approve the join request in the platform admin panel, then the Linux node-agent will finish bootstrap and start heartbeats") return nil } func runInstallWindows(ctx context.Context, args []string) error { fs := flag.NewFlagSet("install-windows", flag.ContinueOnError) cfg := hostagent.WindowsInstallConfig{} var profileURL string var installToken string fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.") fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.") fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.") fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/windows-install-profile URL for profile-based install.") fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Windows install profile.") fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.") fs.StringVar(&cfg.RuntimeConfig.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.") fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_WINDOWS_INSTALL_DIR", ""), "Directory for rap-node-agent.exe and wrapper scripts.") fs.StringVar(&cfg.StartupMode, "startup-mode", getenv("RAP_WINDOWS_STARTUP_MODE", "auto"), "Startup mode: auto, system-task, user-task, or none.") fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_REPLACE", true), "Replace local node-agent binary/config when an artifact is available.") fs.BoolVar(&cfg.DryRun, "dry-run", false, "Print resolved placement without installing.") fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Windows host-agent update task.") fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.") fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.") fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.") fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.") fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.") fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent.exe path copied to the persistent updater location.") fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.") fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.") fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.") fs.BoolVar(&cfg.RuntimeConfig.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.") fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.") fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.") fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.") fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.") fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.") fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.") fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.") fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.") fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.") fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.") fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Advertised transport.") fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.") fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.") fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "windows"), "Region/site hint.") fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.") fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.") fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.") if err := fs.Parse(args); err != nil { return err } if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" { dryRun := cfg.DryRun startupMode := strings.TrimSpace(cfg.StartupMode) autoUpdateEnabled := cfg.AutoUpdateEnabled autoUpdateCurrentVersion := cfg.AutoUpdateCurrentVersion autoUpdateChannel := cfg.AutoUpdateChannel autoUpdateIntervalSeconds := cfg.AutoUpdateIntervalSeconds autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds hostAgentSourcePath := cfg.HostAgentSourcePath profile, err := hostagent.FetchWindowsInstallProfile(ctx, hostagent.ProfileRequest{ URL: profileURL, ClusterID: cfg.RuntimeConfig.ClusterID, InstallToken: installToken, NodeName: cfg.RuntimeConfig.NodeName, }) if err != nil { return err } cfg = hostagent.WindowsInstallConfigFromProfile(profile) cfg.Replace = true cfg.DryRun = dryRun cfg.AutoUpdateEnabled = autoUpdateEnabled cfg.AutoUpdateCurrentVersion = autoUpdateCurrentVersion cfg.AutoUpdateChannel = autoUpdateChannel cfg.AutoUpdateIntervalSeconds = autoUpdateIntervalSeconds cfg.AutoUpdateInitialDelaySeconds = autoUpdateInitialDelaySeconds cfg.AutoUpdateHealthTimeoutSeconds = autoUpdateHealthTimeoutSeconds cfg.HostAgentSourcePath = hostAgentSourcePath if startupMode != "" { cfg.StartupMode = startupMode } } result, err := (hostagent.WindowsManager{}).Install(ctx, cfg) if err != nil { return err } fmt.Printf("node=%s install_dir=%s state_dir=%s node_agent=%s startup_mode=%s task=%s downloaded=%t started=%t updater_task=%s updater_started=%t admin_fallback=%t\n", result.NodeName, result.InstallDir, result.StateDir, result.NodeAgentPath, result.StartupMode, result.TaskName, result.Downloaded, result.Started, result.UpdaterTaskName, result.UpdaterStarted, result.AdminFallback) fmt.Println("next: approve the join request in the platform admin panel, then the Windows node-agent will finish bootstrap and start heartbeats") return nil } func runInstall(ctx context.Context, args []string) error { installCfg, err := parseInstall(args) if err != nil { return err } cfg := installCfg.Runtime.Normalize() cfg = cfg.Normalize() runArgs := hostagent.DockerRunArgs(cfg) if installCfg.DryRun { fmt.Printf("docker %s\n", shellJoin(hostagent.RedactedArgs(runArgs))) if installCfg.AutoUpdateEnabled { service := installCfg.AutoUpdate service.RuntimeConfig = cfg service.DryRun = true result, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service) if err != nil { return err } fmt.Print(result.Unit) if result.MonitorUnit != "" { fmt.Print(result.MonitorUnit) } } return nil } result, err := (hostagent.DockerManager{}).Install(ctx, cfg) if err != nil { return err } fmt.Printf("container=%s image=%s id=%s pulled=%t replaced=%t\n", result.ContainerName, result.Image, result.ContainerID, result.Pulled, result.Replaced) if installCfg.AutoUpdateEnabled { service := installCfg.AutoUpdate service.RuntimeConfig = cfg service.ManageSystemd = true serviceResult, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service) if err != nil { return err } fmt.Printf("updater_service=%s unit=%s binary=%s started=%t monitor_service=%s\n", serviceResult.UnitName, serviceResult.UnitPath, serviceResult.BinaryPath, serviceResult.Started, serviceResult.MonitorUnitName) } fmt.Println("next: approve the join request in the platform admin panel, then the node-agent will finish bootstrap and start heartbeats") return nil } func runStatus(ctx context.Context, args []string) error { fs := flag.NewFlagSet("status", flag.ContinueOnError) containerName := fs.String("container-name", hostagent.DefaultContainerName, "Docker container name.") if err := fs.Parse(args); err != nil { return err } out, err := (hostagent.DockerManager{}).Status(ctx, *containerName) if err != nil { return err } fmt.Print(out) return nil } func runUpdate(ctx context.Context, args []string) error { fs := flag.NewFlagSet("update", flag.ContinueOnError) req := hostagent.UpdateRequest{} var healthTimeoutSeconds int registerUpdateFlags(fs, &req, &healthTimeoutSeconds) if err := fs.Parse(args); err != nil { return err } req.HealthTimeout = time.Duration(healthTimeoutSeconds) * time.Second if req.DryRun { plan, err := hostagent.FetchNodeUpdatePlan(ctx, req) if err != nil { return err } fmt.Printf("action=%s reason=%s target=%s production_forwarding=%t\n", plan.Action, plan.Reason, plan.TargetVersion, plan.ProductionForwarding) if plan.Artifact != nil { fmt.Printf("artifact=%s sha256=%s size=%d\n", plan.Artifact.URL, plan.Artifact.SHA256, plan.Artifact.SizeBytes) } return nil } var result hostagent.UpdateResult var err error if req.InstallType == hostagent.WindowsUpdateInstallType || runtime.GOOS == "windows" { result, err = (hostagent.WindowsManager{}).ApplyUpdate(ctx, req) } else if req.InstallType == hostagent.BinaryUpdateInstallType { result, err = (hostagent.LinuxManager{}).ApplyUpdate(ctx, req) } else { result, err = (hostagent.DockerManager{}).ApplyUpdate(ctx, req) } if err != nil { return err } fmt.Printf("action=%s reason=%s target=%s container=%s image=%s id=%s loaded=%t replaced=%t rolled_back=%t\n", result.Action, result.Reason, result.TargetVersion, result.ContainerName, result.NewImage, result.ContainerID, result.Loaded, result.Replaced, result.RolledBack, ) return nil } func runUpdateLoop(ctx context.Context, args []string) error { fs := flag.NewFlagSet("update-loop", flag.ContinueOnError) req := hostagent.UpdateRequest{} var healthTimeoutSeconds int var intervalSeconds int var initialDelaySeconds int var maxRuns int var jitter float64 var stopOnError bool var hostAgentStatusEnabled bool var hostAgentVersion string var hostAgentBinaryPath string registerUpdateFlags(fs, &req, &healthTimeoutSeconds) fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Seconds between emergency fallback update plan polls. Update-service/heartbeat hints trigger normal runs.") fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 0), "Seconds to wait before the first poll.") fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.") fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.") fs.BoolVar(&stopOnError, "stop-on-error", getenvBool("RAP_UPDATE_STOP_ON_ERROR", false), "Stop the loop after the first failed update attempt.") fs.BoolVar(&hostAgentStatusEnabled, "host-agent-update-status-enabled", getenvBool("RAP_HOST_AGENT_UPDATE_STATUS_ENABLED", true), "Also poll/report rap-host-agent update status from this loop.") fs.StringVar(&hostAgentVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Current rap-host-agent version reported by the loop.") fs.StringVar(&hostAgentBinaryPath, "host-agent-binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "rap-host-agent binary path used for host-agent update status.") if err := fs.Parse(args); err != nil { return err } req.HealthTimeout = time.Duration(healthTimeoutSeconds) * time.Second cfg := hostagent.UpdateLoopConfig{ Request: req, Interval: time.Duration(intervalSeconds) * time.Second, InitialDelay: time.Duration(initialDelaySeconds) * time.Second, Jitter: jitter, MaxRuns: maxRuns, StopOnError: stopOnError, Logf: func(format string, args ...any) { fmt.Printf(format+"\n", args...) }, } cfg.HostAgentUpdateEnabled = hostAgentStatusEnabled cfg.HostAgentUpdateRequest = hostagent.HostAgentUpdateRequest{ BackendURL: req.BackendURL, ClusterID: req.ClusterID, NodeID: req.NodeID, StateDir: req.StateDir, CurrentVersion: hostAgentVersion, Channel: req.Channel, OS: firstNonEmptyLocal(req.OS, runtime.GOOS), Arch: firstNonEmptyLocal(req.Arch, runtime.GOARCH), InstallType: hostagent.BinaryUpdateInstallType, BinaryPath: hostAgentBinaryPath, } if req.InstallType == hostagent.WindowsUpdateInstallType || runtime.GOOS == "windows" { cfg.HostAgentUpdateRequest.InstallType = "windows_binary" return (hostagent.WindowsManager{}).RunUpdateLoop(ctx, cfg) } if req.InstallType == hostagent.BinaryUpdateInstallType { return (hostagent.LinuxManager{}).RunUpdateLoop(ctx, cfg) } return (hostagent.DockerManager{}).RunUpdateLoop(ctx, cfg) } func runMonitorLoop(ctx context.Context, args []string) error { cfg, err := parseMonitor(args) if err != nil { return err } return hostagent.RunMonitorLoop(ctx, cfg) } func runMonitorOnce(ctx context.Context, args []string) error { cfg, err := parseMonitor(args) if err != nil { return err } cfg.MaxRuns = 1 result := hostagent.RunMonitorOnce(ctx, cfg) if err := json.NewEncoder(os.Stdout).Encode(result); err != nil { return err } return nil } func parseMonitor(args []string) (hostagent.MonitorConfig, error) { fs := flag.NewFlagSet("monitor-loop", flag.ContinueOnError) cfg := hostagent.MonitorConfig{} var intervalSeconds int var initialDelaySeconds int var maxRuns int var restartCooldownSeconds int var staleRestartingSeconds int var tmpMinAgeMinutes int watchContainers := repeatedFlag{} fs.StringVar(&cfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL used for monitor status reports.") fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.") fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.") fs.StringVar(&cfg.Product, "product", getenv("RAP_MONITOR_PRODUCT", hostagent.DefaultMonitorProduct), "Status product name.") fs.StringVar(&cfg.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Current rap-host-agent version.") fs.StringVar(&cfg.DockerBinary, "docker-binary", getenv("RAP_DOCKER_BINARY", "docker"), "Docker CLI binary.") fs.StringVar(&cfg.DiskPath, "disk-path", getenv("RAP_MONITOR_DISK_PATH", "/"), "Filesystem path used for disk usage checks.") fs.StringVar(&cfg.TmpDir, "tmp-dir", getenv("RAP_MONITOR_TMP_DIR", "/tmp"), "Temporary directory cleaned under pressure.") fs.StringVar(&cfg.StatusFile, "status-file", getenv("RAP_MONITOR_STATUS_FILE", ""), "Optional JSON status file written after every run.") fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_MONITOR_INTERVAL_SECONDS", hostagent.DefaultMonitorIntervalSeconds), "Seconds between monitor checks.") fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_MONITOR_INITIAL_DELAY_SECONDS", 0), "Seconds to wait before first monitor check.") fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_MONITOR_MAX_RUNS", 0), "Maximum monitor iterations. Use 0 to run until stopped.") fs.IntVar(&cfg.DiskWarnPercent, "disk-warn-percent", getenvInt("RAP_MONITOR_DISK_WARN_PERCENT", hostagent.DefaultMonitorDiskWarnPercent), "Disk used percent that reports warning.") fs.IntVar(&cfg.DiskCleanupPercent, "disk-cleanup-percent", getenvInt("RAP_MONITOR_DISK_CLEANUP_PERCENT", hostagent.DefaultMonitorDiskCleanupPercent), "Disk used percent that triggers cleanup.") fs.IntVar(&cfg.DiskCriticalPercent, "disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.") fs.IntVar(&restartCooldownSeconds, "restart-cooldown-seconds", getenvInt("RAP_MONITOR_RESTART_COOLDOWN_SECONDS", hostagent.DefaultMonitorRestartCooldownSec), "Minimum seconds between repeated restarts of the same target.") fs.IntVar(&staleRestartingSeconds, "stale-restarting-seconds", getenvInt("RAP_MONITOR_STALE_RESTARTING_SECONDS", hostagent.DefaultMonitorStaleRestartingSec), "Seconds after which docker restarting state is considered stuck.") fs.IntVar(&tmpMinAgeMinutes, "tmp-min-age-minutes", getenvInt("RAP_MONITOR_TMP_MIN_AGE_MINUTES", hostagent.DefaultMonitorTmpMinAgeMinutes), "Minimum age for /tmp rap-* and go-build* cleanup.") fs.BoolVar(&cfg.RestartContainers, "restart-containers", getenvBool("RAP_MONITOR_RESTART_CONTAINERS", true), "Start/restart watched containers when they are stopped, unhealthy, or stuck restarting.") fs.BoolVar(&cfg.CleanupDocker, "cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.") fs.Var(&watchContainers, "watch-container", "Docker container to watch and heal; may be repeated.") if err := fs.Parse(args); err != nil { return hostagent.MonitorConfig{}, err } cfg.WatchContainers = watchContainers cfg.Interval = time.Duration(intervalSeconds) * time.Second cfg.InitialDelay = time.Duration(initialDelaySeconds) * time.Second cfg.MaxRuns = maxRuns cfg.RestartCooldown = time.Duration(restartCooldownSeconds) * time.Second cfg.StaleRestartingAfter = time.Duration(staleRestartingSeconds) * time.Second cfg.TmpMinAge = time.Duration(tmpMinAgeMinutes) * time.Minute cfg.Logf = func(format string, args ...any) { fmt.Printf(format+"\n", args...) } return cfg, nil } func firstNonEmptyLocal(values ...string) string { for _, value := range values { if strings.TrimSpace(value) != "" { return value } } return "" } func runInstallUpdater(ctx context.Context, args []string) error { fs := flag.NewFlagSet("install-updater", flag.ContinueOnError) runtimeCfg := hostagent.RuntimeConfig{} service := hostagent.UpdateServiceConfig{} var dryRun bool var selfUpdater bool var monitorEnabled bool monitorContainers := repeatedFlag{} fs.StringVar(&runtimeCfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.") fs.StringVar(&runtimeCfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&runtimeCfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.") fs.StringVar(&runtimeCfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.") fs.StringVar(&service.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version before first successful update.") fs.StringVar(&service.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.") fs.IntVar(&service.IntervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.") fs.IntVar(&service.InitialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.") fs.Float64Var(&service.Jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.") fs.IntVar(&service.HealthTimeoutSec, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.") fs.StringVar(&service.BinaryInstallPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.") fs.BoolVar(&selfUpdater, "self-updater-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.") fs.BoolVar(&monitorEnabled, "monitor-enabled", getenvBool("RAP_HOST_AGENT_MONITOR_ENABLED", true), "Install and start the local host monitor service.") fs.IntVar(&service.MonitorIntervalSec, "monitor-interval-seconds", getenvInt("RAP_MONITOR_INTERVAL_SECONDS", hostagent.DefaultMonitorIntervalSeconds), "Seconds between monitor checks.") fs.StringVar(&service.MonitorStatusFile, "monitor-status-file", getenv("RAP_MONITOR_STATUS_FILE", ""), "Optional JSON status file written by the monitor.") fs.IntVar(&service.MonitorDiskWarn, "monitor-disk-warn-percent", getenvInt("RAP_MONITOR_DISK_WARN_PERCENT", hostagent.DefaultMonitorDiskWarnPercent), "Disk used percent that reports warning.") fs.IntVar(&service.MonitorDiskCleanup, "monitor-disk-cleanup-percent", getenvInt("RAP_MONITOR_DISK_CLEANUP_PERCENT", hostagent.DefaultMonitorDiskCleanupPercent), "Disk used percent that triggers cleanup.") fs.IntVar(&service.MonitorDiskCritical, "monitor-disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.") fs.BoolVar(&service.MonitorCleanupDocker, "monitor-cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.") fs.Var(&monitorContainers, "monitor-container", "Extra Docker container watched by monitor; may be repeated.") fs.BoolVar(&dryRun, "dry-run", false, "Print the systemd unit without installing it.") if err := fs.Parse(args); err != nil { return err } service.RuntimeConfig = runtimeCfg service.ManageSystemd = !dryRun service.DryRun = dryRun service.InstallSelfUpdater = selfUpdater service.SelfUpdateVersion = agent.Version service.InstallMonitor = monitorEnabled service.MonitorContainers = monitorContainers result, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service) if err != nil { return err } if dryRun { fmt.Print(result.Unit) if result.SelfUnit != "" { fmt.Print(result.SelfUnit) } if result.MonitorUnit != "" { fmt.Print(result.MonitorUnit) } return nil } fmt.Printf("updater_service=%s unit=%s binary=%s started=%t self_updater=%s monitor_service=%s\n", result.UnitName, result.UnitPath, result.BinaryPath, result.Started, result.SelfUnitName, result.MonitorUnitName) return nil } func runUpdateHostAgent(ctx context.Context, args []string) error { req, interval, initialDelay, jitter, maxRuns, stopOnError, loop, err := parseHostAgentUpdate(args) _, _, _, _, _ = interval, initialDelay, jitter, maxRuns, stopOnError if err != nil { return err } if loop { return fmt.Errorf("internal parser error: loop flag set for one-shot update") } result, err := (hostagent.DockerManager{}).ApplyHostAgentUpdate(ctx, req) if err != nil { return err } fmt.Printf("action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t\n", result.Action, result.Reason, result.TargetVersion, result.NewImage, result.Replaced, result.RestartNeeded) return nil } func runUpdateHostAgentLoop(ctx context.Context, args []string) error { req, interval, initialDelay, jitter, maxRuns, stopOnError, _, err := parseHostAgentUpdate(args) if err != nil { return err } return (hostagent.DockerManager{}).RunHostAgentUpdateLoop(ctx, hostagent.HostAgentUpdateLoopConfig{ Request: req, Interval: time.Duration(interval) * time.Second, InitialDelay: time.Duration(initialDelay) * time.Second, Jitter: jitter, MaxRuns: maxRuns, StopOnError: stopOnError, Logf: func(format string, args ...any) { fmt.Printf(format+"\n", args...) }, }) } func parseHostAgentUpdate(args []string) (hostagent.HostAgentUpdateRequest, int, int, float64, int, bool, bool, error) { fs := flag.NewFlagSet("update-host-agent", flag.ContinueOnError) req := hostagent.HostAgentUpdateRequest{} var intervalSeconds int var initialDelaySeconds int var maxRuns int var jitter float64 var stopOnError bool fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.") fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.") fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json.") fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Currently installed rap-host-agent version.") fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.") fs.StringVar(&req.OS, "os", getenv("RAP_HOST_AGENT_UPDATE_OS", runtime.GOOS), "Host-agent artifact OS selector.") fs.StringVar(&req.Arch, "arch", getenv("RAP_HOST_AGENT_UPDATE_ARCH", runtime.GOARCH), "Host-agent artifact architecture selector.") fs.StringVar(&req.InstallType, "install-type", getenv("RAP_HOST_AGENT_UPDATE_INSTALL_TYPE", hostagent.BinaryUpdateInstallType), "Host-agent artifact install type.") fs.StringVar(&req.BinaryPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "rap-host-agent binary path to replace atomically.") fs.BoolVar(&req.DryRun, "dry-run", false, "Fetch and print the update plan without applying it.") fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INTERVAL_SECONDS", 900), "Seconds between host-agent update plan polls.") fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INITIAL_DELAY_SECONDS", 45), "Seconds to wait before the first poll.") fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.") fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.") fs.BoolVar(&stopOnError, "stop-on-error", getenvBool("RAP_UPDATE_STOP_ON_ERROR", false), "Stop the loop after the first failed update attempt.") if err := fs.Parse(args); err != nil { return hostagent.HostAgentUpdateRequest{}, 0, 0, 0, 0, false, false, err } return req, intervalSeconds, initialDelaySeconds, jitter, maxRuns, stopOnError, false, nil } func registerUpdateFlags(fs *flag.FlagSet, req *hostagent.UpdateRequest, healthTimeoutSeconds *int) { fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.") fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.") fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json; used when node-id is not known yet.") fs.StringVar(&req.Product, "product", getenv("RAP_UPDATE_PRODUCT", hostagent.DefaultUpdateProduct), "Update product name.") fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Currently running product version.") fs.StringVar(&req.OS, "os", getenv("RAP_UPDATE_OS", runtime.GOOS), "Artifact OS selector.") fs.StringVar(&req.Arch, "arch", getenv("RAP_UPDATE_ARCH", runtime.GOARCH), "Artifact architecture selector.") fs.StringVar(&req.InstallType, "install-type", getenv("RAP_UPDATE_INSTALL_TYPE", hostagent.DefaultUpdateInstallType), "Artifact install type.") fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.") fs.StringVar(&req.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.") fs.StringVar(&req.BinaryPath, "binary-path", getenv("RAP_NODE_AGENT_BINARY_PATH", ""), "Windows node-agent binary path to replace.") fs.StringVar(&req.WindowsTaskName, "windows-task-name", getenv("RAP_WINDOWS_TASK_NAME", ""), "Windows Scheduled Task name used to restart node-agent.") fs.StringVar(&req.SystemdUnitName, "systemd-unit", getenv("RAP_SYSTEMD_UNIT", ""), "Linux systemd unit used to restart node-agent.") fs.IntVar(healthTimeoutSeconds, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Seconds to wait for the updated container to be running.") fs.BoolVar(&req.DryRun, "dry-run", false, "Fetch and print the update plan without applying it.") } func parseInstall(args []string) (installCommandConfig, error) { fs := flag.NewFlagSet("install", flag.ContinueOnError) cfg := hostagent.RuntimeConfig{} var dryRun bool var profileURL string var installToken string var autoUpdateEnabled bool autoUpdate := hostagent.UpdateServiceConfig{} monitorContainers := repeatedFlag{} fs.StringVar(&cfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.") fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.") fs.StringVar(&cfg.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.") fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/docker-install-profile URL for profile-based install.") fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Docker install profile.") fs.StringVar(&cfg.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.") fs.StringVar(&cfg.Image, "image", getenv("RAP_NODE_AGENT_IMAGE", hostagent.DefaultImage), "Docker image for rap-node-agent.") fs.StringVar(&cfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name.") fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path mounted as node-agent state.") fs.StringVar(&cfg.Network, "network", getenv("RAP_DOCKER_NETWORK", hostagent.DefaultNetwork), "Docker network mode/name.") fs.StringVar(&cfg.RestartPolicy, "restart", getenv("RAP_DOCKER_RESTART", "unless-stopped"), "Docker restart policy.") fs.BoolVar(&cfg.PullImage, "pull", getenvBool("RAP_DOCKER_PULL", false), "Pull image before running.") fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_DOCKER_REPLACE", false), "Remove an existing container with the same name before run.") fs.BoolVar(&cfg.DockerVPNGatewayEnabled, "docker-vpn-gateway-enabled", getenvBool("RAP_DOCKER_VPN_GATEWAY_ENABLED", false), "Run Docker node-agent with NET_ADMIN and /dev/net/tun for VPN gateway mode.") fs.StringVar(&cfg.ImageArtifactSHA256, "image-artifact-sha256", getenv("RAP_NODE_AGENT_IMAGE_ARTIFACT_SHA256", ""), "Expected SHA-256 for a Docker image tar artifact.") fs.Int64Var(&cfg.ImageArtifactSizeBytes, "image-artifact-size-bytes", getenvInt64("RAP_NODE_AGENT_IMAGE_ARTIFACT_SIZE_BYTES", 0), "Expected byte size for a Docker image tar artifact (used as a best-effort check when sha256 is provided).") fs.BoolVar(&dryRun, "dry-run", false, "Print the docker command with secrets redacted.") fs.BoolVar(&autoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the local update-loop service.") fs.BoolVar(&autoUpdate.InstallSelfUpdater, "host-agent-self-update-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.") fs.BoolVar(&autoUpdate.InstallMonitor, "host-agent-monitor-enabled", getenvBool("RAP_HOST_AGENT_MONITOR_ENABLED", true), "Install and start the local host monitor service.") fs.StringVar(&autoUpdate.CurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.") fs.StringVar(&autoUpdate.SelfUpdateVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Initial host-agent binary version used by the self-updater.") fs.StringVar(&autoUpdate.Channel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.") fs.IntVar(&autoUpdate.IntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.") fs.IntVar(&autoUpdate.InitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.") fs.Float64Var(&autoUpdate.Jitter, "auto-update-jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.") fs.IntVar(&autoUpdate.HealthTimeoutSec, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.") fs.StringVar(&autoUpdate.BinaryInstallPath, "auto-update-binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.") fs.IntVar(&autoUpdate.MonitorIntervalSec, "monitor-interval-seconds", getenvInt("RAP_MONITOR_INTERVAL_SECONDS", hostagent.DefaultMonitorIntervalSeconds), "Seconds between monitor checks.") fs.StringVar(&autoUpdate.MonitorStatusFile, "monitor-status-file", getenv("RAP_MONITOR_STATUS_FILE", ""), "Optional JSON status file written by the monitor.") fs.IntVar(&autoUpdate.MonitorDiskWarn, "monitor-disk-warn-percent", getenvInt("RAP_MONITOR_DISK_WARN_PERCENT", hostagent.DefaultMonitorDiskWarnPercent), "Disk used percent that reports warning.") fs.IntVar(&autoUpdate.MonitorDiskCleanup, "monitor-disk-cleanup-percent", getenvInt("RAP_MONITOR_DISK_CLEANUP_PERCENT", hostagent.DefaultMonitorDiskCleanupPercent), "Disk used percent that triggers cleanup.") fs.IntVar(&autoUpdate.MonitorDiskCritical, "monitor-disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.") fs.BoolVar(&autoUpdate.MonitorCleanupDocker, "monitor-cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.") fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.") fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable synthetic mesh runtime.") fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.") fs.BoolVar(&cfg.MeshFabricSessionEnabled, "mesh-fabric-session-enabled", getenvBool("RAP_MESH_FABRIC_SESSION_ENABLED", false), "Enable authenticated fabric session endpoint.") fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.") fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.") fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.") fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.") fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.") fs.IntVar(&cfg.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.") fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Synthetic mesh HTTP listen address inside container.") fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", ""), "Mesh listen port behavior: manual, auto, or disabled.") fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 0), "First port used when mesh listen port mode is auto.") fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.") fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.") fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.") fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Advertised transport.") fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", ""), "Connectivity mode hint.") fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", ""), "NAT type hint.") fs.StringVar(&cfg.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint.") fs.IntVar(&cfg.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.") fs.IntVar(&cfg.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.") fs.IntVar(&cfg.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.") fs.IntVar(&cfg.ProductionObservationSinkCap, "production-observation-sink-capacity", getenvInt("RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Production observation sink capacity.") extraEnv := repeatedFlag{} extraRunArg := repeatedFlag{} imageArtifactURL := repeatedFlag{} fs.Var(&extraEnv, "env", "Extra KEY=VALUE env passed to node-agent container; may be repeated.") fs.Var(&extraRunArg, "docker-run-arg", "Extra raw docker run argument; may be repeated.") fs.Var(&imageArtifactURL, "image-artifact-url", "Docker image tar artifact URL to docker load before running; may be repeated.") fs.Var(&monitorContainers, "monitor-container", "Extra Docker container watched by monitor; may be repeated.") if err := fs.Parse(args); err != nil { return installCommandConfig{}, err } cfg.ExtraEnv = extraEnv cfg.AdditionalDockerRunArgs = extraRunArg cfg.ImageArtifactURLs = append(cfg.ImageArtifactURLs, imageArtifactURL...) autoUpdate.MonitorContainers = monitorContainers if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" { profile, err := hostagent.FetchDockerInstallProfile(context.Background(), hostagent.ProfileRequest{ URL: profileURL, ClusterID: cfg.ClusterID, InstallToken: installToken, NodeName: cfg.NodeName, }) if err != nil { return installCommandConfig{}, err } profileCfg := hostagent.RuntimeConfigFromProfile(profile) profileCfg.ExtraEnv = cfg.ExtraEnv profileCfg.AdditionalDockerRunArgs = cfg.AdditionalDockerRunArgs profileCfg.DockerVPNGatewayEnabled = profileCfg.DockerVPNGatewayEnabled || cfg.DockerVPNGatewayEnabled if len(imageArtifactURL) > 0 { profileCfg.ImageArtifactURLs = append([]string(nil), imageArtifactURL...) } if cfg.ImageArtifactSHA256 != "" { profileCfg.ImageArtifactSHA256 = cfg.ImageArtifactSHA256 } if cfg.ImageArtifactSizeBytes > 0 { profileCfg.ImageArtifactSizeBytes = cfg.ImageArtifactSizeBytes } cfg = profileCfg } if err := cfg.ValidateInstall(); err != nil { return installCommandConfig{}, err } return installCommandConfig{ Runtime: cfg, DryRun: dryRun, AutoUpdateEnabled: autoUpdateEnabled, AutoUpdate: autoUpdate, }, nil } type repeatedFlag []string func (f *repeatedFlag) String() string { return strings.Join(*f, ",") } func (f *repeatedFlag) Set(value string) error { *f = append(*f, value) return nil } func getenv(key, fallback string) string { if value := strings.TrimSpace(os.Getenv(key)); value != "" { return value } return fallback } func getenvBool(key string, fallback bool) bool { switch strings.ToLower(strings.TrimSpace(os.Getenv(key))) { case "1", "true", "yes", "y", "on": return true case "0", "false", "no", "n", "off": return false default: return fallback } } func getenvInt(key string, fallback int) int { var out int if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%d", &out); err == nil { return out } return fallback } func getenvInt64(key string, fallback int64) int64 { var out int64 if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%d", &out); err == nil { return out } return fallback } func getenvFloat(key string, fallback float64) float64 { var out float64 if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%f", &out); err == nil { return out } return fallback } func shellJoin(args []string) string { parts := make([]string, 0, len(args)) for _, arg := range args { if strings.ContainsAny(arg, " \t\"'") { parts = append(parts, `"`+strings.ReplaceAll(arg, `"`, `\"`)+`"`) } else { parts = append(parts, arg) } } return strings.Join(parts, " ") } func usage() { fmt.Fprintln(os.Stderr, `usage: rap-host-agent install -profile-url URL -install-token TOKEN [-node-name NAME] [docker options] rap-host-agent install -backend-url URL -cluster-id ID -join-token TOKEN -node-name NAME [docker options] rap-host-agent install-windows -profile-url URL -install-token TOKEN [-node-name NAME] [windows options] rap-host-agent install-linux -profile-url URL -install-token TOKEN [-node-name NAME] [linux/systemd options] rap-host-agent install-updater -backend-url URL -cluster-id ID -state-dir DIR -container-name NAME rap-host-agent update-host-agent -backend-url URL -cluster-id ID -state-dir DIR rap-host-agent update-host-agent-loop -backend-url URL -cluster-id ID -state-dir DIR rap-host-agent monitor-loop -backend-url URL -cluster-id ID -state-dir DIR --watch-container NAME rap-host-agent monitor-once -backend-url URL -cluster-id ID -state-dir DIR --watch-container NAME rap-host-agent fabric-session-smoke -mesh-url URL -token rap_fsn_TOKEN [-authority-payload VALUE -authority-signature VALUE] rap-host-agent update -backend-url URL -cluster-id ID -node-id ID [-container-name NAME] rap-host-agent update-loop -backend-url URL -cluster-id ID -node-id ID [-container-name NAME] rap-host-agent status [-container-name NAME]`) }