Record project continuation changes
This commit is contained in:
@@ -0,0 +1,744 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/hostagent"
|
||||
)
|
||||
|
||||
type installCommandConfig struct {
|
||||
Runtime hostagent.RuntimeConfig
|
||||
DryRun bool
|
||||
AutoUpdateEnabled bool
|
||||
AutoUpdate hostagent.UpdateServiceConfig
|
||||
}
|
||||
|
||||
func main() {
|
||||
log.SetFlags(0)
|
||||
applyStagedSelfUpdate()
|
||||
if len(os.Args) < 2 {
|
||||
usage()
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
defer stop()
|
||||
switch os.Args[1] {
|
||||
case "install":
|
||||
if err := runInstall(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("install failed: %v", err)
|
||||
}
|
||||
case "install-windows":
|
||||
if err := runInstallWindows(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("install-windows failed: %v", err)
|
||||
}
|
||||
case "install-linux":
|
||||
if err := runInstallLinux(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("install-linux failed: %v", err)
|
||||
}
|
||||
case "status":
|
||||
if err := runStatus(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("status failed: %v", err)
|
||||
}
|
||||
case "update":
|
||||
if err := runUpdate(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update failed: %v", err)
|
||||
}
|
||||
case "update-loop":
|
||||
if err := runUpdateLoop(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update-loop failed: %v", err)
|
||||
}
|
||||
case "install-updater":
|
||||
if err := runInstallUpdater(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("install-updater failed: %v", err)
|
||||
}
|
||||
case "update-host-agent":
|
||||
if err := runUpdateHostAgent(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update-host-agent failed: %v", err)
|
||||
}
|
||||
case "update-host-agent-loop":
|
||||
if err := runUpdateHostAgentLoop(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update-host-agent-loop failed: %v", err)
|
||||
}
|
||||
default:
|
||||
usage()
|
||||
os.Exit(2)
|
||||
}
|
||||
}
|
||||
|
||||
func applyStagedSelfUpdate() {
|
||||
if runtime.GOOS == "windows" {
|
||||
return
|
||||
}
|
||||
executable, err := os.Executable()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
staged := executable + ".next"
|
||||
if _, err := os.Stat(staged); err != nil {
|
||||
return
|
||||
}
|
||||
backup := executable + ".old"
|
||||
_ = os.Remove(backup)
|
||||
if err := os.Rename(executable, backup); err != nil {
|
||||
return
|
||||
}
|
||||
if err := os.Rename(staged, executable); err != nil {
|
||||
_ = os.Rename(backup, executable)
|
||||
return
|
||||
}
|
||||
_ = os.Chmod(executable, 0o755)
|
||||
_ = os.Remove(backup)
|
||||
}
|
||||
|
||||
func runInstallLinux(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("install-linux", flag.ContinueOnError)
|
||||
cfg := hostagent.LinuxInstallConfig{}
|
||||
var profileURL string
|
||||
var installToken string
|
||||
fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
|
||||
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/linux-install-profile URL for profile-based install.")
|
||||
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Linux install profile.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
|
||||
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.")
|
||||
fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_LINUX_INSTALL_DIR", ""), "Directory for rap-node-agent and rap-host-agent.")
|
||||
fs.StringVar(&cfg.ConfigDir, "config-dir", getenv("RAP_LINUX_CONFIG_DIR", ""), "Directory for node-agent env file.")
|
||||
fs.StringVar(&cfg.StartupMode, "startup-mode", getenv("RAP_LINUX_STARTUP_MODE", "systemd"), "Startup mode: systemd, auto, or none.")
|
||||
fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_REPLACE", true), "Replace local node-agent binary/config when an artifact is available.")
|
||||
fs.BoolVar(&cfg.DryRun, "dry-run", false, "Print resolved placement without installing.")
|
||||
fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Linux host-agent update service.")
|
||||
fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
|
||||
fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
|
||||
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
|
||||
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
|
||||
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent path copied to the persistent updater location.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "direct_http"), "Advertised transport.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "linux"), "Region/site hint.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
|
||||
dryRun := cfg.DryRun
|
||||
startupMode := strings.TrimSpace(cfg.StartupMode)
|
||||
autoUpdateEnabled := cfg.AutoUpdateEnabled
|
||||
autoUpdateCurrentVersion := cfg.AutoUpdateCurrentVersion
|
||||
autoUpdateChannel := cfg.AutoUpdateChannel
|
||||
autoUpdateIntervalSeconds := cfg.AutoUpdateIntervalSeconds
|
||||
autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds
|
||||
autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds
|
||||
hostAgentSourcePath := cfg.HostAgentSourcePath
|
||||
profile, err := hostagent.FetchLinuxInstallProfile(ctx, hostagent.ProfileRequest{URL: profileURL, ClusterID: cfg.RuntimeConfig.ClusterID, InstallToken: installToken, NodeName: cfg.RuntimeConfig.NodeName})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg = hostagent.LinuxInstallConfigFromProfile(profile)
|
||||
cfg.Replace = true
|
||||
cfg.DryRun = dryRun
|
||||
cfg.AutoUpdateEnabled = autoUpdateEnabled
|
||||
cfg.AutoUpdateCurrentVersion = autoUpdateCurrentVersion
|
||||
cfg.AutoUpdateChannel = autoUpdateChannel
|
||||
cfg.AutoUpdateIntervalSeconds = autoUpdateIntervalSeconds
|
||||
cfg.AutoUpdateInitialDelaySeconds = autoUpdateInitialDelaySeconds
|
||||
cfg.AutoUpdateHealthTimeoutSeconds = autoUpdateHealthTimeoutSeconds
|
||||
cfg.HostAgentSourcePath = hostAgentSourcePath
|
||||
if startupMode != "" {
|
||||
cfg.StartupMode = startupMode
|
||||
}
|
||||
}
|
||||
result, err := (hostagent.LinuxManager{}).Install(ctx, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("node=%s install_dir=%s state_dir=%s node_agent=%s unit=%s downloaded=%t started=%t updater_unit=%s updater_started=%t\n",
|
||||
result.NodeName, result.InstallDir, result.StateDir, result.NodeAgentPath, result.UnitName, result.Downloaded, result.Started, result.UpdaterUnitName, result.UpdaterStarted)
|
||||
fmt.Println("next: approve the join request in the platform admin panel, then the Linux node-agent will finish bootstrap and start heartbeats")
|
||||
return nil
|
||||
}
|
||||
|
||||
func runInstallWindows(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("install-windows", flag.ContinueOnError)
|
||||
cfg := hostagent.WindowsInstallConfig{}
|
||||
var profileURL string
|
||||
var installToken string
|
||||
fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
|
||||
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/windows-install-profile URL for profile-based install.")
|
||||
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Windows install profile.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.")
|
||||
fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_WINDOWS_INSTALL_DIR", ""), "Directory for rap-node-agent.exe and wrapper scripts.")
|
||||
fs.StringVar(&cfg.StartupMode, "startup-mode", getenv("RAP_WINDOWS_STARTUP_MODE", "auto"), "Startup mode: auto, system-task, user-task, or none.")
|
||||
fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_REPLACE", true), "Replace local node-agent binary/config when an artifact is available.")
|
||||
fs.BoolVar(&cfg.DryRun, "dry-run", false, "Print resolved placement without installing.")
|
||||
fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Windows host-agent update task.")
|
||||
fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
|
||||
fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
|
||||
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
|
||||
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
|
||||
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent.exe path copied to the persistent updater location.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "direct_http"), "Advertised transport.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "windows"), "Region/site hint.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
|
||||
dryRun := cfg.DryRun
|
||||
startupMode := strings.TrimSpace(cfg.StartupMode)
|
||||
autoUpdateEnabled := cfg.AutoUpdateEnabled
|
||||
autoUpdateCurrentVersion := cfg.AutoUpdateCurrentVersion
|
||||
autoUpdateChannel := cfg.AutoUpdateChannel
|
||||
autoUpdateIntervalSeconds := cfg.AutoUpdateIntervalSeconds
|
||||
autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds
|
||||
autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds
|
||||
hostAgentSourcePath := cfg.HostAgentSourcePath
|
||||
profile, err := hostagent.FetchWindowsInstallProfile(ctx, hostagent.ProfileRequest{
|
||||
URL: profileURL,
|
||||
ClusterID: cfg.RuntimeConfig.ClusterID,
|
||||
InstallToken: installToken,
|
||||
NodeName: cfg.RuntimeConfig.NodeName,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg = hostagent.WindowsInstallConfigFromProfile(profile)
|
||||
cfg.Replace = true
|
||||
cfg.DryRun = dryRun
|
||||
cfg.AutoUpdateEnabled = autoUpdateEnabled
|
||||
cfg.AutoUpdateCurrentVersion = autoUpdateCurrentVersion
|
||||
cfg.AutoUpdateChannel = autoUpdateChannel
|
||||
cfg.AutoUpdateIntervalSeconds = autoUpdateIntervalSeconds
|
||||
cfg.AutoUpdateInitialDelaySeconds = autoUpdateInitialDelaySeconds
|
||||
cfg.AutoUpdateHealthTimeoutSeconds = autoUpdateHealthTimeoutSeconds
|
||||
cfg.HostAgentSourcePath = hostAgentSourcePath
|
||||
if startupMode != "" {
|
||||
cfg.StartupMode = startupMode
|
||||
}
|
||||
}
|
||||
result, err := (hostagent.WindowsManager{}).Install(ctx, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("node=%s install_dir=%s state_dir=%s node_agent=%s startup_mode=%s task=%s downloaded=%t started=%t updater_task=%s updater_started=%t admin_fallback=%t\n",
|
||||
result.NodeName, result.InstallDir, result.StateDir, result.NodeAgentPath, result.StartupMode, result.TaskName, result.Downloaded, result.Started, result.UpdaterTaskName, result.UpdaterStarted, result.AdminFallback)
|
||||
fmt.Println("next: approve the join request in the platform admin panel, then the Windows node-agent will finish bootstrap and start heartbeats")
|
||||
return nil
|
||||
}
|
||||
|
||||
func runInstall(ctx context.Context, args []string) error {
|
||||
installCfg, err := parseInstall(args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg := installCfg.Runtime.Normalize()
|
||||
cfg = cfg.Normalize()
|
||||
runArgs := hostagent.DockerRunArgs(cfg)
|
||||
if installCfg.DryRun {
|
||||
fmt.Printf("docker %s\n", shellJoin(hostagent.RedactedArgs(runArgs)))
|
||||
if installCfg.AutoUpdateEnabled {
|
||||
service := installCfg.AutoUpdate
|
||||
service.RuntimeConfig = cfg
|
||||
service.DryRun = true
|
||||
result, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Print(result.Unit)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
result, err := (hostagent.DockerManager{}).Install(ctx, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("container=%s image=%s id=%s pulled=%t replaced=%t\n", result.ContainerName, result.Image, result.ContainerID, result.Pulled, result.Replaced)
|
||||
if installCfg.AutoUpdateEnabled {
|
||||
service := installCfg.AutoUpdate
|
||||
service.RuntimeConfig = cfg
|
||||
service.ManageSystemd = true
|
||||
serviceResult, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t\n", serviceResult.UnitName, serviceResult.UnitPath, serviceResult.BinaryPath, serviceResult.Started)
|
||||
}
|
||||
fmt.Println("next: approve the join request in the platform admin panel, then the node-agent will finish bootstrap and start heartbeats")
|
||||
return nil
|
||||
}
|
||||
|
||||
func runStatus(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("status", flag.ContinueOnError)
|
||||
containerName := fs.String("container-name", hostagent.DefaultContainerName, "Docker container name.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := (hostagent.DockerManager{}).Status(ctx, *containerName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Print(out)
|
||||
return nil
|
||||
}
|
||||
|
||||
func runUpdate(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("update", flag.ContinueOnError)
|
||||
req := hostagent.UpdateRequest{}
|
||||
var healthTimeoutSeconds int
|
||||
registerUpdateFlags(fs, &req, &healthTimeoutSeconds)
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
req.HealthTimeout = time.Duration(healthTimeoutSeconds) * time.Second
|
||||
if req.DryRun {
|
||||
plan, err := hostagent.FetchNodeUpdatePlan(ctx, req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("action=%s reason=%s target=%s production_forwarding=%t\n", plan.Action, plan.Reason, plan.TargetVersion, plan.ProductionForwarding)
|
||||
if plan.Artifact != nil {
|
||||
fmt.Printf("artifact=%s sha256=%s size=%d\n", plan.Artifact.URL, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
var result hostagent.UpdateResult
|
||||
var err error
|
||||
if req.InstallType == hostagent.WindowsUpdateInstallType || runtime.GOOS == "windows" {
|
||||
result, err = (hostagent.WindowsManager{}).ApplyUpdate(ctx, req)
|
||||
} else if req.InstallType == hostagent.BinaryUpdateInstallType {
|
||||
result, err = (hostagent.LinuxManager{}).ApplyUpdate(ctx, req)
|
||||
} else {
|
||||
result, err = (hostagent.DockerManager{}).ApplyUpdate(ctx, req)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("action=%s reason=%s target=%s container=%s image=%s id=%s loaded=%t replaced=%t rolled_back=%t\n",
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.ContainerName,
|
||||
result.NewImage,
|
||||
result.ContainerID,
|
||||
result.Loaded,
|
||||
result.Replaced,
|
||||
result.RolledBack,
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
||||
func runUpdateLoop(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("update-loop", flag.ContinueOnError)
|
||||
req := hostagent.UpdateRequest{}
|
||||
var healthTimeoutSeconds int
|
||||
var intervalSeconds int
|
||||
var initialDelaySeconds int
|
||||
var maxRuns int
|
||||
var jitter float64
|
||||
var stopOnError bool
|
||||
var hostAgentStatusEnabled bool
|
||||
var hostAgentVersion string
|
||||
var hostAgentBinaryPath string
|
||||
registerUpdateFlags(fs, &req, &healthTimeoutSeconds)
|
||||
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Seconds between emergency fallback update plan polls. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 0), "Seconds to wait before the first poll.")
|
||||
fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.")
|
||||
fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.")
|
||||
fs.BoolVar(&stopOnError, "stop-on-error", getenvBool("RAP_UPDATE_STOP_ON_ERROR", false), "Stop the loop after the first failed update attempt.")
|
||||
fs.BoolVar(&hostAgentStatusEnabled, "host-agent-update-status-enabled", getenvBool("RAP_HOST_AGENT_UPDATE_STATUS_ENABLED", true), "Also poll/report rap-host-agent update status from this loop.")
|
||||
fs.StringVar(&hostAgentVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Current rap-host-agent version reported by the loop.")
|
||||
fs.StringVar(&hostAgentBinaryPath, "host-agent-binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "rap-host-agent binary path used for host-agent update status.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
req.HealthTimeout = time.Duration(healthTimeoutSeconds) * time.Second
|
||||
cfg := hostagent.UpdateLoopConfig{
|
||||
Request: req,
|
||||
Interval: time.Duration(intervalSeconds) * time.Second,
|
||||
InitialDelay: time.Duration(initialDelaySeconds) * time.Second,
|
||||
Jitter: jitter,
|
||||
MaxRuns: maxRuns,
|
||||
StopOnError: stopOnError,
|
||||
Logf: func(format string, args ...any) {
|
||||
fmt.Printf(format+"\n", args...)
|
||||
},
|
||||
}
|
||||
cfg.HostAgentUpdateEnabled = hostAgentStatusEnabled
|
||||
cfg.HostAgentUpdateRequest = hostagent.HostAgentUpdateRequest{
|
||||
BackendURL: req.BackendURL,
|
||||
ClusterID: req.ClusterID,
|
||||
NodeID: req.NodeID,
|
||||
StateDir: req.StateDir,
|
||||
CurrentVersion: hostAgentVersion,
|
||||
Channel: req.Channel,
|
||||
OS: firstNonEmptyLocal(req.OS, runtime.GOOS),
|
||||
Arch: firstNonEmptyLocal(req.Arch, runtime.GOARCH),
|
||||
InstallType: hostagent.BinaryUpdateInstallType,
|
||||
BinaryPath: hostAgentBinaryPath,
|
||||
}
|
||||
if req.InstallType == hostagent.WindowsUpdateInstallType || runtime.GOOS == "windows" {
|
||||
cfg.HostAgentUpdateRequest.InstallType = "windows_binary"
|
||||
return (hostagent.WindowsManager{}).RunUpdateLoop(ctx, cfg)
|
||||
}
|
||||
if req.InstallType == hostagent.BinaryUpdateInstallType {
|
||||
return (hostagent.LinuxManager{}).RunUpdateLoop(ctx, cfg)
|
||||
}
|
||||
return (hostagent.DockerManager{}).RunUpdateLoop(ctx, cfg)
|
||||
}
|
||||
|
||||
func firstNonEmptyLocal(values ...string) string {
|
||||
for _, value := range values {
|
||||
if strings.TrimSpace(value) != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func runInstallUpdater(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("install-updater", flag.ContinueOnError)
|
||||
runtimeCfg := hostagent.RuntimeConfig{}
|
||||
service := hostagent.UpdateServiceConfig{}
|
||||
var dryRun bool
|
||||
var selfUpdater bool
|
||||
fs.StringVar(&runtimeCfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&runtimeCfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&runtimeCfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.")
|
||||
fs.StringVar(&runtimeCfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.")
|
||||
fs.StringVar(&service.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version before first successful update.")
|
||||
fs.StringVar(&service.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
|
||||
fs.IntVar(&service.IntervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&service.InitialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
|
||||
fs.Float64Var(&service.Jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.")
|
||||
fs.IntVar(&service.HealthTimeoutSec, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
|
||||
fs.StringVar(&service.BinaryInstallPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.")
|
||||
fs.BoolVar(&selfUpdater, "self-updater-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.")
|
||||
fs.BoolVar(&dryRun, "dry-run", false, "Print the systemd unit without installing it.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
service.RuntimeConfig = runtimeCfg
|
||||
service.ManageSystemd = !dryRun
|
||||
service.DryRun = dryRun
|
||||
service.InstallSelfUpdater = selfUpdater
|
||||
service.SelfUpdateVersion = agent.Version
|
||||
result, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if dryRun {
|
||||
fmt.Print(result.Unit)
|
||||
if result.SelfUnit != "" {
|
||||
fmt.Print(result.SelfUnit)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t self_updater=%s\n", result.UnitName, result.UnitPath, result.BinaryPath, result.Started, result.SelfUnitName)
|
||||
return nil
|
||||
}
|
||||
|
||||
func runUpdateHostAgent(ctx context.Context, args []string) error {
|
||||
req, interval, initialDelay, jitter, maxRuns, stopOnError, loop, err := parseHostAgentUpdate(args)
|
||||
_, _, _, _, _ = interval, initialDelay, jitter, maxRuns, stopOnError
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if loop {
|
||||
return fmt.Errorf("internal parser error: loop flag set for one-shot update")
|
||||
}
|
||||
result, err := (hostagent.DockerManager{}).ApplyHostAgentUpdate(ctx, req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t\n", result.Action, result.Reason, result.TargetVersion, result.NewImage, result.Replaced, result.RestartNeeded)
|
||||
return nil
|
||||
}
|
||||
|
||||
func runUpdateHostAgentLoop(ctx context.Context, args []string) error {
|
||||
req, interval, initialDelay, jitter, maxRuns, stopOnError, _, err := parseHostAgentUpdate(args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return (hostagent.DockerManager{}).RunHostAgentUpdateLoop(ctx, hostagent.HostAgentUpdateLoopConfig{
|
||||
Request: req,
|
||||
Interval: time.Duration(interval) * time.Second,
|
||||
InitialDelay: time.Duration(initialDelay) * time.Second,
|
||||
Jitter: jitter,
|
||||
MaxRuns: maxRuns,
|
||||
StopOnError: stopOnError,
|
||||
Logf: func(format string, args ...any) {
|
||||
fmt.Printf(format+"\n", args...)
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func parseHostAgentUpdate(args []string) (hostagent.HostAgentUpdateRequest, int, int, float64, int, bool, bool, error) {
|
||||
fs := flag.NewFlagSet("update-host-agent", flag.ContinueOnError)
|
||||
req := hostagent.HostAgentUpdateRequest{}
|
||||
var intervalSeconds int
|
||||
var initialDelaySeconds int
|
||||
var maxRuns int
|
||||
var jitter float64
|
||||
var stopOnError bool
|
||||
fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
|
||||
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json.")
|
||||
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Currently installed rap-host-agent version.")
|
||||
fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
|
||||
fs.StringVar(&req.OS, "os", getenv("RAP_HOST_AGENT_UPDATE_OS", runtime.GOOS), "Host-agent artifact OS selector.")
|
||||
fs.StringVar(&req.Arch, "arch", getenv("RAP_HOST_AGENT_UPDATE_ARCH", runtime.GOARCH), "Host-agent artifact architecture selector.")
|
||||
fs.StringVar(&req.InstallType, "install-type", getenv("RAP_HOST_AGENT_UPDATE_INSTALL_TYPE", hostagent.BinaryUpdateInstallType), "Host-agent artifact install type.")
|
||||
fs.StringVar(&req.BinaryPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "rap-host-agent binary path to replace atomically.")
|
||||
fs.BoolVar(&req.DryRun, "dry-run", false, "Fetch and print the update plan without applying it.")
|
||||
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INTERVAL_SECONDS", 900), "Seconds between host-agent update plan polls.")
|
||||
fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INITIAL_DELAY_SECONDS", 45), "Seconds to wait before the first poll.")
|
||||
fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.")
|
||||
fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.")
|
||||
fs.BoolVar(&stopOnError, "stop-on-error", getenvBool("RAP_UPDATE_STOP_ON_ERROR", false), "Stop the loop after the first failed update attempt.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return hostagent.HostAgentUpdateRequest{}, 0, 0, 0, 0, false, false, err
|
||||
}
|
||||
return req, intervalSeconds, initialDelaySeconds, jitter, maxRuns, stopOnError, false, nil
|
||||
}
|
||||
|
||||
func registerUpdateFlags(fs *flag.FlagSet, req *hostagent.UpdateRequest, healthTimeoutSeconds *int) {
|
||||
fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
|
||||
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json; used when node-id is not known yet.")
|
||||
fs.StringVar(&req.Product, "product", getenv("RAP_UPDATE_PRODUCT", hostagent.DefaultUpdateProduct), "Update product name.")
|
||||
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Currently running product version.")
|
||||
fs.StringVar(&req.OS, "os", getenv("RAP_UPDATE_OS", runtime.GOOS), "Artifact OS selector.")
|
||||
fs.StringVar(&req.Arch, "arch", getenv("RAP_UPDATE_ARCH", runtime.GOARCH), "Artifact architecture selector.")
|
||||
fs.StringVar(&req.InstallType, "install-type", getenv("RAP_UPDATE_INSTALL_TYPE", hostagent.DefaultUpdateInstallType), "Artifact install type.")
|
||||
fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
|
||||
fs.StringVar(&req.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.")
|
||||
fs.StringVar(&req.BinaryPath, "binary-path", getenv("RAP_NODE_AGENT_BINARY_PATH", ""), "Windows node-agent binary path to replace.")
|
||||
fs.StringVar(&req.WindowsTaskName, "windows-task-name", getenv("RAP_WINDOWS_TASK_NAME", ""), "Windows Scheduled Task name used to restart node-agent.")
|
||||
fs.StringVar(&req.SystemdUnitName, "systemd-unit", getenv("RAP_SYSTEMD_UNIT", ""), "Linux systemd unit used to restart node-agent.")
|
||||
fs.IntVar(healthTimeoutSeconds, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Seconds to wait for the updated container to be running.")
|
||||
fs.BoolVar(&req.DryRun, "dry-run", false, "Fetch and print the update plan without applying it.")
|
||||
}
|
||||
|
||||
func parseInstall(args []string) (installCommandConfig, error) {
|
||||
fs := flag.NewFlagSet("install", flag.ContinueOnError)
|
||||
cfg := hostagent.RuntimeConfig{}
|
||||
var dryRun bool
|
||||
var profileURL string
|
||||
var installToken string
|
||||
var autoUpdateEnabled bool
|
||||
autoUpdate := hostagent.UpdateServiceConfig{}
|
||||
fs.StringVar(&cfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
|
||||
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/docker-install-profile URL for profile-based install.")
|
||||
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Docker install profile.")
|
||||
fs.StringVar(&cfg.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
|
||||
fs.StringVar(&cfg.Image, "image", getenv("RAP_NODE_AGENT_IMAGE", hostagent.DefaultImage), "Docker image for rap-node-agent.")
|
||||
fs.StringVar(&cfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name.")
|
||||
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path mounted as node-agent state.")
|
||||
fs.StringVar(&cfg.Network, "network", getenv("RAP_DOCKER_NETWORK", hostagent.DefaultNetwork), "Docker network mode/name.")
|
||||
fs.StringVar(&cfg.RestartPolicy, "restart", getenv("RAP_DOCKER_RESTART", "unless-stopped"), "Docker restart policy.")
|
||||
fs.BoolVar(&cfg.PullImage, "pull", getenvBool("RAP_DOCKER_PULL", false), "Pull image before running.")
|
||||
fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_DOCKER_REPLACE", false), "Remove an existing container with the same name before run.")
|
||||
fs.BoolVar(&cfg.DockerVPNGatewayEnabled, "docker-vpn-gateway-enabled", getenvBool("RAP_DOCKER_VPN_GATEWAY_ENABLED", false), "Run Docker node-agent with NET_ADMIN and /dev/net/tun for VPN gateway mode.")
|
||||
fs.StringVar(&cfg.ImageArtifactSHA256, "image-artifact-sha256", getenv("RAP_NODE_AGENT_IMAGE_ARTIFACT_SHA256", ""), "Expected SHA-256 for a Docker image tar artifact.")
|
||||
fs.Int64Var(&cfg.ImageArtifactSizeBytes, "image-artifact-size-bytes", getenvInt64("RAP_NODE_AGENT_IMAGE_ARTIFACT_SIZE_BYTES", 0), "Expected byte size for a Docker image tar artifact (used as a best-effort check when sha256 is provided).")
|
||||
fs.BoolVar(&dryRun, "dry-run", false, "Print the docker command with secrets redacted.")
|
||||
fs.BoolVar(&autoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the local update-loop service.")
|
||||
fs.BoolVar(&autoUpdate.InstallSelfUpdater, "host-agent-self-update-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.")
|
||||
fs.StringVar(&autoUpdate.CurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
|
||||
fs.StringVar(&autoUpdate.SelfUpdateVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Initial host-agent binary version used by the self-updater.")
|
||||
fs.StringVar(&autoUpdate.Channel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
|
||||
fs.IntVar(&autoUpdate.IntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&autoUpdate.InitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
|
||||
fs.Float64Var(&autoUpdate.Jitter, "auto-update-jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.")
|
||||
fs.IntVar(&autoUpdate.HealthTimeoutSec, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
|
||||
fs.StringVar(&autoUpdate.BinaryInstallPath, "auto-update-binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.")
|
||||
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Synthetic mesh HTTP listen address inside container.")
|
||||
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", ""), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 0), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", ""), "Advertised transport.")
|
||||
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", ""), "Connectivity mode hint.")
|
||||
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", ""), "NAT type hint.")
|
||||
fs.StringVar(&cfg.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint.")
|
||||
fs.IntVar(&cfg.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
|
||||
fs.IntVar(&cfg.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
|
||||
fs.IntVar(&cfg.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.")
|
||||
fs.IntVar(&cfg.ProductionObservationSinkCap, "production-observation-sink-capacity", getenvInt("RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Production observation sink capacity.")
|
||||
extraEnv := repeatedFlag{}
|
||||
extraRunArg := repeatedFlag{}
|
||||
imageArtifactURL := repeatedFlag{}
|
||||
fs.Var(&extraEnv, "env", "Extra KEY=VALUE env passed to node-agent container; may be repeated.")
|
||||
fs.Var(&extraRunArg, "docker-run-arg", "Extra raw docker run argument; may be repeated.")
|
||||
fs.Var(&imageArtifactURL, "image-artifact-url", "Docker image tar artifact URL to docker load before running; may be repeated.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return installCommandConfig{}, err
|
||||
}
|
||||
cfg.ExtraEnv = extraEnv
|
||||
cfg.AdditionalDockerRunArgs = extraRunArg
|
||||
cfg.ImageArtifactURLs = append(cfg.ImageArtifactURLs, imageArtifactURL...)
|
||||
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
|
||||
profile, err := hostagent.FetchDockerInstallProfile(context.Background(), hostagent.ProfileRequest{
|
||||
URL: profileURL,
|
||||
ClusterID: cfg.ClusterID,
|
||||
InstallToken: installToken,
|
||||
NodeName: cfg.NodeName,
|
||||
})
|
||||
if err != nil {
|
||||
return installCommandConfig{}, err
|
||||
}
|
||||
profileCfg := hostagent.RuntimeConfigFromProfile(profile)
|
||||
profileCfg.ExtraEnv = cfg.ExtraEnv
|
||||
profileCfg.AdditionalDockerRunArgs = cfg.AdditionalDockerRunArgs
|
||||
profileCfg.DockerVPNGatewayEnabled = profileCfg.DockerVPNGatewayEnabled || cfg.DockerVPNGatewayEnabled
|
||||
if len(imageArtifactURL) > 0 {
|
||||
profileCfg.ImageArtifactURLs = append([]string(nil), imageArtifactURL...)
|
||||
}
|
||||
if cfg.ImageArtifactSHA256 != "" {
|
||||
profileCfg.ImageArtifactSHA256 = cfg.ImageArtifactSHA256
|
||||
}
|
||||
if cfg.ImageArtifactSizeBytes > 0 {
|
||||
profileCfg.ImageArtifactSizeBytes = cfg.ImageArtifactSizeBytes
|
||||
}
|
||||
cfg = profileCfg
|
||||
}
|
||||
if err := cfg.ValidateInstall(); err != nil {
|
||||
return installCommandConfig{}, err
|
||||
}
|
||||
return installCommandConfig{
|
||||
Runtime: cfg,
|
||||
DryRun: dryRun,
|
||||
AutoUpdateEnabled: autoUpdateEnabled,
|
||||
AutoUpdate: autoUpdate,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type repeatedFlag []string
|
||||
|
||||
func (f *repeatedFlag) String() string {
|
||||
return strings.Join(*f, ",")
|
||||
}
|
||||
|
||||
func (f *repeatedFlag) Set(value string) error {
|
||||
*f = append(*f, value)
|
||||
return nil
|
||||
}
|
||||
|
||||
func getenv(key, fallback string) string {
|
||||
if value := strings.TrimSpace(os.Getenv(key)); value != "" {
|
||||
return value
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func getenvBool(key string, fallback bool) bool {
|
||||
switch strings.ToLower(strings.TrimSpace(os.Getenv(key))) {
|
||||
case "1", "true", "yes", "y", "on":
|
||||
return true
|
||||
case "0", "false", "no", "n", "off":
|
||||
return false
|
||||
default:
|
||||
return fallback
|
||||
}
|
||||
}
|
||||
|
||||
func getenvInt(key string, fallback int) int {
|
||||
var out int
|
||||
if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%d", &out); err == nil {
|
||||
return out
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func getenvInt64(key string, fallback int64) int64 {
|
||||
var out int64
|
||||
if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%d", &out); err == nil {
|
||||
return out
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func getenvFloat(key string, fallback float64) float64 {
|
||||
var out float64
|
||||
if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%f", &out); err == nil {
|
||||
return out
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func shellJoin(args []string) string {
|
||||
parts := make([]string, 0, len(args))
|
||||
for _, arg := range args {
|
||||
if strings.ContainsAny(arg, " \t\"'") {
|
||||
parts = append(parts, `"`+strings.ReplaceAll(arg, `"`, `\"`)+`"`)
|
||||
} else {
|
||||
parts = append(parts, arg)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func usage() {
|
||||
fmt.Fprintln(os.Stderr, `usage:
|
||||
rap-host-agent install -profile-url URL -install-token TOKEN [-node-name NAME] [docker options]
|
||||
rap-host-agent install -backend-url URL -cluster-id ID -join-token TOKEN -node-name NAME [docker options]
|
||||
rap-host-agent install-windows -profile-url URL -install-token TOKEN [-node-name NAME] [windows options]
|
||||
rap-host-agent install-linux -profile-url URL -install-token TOKEN [-node-name NAME] [linux/systemd options]
|
||||
rap-host-agent install-updater -backend-url URL -cluster-id ID -state-dir DIR -container-name NAME
|
||||
rap-host-agent update-host-agent -backend-url URL -cluster-id ID -state-dir DIR
|
||||
rap-host-agent update-host-agent-loop -backend-url URL -cluster-id ID -state-dir DIR
|
||||
rap-host-agent update -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent update-loop -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent status [-container-name NAME]`)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -78,6 +78,202 @@ func TestLoadSyntheticMeshConfigPrefersScopedFile(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticMeshConfigAuthorityHashUsesRawConfigPayload(t *testing.T) {
|
||||
raw := json.RawMessage(`{
|
||||
"enabled": true,
|
||||
"schema_version": "c18z-test.synthetic.v1",
|
||||
"cluster_id": "cluster-1",
|
||||
"local_node_id": "node-a",
|
||||
"authority_required": true,
|
||||
"cluster_authority": {"schema_version":"rap.cluster_authority.v1"},
|
||||
"authority_payload": {"ignored": true},
|
||||
"authority_signature": {"ignored": true},
|
||||
"config_version": "config-1",
|
||||
"peer_endpoints": {},
|
||||
"routes": [],
|
||||
"production_forwarding": true,
|
||||
"future_backend_field": {"must_remain_hash_visible": true}
|
||||
}`)
|
||||
var remote client.SyntheticMeshConfig
|
||||
if err := json.Unmarshal(raw, &remote); err != nil {
|
||||
t.Fatalf("unmarshal synthetic config: %v", err)
|
||||
}
|
||||
var unsigned map[string]json.RawMessage
|
||||
if err := json.Unmarshal(raw, &unsigned); err != nil {
|
||||
t.Fatalf("unmarshal unsigned map: %v", err)
|
||||
}
|
||||
delete(unsigned, "authority_payload")
|
||||
delete(unsigned, "authority_signature")
|
||||
unsignedRaw, err := json.Marshal(unsigned)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal unsigned map: %v", err)
|
||||
}
|
||||
want, err := agentauthority.HashRaw(unsignedRaw)
|
||||
if err != nil {
|
||||
t.Fatalf("hash unsigned map: %v", err)
|
||||
}
|
||||
got, err := syntheticMeshConfigAuthorityHash(remote)
|
||||
if err != nil {
|
||||
t.Fatalf("hash synthetic config: %v", err)
|
||||
}
|
||||
if got != want {
|
||||
t.Fatalf("hash = %s, want raw-preserving hash %s", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneConsumesRemediationCommand(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
decisions := routeManagerDecisionsFromControlPlane(nil, []client.FabricServiceChannelRemediationCommand{{
|
||||
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
|
||||
CommandID: "cmd-1",
|
||||
Action: "prefer_alternate_route",
|
||||
ClusterID: "cluster-1",
|
||||
ChannelID: "channel-1",
|
||||
ServiceClass: "vpn_packets",
|
||||
PrimaryRouteID: "route-primary",
|
||||
ReplacementRouteID: "route-alternate",
|
||||
Reason: "authorized_alternate_route_available",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 1 {
|
||||
t.Fatalf("decisions = %+v, want one remediation decision", decisions)
|
||||
}
|
||||
decision := decisions[0]
|
||||
if decision.RouteID != "route-primary" ||
|
||||
decision.ReplacementRouteID != "route-alternate" ||
|
||||
decision.RebuildStatus != "applied" ||
|
||||
decision.DecisionSource != "service_channel_remediation_command" ||
|
||||
decision.RebuildRequestID != "cmd-1" {
|
||||
t.Fatalf("unexpected remediation decision: %+v", decision)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneConsumesRebuildRouteCommand(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
decisions := routeManagerDecisionsFromControlPlane(nil, []client.FabricServiceChannelRemediationCommand{{
|
||||
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
|
||||
CommandID: "cmd-rebuild",
|
||||
Action: "rebuild_route",
|
||||
ClusterID: "cluster-1",
|
||||
ChannelID: "channel-1",
|
||||
ServiceClass: "vpn_packets",
|
||||
PrimaryRouteID: "route-primary",
|
||||
Reason: "route_feedback_recommends_rebuild",
|
||||
GuardStatus: "allowed",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 1 {
|
||||
t.Fatalf("decisions = %+v, want one rebuild remediation decision", decisions)
|
||||
}
|
||||
decision := decisions[0]
|
||||
if decision.RouteID != "route-primary" ||
|
||||
decision.RebuildStatus != "pending_degraded_fallback" ||
|
||||
decision.DecisionSource != "service_channel_remediation_command" ||
|
||||
decision.RebuildRequestID != "cmd-rebuild" {
|
||||
t.Fatalf("unexpected rebuild remediation decision: %+v", decision)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneRejectsGuardedRemediationCommand(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
decisions := routeManagerDecisionsFromControlPlane(nil, []client.FabricServiceChannelRemediationCommand{{
|
||||
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
|
||||
CommandID: "cmd-guarded",
|
||||
Action: "prefer_alternate_route",
|
||||
ClusterID: "cluster-1",
|
||||
ChannelID: "channel-1",
|
||||
ServiceClass: "vpn_packets",
|
||||
PrimaryRouteID: "route-primary",
|
||||
ReplacementRouteID: "route-outside-policy",
|
||||
GuardStatus: "rejected",
|
||||
GuardReason: "replacement_exit_outside_signed_pool_policy",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 0 {
|
||||
t.Fatalf("guarded remediation command must not reach route-manager: %+v", decisions)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneKeepsExplicitRemediationCommand(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
report := &client.RoutePathDecisionReport{Decisions: []client.RoutePathDecision{{
|
||||
RouteID: "route-primary",
|
||||
ReplacementRouteID: "route-alternate",
|
||||
RebuildRequestID: "feedback-rebuild",
|
||||
RebuildStatus: "applied",
|
||||
RebuildReason: "service_channel_feedback_rebuild_applied_to_alternate",
|
||||
DecisionSource: "service_channel_feedback_replacement",
|
||||
Generation: "gen-1",
|
||||
}}}
|
||||
decisions := routeManagerDecisionsFromControlPlane(report, []client.FabricServiceChannelRemediationCommand{{
|
||||
CommandID: "cmd-1",
|
||||
Action: "prefer_alternate_route",
|
||||
PrimaryRouteID: "route-primary",
|
||||
ReplacementRouteID: "route-alternate",
|
||||
Reason: "authorized_alternate_route_available",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 2 {
|
||||
t.Fatalf("decisions = %+v, want feedback and explicit remediation command", decisions)
|
||||
}
|
||||
if decisions[1].DecisionSource != "service_channel_remediation_command" || decisions[1].RebuildRequestID != "cmd-1" {
|
||||
t.Fatalf("remediation command was not kept as explicit route-manager input: %+v", decisions)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneSkipsCommandAlreadyResolvedByPlanner(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
report := &client.RoutePathDecisionReport{Decisions: []client.RoutePathDecision{{
|
||||
RouteID: "route-primary",
|
||||
ReplacementRouteID: "route-planner",
|
||||
RebuildRequestID: "cmd-rebuild",
|
||||
RebuildStatus: "applied",
|
||||
RebuildReason: "remediation_rebuild_applied_to_alternate",
|
||||
DecisionSource: "service_channel_remediation_command",
|
||||
Generation: "config-c18z77",
|
||||
}}}
|
||||
decisions := routeManagerDecisionsFromControlPlane(report, []client.FabricServiceChannelRemediationCommand{{
|
||||
CommandID: "cmd-rebuild",
|
||||
Action: "rebuild_route",
|
||||
PrimaryRouteID: "route-primary",
|
||||
Reason: "route_feedback_recommends_rebuild",
|
||||
GuardStatus: "allowed",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 1 {
|
||||
t.Fatalf("decisions = %+v, want only planner-resolved decision", decisions)
|
||||
}
|
||||
if decisions[0].RebuildStatus != "applied" || decisions[0].ReplacementRouteID != "route-planner" {
|
||||
t.Fatalf("unexpected planner decision: %+v", decisions[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricServiceChannelAccessStatsReportsDataPlaneViolations(t *testing.T) {
|
||||
stats := newFabricServiceChannelAccessStats()
|
||||
stats.Observe(mesh.FabricServiceChannelAccessLogEntry{
|
||||
Event: "fabric_service_channel_data_plane_violation",
|
||||
ClusterID: "cluster-1",
|
||||
ChannelID: "channel-1",
|
||||
ResourceID: "vpn-1",
|
||||
BackendRelayPolicy: "disabled",
|
||||
ViolationStatus: "fabric_route_send_failed_backend_fallback_blocked",
|
||||
ViolationReason: "mesh synthetic route not found",
|
||||
OccurredAt: time.Unix(10, 0).UTC(),
|
||||
})
|
||||
report := stats.Report(time.Unix(20, 0).UTC())
|
||||
if report["backend_fallback_blocked"] != int64(1) ||
|
||||
report["fabric_route_send_failure"] != int64(1) ||
|
||||
report["last_data_plane_violation_status"] != "fabric_route_send_failed_backend_fallback_blocked" ||
|
||||
report["last_data_plane_violation_reason"] != "mesh synthetic route not found" {
|
||||
t.Fatalf("unexpected violation report: %+v", report)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyEnrollmentBootstrapAcceptsSignedApproval(t *testing.T) {
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
@@ -134,6 +330,134 @@ func TestVerifyEnrollmentBootstrapAcceptsSignedApproval(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyControlPlaneSyntheticMeshConfigAcceptsSignedServiceChannelFeedback(t *testing.T) {
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("generate key: %v", err)
|
||||
}
|
||||
publicKeyB64 := base64.StdEncoding.EncodeToString(publicKey)
|
||||
fingerprint := agentauthority.Fingerprint(publicKey)
|
||||
now := time.Now().UTC()
|
||||
remote := client.SyntheticMeshConfig{
|
||||
Enabled: true,
|
||||
SchemaVersion: "c17z18.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
AuthorityRequired: true,
|
||||
ClusterAuthority: &client.ClusterAuthorityDescriptor{
|
||||
SchemaVersion: agentauthority.AuthoritySchemaVersion,
|
||||
ClusterID: "cluster-1",
|
||||
AuthorityState: "authoritative",
|
||||
KeyAlgorithm: agentauthority.AlgorithmEd25519,
|
||||
PublicKey: publicKeyB64,
|
||||
PublicKeyFingerprint: fingerprint,
|
||||
},
|
||||
ConfigVersion: "config-v1",
|
||||
PeerDirectoryVersion: "config-v1",
|
||||
PolicyVersion: "config-v1",
|
||||
PeerEndpoints: map[string]string{},
|
||||
PeerEndpointCandidates: map[string][]client.PeerEndpointCandidate{},
|
||||
PeerDirectory: []client.PeerDirectoryEntry{},
|
||||
RecoverySeeds: []client.PeerRecoverySeed{},
|
||||
RendezvousLeases: []client.PeerRendezvousLease{},
|
||||
RoutePathDecisions: &client.RoutePathDecisionReport{
|
||||
SchemaVersion: "c17z18.route_path_decisions.v1",
|
||||
DecisionMode: "control_plane_effective_path_from_relay_policy_and_service_channel_feedback",
|
||||
Generation: "config-v1",
|
||||
DecisionCount: 1,
|
||||
ReplacementDecisionCount: 1,
|
||||
RebuildRequestCount: 1,
|
||||
RebuildAppliedCount: 1,
|
||||
ControlPlaneOnly: true,
|
||||
Decisions: []client.RoutePathDecision{{
|
||||
DecisionID: "route-ab-path-node-a-service-channel-feedback",
|
||||
RouteID: "route-ab",
|
||||
ReplacementRouteID: "route-ac",
|
||||
RebuildRequestID: "route-ab-node-a-config-v1-rebuild",
|
||||
RebuildStatus: "applied",
|
||||
RebuildReason: "service_channel_feedback_rebuild_applied_to_alternate",
|
||||
RebuildAttempt: 2,
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
OriginalHops: []string{"node-a", "node-b"},
|
||||
EffectiveHops: []string{"node-a", "node-c", "node-b"},
|
||||
LocalRole: "source",
|
||||
DecisionSource: "service_channel_feedback_replacement",
|
||||
Generation: "config-v1",
|
||||
PathScore: 1000,
|
||||
ScoreReasons: []string{"service_channel_rebuild_applied"},
|
||||
ControlPlaneOnly: true,
|
||||
ExpiresAt: now.Add(30 * time.Second),
|
||||
}},
|
||||
},
|
||||
ServiceChannelFeedback: &client.FabricServiceChannelFeedbackReport{
|
||||
SchemaVersion: "c18n.fabric_service_channel_route_feedback_report.v1",
|
||||
GeneratedAt: now,
|
||||
FeedbackMaxAgeSeconds: 30,
|
||||
ObservationCount: 1,
|
||||
FencedRouteCount: 1,
|
||||
Observations: []client.FabricServiceChannelFeedbackObservation{{
|
||||
ClusterID: "cluster-1",
|
||||
ReporterNodeID: "node-a",
|
||||
RouteID: "route-ab",
|
||||
ServiceClass: "vpn_packets",
|
||||
FeedbackStatus: "fenced",
|
||||
ScoreAdjustment: -1000,
|
||||
Reasons: []string{"route_rebuild_recommended"},
|
||||
ConsecutiveFailures: 2,
|
||||
Payload: json.RawMessage(`{"route_rebuild_recommended":true}`),
|
||||
ObservedAt: now,
|
||||
ExpiresAt: now.Add(30 * time.Second),
|
||||
}},
|
||||
},
|
||||
MeshListener: nil,
|
||||
Routes: []client.SyntheticMeshRouteConfig{},
|
||||
ProductionForwarding: false,
|
||||
}
|
||||
configHash, err := syntheticMeshConfigAuthorityHash(remote)
|
||||
if err != nil {
|
||||
t.Fatalf("config hash: %v", err)
|
||||
}
|
||||
payload, err := json.Marshal(controlPlaneMeshConfigAuthorityPayload{
|
||||
SchemaVersion: "rap.cluster.mesh_config_snapshot.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
ConfigVersion: "config-v1",
|
||||
ConfigSHA256: configHash,
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Hour),
|
||||
ControlPlaneOnly: true,
|
||||
ProductionForwarding: false,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal payload: %v", err)
|
||||
}
|
||||
canonical, err := agentauthority.CanonicalJSON(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("canonical json: %v", err)
|
||||
}
|
||||
remote.AuthorityPayload = payload
|
||||
remote.AuthoritySignature = &client.ClusterSignature{
|
||||
SchemaVersion: agentauthority.SignatureSchemaVersion,
|
||||
Algorithm: agentauthority.AlgorithmEd25519,
|
||||
KeyFingerprint: fingerprint,
|
||||
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
|
||||
SignedAt: now,
|
||||
}
|
||||
|
||||
err = verifyControlPlaneSyntheticMeshConfig(remote, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
ClusterAuthorityPublicKey: publicKeyB64,
|
||||
ClusterAuthorityFingerprint: fingerprint,
|
||||
}, config.Config{})
|
||||
if err != nil {
|
||||
t.Fatalf("verify control-plane synthetic mesh config: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyEnrollmentBootstrapRejectsPinnedAuthorityMismatch(t *testing.T) {
|
||||
bootstrap := client.NodeBootstrap{
|
||||
NodeID: "node-1",
|
||||
@@ -155,6 +479,54 @@ func TestVerifyEnrollmentBootstrapRejectsPinnedAuthorityMismatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnsureApprovedIdentityKeepsPollingWhenTimeoutDisabled(t *testing.T) {
|
||||
var bootstrapPolls int
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.URL.Path == "/node-agents/enroll":
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"status": "pending",
|
||||
"join_request": map[string]any{"id": "join-request-1"},
|
||||
})
|
||||
case r.URL.Path == "/node-agents/enrollments/join-request-1/bootstrap":
|
||||
bootstrapPolls++
|
||||
if bootstrapPolls >= 2 {
|
||||
cancel()
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"status": "pending",
|
||||
"join_request": map[string]any{"id": "join-request-1"},
|
||||
})
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
dir := t.TempDir()
|
||||
identity, err := state.LoadOrCreate(dir, "cluster-1", "node-a")
|
||||
if err != nil {
|
||||
t.Fatalf("load identity: %v", err)
|
||||
}
|
||||
_, err = ensureApprovedIdentity(ctx, config.Config{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-token",
|
||||
NodeName: "node-a",
|
||||
StateDir: dir,
|
||||
EnrollmentPollInterval: time.Millisecond,
|
||||
EnrollmentPollTimeout: 0,
|
||||
}, identity, client.New(server.URL))
|
||||
if err == nil || !strings.Contains(err.Error(), "context canceled") {
|
||||
t.Fatalf("ensureApprovedIdentity err = %v, want context canceled", err)
|
||||
}
|
||||
if bootstrapPolls < 2 {
|
||||
t.Fatalf("bootstrap polls = %d, want at least 2", bootstrapPolls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticQualityScoreIsBounded(t *testing.T) {
|
||||
cases := []struct {
|
||||
latency int
|
||||
@@ -209,6 +581,168 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayloadReportsMeshListenerFailureWithoutKillingHeartbeat(t *testing.T) {
|
||||
now := time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC)
|
||||
payload := heartbeatPayload(config.Config{
|
||||
MeshConnectivityMode: "private_lan",
|
||||
}, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
}, &syntheticMeshState{
|
||||
ListenerReport: meshListenerReport{
|
||||
SchemaVersion: "c17z21.mesh_listener_report.v1",
|
||||
ConfiguredListenAddr: ":19131",
|
||||
ListenPortMode: "manual",
|
||||
Status: "listen_failed",
|
||||
InboundReachability: "unavailable",
|
||||
ControlPlaneReachable: true,
|
||||
OneWayConnectivity: true,
|
||||
FailureReason: "bind_failed",
|
||||
FailureError: "listen tcp :19131: bind: address already in use",
|
||||
PortConflict: true,
|
||||
},
|
||||
}, now)
|
||||
|
||||
report, ok := payload.Metadata["mesh_listener_report"].(meshListenerReport)
|
||||
if !ok {
|
||||
t.Fatalf("mesh listener report missing: %+v", payload.Metadata)
|
||||
}
|
||||
if payload.HealthStatus != "warning" || report.Status != "listen_failed" || !report.PortConflict {
|
||||
t.Fatalf("unexpected listener health report: status=%s report=%+v", payload.HealthStatus, report)
|
||||
}
|
||||
if payload.Capabilities["mesh_listener_diagnostics"] != true || payload.Capabilities["mesh_one_way_connectivity"] != true {
|
||||
t.Fatalf("listener capabilities missing: %+v", payload.Capabilities)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAdvertisedEndpointCandidatesPreferManualEndpoints(t *testing.T) {
|
||||
now := time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC)
|
||||
candidates, err := advertisedEndpointCandidates(config.Config{
|
||||
MeshAdvertiseEndpointsJSON: `[{"endpoint_id":"node-a-json","node_id":"node-a","transport":"direct_http","address":"http://10.10.10.10:19131","priority":12,"connectivity_mode":"private_lan","reachability":"private"}]`,
|
||||
MeshAdvertiseEndpoint: "http://203.0.113.10:19131",
|
||||
MeshAdvertiseTransport: "direct_http",
|
||||
MeshConnectivityMode: "direct",
|
||||
MeshNATType: "port_restricted",
|
||||
MeshRegion: "edge",
|
||||
}, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
}, nil, now)
|
||||
if err != nil {
|
||||
t.Fatalf("advertised endpoint candidates failed: %v", err)
|
||||
}
|
||||
if len(candidates) != 2 {
|
||||
t.Fatalf("expected two manual candidates, got %d: %+v", len(candidates), candidates)
|
||||
}
|
||||
if candidates[0].Address != "http://203.0.113.10:19131" || candidates[0].Priority != 10 {
|
||||
t.Fatalf("explicit advertise endpoint must win: %+v", candidates)
|
||||
}
|
||||
if candidates[1].Address != "http://10.10.10.10:19131" || candidates[1].Priority != 12 {
|
||||
t.Fatalf("json candidate order mismatch: %+v", candidates)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetworkInterfaceClassificationSkipsContainerNoise(t *testing.T) {
|
||||
tests := map[string]string{
|
||||
"ens160": "physical",
|
||||
"wg0": "vpn",
|
||||
"tailscale0": "vpn",
|
||||
"docker0": "container",
|
||||
"br-a1b2c3d4": "container",
|
||||
"vethabc123": "container",
|
||||
}
|
||||
for name, want := range tests {
|
||||
if got := classifyNetworkInterface(name); got != want {
|
||||
t.Fatalf("classifyNetworkInterface(%q)=%q, want %q", name, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayloadTreatsOutboundOnlyListenerFailureAsOneWayConnectivity(t *testing.T) {
|
||||
payload := heartbeatPayload(config.Config{
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshConnectivityMode: "outbound_only",
|
||||
}, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
}, &syntheticMeshState{
|
||||
ListenerReport: meshListenerReport{
|
||||
SchemaVersion: "c17z21.mesh_listener_report.v1",
|
||||
ConfiguredListenAddr: ":19131",
|
||||
ListenPortMode: "manual",
|
||||
Status: "listen_failed",
|
||||
InboundReachability: "unavailable",
|
||||
ControlPlaneReachable: true,
|
||||
OneWayConnectivity: true,
|
||||
FailureReason: "bind_failed",
|
||||
},
|
||||
}, time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC))
|
||||
|
||||
if payload.HealthStatus != "healthy" {
|
||||
t.Fatalf("HealthStatus = %q, want healthy for outbound-only listener failure", payload.HealthStatus)
|
||||
}
|
||||
report, ok := payload.Metadata["mesh_outbound_session_report"].(meshOutboundSessionReport)
|
||||
if !ok {
|
||||
t.Fatalf("mesh outbound session report missing: %+v", payload.Metadata)
|
||||
}
|
||||
if report.Status != "ready" || !report.UsableForInboundControl || report.ListenerStatus != "listen_failed" {
|
||||
t.Fatalf("unexpected outbound session report: %+v", report)
|
||||
}
|
||||
if payload.Capabilities["mesh_outbound_control_session"] != true ||
|
||||
payload.Capabilities["mesh_reverse_control_channel_contract"] != true {
|
||||
t.Fatalf("outbound session capabilities missing: %+v", payload.Capabilities)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayloadReportsMeshConfigLoadFailureWithoutDroppingPresence(t *testing.T) {
|
||||
payload := heartbeatPayload(config.Config{
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshConnectivityMode: "private_lan",
|
||||
}, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
}, &syntheticMeshState{
|
||||
ConfigLoadError: "control-plane synthetic mesh config unavailable",
|
||||
ListenerReport: meshListenerReport{
|
||||
SchemaVersion: "c17z21.mesh_listener_report.v1",
|
||||
ConfiguredListenAddr: ":19131",
|
||||
ListenPortMode: "manual",
|
||||
Status: "listening",
|
||||
InboundReachability: "private",
|
||||
ControlPlaneReachable: true,
|
||||
},
|
||||
}, time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC))
|
||||
|
||||
report, ok := payload.Metadata["mesh_outbound_session_report"].(meshOutboundSessionReport)
|
||||
if !ok {
|
||||
t.Fatalf("mesh outbound session report missing: %+v", payload.Metadata)
|
||||
}
|
||||
if payload.HealthStatus != "warning" || report.Status != "degraded" || report.ConfigLoadError == "" {
|
||||
t.Fatalf("unexpected config-load diagnostic heartbeat: health=%s report=%+v", payload.HealthStatus, report)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOutboundSessionReportTreatsListeningPrivateLANAsUsable(t *testing.T) {
|
||||
report := meshOutboundSessionReportFromState(config.Config{
|
||||
BackendURL: "http://control/api/v1",
|
||||
MeshConnectivityMode: "private_lan",
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
}, &syntheticMeshState{
|
||||
ListenerReport: meshListenerReport{
|
||||
SchemaVersion: "c17z21.mesh_listener_report.v1",
|
||||
Status: "listening",
|
||||
InboundReachability: reachabilityFromConnectivityMode("private_lan"),
|
||||
},
|
||||
}, time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC))
|
||||
|
||||
if !report.UsableForInboundControl {
|
||||
t.Fatalf("listening private LAN listener must be usable: %+v", report)
|
||||
}
|
||||
if reachabilityFromConnectivityMode("private_lan") != "private" {
|
||||
t.Fatalf("private_lan reachability mismatch")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayloadReportsMultipleMeshEndpoints(t *testing.T) {
|
||||
payload := heartbeatPayload(config.Config{
|
||||
MeshAdvertiseEndpointsJSON: `[{
|
||||
@@ -1050,17 +1584,36 @@ func TestProductionEnvelopeObservationSinkFromConfigCreatesBoundedSink(t *testin
|
||||
func TestProductionForwardingLogStateDistinguishesGateFromRuntime(t *testing.T) {
|
||||
gateEnabled, runtimeEnabled := productionForwardingLogState(config.Config{
|
||||
MeshProductionForwardingEnabled: true,
|
||||
})
|
||||
}, false)
|
||||
if !gateEnabled {
|
||||
t.Fatal("gateEnabled = false, want true")
|
||||
}
|
||||
if !runtimeEnabled {
|
||||
t.Fatal("runtimeEnabled = false, want true")
|
||||
}
|
||||
gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{})
|
||||
gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{}, false)
|
||||
if gateEnabled || runtimeEnabled {
|
||||
t.Fatalf("default log state = gate:%t runtime:%t, want false/false", gateEnabled, runtimeEnabled)
|
||||
}
|
||||
gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{}, true)
|
||||
if !gateEnabled || !runtimeEnabled {
|
||||
t.Fatalf("signed control-plane log state = gate:%t runtime:%t, want true/true", gateEnabled, runtimeEnabled)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshLinkStatusFromPeerProbeMapsDeferredForLatestLinks(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
mesh.PeerConnectionProbeReachable: "reachable",
|
||||
mesh.PeerConnectionProbeUnreachable: "unreachable",
|
||||
mesh.PeerConnectionProbeDeferred: "degraded",
|
||||
mesh.PeerConnectionProbeSkipped: "unknown",
|
||||
"unexpected": "unknown",
|
||||
}
|
||||
for input, want := range cases {
|
||||
if got := meshLinkStatusFromPeerProbe(input); got != want {
|
||||
t.Fatalf("meshLinkStatusFromPeerProbe(%q) = %q, want %q", input, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogProductionObservationSinkMetricsToleratesNilState(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user