1
This commit is contained in:
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
@@ -58,6 +59,14 @@ func main() {
|
||||
if err := runUpdateLoop(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update-loop failed: %v", err)
|
||||
}
|
||||
case "monitor-loop":
|
||||
if err := runMonitorLoop(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("monitor-loop failed: %v", err)
|
||||
}
|
||||
case "monitor-once":
|
||||
if err := runMonitorOnce(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("monitor-once failed: %v", err)
|
||||
}
|
||||
case "install-updater":
|
||||
if err := runInstallUpdater(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("install-updater failed: %v", err)
|
||||
@@ -288,6 +297,9 @@ func runInstall(ctx context.Context, args []string) error {
|
||||
return err
|
||||
}
|
||||
fmt.Print(result.Unit)
|
||||
if result.MonitorUnit != "" {
|
||||
fmt.Print(result.MonitorUnit)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -304,7 +316,7 @@ func runInstall(ctx context.Context, args []string) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t\n", serviceResult.UnitName, serviceResult.UnitPath, serviceResult.BinaryPath, serviceResult.Started)
|
||||
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t monitor_service=%s\n", serviceResult.UnitName, serviceResult.UnitPath, serviceResult.BinaryPath, serviceResult.Started, serviceResult.MonitorUnitName)
|
||||
}
|
||||
fmt.Println("next: approve the join request in the platform admin panel, then the node-agent will finish bootstrap and start heartbeats")
|
||||
return nil
|
||||
@@ -429,6 +441,75 @@ func runUpdateLoop(ctx context.Context, args []string) error {
|
||||
return (hostagent.DockerManager{}).RunUpdateLoop(ctx, cfg)
|
||||
}
|
||||
|
||||
func runMonitorLoop(ctx context.Context, args []string) error {
|
||||
cfg, err := parseMonitor(args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return hostagent.RunMonitorLoop(ctx, cfg)
|
||||
}
|
||||
|
||||
func runMonitorOnce(ctx context.Context, args []string) error {
|
||||
cfg, err := parseMonitor(args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg.MaxRuns = 1
|
||||
result := hostagent.RunMonitorOnce(ctx, cfg)
|
||||
if err := json.NewEncoder(os.Stdout).Encode(result); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseMonitor(args []string) (hostagent.MonitorConfig, error) {
|
||||
fs := flag.NewFlagSet("monitor-loop", flag.ContinueOnError)
|
||||
cfg := hostagent.MonitorConfig{}
|
||||
var intervalSeconds int
|
||||
var initialDelaySeconds int
|
||||
var maxRuns int
|
||||
var restartCooldownSeconds int
|
||||
var staleRestartingSeconds int
|
||||
var tmpMinAgeMinutes int
|
||||
watchContainers := repeatedFlag{}
|
||||
fs.StringVar(&cfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL used for monitor status reports.")
|
||||
fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
|
||||
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.")
|
||||
fs.StringVar(&cfg.Product, "product", getenv("RAP_MONITOR_PRODUCT", hostagent.DefaultMonitorProduct), "Status product name.")
|
||||
fs.StringVar(&cfg.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Current rap-host-agent version.")
|
||||
fs.StringVar(&cfg.DockerBinary, "docker-binary", getenv("RAP_DOCKER_BINARY", "docker"), "Docker CLI binary.")
|
||||
fs.StringVar(&cfg.DiskPath, "disk-path", getenv("RAP_MONITOR_DISK_PATH", "/"), "Filesystem path used for disk usage checks.")
|
||||
fs.StringVar(&cfg.TmpDir, "tmp-dir", getenv("RAP_MONITOR_TMP_DIR", "/tmp"), "Temporary directory cleaned under pressure.")
|
||||
fs.StringVar(&cfg.StatusFile, "status-file", getenv("RAP_MONITOR_STATUS_FILE", ""), "Optional JSON status file written after every run.")
|
||||
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_MONITOR_INTERVAL_SECONDS", hostagent.DefaultMonitorIntervalSeconds), "Seconds between monitor checks.")
|
||||
fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_MONITOR_INITIAL_DELAY_SECONDS", 0), "Seconds to wait before first monitor check.")
|
||||
fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_MONITOR_MAX_RUNS", 0), "Maximum monitor iterations. Use 0 to run until stopped.")
|
||||
fs.IntVar(&cfg.DiskWarnPercent, "disk-warn-percent", getenvInt("RAP_MONITOR_DISK_WARN_PERCENT", hostagent.DefaultMonitorDiskWarnPercent), "Disk used percent that reports warning.")
|
||||
fs.IntVar(&cfg.DiskCleanupPercent, "disk-cleanup-percent", getenvInt("RAP_MONITOR_DISK_CLEANUP_PERCENT", hostagent.DefaultMonitorDiskCleanupPercent), "Disk used percent that triggers cleanup.")
|
||||
fs.IntVar(&cfg.DiskCriticalPercent, "disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.")
|
||||
fs.IntVar(&restartCooldownSeconds, "restart-cooldown-seconds", getenvInt("RAP_MONITOR_RESTART_COOLDOWN_SECONDS", hostagent.DefaultMonitorRestartCooldownSec), "Minimum seconds between repeated restarts of the same target.")
|
||||
fs.IntVar(&staleRestartingSeconds, "stale-restarting-seconds", getenvInt("RAP_MONITOR_STALE_RESTARTING_SECONDS", hostagent.DefaultMonitorStaleRestartingSec), "Seconds after which docker restarting state is considered stuck.")
|
||||
fs.IntVar(&tmpMinAgeMinutes, "tmp-min-age-minutes", getenvInt("RAP_MONITOR_TMP_MIN_AGE_MINUTES", hostagent.DefaultMonitorTmpMinAgeMinutes), "Minimum age for /tmp rap-* and go-build* cleanup.")
|
||||
fs.BoolVar(&cfg.RestartContainers, "restart-containers", getenvBool("RAP_MONITOR_RESTART_CONTAINERS", true), "Start/restart watched containers when they are stopped, unhealthy, or stuck restarting.")
|
||||
fs.BoolVar(&cfg.CleanupDocker, "cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.")
|
||||
fs.Var(&watchContainers, "watch-container", "Docker container to watch and heal; may be repeated.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return hostagent.MonitorConfig{}, err
|
||||
}
|
||||
cfg.WatchContainers = watchContainers
|
||||
cfg.Interval = time.Duration(intervalSeconds) * time.Second
|
||||
cfg.InitialDelay = time.Duration(initialDelaySeconds) * time.Second
|
||||
cfg.MaxRuns = maxRuns
|
||||
cfg.RestartCooldown = time.Duration(restartCooldownSeconds) * time.Second
|
||||
cfg.StaleRestartingAfter = time.Duration(staleRestartingSeconds) * time.Second
|
||||
cfg.TmpMinAge = time.Duration(tmpMinAgeMinutes) * time.Minute
|
||||
cfg.Logf = func(format string, args ...any) {
|
||||
fmt.Printf(format+"\n", args...)
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func firstNonEmptyLocal(values ...string) string {
|
||||
for _, value := range values {
|
||||
if strings.TrimSpace(value) != "" {
|
||||
@@ -444,6 +525,8 @@ func runInstallUpdater(ctx context.Context, args []string) error {
|
||||
service := hostagent.UpdateServiceConfig{}
|
||||
var dryRun bool
|
||||
var selfUpdater bool
|
||||
var monitorEnabled bool
|
||||
monitorContainers := repeatedFlag{}
|
||||
fs.StringVar(&runtimeCfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&runtimeCfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&runtimeCfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.")
|
||||
@@ -456,6 +539,14 @@ func runInstallUpdater(ctx context.Context, args []string) error {
|
||||
fs.IntVar(&service.HealthTimeoutSec, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
|
||||
fs.StringVar(&service.BinaryInstallPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.")
|
||||
fs.BoolVar(&selfUpdater, "self-updater-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.")
|
||||
fs.BoolVar(&monitorEnabled, "monitor-enabled", getenvBool("RAP_HOST_AGENT_MONITOR_ENABLED", true), "Install and start the local host monitor service.")
|
||||
fs.IntVar(&service.MonitorIntervalSec, "monitor-interval-seconds", getenvInt("RAP_MONITOR_INTERVAL_SECONDS", hostagent.DefaultMonitorIntervalSeconds), "Seconds between monitor checks.")
|
||||
fs.StringVar(&service.MonitorStatusFile, "monitor-status-file", getenv("RAP_MONITOR_STATUS_FILE", ""), "Optional JSON status file written by the monitor.")
|
||||
fs.IntVar(&service.MonitorDiskWarn, "monitor-disk-warn-percent", getenvInt("RAP_MONITOR_DISK_WARN_PERCENT", hostagent.DefaultMonitorDiskWarnPercent), "Disk used percent that reports warning.")
|
||||
fs.IntVar(&service.MonitorDiskCleanup, "monitor-disk-cleanup-percent", getenvInt("RAP_MONITOR_DISK_CLEANUP_PERCENT", hostagent.DefaultMonitorDiskCleanupPercent), "Disk used percent that triggers cleanup.")
|
||||
fs.IntVar(&service.MonitorDiskCritical, "monitor-disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.")
|
||||
fs.BoolVar(&service.MonitorCleanupDocker, "monitor-cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.")
|
||||
fs.Var(&monitorContainers, "monitor-container", "Extra Docker container watched by monitor; may be repeated.")
|
||||
fs.BoolVar(&dryRun, "dry-run", false, "Print the systemd unit without installing it.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
@@ -465,6 +556,8 @@ func runInstallUpdater(ctx context.Context, args []string) error {
|
||||
service.DryRun = dryRun
|
||||
service.InstallSelfUpdater = selfUpdater
|
||||
service.SelfUpdateVersion = agent.Version
|
||||
service.InstallMonitor = monitorEnabled
|
||||
service.MonitorContainers = monitorContainers
|
||||
result, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -474,9 +567,12 @@ func runInstallUpdater(ctx context.Context, args []string) error {
|
||||
if result.SelfUnit != "" {
|
||||
fmt.Print(result.SelfUnit)
|
||||
}
|
||||
if result.MonitorUnit != "" {
|
||||
fmt.Print(result.MonitorUnit)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t self_updater=%s\n", result.UnitName, result.UnitPath, result.BinaryPath, result.Started, result.SelfUnitName)
|
||||
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t self_updater=%s monitor_service=%s\n", result.UnitName, result.UnitPath, result.BinaryPath, result.Started, result.SelfUnitName, result.MonitorUnitName)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -572,6 +668,7 @@ func parseInstall(args []string) (installCommandConfig, error) {
|
||||
var installToken string
|
||||
var autoUpdateEnabled bool
|
||||
autoUpdate := hostagent.UpdateServiceConfig{}
|
||||
monitorContainers := repeatedFlag{}
|
||||
fs.StringVar(&cfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
|
||||
@@ -591,6 +688,7 @@ func parseInstall(args []string) (installCommandConfig, error) {
|
||||
fs.BoolVar(&dryRun, "dry-run", false, "Print the docker command with secrets redacted.")
|
||||
fs.BoolVar(&autoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the local update-loop service.")
|
||||
fs.BoolVar(&autoUpdate.InstallSelfUpdater, "host-agent-self-update-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.")
|
||||
fs.BoolVar(&autoUpdate.InstallMonitor, "host-agent-monitor-enabled", getenvBool("RAP_HOST_AGENT_MONITOR_ENABLED", true), "Install and start the local host monitor service.")
|
||||
fs.StringVar(&autoUpdate.CurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
|
||||
fs.StringVar(&autoUpdate.SelfUpdateVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Initial host-agent binary version used by the self-updater.")
|
||||
fs.StringVar(&autoUpdate.Channel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
|
||||
@@ -599,6 +697,12 @@ func parseInstall(args []string) (installCommandConfig, error) {
|
||||
fs.Float64Var(&autoUpdate.Jitter, "auto-update-jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.")
|
||||
fs.IntVar(&autoUpdate.HealthTimeoutSec, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
|
||||
fs.StringVar(&autoUpdate.BinaryInstallPath, "auto-update-binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.")
|
||||
fs.IntVar(&autoUpdate.MonitorIntervalSec, "monitor-interval-seconds", getenvInt("RAP_MONITOR_INTERVAL_SECONDS", hostagent.DefaultMonitorIntervalSeconds), "Seconds between monitor checks.")
|
||||
fs.StringVar(&autoUpdate.MonitorStatusFile, "monitor-status-file", getenv("RAP_MONITOR_STATUS_FILE", ""), "Optional JSON status file written by the monitor.")
|
||||
fs.IntVar(&autoUpdate.MonitorDiskWarn, "monitor-disk-warn-percent", getenvInt("RAP_MONITOR_DISK_WARN_PERCENT", hostagent.DefaultMonitorDiskWarnPercent), "Disk used percent that reports warning.")
|
||||
fs.IntVar(&autoUpdate.MonitorDiskCleanup, "monitor-disk-cleanup-percent", getenvInt("RAP_MONITOR_DISK_CLEANUP_PERCENT", hostagent.DefaultMonitorDiskCleanupPercent), "Disk used percent that triggers cleanup.")
|
||||
fs.IntVar(&autoUpdate.MonitorDiskCritical, "monitor-disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.")
|
||||
fs.BoolVar(&autoUpdate.MonitorCleanupDocker, "monitor-cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.")
|
||||
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
@@ -622,12 +726,14 @@ func parseInstall(args []string) (installCommandConfig, error) {
|
||||
fs.Var(&extraEnv, "env", "Extra KEY=VALUE env passed to node-agent container; may be repeated.")
|
||||
fs.Var(&extraRunArg, "docker-run-arg", "Extra raw docker run argument; may be repeated.")
|
||||
fs.Var(&imageArtifactURL, "image-artifact-url", "Docker image tar artifact URL to docker load before running; may be repeated.")
|
||||
fs.Var(&monitorContainers, "monitor-container", "Extra Docker container watched by monitor; may be repeated.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return installCommandConfig{}, err
|
||||
}
|
||||
cfg.ExtraEnv = extraEnv
|
||||
cfg.AdditionalDockerRunArgs = extraRunArg
|
||||
cfg.ImageArtifactURLs = append(cfg.ImageArtifactURLs, imageArtifactURL...)
|
||||
autoUpdate.MonitorContainers = monitorContainers
|
||||
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
|
||||
profile, err := hostagent.FetchDockerInstallProfile(context.Background(), hostagent.ProfileRequest{
|
||||
URL: profileURL,
|
||||
@@ -738,6 +844,8 @@ func usage() {
|
||||
rap-host-agent install-updater -backend-url URL -cluster-id ID -state-dir DIR -container-name NAME
|
||||
rap-host-agent update-host-agent -backend-url URL -cluster-id ID -state-dir DIR
|
||||
rap-host-agent update-host-agent-loop -backend-url URL -cluster-id ID -state-dir DIR
|
||||
rap-host-agent monitor-loop -backend-url URL -cluster-id ID -state-dir DIR --watch-container NAME
|
||||
rap-host-agent monitor-once -backend-url URL -cluster-id ID -state-dir DIR --watch-container NAME
|
||||
rap-host-agent update -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent update-loop -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent status [-container-name NAME]`)
|
||||
|
||||
Reference in New Issue
Block a user