Record project continuation changes

This commit is contained in:
2026-05-12 21:02:29 +03:00
parent 3059d1d7a3
commit 8f69d53193
339 changed files with 101111 additions and 1769 deletions
+5 -3
View File
@@ -1,4 +1,4 @@
FROM golang:1.23-bookworm AS build
FROM golang:1.25-bookworm AS build
WORKDIR /src
COPY agents/rap-node-agent/go.mod ./
@@ -6,8 +6,10 @@ RUN go mod download
COPY agents/rap-node-agent/ ./
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/rap-node-agent ./cmd/rap-node-agent
FROM gcr.io/distroless/static-debian12:nonroot
FROM debian:bookworm-slim
RUN apt-get update \
&& apt-get install -y --no-install-recommends ca-certificates iproute2 iptables procps \
&& rm -rf /var/lib/apt/lists/*
COPY --from=build /out/rap-node-agent /usr/local/bin/rap-node-agent
USER nonroot:nonroot
ENTRYPOINT ["/usr/local/bin/rap-node-agent"]
+304 -5
View File
@@ -66,6 +66,11 @@ Implemented:
- synthetic route-health route config refresh from Control Plane path
decisions
- route-health expected/observed effective path drift reporting
- host-agent Docker update plan executor with artifact checksum/size
verification, container replacement, health check, status reporting, and
rollback attempt
- host-agent update loop for service/timer placement
- host-agent binary self-update loop for the updater service itself
- maximum capacity guard for the local production observation sink
- panic-safe fail-closed production envelope observation wrapper
- explicit `4096` byte payload boundary for validated production
@@ -98,7 +103,7 @@ Not implemented yet:
- VPN runtime
- production workload supervision
- certificate issuance/rotation
- updater runtime
- in-agent native updater runtime
- privileged host route/firewall control
## Build
@@ -107,9 +112,237 @@ Not implemented yet:
cd agents\rap-node-agent
go test ./...
go build -o bin\rap-node-agent.exe .\cmd\rap-node-agent
go build -buildvcs=false -o bin\rap-host-agent.exe .\cmd\rap-host-agent
go build -o bin\mesh-live-smoke.exe .\cmd\mesh-live-smoke
```
## Docker Host Agent Bootstrap
`rap-host-agent` is the first host-level installer/updater boundary for Docker
placement. It does not join the mesh itself. It applies the cluster's install
intent locally by running the `rap-node-agent` container with a persistent host
state directory. On Linux it also installs a systemd `update-loop` service by
default, so nodes continue to update from Control Plane policy without operator
commands on each host.
Preferred profile-based install:
```bash
rap-host-agent install \
--profile-url https://control.example.com/api/v1 \
--cluster-id <cluster_id> \
--install-token <one_time_install_token> \
--node-name docker-node-1
```
The host-agent exchanges the install token for a signed control-plane install
profile, then applies Docker image, container, state-dir, mesh listen,
advertise, NAT/connectivity, and region settings from that profile. The same
token is then used by the node-agent for first enrollment, so the operator does
not need to manually pass cluster/runtime flags.
Manual install is still supported:
```bash
rap-host-agent install \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--join-token <raw_join_token> \
--node-name docker-node-1 \
--image rap-node-agent:dev-enrollment-bootstrap-smoke \
--container-name rap-node-agent-docker-node-1 \
--state-dir /var/lib/rap/nodes/docker-node-1 \
--network host \
--replace
```
The command creates or replaces only the local Docker container. The running
node-agent submits the join request, waits for owner approval, stores its
identity in the mounted state directory, and then sends heartbeats. Re-running
with `--replace` updates the container while preserving node identity. Pass
`--auto-update-enabled=false` only for lab/debug installs where the local
systemd updater must not be registered.
Useful checks:
```bash
rap-host-agent status --container-name rap-node-agent-docker-node-1
docker logs -f rap-node-agent-docker-node-1
```
For a node that was installed before the updater existed, register only the
local updater service without recreating the node-agent container:
```bash
rap-host-agent install-updater \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--state-dir /var/lib/rap/nodes/docker-node-1 \
--container-name rap-node-agent-docker-node-1
```
## Docker Host Agent Updates
`rap-host-agent update` applies one Control Plane update plan for an already
enrolled Docker node. The host-agent fetches the plan, downloads the selected
Docker image tar, verifies size and sha256, loads the image, recreates the
node-agent container from the existing Docker runtime settings, checks that the
container is running, and reports update phases back to the Control Plane.
```bash
rap-host-agent update \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--node-id <node_id> \
--container-name rap-node-agent-docker-node-1 \
--current-version 0.1.0-c17z26
```
`rap-host-agent update-loop` is the per-node executor and health boundary. It
does not need to poll for normal releases: the node-agent receives an
`rap.node_update_hint.v1` subscription hint from Control Plane or the assigned
update-cache service during heartbeat, writes `<state-dir>/update-trigger.json`,
and the host-agent wakes immediately. The interval is an emergency fallback for
missed hints, service migration, or a dead update-cache service; keep it long
in production. The loop keeps running after transient errors by default and
advances its in-process current version after a successful update so it does
not repeatedly apply the same plan. When started without `--node-id` it reads
`<state-dir>/identity.json` and waits until the approved node identity appears,
which lets the updater service start immediately during first install. It also
persists the last applied node-agent version in
`<state-dir>/host-update-state.json` so a service restart does not reapply an
already-installed release.
```bash
rap-host-agent update-loop \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--node-id <node_id> \
--container-name rap-node-agent-docker-node-1 \
--current-version 0.1.0-c17z26 \
--interval-seconds 21600 \
--jitter 0.15
```
Update-cache nodes are ordinary cluster nodes with the `update-cache` role.
Control Plane assigns a healthy update-cache node in the heartbeat hint. If the
assigned service disappears, the next hint returns `control_plane_fallback` or a
new service assignment; the local updater stays subscribed and only uses the
long fallback timer as a last resort.
`rap-host-agent update-host-agent-loop` updates the host-agent binary itself.
Only one global systemd unit is installed per Docker host:
`rap-host-agent-self-updater.service`. It uses one approved local node identity
to ask Control Plane for product `rap-host-agent` with install type
`linux_binary`, verifies the downloaded binary size and sha256, atomically
replaces `/usr/local/bin/rap-host-agent`, and reports status. The already
running process continues until systemd restarts it, while new invocations use
the new binary.
```bash
rap-host-agent update-host-agent-loop \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--state-dir /var/lib/rap/nodes/docker-node-1 \
--binary-path /usr/local/bin/rap-host-agent
```
## Windows Host Agent Bootstrap And Updates
Windows uses the same Control Plane install profile, but the local placement is
a Scheduled Task instead of Docker. In `--startup-mode auto` the installer first
tries an elevated `ONSTART` task running as `SYSTEM`; without admin rights it
falls back to a per-user `ONLOGON` task. The `ONSTART` mode starts after reboot
without an interactive user session. The `ONLOGON` fallback can only start after
that Windows user signs in.
```cmd
powershell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -UseBasicParsing 'http://control.example.com/downloads/rap-host-agent-windows-amd64.exe' -OutFile $env:TEMP\rap-host-agent.exe"
%TEMP%\rap-host-agent.exe install-windows --profile-url "http://control.example.com/api/v1" --cluster-id "<cluster_id>" --install-token "<one_time_install_token>" --node-name "office-win-1" --startup-mode "auto"
```
`install-windows` installs two tasks:
- `RAP Node Agent <node>` runs `rap-node-agent.exe`.
- `RAP Host Agent Updater <node>` runs `rap-host-agent update-loop` for product
`rap-node-agent`, install type `windows_service`, and replaces the local
`rap-node-agent.exe` from signed release artifacts.
During first bootstrap the updater can read `<state-dir>\identity.json` and
will wait until the join request is approved. For an already-enrolled Windows
node, prefer passing `--node-id` explicitly. That makes the updater wrapper
independent from the local identity file location and is required for repair of
older Windows installs where the node is already heartbeat-healthy but the
host-agent updater has no usable identity file.
```cmd
%TEMP%\rap-host-agent.exe install-windows --backend-url "http://control.example.com/api/v1" --cluster-id "<cluster_id>" --node-id "<node_id>" --node-name "office-win-1" --replace --startup-mode "auto" --auto-update-current-version "<current_version>"
```
The admin UI node details page generates a downloadable
`rap-repair-updater-<node>.cmd` for this repair path. It performs these steps:
- prints `schtasks /Query` diagnostics for the node-agent and updater tasks;
- prints the local `rap-*.exe*` files;
- downloads the current `rap-host-agent.exe`;
- reinstalls the Windows updater wrapper with `--node-id`;
- runs a foreground one-shot `update-loop --max-runs 1`;
- applies `rap-host-agent.exe.next` if the running host-agent could not replace
itself;
- restarts `RAP Host Agent Updater <node>`;
- prints post-repair diagnostics.
Expected successful updater reports in the admin panel:
```text
rap-node-agent <target> -> <target> plan/noop
rap-host-agent <target> -> <target> plan/noop
```
If the latest host-agent report is `apply/staged`, the new host-agent binary
was downloaded as `rap-host-agent.exe.next` but the running process still held
the old executable. End and run the updater task once, or rerun the generated
repair command:
```cmd
schtasks /End /TN "RAP Host Agent Updater office-win-1"
schtasks /Run /TN "RAP Host Agent Updater office-win-1"
```
### Windows Reboot / Autostart Verification
After installation or repair, verify the service survives a reboot:
1. Reboot the Windows host, or at minimum restart both scheduled tasks.
2. Confirm the tasks exist:
```cmd
schtasks /Query /TN "RAP Node Agent office-win-1" /V /FO LIST
schtasks /Query /TN "RAP Host Agent Updater office-win-1" /V /FO LIST
```
3. Confirm the admin panel shows:
```text
heartbeat: fresh
rap-node-agent: plan/noop
rap-host-agent: plan/noop
node version_state: current
```
Without admin rights, `install-windows --startup-mode auto` may fall back to
`user-task`. That node can still heartbeat and update after the user logs in,
but it will not start before logon after a reboot. Use an elevated shell for
production Windows nodes that must recover unattended.
Control Plane release artifacts for Windows must use:
- `product=rap-node-agent`
- `os=windows`
- `arch=amd64`
- `install_type=windows_service`
- `kind=binary`
## First Enrollment
Create a join token from the platform control plane, then run:
@@ -185,9 +418,18 @@ bounded `synthetic.echo` test-service runtime, and live synthetic HTTP endpoint.
It must not be used for RDP, VPN, file, video, or other production service
traffic.
`RAP_WORKLOAD_SUPERVISION_ENABLED` defaults to `false`. While service runtime
supervision is still a stub, the agent does not poll desired workloads or report
workload status unless this flag is explicitly enabled.
`RAP_WORKLOAD_SUPERVISION_ENABLED` defaults to `false`. When enabled, the agent
polls node-scoped desired workloads and reports status. The current bounded
runtime reports built-in `core-mesh` and `mesh-listener` services as running
when enabled, supports the native built-in `synthetic.echo` test workload, and
keeps unsupported production workloads such as RDP workers degraded until their
supervisors are implemented.
For Remote Workspace/RDP integration work, the native `rdp-worker` desired
workload supports only an explicit `adapter_contract_probe` mode. That mode
reports the remote-workspace adapter channel contract and requires Fabric
Service Channel as the future data plane; it does not start FreeRDP, create a
remote session, or carry production RDP payloads.
`RAP_MESH_LISTEN_ADDR` starts the C17E/C17F/C17G synthetic HTTP endpoint only when
`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true`. `RAP_MESH_SYNTHETIC_CONFIG` points to
@@ -423,6 +665,63 @@ observations with expected/observed hops and drift status. This probes
replacement relay effective paths for control-plane health only and does not
enable service payload forwarding.
C17Z21 defines the portable inbound listener contract for Docker, Linux
service, Windows service, and future OS-specific node packages. The node-agent
does not stop when the mesh listen port cannot be bound. It keeps the outbound
Control Plane session alive and emits `c17z21.mesh_listener_report.v1` in
heartbeat metadata with configured address, effective address, listen mode,
listener status, inbound reachability, one-way connectivity, failure reason,
and port-conflict diagnostics.
`RAP_MESH_LISTEN_PORT_MODE` controls behavior:
- `manual`: bind exactly `RAP_MESH_LISTEN_ADDR`; on conflict report
`listen_failed` and wait for an operator/config change.
- `auto`: try `RAP_MESH_LISTEN_ADDR`; on conflict scan
`RAP_MESH_LISTEN_AUTO_PORT_START..RAP_MESH_LISTEN_AUTO_PORT_END` and report
`auto_rebound` when a free port is selected.
- `disabled`: do not open an inbound listener; the node is expected to be
outbound-only, relay/rendezvous, or Control Plane only.
For `RAP_MESH_CONNECTIVITY_MODE=outbound_only`, inbound listener failure is not
treated as node death. The heartbeat remains `healthy` with
`mesh_one_way_connectivity=true` and listener diagnostics. For direct/private
LAN modes, a listener failure degrades the node so the admin panel can show
that the node is alive but cannot accept inbound mesh traffic. Service payload
forwarding is still not enabled by this contract.
C17Z22 separates outbound Control Plane presence from inbound mesh
reachability. When synthetic mesh testing is enabled, every heartbeat includes
`c17z22.mesh_outbound_session_report.v1` with node-to-control-plane direction,
keepalive transport, listener conflict state, rendezvous/relay counters, and a
flag showing whether the current outbound session can be used as a reverse
control-channel contract. This is the portable basis for Docker, Linux service,
Windows service, and future packages where a node may be behind NAT or have no
stable inbound address. It is still control-plane telemetry only and does not
carry RDP/VPN/service payload traffic.
C17Z24 separates the listener bind address from advertised mesh endpoints. The
agent never advertises loopback addresses discovered from the local listener;
`127.0.0.1`/`::1` are test-only bind details, not cluster reachability data.
When the listener is active, the agent enumerates active non-loopback host
interfaces and reports usable endpoint candidates with interface metadata,
address family, reachability, NAT/connectivity hints, and priority. Container
bridge/veth interfaces and link-local addresses are filtered by default, while
physical and VPN-style interfaces are kept so different cluster segments can
choose the address that matches their network. Operator-provided
`RAP_MESH_ADVERTISE_ENDPOINT` or endpoint-candidate JSON remains authoritative
and is ranked ahead of auto-discovered addresses.
C17Z25 adds per-peer endpoint fallback probing to the control-plane mesh
manager. A node no longer treats the top-ranked endpoint candidate as the only
possible address for a peer. For each warm direct/private/corporate peer, the
manager probes the ranked candidate list until one `/mesh/v1/health` endpoint
responds or all direct candidates fail. Heartbeat metadata includes
`c17z25.mesh_peer_connection_manager_report.v1` with `probe_results`,
`selected_candidate_id`, `selected_endpoint`, and per-candidate success/failure
details. This is still control-plane health and address selection telemetry; it
does not forward RDP/VPN/service payloads.
Scoped synthetic config shape:
```json
@@ -480,7 +779,7 @@ Expected:
- The agent never assigns roles to itself.
- The agent reports capabilities only.
- Platform policy assigns roles.
- No RDP/VPN/production service traffic is carried by the C17A-C17Z18 staged
- No RDP/VPN/production service traffic is carried by the C17A-C17Z22 staged
mesh runtime.
- Production forwarding remains disabled by default and limited to
`fabric.control` when explicitly enabled.
@@ -0,0 +1,744 @@
package main
import (
"context"
"flag"
"fmt"
"log"
"os"
"os/signal"
"runtime"
"strings"
"syscall"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/hostagent"
)
type installCommandConfig struct {
Runtime hostagent.RuntimeConfig
DryRun bool
AutoUpdateEnabled bool
AutoUpdate hostagent.UpdateServiceConfig
}
func main() {
log.SetFlags(0)
applyStagedSelfUpdate()
if len(os.Args) < 2 {
usage()
os.Exit(2)
}
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()
switch os.Args[1] {
case "install":
if err := runInstall(ctx, os.Args[2:]); err != nil {
log.Fatalf("install failed: %v", err)
}
case "install-windows":
if err := runInstallWindows(ctx, os.Args[2:]); err != nil {
log.Fatalf("install-windows failed: %v", err)
}
case "install-linux":
if err := runInstallLinux(ctx, os.Args[2:]); err != nil {
log.Fatalf("install-linux failed: %v", err)
}
case "status":
if err := runStatus(ctx, os.Args[2:]); err != nil {
log.Fatalf("status failed: %v", err)
}
case "update":
if err := runUpdate(ctx, os.Args[2:]); err != nil {
log.Fatalf("update failed: %v", err)
}
case "update-loop":
if err := runUpdateLoop(ctx, os.Args[2:]); err != nil {
log.Fatalf("update-loop failed: %v", err)
}
case "install-updater":
if err := runInstallUpdater(ctx, os.Args[2:]); err != nil {
log.Fatalf("install-updater failed: %v", err)
}
case "update-host-agent":
if err := runUpdateHostAgent(ctx, os.Args[2:]); err != nil {
log.Fatalf("update-host-agent failed: %v", err)
}
case "update-host-agent-loop":
if err := runUpdateHostAgentLoop(ctx, os.Args[2:]); err != nil {
log.Fatalf("update-host-agent-loop failed: %v", err)
}
default:
usage()
os.Exit(2)
}
}
func applyStagedSelfUpdate() {
if runtime.GOOS == "windows" {
return
}
executable, err := os.Executable()
if err != nil {
return
}
staged := executable + ".next"
if _, err := os.Stat(staged); err != nil {
return
}
backup := executable + ".old"
_ = os.Remove(backup)
if err := os.Rename(executable, backup); err != nil {
return
}
if err := os.Rename(staged, executable); err != nil {
_ = os.Rename(backup, executable)
return
}
_ = os.Chmod(executable, 0o755)
_ = os.Remove(backup)
}
func runInstallLinux(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("install-linux", flag.ContinueOnError)
cfg := hostagent.LinuxInstallConfig{}
var profileURL string
var installToken string
fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.")
fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/linux-install-profile URL for profile-based install.")
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Linux install profile.")
fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.")
fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_LINUX_INSTALL_DIR", ""), "Directory for rap-node-agent and rap-host-agent.")
fs.StringVar(&cfg.ConfigDir, "config-dir", getenv("RAP_LINUX_CONFIG_DIR", ""), "Directory for node-agent env file.")
fs.StringVar(&cfg.StartupMode, "startup-mode", getenv("RAP_LINUX_STARTUP_MODE", "systemd"), "Startup mode: systemd, auto, or none.")
fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_REPLACE", true), "Replace local node-agent binary/config when an artifact is available.")
fs.BoolVar(&cfg.DryRun, "dry-run", false, "Print resolved placement without installing.")
fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Linux host-agent update service.")
fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent path copied to the persistent updater location.")
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "direct_http"), "Advertised transport.")
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "linux"), "Region/site hint.")
fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.")
if err := fs.Parse(args); err != nil {
return err
}
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
dryRun := cfg.DryRun
startupMode := strings.TrimSpace(cfg.StartupMode)
autoUpdateEnabled := cfg.AutoUpdateEnabled
autoUpdateCurrentVersion := cfg.AutoUpdateCurrentVersion
autoUpdateChannel := cfg.AutoUpdateChannel
autoUpdateIntervalSeconds := cfg.AutoUpdateIntervalSeconds
autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds
autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds
hostAgentSourcePath := cfg.HostAgentSourcePath
profile, err := hostagent.FetchLinuxInstallProfile(ctx, hostagent.ProfileRequest{URL: profileURL, ClusterID: cfg.RuntimeConfig.ClusterID, InstallToken: installToken, NodeName: cfg.RuntimeConfig.NodeName})
if err != nil {
return err
}
cfg = hostagent.LinuxInstallConfigFromProfile(profile)
cfg.Replace = true
cfg.DryRun = dryRun
cfg.AutoUpdateEnabled = autoUpdateEnabled
cfg.AutoUpdateCurrentVersion = autoUpdateCurrentVersion
cfg.AutoUpdateChannel = autoUpdateChannel
cfg.AutoUpdateIntervalSeconds = autoUpdateIntervalSeconds
cfg.AutoUpdateInitialDelaySeconds = autoUpdateInitialDelaySeconds
cfg.AutoUpdateHealthTimeoutSeconds = autoUpdateHealthTimeoutSeconds
cfg.HostAgentSourcePath = hostAgentSourcePath
if startupMode != "" {
cfg.StartupMode = startupMode
}
}
result, err := (hostagent.LinuxManager{}).Install(ctx, cfg)
if err != nil {
return err
}
fmt.Printf("node=%s install_dir=%s state_dir=%s node_agent=%s unit=%s downloaded=%t started=%t updater_unit=%s updater_started=%t\n",
result.NodeName, result.InstallDir, result.StateDir, result.NodeAgentPath, result.UnitName, result.Downloaded, result.Started, result.UpdaterUnitName, result.UpdaterStarted)
fmt.Println("next: approve the join request in the platform admin panel, then the Linux node-agent will finish bootstrap and start heartbeats")
return nil
}
func runInstallWindows(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("install-windows", flag.ContinueOnError)
cfg := hostagent.WindowsInstallConfig{}
var profileURL string
var installToken string
fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.")
fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/windows-install-profile URL for profile-based install.")
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Windows install profile.")
fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
fs.StringVar(&cfg.RuntimeConfig.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.")
fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_WINDOWS_INSTALL_DIR", ""), "Directory for rap-node-agent.exe and wrapper scripts.")
fs.StringVar(&cfg.StartupMode, "startup-mode", getenv("RAP_WINDOWS_STARTUP_MODE", "auto"), "Startup mode: auto, system-task, user-task, or none.")
fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_REPLACE", true), "Replace local node-agent binary/config when an artifact is available.")
fs.BoolVar(&cfg.DryRun, "dry-run", false, "Print resolved placement without installing.")
fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Windows host-agent update task.")
fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent.exe path copied to the persistent updater location.")
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "direct_http"), "Advertised transport.")
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "windows"), "Region/site hint.")
fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.")
if err := fs.Parse(args); err != nil {
return err
}
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
dryRun := cfg.DryRun
startupMode := strings.TrimSpace(cfg.StartupMode)
autoUpdateEnabled := cfg.AutoUpdateEnabled
autoUpdateCurrentVersion := cfg.AutoUpdateCurrentVersion
autoUpdateChannel := cfg.AutoUpdateChannel
autoUpdateIntervalSeconds := cfg.AutoUpdateIntervalSeconds
autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds
autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds
hostAgentSourcePath := cfg.HostAgentSourcePath
profile, err := hostagent.FetchWindowsInstallProfile(ctx, hostagent.ProfileRequest{
URL: profileURL,
ClusterID: cfg.RuntimeConfig.ClusterID,
InstallToken: installToken,
NodeName: cfg.RuntimeConfig.NodeName,
})
if err != nil {
return err
}
cfg = hostagent.WindowsInstallConfigFromProfile(profile)
cfg.Replace = true
cfg.DryRun = dryRun
cfg.AutoUpdateEnabled = autoUpdateEnabled
cfg.AutoUpdateCurrentVersion = autoUpdateCurrentVersion
cfg.AutoUpdateChannel = autoUpdateChannel
cfg.AutoUpdateIntervalSeconds = autoUpdateIntervalSeconds
cfg.AutoUpdateInitialDelaySeconds = autoUpdateInitialDelaySeconds
cfg.AutoUpdateHealthTimeoutSeconds = autoUpdateHealthTimeoutSeconds
cfg.HostAgentSourcePath = hostAgentSourcePath
if startupMode != "" {
cfg.StartupMode = startupMode
}
}
result, err := (hostagent.WindowsManager{}).Install(ctx, cfg)
if err != nil {
return err
}
fmt.Printf("node=%s install_dir=%s state_dir=%s node_agent=%s startup_mode=%s task=%s downloaded=%t started=%t updater_task=%s updater_started=%t admin_fallback=%t\n",
result.NodeName, result.InstallDir, result.StateDir, result.NodeAgentPath, result.StartupMode, result.TaskName, result.Downloaded, result.Started, result.UpdaterTaskName, result.UpdaterStarted, result.AdminFallback)
fmt.Println("next: approve the join request in the platform admin panel, then the Windows node-agent will finish bootstrap and start heartbeats")
return nil
}
func runInstall(ctx context.Context, args []string) error {
installCfg, err := parseInstall(args)
if err != nil {
return err
}
cfg := installCfg.Runtime.Normalize()
cfg = cfg.Normalize()
runArgs := hostagent.DockerRunArgs(cfg)
if installCfg.DryRun {
fmt.Printf("docker %s\n", shellJoin(hostagent.RedactedArgs(runArgs)))
if installCfg.AutoUpdateEnabled {
service := installCfg.AutoUpdate
service.RuntimeConfig = cfg
service.DryRun = true
result, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
if err != nil {
return err
}
fmt.Print(result.Unit)
}
return nil
}
result, err := (hostagent.DockerManager{}).Install(ctx, cfg)
if err != nil {
return err
}
fmt.Printf("container=%s image=%s id=%s pulled=%t replaced=%t\n", result.ContainerName, result.Image, result.ContainerID, result.Pulled, result.Replaced)
if installCfg.AutoUpdateEnabled {
service := installCfg.AutoUpdate
service.RuntimeConfig = cfg
service.ManageSystemd = true
serviceResult, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
if err != nil {
return err
}
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t\n", serviceResult.UnitName, serviceResult.UnitPath, serviceResult.BinaryPath, serviceResult.Started)
}
fmt.Println("next: approve the join request in the platform admin panel, then the node-agent will finish bootstrap and start heartbeats")
return nil
}
func runStatus(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("status", flag.ContinueOnError)
containerName := fs.String("container-name", hostagent.DefaultContainerName, "Docker container name.")
if err := fs.Parse(args); err != nil {
return err
}
out, err := (hostagent.DockerManager{}).Status(ctx, *containerName)
if err != nil {
return err
}
fmt.Print(out)
return nil
}
func runUpdate(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("update", flag.ContinueOnError)
req := hostagent.UpdateRequest{}
var healthTimeoutSeconds int
registerUpdateFlags(fs, &req, &healthTimeoutSeconds)
if err := fs.Parse(args); err != nil {
return err
}
req.HealthTimeout = time.Duration(healthTimeoutSeconds) * time.Second
if req.DryRun {
plan, err := hostagent.FetchNodeUpdatePlan(ctx, req)
if err != nil {
return err
}
fmt.Printf("action=%s reason=%s target=%s production_forwarding=%t\n", plan.Action, plan.Reason, plan.TargetVersion, plan.ProductionForwarding)
if plan.Artifact != nil {
fmt.Printf("artifact=%s sha256=%s size=%d\n", plan.Artifact.URL, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
}
return nil
}
var result hostagent.UpdateResult
var err error
if req.InstallType == hostagent.WindowsUpdateInstallType || runtime.GOOS == "windows" {
result, err = (hostagent.WindowsManager{}).ApplyUpdate(ctx, req)
} else if req.InstallType == hostagent.BinaryUpdateInstallType {
result, err = (hostagent.LinuxManager{}).ApplyUpdate(ctx, req)
} else {
result, err = (hostagent.DockerManager{}).ApplyUpdate(ctx, req)
}
if err != nil {
return err
}
fmt.Printf("action=%s reason=%s target=%s container=%s image=%s id=%s loaded=%t replaced=%t rolled_back=%t\n",
result.Action,
result.Reason,
result.TargetVersion,
result.ContainerName,
result.NewImage,
result.ContainerID,
result.Loaded,
result.Replaced,
result.RolledBack,
)
return nil
}
func runUpdateLoop(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("update-loop", flag.ContinueOnError)
req := hostagent.UpdateRequest{}
var healthTimeoutSeconds int
var intervalSeconds int
var initialDelaySeconds int
var maxRuns int
var jitter float64
var stopOnError bool
var hostAgentStatusEnabled bool
var hostAgentVersion string
var hostAgentBinaryPath string
registerUpdateFlags(fs, &req, &healthTimeoutSeconds)
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Seconds between emergency fallback update plan polls. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 0), "Seconds to wait before the first poll.")
fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.")
fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.")
fs.BoolVar(&stopOnError, "stop-on-error", getenvBool("RAP_UPDATE_STOP_ON_ERROR", false), "Stop the loop after the first failed update attempt.")
fs.BoolVar(&hostAgentStatusEnabled, "host-agent-update-status-enabled", getenvBool("RAP_HOST_AGENT_UPDATE_STATUS_ENABLED", true), "Also poll/report rap-host-agent update status from this loop.")
fs.StringVar(&hostAgentVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Current rap-host-agent version reported by the loop.")
fs.StringVar(&hostAgentBinaryPath, "host-agent-binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "rap-host-agent binary path used for host-agent update status.")
if err := fs.Parse(args); err != nil {
return err
}
req.HealthTimeout = time.Duration(healthTimeoutSeconds) * time.Second
cfg := hostagent.UpdateLoopConfig{
Request: req,
Interval: time.Duration(intervalSeconds) * time.Second,
InitialDelay: time.Duration(initialDelaySeconds) * time.Second,
Jitter: jitter,
MaxRuns: maxRuns,
StopOnError: stopOnError,
Logf: func(format string, args ...any) {
fmt.Printf(format+"\n", args...)
},
}
cfg.HostAgentUpdateEnabled = hostAgentStatusEnabled
cfg.HostAgentUpdateRequest = hostagent.HostAgentUpdateRequest{
BackendURL: req.BackendURL,
ClusterID: req.ClusterID,
NodeID: req.NodeID,
StateDir: req.StateDir,
CurrentVersion: hostAgentVersion,
Channel: req.Channel,
OS: firstNonEmptyLocal(req.OS, runtime.GOOS),
Arch: firstNonEmptyLocal(req.Arch, runtime.GOARCH),
InstallType: hostagent.BinaryUpdateInstallType,
BinaryPath: hostAgentBinaryPath,
}
if req.InstallType == hostagent.WindowsUpdateInstallType || runtime.GOOS == "windows" {
cfg.HostAgentUpdateRequest.InstallType = "windows_binary"
return (hostagent.WindowsManager{}).RunUpdateLoop(ctx, cfg)
}
if req.InstallType == hostagent.BinaryUpdateInstallType {
return (hostagent.LinuxManager{}).RunUpdateLoop(ctx, cfg)
}
return (hostagent.DockerManager{}).RunUpdateLoop(ctx, cfg)
}
func firstNonEmptyLocal(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return value
}
}
return ""
}
func runInstallUpdater(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("install-updater", flag.ContinueOnError)
runtimeCfg := hostagent.RuntimeConfig{}
service := hostagent.UpdateServiceConfig{}
var dryRun bool
var selfUpdater bool
fs.StringVar(&runtimeCfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&runtimeCfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&runtimeCfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.")
fs.StringVar(&runtimeCfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.")
fs.StringVar(&service.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version before first successful update.")
fs.StringVar(&service.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
fs.IntVar(&service.IntervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&service.InitialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
fs.Float64Var(&service.Jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.")
fs.IntVar(&service.HealthTimeoutSec, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
fs.StringVar(&service.BinaryInstallPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.")
fs.BoolVar(&selfUpdater, "self-updater-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.")
fs.BoolVar(&dryRun, "dry-run", false, "Print the systemd unit without installing it.")
if err := fs.Parse(args); err != nil {
return err
}
service.RuntimeConfig = runtimeCfg
service.ManageSystemd = !dryRun
service.DryRun = dryRun
service.InstallSelfUpdater = selfUpdater
service.SelfUpdateVersion = agent.Version
result, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
if err != nil {
return err
}
if dryRun {
fmt.Print(result.Unit)
if result.SelfUnit != "" {
fmt.Print(result.SelfUnit)
}
return nil
}
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t self_updater=%s\n", result.UnitName, result.UnitPath, result.BinaryPath, result.Started, result.SelfUnitName)
return nil
}
func runUpdateHostAgent(ctx context.Context, args []string) error {
req, interval, initialDelay, jitter, maxRuns, stopOnError, loop, err := parseHostAgentUpdate(args)
_, _, _, _, _ = interval, initialDelay, jitter, maxRuns, stopOnError
if err != nil {
return err
}
if loop {
return fmt.Errorf("internal parser error: loop flag set for one-shot update")
}
result, err := (hostagent.DockerManager{}).ApplyHostAgentUpdate(ctx, req)
if err != nil {
return err
}
fmt.Printf("action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t\n", result.Action, result.Reason, result.TargetVersion, result.NewImage, result.Replaced, result.RestartNeeded)
return nil
}
func runUpdateHostAgentLoop(ctx context.Context, args []string) error {
req, interval, initialDelay, jitter, maxRuns, stopOnError, _, err := parseHostAgentUpdate(args)
if err != nil {
return err
}
return (hostagent.DockerManager{}).RunHostAgentUpdateLoop(ctx, hostagent.HostAgentUpdateLoopConfig{
Request: req,
Interval: time.Duration(interval) * time.Second,
InitialDelay: time.Duration(initialDelay) * time.Second,
Jitter: jitter,
MaxRuns: maxRuns,
StopOnError: stopOnError,
Logf: func(format string, args ...any) {
fmt.Printf(format+"\n", args...)
},
})
}
func parseHostAgentUpdate(args []string) (hostagent.HostAgentUpdateRequest, int, int, float64, int, bool, bool, error) {
fs := flag.NewFlagSet("update-host-agent", flag.ContinueOnError)
req := hostagent.HostAgentUpdateRequest{}
var intervalSeconds int
var initialDelaySeconds int
var maxRuns int
var jitter float64
var stopOnError bool
fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json.")
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Currently installed rap-host-agent version.")
fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
fs.StringVar(&req.OS, "os", getenv("RAP_HOST_AGENT_UPDATE_OS", runtime.GOOS), "Host-agent artifact OS selector.")
fs.StringVar(&req.Arch, "arch", getenv("RAP_HOST_AGENT_UPDATE_ARCH", runtime.GOARCH), "Host-agent artifact architecture selector.")
fs.StringVar(&req.InstallType, "install-type", getenv("RAP_HOST_AGENT_UPDATE_INSTALL_TYPE", hostagent.BinaryUpdateInstallType), "Host-agent artifact install type.")
fs.StringVar(&req.BinaryPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "rap-host-agent binary path to replace atomically.")
fs.BoolVar(&req.DryRun, "dry-run", false, "Fetch and print the update plan without applying it.")
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INTERVAL_SECONDS", 900), "Seconds between host-agent update plan polls.")
fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INITIAL_DELAY_SECONDS", 45), "Seconds to wait before the first poll.")
fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.")
fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.")
fs.BoolVar(&stopOnError, "stop-on-error", getenvBool("RAP_UPDATE_STOP_ON_ERROR", false), "Stop the loop after the first failed update attempt.")
if err := fs.Parse(args); err != nil {
return hostagent.HostAgentUpdateRequest{}, 0, 0, 0, 0, false, false, err
}
return req, intervalSeconds, initialDelaySeconds, jitter, maxRuns, stopOnError, false, nil
}
func registerUpdateFlags(fs *flag.FlagSet, req *hostagent.UpdateRequest, healthTimeoutSeconds *int) {
fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json; used when node-id is not known yet.")
fs.StringVar(&req.Product, "product", getenv("RAP_UPDATE_PRODUCT", hostagent.DefaultUpdateProduct), "Update product name.")
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Currently running product version.")
fs.StringVar(&req.OS, "os", getenv("RAP_UPDATE_OS", runtime.GOOS), "Artifact OS selector.")
fs.StringVar(&req.Arch, "arch", getenv("RAP_UPDATE_ARCH", runtime.GOARCH), "Artifact architecture selector.")
fs.StringVar(&req.InstallType, "install-type", getenv("RAP_UPDATE_INSTALL_TYPE", hostagent.DefaultUpdateInstallType), "Artifact install type.")
fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
fs.StringVar(&req.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.")
fs.StringVar(&req.BinaryPath, "binary-path", getenv("RAP_NODE_AGENT_BINARY_PATH", ""), "Windows node-agent binary path to replace.")
fs.StringVar(&req.WindowsTaskName, "windows-task-name", getenv("RAP_WINDOWS_TASK_NAME", ""), "Windows Scheduled Task name used to restart node-agent.")
fs.StringVar(&req.SystemdUnitName, "systemd-unit", getenv("RAP_SYSTEMD_UNIT", ""), "Linux systemd unit used to restart node-agent.")
fs.IntVar(healthTimeoutSeconds, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Seconds to wait for the updated container to be running.")
fs.BoolVar(&req.DryRun, "dry-run", false, "Fetch and print the update plan without applying it.")
}
func parseInstall(args []string) (installCommandConfig, error) {
fs := flag.NewFlagSet("install", flag.ContinueOnError)
cfg := hostagent.RuntimeConfig{}
var dryRun bool
var profileURL string
var installToken string
var autoUpdateEnabled bool
autoUpdate := hostagent.UpdateServiceConfig{}
fs.StringVar(&cfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/docker-install-profile URL for profile-based install.")
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Docker install profile.")
fs.StringVar(&cfg.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
fs.StringVar(&cfg.Image, "image", getenv("RAP_NODE_AGENT_IMAGE", hostagent.DefaultImage), "Docker image for rap-node-agent.")
fs.StringVar(&cfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name.")
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path mounted as node-agent state.")
fs.StringVar(&cfg.Network, "network", getenv("RAP_DOCKER_NETWORK", hostagent.DefaultNetwork), "Docker network mode/name.")
fs.StringVar(&cfg.RestartPolicy, "restart", getenv("RAP_DOCKER_RESTART", "unless-stopped"), "Docker restart policy.")
fs.BoolVar(&cfg.PullImage, "pull", getenvBool("RAP_DOCKER_PULL", false), "Pull image before running.")
fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_DOCKER_REPLACE", false), "Remove an existing container with the same name before run.")
fs.BoolVar(&cfg.DockerVPNGatewayEnabled, "docker-vpn-gateway-enabled", getenvBool("RAP_DOCKER_VPN_GATEWAY_ENABLED", false), "Run Docker node-agent with NET_ADMIN and /dev/net/tun for VPN gateway mode.")
fs.StringVar(&cfg.ImageArtifactSHA256, "image-artifact-sha256", getenv("RAP_NODE_AGENT_IMAGE_ARTIFACT_SHA256", ""), "Expected SHA-256 for a Docker image tar artifact.")
fs.Int64Var(&cfg.ImageArtifactSizeBytes, "image-artifact-size-bytes", getenvInt64("RAP_NODE_AGENT_IMAGE_ARTIFACT_SIZE_BYTES", 0), "Expected byte size for a Docker image tar artifact (used as a best-effort check when sha256 is provided).")
fs.BoolVar(&dryRun, "dry-run", false, "Print the docker command with secrets redacted.")
fs.BoolVar(&autoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the local update-loop service.")
fs.BoolVar(&autoUpdate.InstallSelfUpdater, "host-agent-self-update-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.")
fs.StringVar(&autoUpdate.CurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
fs.StringVar(&autoUpdate.SelfUpdateVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Initial host-agent binary version used by the self-updater.")
fs.StringVar(&autoUpdate.Channel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
fs.IntVar(&autoUpdate.IntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&autoUpdate.InitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
fs.Float64Var(&autoUpdate.Jitter, "auto-update-jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.")
fs.IntVar(&autoUpdate.HealthTimeoutSec, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
fs.StringVar(&autoUpdate.BinaryInstallPath, "auto-update-binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.")
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable synthetic mesh runtime.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Synthetic mesh HTTP listen address inside container.")
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", ""), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 0), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", ""), "Advertised transport.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", ""), "Connectivity mode hint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", ""), "NAT type hint.")
fs.StringVar(&cfg.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint.")
fs.IntVar(&cfg.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
fs.IntVar(&cfg.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
fs.IntVar(&cfg.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.")
fs.IntVar(&cfg.ProductionObservationSinkCap, "production-observation-sink-capacity", getenvInt("RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Production observation sink capacity.")
extraEnv := repeatedFlag{}
extraRunArg := repeatedFlag{}
imageArtifactURL := repeatedFlag{}
fs.Var(&extraEnv, "env", "Extra KEY=VALUE env passed to node-agent container; may be repeated.")
fs.Var(&extraRunArg, "docker-run-arg", "Extra raw docker run argument; may be repeated.")
fs.Var(&imageArtifactURL, "image-artifact-url", "Docker image tar artifact URL to docker load before running; may be repeated.")
if err := fs.Parse(args); err != nil {
return installCommandConfig{}, err
}
cfg.ExtraEnv = extraEnv
cfg.AdditionalDockerRunArgs = extraRunArg
cfg.ImageArtifactURLs = append(cfg.ImageArtifactURLs, imageArtifactURL...)
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
profile, err := hostagent.FetchDockerInstallProfile(context.Background(), hostagent.ProfileRequest{
URL: profileURL,
ClusterID: cfg.ClusterID,
InstallToken: installToken,
NodeName: cfg.NodeName,
})
if err != nil {
return installCommandConfig{}, err
}
profileCfg := hostagent.RuntimeConfigFromProfile(profile)
profileCfg.ExtraEnv = cfg.ExtraEnv
profileCfg.AdditionalDockerRunArgs = cfg.AdditionalDockerRunArgs
profileCfg.DockerVPNGatewayEnabled = profileCfg.DockerVPNGatewayEnabled || cfg.DockerVPNGatewayEnabled
if len(imageArtifactURL) > 0 {
profileCfg.ImageArtifactURLs = append([]string(nil), imageArtifactURL...)
}
if cfg.ImageArtifactSHA256 != "" {
profileCfg.ImageArtifactSHA256 = cfg.ImageArtifactSHA256
}
if cfg.ImageArtifactSizeBytes > 0 {
profileCfg.ImageArtifactSizeBytes = cfg.ImageArtifactSizeBytes
}
cfg = profileCfg
}
if err := cfg.ValidateInstall(); err != nil {
return installCommandConfig{}, err
}
return installCommandConfig{
Runtime: cfg,
DryRun: dryRun,
AutoUpdateEnabled: autoUpdateEnabled,
AutoUpdate: autoUpdate,
}, nil
}
type repeatedFlag []string
func (f *repeatedFlag) String() string {
return strings.Join(*f, ",")
}
func (f *repeatedFlag) Set(value string) error {
*f = append(*f, value)
return nil
}
func getenv(key, fallback string) string {
if value := strings.TrimSpace(os.Getenv(key)); value != "" {
return value
}
return fallback
}
func getenvBool(key string, fallback bool) bool {
switch strings.ToLower(strings.TrimSpace(os.Getenv(key))) {
case "1", "true", "yes", "y", "on":
return true
case "0", "false", "no", "n", "off":
return false
default:
return fallback
}
}
func getenvInt(key string, fallback int) int {
var out int
if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%d", &out); err == nil {
return out
}
return fallback
}
func getenvInt64(key string, fallback int64) int64 {
var out int64
if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%d", &out); err == nil {
return out
}
return fallback
}
func getenvFloat(key string, fallback float64) float64 {
var out float64
if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%f", &out); err == nil {
return out
}
return fallback
}
func shellJoin(args []string) string {
parts := make([]string, 0, len(args))
for _, arg := range args {
if strings.ContainsAny(arg, " \t\"'") {
parts = append(parts, `"`+strings.ReplaceAll(arg, `"`, `\"`)+`"`)
} else {
parts = append(parts, arg)
}
}
return strings.Join(parts, " ")
}
func usage() {
fmt.Fprintln(os.Stderr, `usage:
rap-host-agent install -profile-url URL -install-token TOKEN [-node-name NAME] [docker options]
rap-host-agent install -backend-url URL -cluster-id ID -join-token TOKEN -node-name NAME [docker options]
rap-host-agent install-windows -profile-url URL -install-token TOKEN [-node-name NAME] [windows options]
rap-host-agent install-linux -profile-url URL -install-token TOKEN [-node-name NAME] [linux/systemd options]
rap-host-agent install-updater -backend-url URL -cluster-id ID -state-dir DIR -container-name NAME
rap-host-agent update-host-agent -backend-url URL -cluster-id ID -state-dir DIR
rap-host-agent update-host-agent-loop -backend-url URL -cluster-id ID -state-dir DIR
rap-host-agent update -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
rap-host-agent update-loop -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
rap-host-agent status [-container-name NAME]`)
}
File diff suppressed because it is too large Load Diff
@@ -78,6 +78,202 @@ func TestLoadSyntheticMeshConfigPrefersScopedFile(t *testing.T) {
}
}
func TestSyntheticMeshConfigAuthorityHashUsesRawConfigPayload(t *testing.T) {
raw := json.RawMessage(`{
"enabled": true,
"schema_version": "c18z-test.synthetic.v1",
"cluster_id": "cluster-1",
"local_node_id": "node-a",
"authority_required": true,
"cluster_authority": {"schema_version":"rap.cluster_authority.v1"},
"authority_payload": {"ignored": true},
"authority_signature": {"ignored": true},
"config_version": "config-1",
"peer_endpoints": {},
"routes": [],
"production_forwarding": true,
"future_backend_field": {"must_remain_hash_visible": true}
}`)
var remote client.SyntheticMeshConfig
if err := json.Unmarshal(raw, &remote); err != nil {
t.Fatalf("unmarshal synthetic config: %v", err)
}
var unsigned map[string]json.RawMessage
if err := json.Unmarshal(raw, &unsigned); err != nil {
t.Fatalf("unmarshal unsigned map: %v", err)
}
delete(unsigned, "authority_payload")
delete(unsigned, "authority_signature")
unsignedRaw, err := json.Marshal(unsigned)
if err != nil {
t.Fatalf("marshal unsigned map: %v", err)
}
want, err := agentauthority.HashRaw(unsignedRaw)
if err != nil {
t.Fatalf("hash unsigned map: %v", err)
}
got, err := syntheticMeshConfigAuthorityHash(remote)
if err != nil {
t.Fatalf("hash synthetic config: %v", err)
}
if got != want {
t.Fatalf("hash = %s, want raw-preserving hash %s", got, want)
}
}
func TestRouteManagerDecisionsFromControlPlaneConsumesRemediationCommand(t *testing.T) {
now := time.Now().UTC()
decisions := routeManagerDecisionsFromControlPlane(nil, []client.FabricServiceChannelRemediationCommand{{
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
CommandID: "cmd-1",
Action: "prefer_alternate_route",
ClusterID: "cluster-1",
ChannelID: "channel-1",
ServiceClass: "vpn_packets",
PrimaryRouteID: "route-primary",
ReplacementRouteID: "route-alternate",
Reason: "authorized_alternate_route_available",
IssuedAt: now,
ExpiresAt: now.Add(time.Minute),
}})
if len(decisions) != 1 {
t.Fatalf("decisions = %+v, want one remediation decision", decisions)
}
decision := decisions[0]
if decision.RouteID != "route-primary" ||
decision.ReplacementRouteID != "route-alternate" ||
decision.RebuildStatus != "applied" ||
decision.DecisionSource != "service_channel_remediation_command" ||
decision.RebuildRequestID != "cmd-1" {
t.Fatalf("unexpected remediation decision: %+v", decision)
}
}
func TestRouteManagerDecisionsFromControlPlaneConsumesRebuildRouteCommand(t *testing.T) {
now := time.Now().UTC()
decisions := routeManagerDecisionsFromControlPlane(nil, []client.FabricServiceChannelRemediationCommand{{
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
CommandID: "cmd-rebuild",
Action: "rebuild_route",
ClusterID: "cluster-1",
ChannelID: "channel-1",
ServiceClass: "vpn_packets",
PrimaryRouteID: "route-primary",
Reason: "route_feedback_recommends_rebuild",
GuardStatus: "allowed",
IssuedAt: now,
ExpiresAt: now.Add(time.Minute),
}})
if len(decisions) != 1 {
t.Fatalf("decisions = %+v, want one rebuild remediation decision", decisions)
}
decision := decisions[0]
if decision.RouteID != "route-primary" ||
decision.RebuildStatus != "pending_degraded_fallback" ||
decision.DecisionSource != "service_channel_remediation_command" ||
decision.RebuildRequestID != "cmd-rebuild" {
t.Fatalf("unexpected rebuild remediation decision: %+v", decision)
}
}
func TestRouteManagerDecisionsFromControlPlaneRejectsGuardedRemediationCommand(t *testing.T) {
now := time.Now().UTC()
decisions := routeManagerDecisionsFromControlPlane(nil, []client.FabricServiceChannelRemediationCommand{{
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
CommandID: "cmd-guarded",
Action: "prefer_alternate_route",
ClusterID: "cluster-1",
ChannelID: "channel-1",
ServiceClass: "vpn_packets",
PrimaryRouteID: "route-primary",
ReplacementRouteID: "route-outside-policy",
GuardStatus: "rejected",
GuardReason: "replacement_exit_outside_signed_pool_policy",
IssuedAt: now,
ExpiresAt: now.Add(time.Minute),
}})
if len(decisions) != 0 {
t.Fatalf("guarded remediation command must not reach route-manager: %+v", decisions)
}
}
func TestRouteManagerDecisionsFromControlPlaneKeepsExplicitRemediationCommand(t *testing.T) {
now := time.Now().UTC()
report := &client.RoutePathDecisionReport{Decisions: []client.RoutePathDecision{{
RouteID: "route-primary",
ReplacementRouteID: "route-alternate",
RebuildRequestID: "feedback-rebuild",
RebuildStatus: "applied",
RebuildReason: "service_channel_feedback_rebuild_applied_to_alternate",
DecisionSource: "service_channel_feedback_replacement",
Generation: "gen-1",
}}}
decisions := routeManagerDecisionsFromControlPlane(report, []client.FabricServiceChannelRemediationCommand{{
CommandID: "cmd-1",
Action: "prefer_alternate_route",
PrimaryRouteID: "route-primary",
ReplacementRouteID: "route-alternate",
Reason: "authorized_alternate_route_available",
IssuedAt: now,
ExpiresAt: now.Add(time.Minute),
}})
if len(decisions) != 2 {
t.Fatalf("decisions = %+v, want feedback and explicit remediation command", decisions)
}
if decisions[1].DecisionSource != "service_channel_remediation_command" || decisions[1].RebuildRequestID != "cmd-1" {
t.Fatalf("remediation command was not kept as explicit route-manager input: %+v", decisions)
}
}
func TestRouteManagerDecisionsFromControlPlaneSkipsCommandAlreadyResolvedByPlanner(t *testing.T) {
now := time.Now().UTC()
report := &client.RoutePathDecisionReport{Decisions: []client.RoutePathDecision{{
RouteID: "route-primary",
ReplacementRouteID: "route-planner",
RebuildRequestID: "cmd-rebuild",
RebuildStatus: "applied",
RebuildReason: "remediation_rebuild_applied_to_alternate",
DecisionSource: "service_channel_remediation_command",
Generation: "config-c18z77",
}}}
decisions := routeManagerDecisionsFromControlPlane(report, []client.FabricServiceChannelRemediationCommand{{
CommandID: "cmd-rebuild",
Action: "rebuild_route",
PrimaryRouteID: "route-primary",
Reason: "route_feedback_recommends_rebuild",
GuardStatus: "allowed",
IssuedAt: now,
ExpiresAt: now.Add(time.Minute),
}})
if len(decisions) != 1 {
t.Fatalf("decisions = %+v, want only planner-resolved decision", decisions)
}
if decisions[0].RebuildStatus != "applied" || decisions[0].ReplacementRouteID != "route-planner" {
t.Fatalf("unexpected planner decision: %+v", decisions[0])
}
}
func TestFabricServiceChannelAccessStatsReportsDataPlaneViolations(t *testing.T) {
stats := newFabricServiceChannelAccessStats()
stats.Observe(mesh.FabricServiceChannelAccessLogEntry{
Event: "fabric_service_channel_data_plane_violation",
ClusterID: "cluster-1",
ChannelID: "channel-1",
ResourceID: "vpn-1",
BackendRelayPolicy: "disabled",
ViolationStatus: "fabric_route_send_failed_backend_fallback_blocked",
ViolationReason: "mesh synthetic route not found",
OccurredAt: time.Unix(10, 0).UTC(),
})
report := stats.Report(time.Unix(20, 0).UTC())
if report["backend_fallback_blocked"] != int64(1) ||
report["fabric_route_send_failure"] != int64(1) ||
report["last_data_plane_violation_status"] != "fabric_route_send_failed_backend_fallback_blocked" ||
report["last_data_plane_violation_reason"] != "mesh synthetic route not found" {
t.Fatalf("unexpected violation report: %+v", report)
}
}
func TestVerifyEnrollmentBootstrapAcceptsSignedApproval(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
@@ -134,6 +330,134 @@ func TestVerifyEnrollmentBootstrapAcceptsSignedApproval(t *testing.T) {
}
}
func TestVerifyControlPlaneSyntheticMeshConfigAcceptsSignedServiceChannelFeedback(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatalf("generate key: %v", err)
}
publicKeyB64 := base64.StdEncoding.EncodeToString(publicKey)
fingerprint := agentauthority.Fingerprint(publicKey)
now := time.Now().UTC()
remote := client.SyntheticMeshConfig{
Enabled: true,
SchemaVersion: "c17z18.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
AuthorityRequired: true,
ClusterAuthority: &client.ClusterAuthorityDescriptor{
SchemaVersion: agentauthority.AuthoritySchemaVersion,
ClusterID: "cluster-1",
AuthorityState: "authoritative",
KeyAlgorithm: agentauthority.AlgorithmEd25519,
PublicKey: publicKeyB64,
PublicKeyFingerprint: fingerprint,
},
ConfigVersion: "config-v1",
PeerDirectoryVersion: "config-v1",
PolicyVersion: "config-v1",
PeerEndpoints: map[string]string{},
PeerEndpointCandidates: map[string][]client.PeerEndpointCandidate{},
PeerDirectory: []client.PeerDirectoryEntry{},
RecoverySeeds: []client.PeerRecoverySeed{},
RendezvousLeases: []client.PeerRendezvousLease{},
RoutePathDecisions: &client.RoutePathDecisionReport{
SchemaVersion: "c17z18.route_path_decisions.v1",
DecisionMode: "control_plane_effective_path_from_relay_policy_and_service_channel_feedback",
Generation: "config-v1",
DecisionCount: 1,
ReplacementDecisionCount: 1,
RebuildRequestCount: 1,
RebuildAppliedCount: 1,
ControlPlaneOnly: true,
Decisions: []client.RoutePathDecision{{
DecisionID: "route-ab-path-node-a-service-channel-feedback",
RouteID: "route-ab",
ReplacementRouteID: "route-ac",
RebuildRequestID: "route-ab-node-a-config-v1-rebuild",
RebuildStatus: "applied",
RebuildReason: "service_channel_feedback_rebuild_applied_to_alternate",
RebuildAttempt: 2,
ClusterID: "cluster-1",
LocalNodeID: "node-a",
SourceNodeID: "node-a",
DestinationNodeID: "node-b",
OriginalHops: []string{"node-a", "node-b"},
EffectiveHops: []string{"node-a", "node-c", "node-b"},
LocalRole: "source",
DecisionSource: "service_channel_feedback_replacement",
Generation: "config-v1",
PathScore: 1000,
ScoreReasons: []string{"service_channel_rebuild_applied"},
ControlPlaneOnly: true,
ExpiresAt: now.Add(30 * time.Second),
}},
},
ServiceChannelFeedback: &client.FabricServiceChannelFeedbackReport{
SchemaVersion: "c18n.fabric_service_channel_route_feedback_report.v1",
GeneratedAt: now,
FeedbackMaxAgeSeconds: 30,
ObservationCount: 1,
FencedRouteCount: 1,
Observations: []client.FabricServiceChannelFeedbackObservation{{
ClusterID: "cluster-1",
ReporterNodeID: "node-a",
RouteID: "route-ab",
ServiceClass: "vpn_packets",
FeedbackStatus: "fenced",
ScoreAdjustment: -1000,
Reasons: []string{"route_rebuild_recommended"},
ConsecutiveFailures: 2,
Payload: json.RawMessage(`{"route_rebuild_recommended":true}`),
ObservedAt: now,
ExpiresAt: now.Add(30 * time.Second),
}},
},
MeshListener: nil,
Routes: []client.SyntheticMeshRouteConfig{},
ProductionForwarding: false,
}
configHash, err := syntheticMeshConfigAuthorityHash(remote)
if err != nil {
t.Fatalf("config hash: %v", err)
}
payload, err := json.Marshal(controlPlaneMeshConfigAuthorityPayload{
SchemaVersion: "rap.cluster.mesh_config_snapshot.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
ConfigVersion: "config-v1",
ConfigSHA256: configHash,
IssuedAt: now,
ExpiresAt: now.Add(time.Hour),
ControlPlaneOnly: true,
ProductionForwarding: false,
})
if err != nil {
t.Fatalf("marshal payload: %v", err)
}
canonical, err := agentauthority.CanonicalJSON(payload)
if err != nil {
t.Fatalf("canonical json: %v", err)
}
remote.AuthorityPayload = payload
remote.AuthoritySignature = &client.ClusterSignature{
SchemaVersion: agentauthority.SignatureSchemaVersion,
Algorithm: agentauthority.AlgorithmEd25519,
KeyFingerprint: fingerprint,
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
SignedAt: now,
}
err = verifyControlPlaneSyntheticMeshConfig(remote, state.Identity{
ClusterID: "cluster-1",
NodeID: "node-a",
ClusterAuthorityPublicKey: publicKeyB64,
ClusterAuthorityFingerprint: fingerprint,
}, config.Config{})
if err != nil {
t.Fatalf("verify control-plane synthetic mesh config: %v", err)
}
}
func TestVerifyEnrollmentBootstrapRejectsPinnedAuthorityMismatch(t *testing.T) {
bootstrap := client.NodeBootstrap{
NodeID: "node-1",
@@ -155,6 +479,54 @@ func TestVerifyEnrollmentBootstrapRejectsPinnedAuthorityMismatch(t *testing.T) {
}
}
func TestEnsureApprovedIdentityKeepsPollingWhenTimeoutDisabled(t *testing.T) {
var bootstrapPolls int
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.URL.Path == "/node-agents/enroll":
_ = json.NewEncoder(w).Encode(map[string]any{
"status": "pending",
"join_request": map[string]any{"id": "join-request-1"},
})
case r.URL.Path == "/node-agents/enrollments/join-request-1/bootstrap":
bootstrapPolls++
if bootstrapPolls >= 2 {
cancel()
}
_ = json.NewEncoder(w).Encode(map[string]any{
"status": "pending",
"join_request": map[string]any{"id": "join-request-1"},
})
default:
http.NotFound(w, r)
}
}))
defer server.Close()
dir := t.TempDir()
identity, err := state.LoadOrCreate(dir, "cluster-1", "node-a")
if err != nil {
t.Fatalf("load identity: %v", err)
}
_, err = ensureApprovedIdentity(ctx, config.Config{
BackendURL: server.URL,
ClusterID: "cluster-1",
JoinToken: "join-token",
NodeName: "node-a",
StateDir: dir,
EnrollmentPollInterval: time.Millisecond,
EnrollmentPollTimeout: 0,
}, identity, client.New(server.URL))
if err == nil || !strings.Contains(err.Error(), "context canceled") {
t.Fatalf("ensureApprovedIdentity err = %v, want context canceled", err)
}
if bootstrapPolls < 2 {
t.Fatalf("bootstrap polls = %d, want at least 2", bootstrapPolls)
}
}
func TestSyntheticQualityScoreIsBounded(t *testing.T) {
cases := []struct {
latency int
@@ -209,6 +581,168 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
}
}
func TestHeartbeatPayloadReportsMeshListenerFailureWithoutKillingHeartbeat(t *testing.T) {
now := time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC)
payload := heartbeatPayload(config.Config{
MeshConnectivityMode: "private_lan",
}, state.Identity{
ClusterID: "cluster-1",
NodeID: "node-a",
}, &syntheticMeshState{
ListenerReport: meshListenerReport{
SchemaVersion: "c17z21.mesh_listener_report.v1",
ConfiguredListenAddr: ":19131",
ListenPortMode: "manual",
Status: "listen_failed",
InboundReachability: "unavailable",
ControlPlaneReachable: true,
OneWayConnectivity: true,
FailureReason: "bind_failed",
FailureError: "listen tcp :19131: bind: address already in use",
PortConflict: true,
},
}, now)
report, ok := payload.Metadata["mesh_listener_report"].(meshListenerReport)
if !ok {
t.Fatalf("mesh listener report missing: %+v", payload.Metadata)
}
if payload.HealthStatus != "warning" || report.Status != "listen_failed" || !report.PortConflict {
t.Fatalf("unexpected listener health report: status=%s report=%+v", payload.HealthStatus, report)
}
if payload.Capabilities["mesh_listener_diagnostics"] != true || payload.Capabilities["mesh_one_way_connectivity"] != true {
t.Fatalf("listener capabilities missing: %+v", payload.Capabilities)
}
}
func TestAdvertisedEndpointCandidatesPreferManualEndpoints(t *testing.T) {
now := time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC)
candidates, err := advertisedEndpointCandidates(config.Config{
MeshAdvertiseEndpointsJSON: `[{"endpoint_id":"node-a-json","node_id":"node-a","transport":"direct_http","address":"http://10.10.10.10:19131","priority":12,"connectivity_mode":"private_lan","reachability":"private"}]`,
MeshAdvertiseEndpoint: "http://203.0.113.10:19131",
MeshAdvertiseTransport: "direct_http",
MeshConnectivityMode: "direct",
MeshNATType: "port_restricted",
MeshRegion: "edge",
}, state.Identity{
ClusterID: "cluster-1",
NodeID: "node-a",
}, nil, now)
if err != nil {
t.Fatalf("advertised endpoint candidates failed: %v", err)
}
if len(candidates) != 2 {
t.Fatalf("expected two manual candidates, got %d: %+v", len(candidates), candidates)
}
if candidates[0].Address != "http://203.0.113.10:19131" || candidates[0].Priority != 10 {
t.Fatalf("explicit advertise endpoint must win: %+v", candidates)
}
if candidates[1].Address != "http://10.10.10.10:19131" || candidates[1].Priority != 12 {
t.Fatalf("json candidate order mismatch: %+v", candidates)
}
}
func TestNetworkInterfaceClassificationSkipsContainerNoise(t *testing.T) {
tests := map[string]string{
"ens160": "physical",
"wg0": "vpn",
"tailscale0": "vpn",
"docker0": "container",
"br-a1b2c3d4": "container",
"vethabc123": "container",
}
for name, want := range tests {
if got := classifyNetworkInterface(name); got != want {
t.Fatalf("classifyNetworkInterface(%q)=%q, want %q", name, got, want)
}
}
}
func TestHeartbeatPayloadTreatsOutboundOnlyListenerFailureAsOneWayConnectivity(t *testing.T) {
payload := heartbeatPayload(config.Config{
MeshSyntheticRuntimeEnabled: true,
MeshConnectivityMode: "outbound_only",
}, state.Identity{
ClusterID: "cluster-1",
NodeID: "node-a",
}, &syntheticMeshState{
ListenerReport: meshListenerReport{
SchemaVersion: "c17z21.mesh_listener_report.v1",
ConfiguredListenAddr: ":19131",
ListenPortMode: "manual",
Status: "listen_failed",
InboundReachability: "unavailable",
ControlPlaneReachable: true,
OneWayConnectivity: true,
FailureReason: "bind_failed",
},
}, time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC))
if payload.HealthStatus != "healthy" {
t.Fatalf("HealthStatus = %q, want healthy for outbound-only listener failure", payload.HealthStatus)
}
report, ok := payload.Metadata["mesh_outbound_session_report"].(meshOutboundSessionReport)
if !ok {
t.Fatalf("mesh outbound session report missing: %+v", payload.Metadata)
}
if report.Status != "ready" || !report.UsableForInboundControl || report.ListenerStatus != "listen_failed" {
t.Fatalf("unexpected outbound session report: %+v", report)
}
if payload.Capabilities["mesh_outbound_control_session"] != true ||
payload.Capabilities["mesh_reverse_control_channel_contract"] != true {
t.Fatalf("outbound session capabilities missing: %+v", payload.Capabilities)
}
}
func TestHeartbeatPayloadReportsMeshConfigLoadFailureWithoutDroppingPresence(t *testing.T) {
payload := heartbeatPayload(config.Config{
MeshSyntheticRuntimeEnabled: true,
MeshConnectivityMode: "private_lan",
}, state.Identity{
ClusterID: "cluster-1",
NodeID: "node-a",
}, &syntheticMeshState{
ConfigLoadError: "control-plane synthetic mesh config unavailable",
ListenerReport: meshListenerReport{
SchemaVersion: "c17z21.mesh_listener_report.v1",
ConfiguredListenAddr: ":19131",
ListenPortMode: "manual",
Status: "listening",
InboundReachability: "private",
ControlPlaneReachable: true,
},
}, time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC))
report, ok := payload.Metadata["mesh_outbound_session_report"].(meshOutboundSessionReport)
if !ok {
t.Fatalf("mesh outbound session report missing: %+v", payload.Metadata)
}
if payload.HealthStatus != "warning" || report.Status != "degraded" || report.ConfigLoadError == "" {
t.Fatalf("unexpected config-load diagnostic heartbeat: health=%s report=%+v", payload.HealthStatus, report)
}
}
func TestOutboundSessionReportTreatsListeningPrivateLANAsUsable(t *testing.T) {
report := meshOutboundSessionReportFromState(config.Config{
BackendURL: "http://control/api/v1",
MeshConnectivityMode: "private_lan",
MeshSyntheticRuntimeEnabled: true,
}, &syntheticMeshState{
ListenerReport: meshListenerReport{
SchemaVersion: "c17z21.mesh_listener_report.v1",
Status: "listening",
InboundReachability: reachabilityFromConnectivityMode("private_lan"),
},
}, time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC))
if !report.UsableForInboundControl {
t.Fatalf("listening private LAN listener must be usable: %+v", report)
}
if reachabilityFromConnectivityMode("private_lan") != "private" {
t.Fatalf("private_lan reachability mismatch")
}
}
func TestHeartbeatPayloadReportsMultipleMeshEndpoints(t *testing.T) {
payload := heartbeatPayload(config.Config{
MeshAdvertiseEndpointsJSON: `[{
@@ -1050,17 +1584,36 @@ func TestProductionEnvelopeObservationSinkFromConfigCreatesBoundedSink(t *testin
func TestProductionForwardingLogStateDistinguishesGateFromRuntime(t *testing.T) {
gateEnabled, runtimeEnabled := productionForwardingLogState(config.Config{
MeshProductionForwardingEnabled: true,
})
}, false)
if !gateEnabled {
t.Fatal("gateEnabled = false, want true")
}
if !runtimeEnabled {
t.Fatal("runtimeEnabled = false, want true")
}
gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{})
gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{}, false)
if gateEnabled || runtimeEnabled {
t.Fatalf("default log state = gate:%t runtime:%t, want false/false", gateEnabled, runtimeEnabled)
}
gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{}, true)
if !gateEnabled || !runtimeEnabled {
t.Fatalf("signed control-plane log state = gate:%t runtime:%t, want true/true", gateEnabled, runtimeEnabled)
}
}
func TestMeshLinkStatusFromPeerProbeMapsDeferredForLatestLinks(t *testing.T) {
cases := map[string]string{
mesh.PeerConnectionProbeReachable: "reachable",
mesh.PeerConnectionProbeUnreachable: "unreachable",
mesh.PeerConnectionProbeDeferred: "degraded",
mesh.PeerConnectionProbeSkipped: "unknown",
"unexpected": "unknown",
}
for input, want := range cases {
if got := meshLinkStatusFromPeerProbe(input); got != want {
t.Fatalf("meshLinkStatusFromPeerProbe(%q) = %q, want %q", input, got, want)
}
}
}
func TestLogProductionObservationSinkMetricsToleratesNilState(t *testing.T) {
+12 -1
View File
@@ -1,3 +1,14 @@
module github.com/example/remote-access-platform/agents/rap-node-agent
go 1.23.2
go 1.25.5
require golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb
require (
github.com/gorilla/websocket v1.5.3 // indirect
golang.org/x/net v0.53.0 // indirect
golang.org/x/sys v0.43.0 // indirect
golang.org/x/time v0.15.0 // indirect
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
gvisor.dev/gvisor v0.0.0-20260505022556-2306ef3db943 // indirect
)
+16
View File
@@ -0,0 +1,16 @@
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI=
golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg=
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2/go.mod h1:deeaetjYA+DHMHg+sMSMI58GrEteJUUzzw7en6TJQcI=
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb h1:whnFRlWMcXI9d+ZbWg+4sHnLp52d5yiIPUxMBSt4X9A=
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb/go.mod h1:rpwXGsirqLqN2L0JDJQlwOboGHmptD5ZD6T2VmcqhTw=
gvisor.dev/gvisor v0.0.0-20260505022556-2306ef3db943 h1:YUPk0vGbex2+Jk7XXIgLIPG6oEAD9ml0x7wd6i/bmA4=
gvisor.dev/gvisor v0.0.0-20260505022556-2306ef3db943/go.mod h1:xQ2PWgHmWJA/Ph4i1q1jBm39BKhc3W0DXqWoDSyuBOY=
+37 -14
View File
@@ -7,7 +7,7 @@ import (
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const Version = "0.1.0-c3"
const Version = "0.2.256-c18z82"
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
return client.EnrollRequest{
@@ -17,18 +17,26 @@ func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) cli
NodeFingerprint: identity.NodeFingerprint,
PublicKey: identity.PublicKey,
ReportedCapabilities: map[string]any{
"can_accept_client_ingress": false,
"can_accept_node_ingress": false,
"can_route_mesh": false,
"can_run_rdp_worker": true,
"can_run_vnc_worker": false,
"can_run_vpn_exit": false,
"can_run_vpn_connector": false,
"can_run_file_cache": false,
"can_run_update_cache": false,
"can_run_video_relay": false,
"native_node_agent_version": Version,
"service_supervision_enabled": false,
"can_accept_client_ingress": false,
"can_accept_node_ingress": false,
"can_route_mesh": false,
"can_run_rdp_worker": true,
"can_run_vnc_worker": false,
"can_run_vpn_exit": true,
"can_run_vpn_connector": true,
"can_run_file_cache": false,
"can_run_update_cache": false,
"can_run_video_relay": false,
"native_node_agent_version": Version,
"node_update_plan_contract": "rap.node_update_plan.v1",
"node_update_status_report": true,
"host_agent_update_required": true,
"service_supervision_enabled": false,
"vpn_assignment_status": true,
"vpn_packet_forwarding": true,
"vpn_fabric_packet_transport": true,
"vpn_local_gateway_shortcut": true,
"external_backend_entry_proxy": true,
},
ReportedFacts: map[string]any{
"os": runtime.GOOS,
@@ -45,13 +53,28 @@ func HeartbeatPayload() client.HeartbeatRequest {
HealthStatus: "healthy",
ReportedVersion: Version,
Capabilities: map[string]any{
"native_node_agent": true,
"native_node_agent": true,
"node_update_plan_contract": "rap.node_update_plan.v1",
"node_update_status_report": true,
"vpn_assignment_status": true,
"vpn_packet_forwarding": true,
"vpn_fabric_packet_transport": true,
"vpn_local_gateway_shortcut": true,
"external_backend_entry_proxy": true,
},
ServiceStates: map[string]any{
"workload_supervision": "not_implemented_c3",
},
Metadata: map[string]any{
"stage": "c3",
"update_runtime": map[string]any{
"product": "rap-node-agent",
"current_version": Version,
"host_agent_present": true,
"self_update_enabled": true,
"rollback_executor_ready": true,
"reason": "host-agent updater active",
},
},
}
}
@@ -260,6 +260,7 @@ type SyntheticMeshRouteConfig struct {
}
type SyntheticMeshConfig struct {
Raw json.RawMessage `json:"-"`
Enabled bool `json:"enabled"`
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
@@ -286,6 +287,17 @@ type SyntheticMeshConfig struct {
ProductionForwarding bool `json:"production_forwarding"`
}
func (c *SyntheticMeshConfig) UnmarshalJSON(data []byte) error {
type syntheticMeshConfigAlias SyntheticMeshConfig
var decoded syntheticMeshConfigAlias
if err := json.Unmarshal(data, &decoded); err != nil {
return err
}
*c = SyntheticMeshConfig(decoded)
c.Raw = append(c.Raw[:0], data...)
return nil
}
type FabricServiceChannelRemediationCommand struct {
SchemaVersion string `json:"schema_version"`
CommandID string `json:"command_id"`
@@ -28,6 +28,9 @@ type Config struct {
MeshProductionForwardingEnabled bool
MeshProductionObservationSinkCapacity int
MeshListenAddr string
MeshListenPortMode string
MeshListenAutoPortStart int
MeshListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
MeshAdvertiseTransport string
@@ -58,6 +61,9 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.")
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "direct_tcp_tls"), "Transport label for the advertised mesh endpoint.")
@@ -70,7 +76,7 @@ func Load(args []string, env map[string]string) (Config, error) {
heartbeatSeconds := getEnvInt(env, "RAP_HEARTBEAT_INTERVAL_SECONDS", 15)
fs.DurationVar(&cfg.HeartbeatInterval, "heartbeat-interval", time.Duration(heartbeatSeconds)*time.Second, "Heartbeat interval.")
enrollmentPollIntervalSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5)
enrollmentPollTimeoutSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 600)
enrollmentPollTimeoutSeconds := getEnvSignedInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0)
fs.DurationVar(&cfg.EnrollmentPollInterval, "enrollment-poll-interval", time.Duration(enrollmentPollIntervalSeconds)*time.Second, "Enrollment approval polling interval.")
fs.DurationVar(&cfg.EnrollmentPollTimeout, "enrollment-poll-timeout", time.Duration(enrollmentPollTimeoutSeconds)*time.Second, "Enrollment approval polling timeout.")
if err := fs.Parse(args); err != nil {
@@ -84,6 +90,7 @@ func Load(args []string, env map[string]string) (Config, error) {
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
@@ -117,6 +124,20 @@ func Load(args []string, env map[string]string) (Config, error) {
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
}
switch cfg.MeshListenPortMode {
case "", "manual", "auto", "disabled":
if cfg.MeshListenPortMode == "" {
cfg.MeshListenPortMode = "manual"
}
default:
return Config{}, errors.New("mesh listen port mode must be manual, auto, or disabled")
}
if cfg.MeshListenAutoPortStart <= 0 || cfg.MeshListenAutoPortEnd <= 0 {
return Config{}, errors.New("mesh listen auto port range must be positive")
}
if cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return Config{}, errors.New("mesh listen auto port start must be less than or equal to end")
}
return cfg, nil
}
@@ -22,6 +22,9 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5",
"RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001",
"RAP_MESH_LISTEN_PORT_MODE": "auto",
"RAP_MESH_LISTEN_AUTO_PORT_START": "19010",
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443/",
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
@@ -65,6 +68,9 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if cfg.MeshListenAddr != "127.0.0.1:19001" {
t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr)
}
if cfg.MeshListenPortMode != "auto" || cfg.MeshListenAutoPortStart != 19010 || cfg.MeshListenAutoPortEnd != 19020 {
t.Fatalf("unexpected mesh listen port config: %+v", cfg)
}
if cfg.MeshAdvertiseEndpoint != "https://node-a.example.test:443" ||
cfg.MeshAdvertiseEndpointsJSON == "" ||
cfg.MeshAdvertiseTransport != "wss" ||
@@ -81,6 +87,19 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
}
}
func TestLoadConfigDefaultsEnrollmentPollingToNoTimeout(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
})
if err != nil {
t.Fatalf("load config: %v", err)
}
if cfg.EnrollmentPollTimeout != 0 {
t.Fatalf("EnrollmentPollTimeout = %s, want no timeout", cfg.EnrollmentPollTimeout)
}
}
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
@@ -0,0 +1,135 @@
package hostagent
import (
"errors"
"fmt"
"strings"
)
const (
DefaultContainerName = "rap-node-agent"
DefaultImage = "rap-node-agent:latest"
DefaultStateDir = "/var/lib/rap-node-agent"
DefaultNetwork = "host"
)
type RuntimeConfig struct {
BackendURL string
ClusterID string
JoinToken string
NodeName string
Image string
ContainerName string
StateDir string
Network string
RestartPolicy string
PullImage bool
Replace bool
DockerVPNGatewayEnabled bool
WorkloadSupervisionEnabled bool
MeshSyntheticRuntimeEnabled bool
MeshProductionForwardingEnabled bool
MeshListenAddr string
MeshListenPortMode string
MeshListenAutoPortStart int
MeshListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
MeshRegion string
HeartbeatIntervalSeconds int
EnrollmentPollIntervalSeconds int
EnrollmentPollTimeoutSeconds int
ExtraEnv []string
AdditionalDockerRunArgs []string
ProductionObservationSinkCap int
ImageArtifactURLs []string
ImageArtifactSHA256 string
ImageArtifactSizeBytes int64
}
func (cfg RuntimeConfig) Normalize() RuntimeConfig {
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.Image = firstNonEmpty(cfg.Image, DefaultImage)
cfg.ContainerName = firstNonEmpty(cfg.ContainerName, DefaultContainerName)
cfg.StateDir = firstNonEmpty(cfg.StateDir, DefaultStateDir)
cfg.Network = firstNonEmpty(cfg.Network, DefaultNetwork)
cfg.RestartPolicy = firstNonEmpty(cfg.RestartPolicy, "unless-stopped")
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
cfg.ImageArtifactSHA256 = strings.TrimSpace(cfg.ImageArtifactSHA256)
if cfg.HeartbeatIntervalSeconds == 0 {
cfg.HeartbeatIntervalSeconds = 15
}
if cfg.EnrollmentPollIntervalSeconds == 0 {
cfg.EnrollmentPollIntervalSeconds = 5
}
return cfg
}
func (cfg RuntimeConfig) ValidateInstall() error {
cfg = cfg.Normalize()
var missing []string
if cfg.BackendURL == "" {
missing = append(missing, "backend-url")
}
if cfg.ClusterID == "" {
missing = append(missing, "cluster-id")
}
if cfg.NodeName == "" {
missing = append(missing, "node-name")
}
if len(missing) > 0 {
return fmt.Errorf("missing required install settings: %s", strings.Join(missing, ", "))
}
if cfg.JoinToken == "" && !cfg.Replace {
return errors.New("join-token is required for first install; pass -replace only when updating an already enrolled local state")
}
if cfg.HeartbeatIntervalSeconds <= 0 {
return errors.New("heartbeat interval must be positive")
}
if cfg.EnrollmentPollIntervalSeconds <= 0 {
return errors.New("enrollment poll interval must be positive")
}
if cfg.EnrollmentPollTimeoutSeconds < 0 {
return errors.New("enrollment poll timeout must not be negative")
}
switch cfg.MeshListenPortMode {
case "", "manual", "auto", "disabled":
default:
return errors.New("mesh listen port mode must be manual, auto, or disabled")
}
if cfg.MeshListenAutoPortStart < 0 || cfg.MeshListenAutoPortEnd < 0 {
return errors.New("mesh listen auto port range must not be negative")
}
if cfg.MeshListenAutoPortStart > 0 && cfg.MeshListenAutoPortEnd > 0 && cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return errors.New("mesh listen auto port start must be less than or equal to end")
}
if cfg.ProductionObservationSinkCap < 0 {
return errors.New("production observation sink capacity must not be negative")
}
for _, item := range cfg.ExtraEnv {
if !strings.Contains(item, "=") {
return fmt.Errorf("extra env %q must be KEY=VALUE", item)
}
}
return nil
}
func firstNonEmpty(value, fallback string) string {
if strings.TrimSpace(value) == "" {
return fallback
}
return strings.TrimSpace(value)
}
@@ -0,0 +1,335 @@
package hostagent
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
)
type CommandRunner interface {
Run(ctx context.Context, name string, args ...string) (string, error)
}
type ExecRunner struct{}
func (ExecRunner) Run(ctx context.Context, name string, args ...string) (string, error) {
cmd := exec.CommandContext(ctx, name, args...)
out, err := cmd.CombinedOutput()
if err != nil {
return string(out), fmt.Errorf("%s %s: %w\n%s", name, strings.Join(args, " "), err, strings.TrimSpace(string(out)))
}
return string(out), nil
}
type DockerManager struct {
Runner CommandRunner
Binary string
}
var statHostPath = os.Stat
type InstallResult struct {
ContainerName string
Image string
Replaced bool
Pulled bool
Loaded bool
ContainerID string
}
func (m DockerManager) Install(ctx context.Context, cfg RuntimeConfig) (InstallResult, error) {
if err := cfg.ValidateInstall(); err != nil {
return InstallResult{}, err
}
cfg = cfg.Normalize()
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
docker := firstNonEmpty(m.Binary, "docker")
result := InstallResult{ContainerName: cfg.ContainerName, Image: cfg.Image}
if err := PrepareStateDir(cfg.StateDir); err != nil {
return result, err
}
if cfg.DockerVPNGatewayEnabled {
if err := ensureHostTunDevice(ctx, runner); err != nil {
return result, err
}
}
if cfg.PullImage {
if _, err := runner.Run(ctx, docker, "pull", cfg.Image); err != nil {
return result, err
}
result.Pulled = true
} else if len(cfg.ImageArtifactURLs) > 0 {
loaded, err := m.ensureImageFromArtifact(ctx, runner, docker, cfg)
if err != nil {
return result, err
}
result.Loaded = loaded
}
if cfg.Replace {
if _, err := runner.Run(ctx, docker, "rm", "-f", cfg.ContainerName); err != nil && !isNoSuchContainerError(err) {
return result, err
}
result.Replaced = true
}
args := DockerRunArgs(cfg)
out, err := runner.Run(ctx, docker, args...)
if err != nil {
return result, err
}
result.ContainerID = strings.TrimSpace(out)
return result, nil
}
func ensureHostTunDevice(ctx context.Context, runner CommandRunner) error {
if _, err := statHostPath("/dev/net/tun"); err == nil {
return nil
}
if _, err := runner.Run(ctx, "modprobe", "tun"); err != nil {
return fmt.Errorf("docker vpn gateway requires host /dev/net/tun; modprobe tun failed: %w", err)
}
if _, err := statHostPath("/dev/net/tun"); err != nil {
return fmt.Errorf("docker vpn gateway requires host /dev/net/tun after modprobe tun: %w", err)
}
return nil
}
func (m DockerManager) ensureImageFromArtifact(ctx context.Context, runner CommandRunner, docker string, cfg RuntimeConfig) (bool, error) {
if _, err := runner.Run(ctx, docker, "image", "inspect", cfg.Image); err == nil && !cfg.Replace {
return false, nil
}
path, err := downloadFirstArtifact(ctx, cfg.ImageArtifactURLs, cfg.ImageArtifactSHA256, cfg.ImageArtifactSizeBytes)
if err != nil {
return false, err
}
defer os.Remove(path)
if _, err := runner.Run(ctx, docker, "load", "-i", path); err != nil {
return false, err
}
if _, err := runner.Run(ctx, docker, "image", "inspect", cfg.Image); err != nil {
return true, fmt.Errorf("loaded artifact but image %q is not available: %w", cfg.Image, err)
}
return true, nil
}
func downloadFirstArtifact(ctx context.Context, urls []string, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
var lastErr error
for _, rawURL := range urls {
rawURL = strings.TrimSpace(rawURL)
if rawURL == "" {
continue
}
for attempt := 1; attempt <= 3; attempt++ {
path, err := downloadArtifact(ctx, rawURL, expectedSHA256, expectedSizeBytes)
if err == nil {
return path, nil
}
lastErr = err
}
}
if lastErr != nil {
return "", lastErr
}
return "", fmt.Errorf("no artifact URLs configured")
}
func downloadArtifact(ctx context.Context, rawURL, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return "", err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", fmt.Errorf("download artifact %s: %w", rawURL, err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return "", fmt.Errorf("download artifact %s: %s", rawURL, resp.Status)
}
file, err := os.CreateTemp("", "rap-docker-image-*.tar")
if err != nil {
return "", err
}
path := file.Name()
hasher := sha256.New()
written, copyErr := io.Copy(io.MultiWriter(file, hasher), resp.Body)
closeErr := file.Close()
if copyErr != nil {
os.Remove(path)
return "", copyErr
}
if closeErr != nil {
os.Remove(path)
return "", closeErr
}
if resp.ContentLength >= 0 && written != resp.ContentLength {
os.Remove(path)
return "", fmt.Errorf("artifact download truncated for %s: got %d bytes want content-length %d", rawURL, written, resp.ContentLength)
}
if expectedSizeBytes > 0 && written != expectedSizeBytes {
if strings.TrimSpace(expectedSHA256) != "" {
os.Remove(path)
return "", fmt.Errorf("artifact size mismatch for %s: got %d bytes want %d", rawURL, written, expectedSizeBytes)
}
fmt.Printf("artifact size mismatch for %s: got %d bytes want %d; proceeding without checksum for backward-compatible installs\n", rawURL, written, expectedSizeBytes)
}
actual := hex.EncodeToString(hasher.Sum(nil))
if expected := strings.TrimSpace(expectedSHA256); expected != "" && !strings.EqualFold(actual, expected) {
os.Remove(path)
return "", fmt.Errorf("artifact checksum mismatch for %s: got %s want %s", rawURL, actual, expected)
}
return path, nil
}
func (m DockerManager) Status(ctx context.Context, containerName string) (string, error) {
containerName = firstNonEmpty(containerName, DefaultContainerName)
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
docker := firstNonEmpty(m.Binary, "docker")
return runner.Run(ctx, docker, "ps", "-a", "--filter", "name=^/"+containerName+"$", "--format", "{{.Names}}\t{{.Image}}\t{{.Status}}")
}
func PrepareStateDir(stateDir string) error {
stateDir = strings.TrimSpace(stateDir)
if stateDir == "" || !looksLikeHostPath(stateDir) {
return nil
}
if err := os.MkdirAll(stateDir, 0o777); err != nil {
return fmt.Errorf("prepare state dir %q: %w", stateDir, err)
}
if err := os.Chmod(stateDir, 0o777); err != nil {
if isAccessDenied(err) {
return nil
}
return fmt.Errorf("chmod state dir %q: %w", stateDir, err)
}
return nil
}
func DockerRunArgs(cfg RuntimeConfig) []string {
cfg = cfg.Normalize()
args := []string{
"run", "-d",
"--name", cfg.ContainerName,
"--restart", cfg.RestartPolicy,
"--network", cfg.Network,
"-v", cfg.StateDir + ":/var/lib/rap-node-agent",
}
if cfg.DockerVPNGatewayEnabled {
args = append(args,
"--privileged",
"--cap-add", "NET_ADMIN",
"--device", "/dev/net/tun:/dev/net/tun",
)
}
args = append(args, cfg.AdditionalDockerRunArgs...)
for _, env := range NodeAgentEnv(cfg) {
args = append(args, "-e", env)
}
args = append(args, cfg.Image)
return args
}
func NodeAgentEnv(cfg RuntimeConfig) []string {
return NodeAgentEnvWithStateDir(cfg, "/var/lib/rap-node-agent")
}
func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
cfg = cfg.Normalize()
stateDir = firstNonEmpty(stateDir, cfg.StateDir)
env := []string{
"RAP_BACKEND_URL=" + cfg.BackendURL,
"RAP_CLUSTER_ID=" + cfg.ClusterID,
"RAP_NODE_NAME=" + cfg.NodeName,
"RAP_NODE_STATE_DIR=" + stateDir,
"RAP_HEARTBEAT_INTERVAL_SECONDS=" + strconv.Itoa(cfg.HeartbeatIntervalSeconds),
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollIntervalSeconds),
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollTimeoutSeconds),
"RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled),
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled),
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled),
}
if cfg.JoinToken != "" {
env = append(env, "RAP_JOIN_TOKEN="+cfg.JoinToken)
}
if cfg.MeshListenAddr != "" {
env = append(env, "RAP_MESH_LISTEN_ADDR="+cfg.MeshListenAddr)
}
if cfg.MeshListenPortMode != "" {
env = append(env, "RAP_MESH_LISTEN_PORT_MODE="+cfg.MeshListenPortMode)
}
if cfg.MeshListenAutoPortStart > 0 {
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_START="+strconv.Itoa(cfg.MeshListenAutoPortStart))
}
if cfg.MeshListenAutoPortEnd > 0 {
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_END="+strconv.Itoa(cfg.MeshListenAutoPortEnd))
}
if cfg.MeshAdvertiseEndpoint != "" {
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINT="+cfg.MeshAdvertiseEndpoint)
}
if cfg.MeshAdvertiseEndpointsJSON != "" {
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON="+cfg.MeshAdvertiseEndpointsJSON)
}
if cfg.MeshAdvertiseTransport != "" {
env = append(env, "RAP_MESH_ADVERTISE_TRANSPORT="+cfg.MeshAdvertiseTransport)
}
if cfg.MeshConnectivityMode != "" {
env = append(env, "RAP_MESH_CONNECTIVITY_MODE="+cfg.MeshConnectivityMode)
}
if cfg.MeshNATType != "" {
env = append(env, "RAP_MESH_NAT_TYPE="+cfg.MeshNATType)
}
if cfg.MeshRegion != "" {
env = append(env, "RAP_MESH_REGION="+cfg.MeshRegion)
}
if cfg.ProductionObservationSinkCap > 0 {
env = append(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY="+strconv.Itoa(cfg.ProductionObservationSinkCap))
}
env = append(env, cfg.ExtraEnv...)
return env
}
func RedactedArgs(args []string) []string {
out := append([]string(nil), args...)
for i := 0; i < len(out)-1; i++ {
if out[i] == "-e" && strings.HasPrefix(out[i+1], "RAP_JOIN_TOKEN=") {
out[i+1] = "RAP_JOIN_TOKEN=***"
}
}
return out
}
func isNoSuchContainerError(err error) bool {
value := strings.ToLower(err.Error())
return strings.Contains(value, "no such container") || strings.Contains(value, "no such object")
}
func looksLikeHostPath(value string) bool {
if filepath.IsAbs(value) {
return true
}
return strings.HasPrefix(value, ".") || strings.HasPrefix(value, "~") || strings.Contains(value, "/") || strings.Contains(value, `\`)
}
func boolString(value bool) string {
if value {
return "true"
}
return "false"
}
@@ -0,0 +1,366 @@
package hostagent
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
)
type recordingRunner struct {
calls [][]string
}
func (r *recordingRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) > 0 && args[0] == "run" {
return "container-1\n", nil
}
return "", nil
}
type imageMissingRunner struct {
calls [][]string
inspectSeen int
}
func (r *imageMissingRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) >= 3 && args[0] == "image" && args[1] == "inspect" {
r.inspectSeen++
if r.inspectSeen == 1 {
return "", fmt.Errorf("No such image")
}
return "[]", nil
}
if len(args) > 0 && args[0] == "run" {
return "container-1\n", nil
}
return "", nil
}
type imagePresentRunner struct {
calls [][]string
}
func (r *imagePresentRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) > 0 && args[0] == "run" {
return "container-1\n", nil
}
return "[]", nil
}
func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
args := DockerRunArgs(RuntimeConfig{
BackendURL: "http://control/api/v1/",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "/srv/rap/node-a",
MeshSyntheticRuntimeEnabled: true,
MeshListenAddr: ":19131",
MeshAdvertiseEndpoint: "http://10.0.0.11:19131/",
MeshConnectivityMode: "private_lan",
})
joined := strings.Join(args, "\x00")
for _, want := range []string{
"run", "-d", "--name\x00rap-node-agent-node-a", "--network\x00host",
"-v\x00/srv/rap/node-a:/var/lib/rap-node-agent",
"RAP_BACKEND_URL=http://control/api/v1",
"RAP_CLUSTER_ID=cluster-1",
"RAP_JOIN_TOKEN=join-secret",
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
"RAP_MESH_LISTEN_ADDR=:19131",
"RAP_MESH_ADVERTISE_ENDPOINT=http://10.0.0.11:19131",
"RAP_MESH_CONNECTIVITY_MODE=private_lan",
"rap-node-agent:test",
} {
if !strings.Contains(joined, want) {
t.Fatalf("docker args missing %q in %#v", want, args)
}
}
}
func TestDockerRunArgsEnableVPNGatewayDevice(t *testing.T) {
args := DockerRunArgs(RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
StateDir: "rap-node-state",
DockerVPNGatewayEnabled: true,
})
joined := strings.Join(args, "\x00")
for _, want := range []string{
"--privileged",
"--cap-add\x00NET_ADMIN",
"--device\x00/dev/net/tun:/dev/net/tun",
} {
if !strings.Contains(joined, want) {
t.Fatalf("docker vpn gateway args missing %q in %#v", want, args)
}
}
}
func TestPrepareStateDirCreatesWritableHostPath(t *testing.T) {
dir := filepath.Join(t.TempDir(), "node-state")
if err := PrepareStateDir(dir); err != nil {
t.Fatalf("prepare state dir: %v", err)
}
info, err := os.Stat(dir)
if err != nil {
t.Fatalf("stat state dir: %v", err)
}
if !info.IsDir() {
t.Fatalf("state path is not a directory")
}
if info.Mode().Perm()&0o777 != 0o777 {
t.Fatalf("state dir mode = %v, want writable for container nonroot user", info.Mode().Perm())
}
}
func TestPrepareStateDirSkipsNamedVolume(t *testing.T) {
if err := PrepareStateDir("rap-node-state"); err != nil {
t.Fatalf("named volume should be ignored: %v", err)
}
}
func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/node-agents/docker-install-profile" {
t.Fatalf("path = %s", r.URL.Path)
}
_ = json.NewEncoder(w).Encode(map[string]any{
"docker_install_profile": map[string]any{
"cluster_id": "cluster-1",
"backend_url": "https://control.example.test/api/v1",
"join_token": "rap_join_profile",
"node_name": "node-a",
"image": "rap-node-agent:test",
"artifact_endpoints": []string{"https://cache.example.test/artifacts"},
"docker_image_artifact": map[string]any{
"kind": "docker_image_tar",
"image": "rap-node-agent:test",
"file_name": "rap-node-agent-test.tar",
"size_bytes": 21,
},
"container_name": "rap-node-agent-node-a",
"state_dir": "/var/lib/rap/nodes/node-a",
"network": "host",
"restart_policy": "unless-stopped",
"replace": true,
"mesh_synthetic_runtime_enabled": true,
"mesh_connectivity_mode": "outbound_only",
},
})
}))
defer server.Close()
profile, err := FetchDockerInstallProfile(context.Background(), ProfileRequest{
URL: server.URL + "/api/v1",
ClusterID: "cluster-1",
InstallToken: "rap_join_profile",
NodeName: "node-a",
})
if err != nil {
t.Fatalf("fetch profile: %v", err)
}
cfg := RuntimeConfigFromProfile(profile).Normalize()
if cfg.BackendURL != "https://control.example.test/api/v1" ||
cfg.ClusterID != "cluster-1" ||
cfg.JoinToken != "rap_join_profile" ||
cfg.ContainerName != "rap-node-agent-node-a" ||
len(cfg.ImageArtifactURLs) != 1 ||
cfg.ImageArtifactSizeBytes != 21 ||
!cfg.MeshSyntheticRuntimeEnabled ||
cfg.MeshConnectivityMode != "outbound_only" {
t.Fatalf("unexpected cfg: %+v", cfg)
}
}
func TestInstallLoadsImageArtifactWhenImageMissing(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Loaded || result.ContainerID != "container-1" {
t.Fatalf("result = %+v", result)
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
if !strings.Contains(joined, "load\x00-i") || !strings.Contains(joined, "run\x00-d") {
t.Fatalf("expected docker load and run calls, got %#v", runner.calls)
}
}
func TestInstallAcceptsSizeMismatchWhenChecksumMissing(t *testing.T) {
const payload = "fake docker image tar"
const wrongSize = 999
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(payload))
}))
defer server.Close()
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "", // intentionally absent -> size mismatch should not block install
ImageArtifactSizeBytes: wrongSize,
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Loaded || result.ContainerID != "container-1" {
t.Fatalf("result = %+v", result)
}
}
func TestInstallReloadsImageArtifactWhenReplacingMutableTag(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
runner := &imagePresentRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Loaded {
t.Fatalf("expected image artifact reload, got %+v", result)
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
if !strings.Contains(joined, "load\x00-i") {
t.Fatalf("expected docker load even when image exists during replace, got %#v", runner.calls)
}
}
func TestDockerInstallLoadsExplicitArtifactBeforeReplace(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/rap-node-agent-test.tar" {
t.Fatalf("unexpected path %s", r.URL.Path)
}
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Loaded || !result.Replaced {
t.Fatalf("expected explicit artifact load and replace, got %+v", result)
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
if !strings.Contains(joined, "load\x00-i") {
t.Fatalf("expected docker load call, got %#v", runner.calls)
}
}
func flattenCalls(calls [][]string) []string {
out := []string{}
for _, call := range calls {
out = append(out, call...)
}
return out
}
func TestInstallCanPullReplaceAndRedactsJoinToken(t *testing.T) {
runner := &recordingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
PullImage: true,
Replace: true,
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Pulled || !result.Replaced || result.ContainerID != "container-1" {
t.Fatalf("result = %+v", result)
}
if len(runner.calls) != 3 {
t.Fatalf("calls = %#v", runner.calls)
}
redacted := strings.Join(RedactedArgs(runner.calls[2][1:]), " ")
if strings.Contains(redacted, "join-secret") || !strings.Contains(redacted, "RAP_JOIN_TOKEN=***") {
t.Fatalf("redacted args leaked token: %s", redacted)
}
}
func TestValidateRequiresJoinTokenUnlessReplacingExistingState(t *testing.T) {
err := RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a"}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "join-token") {
t.Fatalf("expected join token validation error, got %v", err)
}
err = RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a", Replace: true}.ValidateInstall()
if err != nil {
t.Fatalf("replace update should allow missing join token: %v", err)
}
}
@@ -0,0 +1,481 @@
package hostagent
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"time"
)
const (
DefaultLinuxInstallRoot = "/opt/rap"
DefaultLinuxStateRoot = "/var/lib/rap/nodes"
DefaultLinuxConfigRoot = "/etc/rap"
)
type LinuxInstallConfig struct {
RuntimeConfig RuntimeConfig
NodeID string
InstallDir string
StateDir string
ConfigDir string
UnitDir string
StartupMode string
ArtifactURLs []string
ArtifactSHA256 string
ArtifactSizeBytes int64
Replace bool
DryRun bool
AutoUpdateEnabled bool
AutoUpdateCurrentVersion string
AutoUpdateChannel string
AutoUpdateIntervalSeconds int
AutoUpdateInitialDelaySeconds int
AutoUpdateHealthTimeoutSeconds int
HostAgentSourcePath string
}
type LinuxInstallResult struct {
NodeName string
InstallDir string
StateDir string
ConfigDir string
NodeAgentPath string
HostAgentPath string
EnvPath string
UnitName string
UnitPath string
UpdaterUnitName string
Downloaded bool
Started bool
UpdaterStarted bool
}
type LinuxManager struct {
Runner CommandRunner
}
func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConfig {
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultLinuxStateRoot, safeUnitSlug(profile.NodeName)))
installDir := firstNonEmpty(profile.InstallDir, filepath.Join(DefaultLinuxInstallRoot, safeUnitSlug(profile.NodeName)))
return LinuxInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
StateDir: stateDir,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
},
InstallDir: installDir,
StateDir: stateDir,
ConfigDir: filepath.Join(DefaultLinuxConfigRoot, safeUnitSlug(profile.NodeName)),
StartupMode: firstNonEmpty(profile.StartupMode, "systemd"),
ArtifactURLs: linuxArtifactURLs(profile),
ArtifactSHA256: linuxArtifactSHA256(profile),
ArtifactSizeBytes: linuxArtifactSizeBytes(profile),
Replace: true,
AutoUpdateEnabled: true,
}
}
func linuxArtifactURLs(profile LinuxInstallProfile) []string {
if profile.NodeAgentArtifact != nil && len(profile.NodeAgentArtifact.URLs) > 0 {
return append([]string(nil), profile.NodeAgentArtifact.URLs...)
}
if profile.NodeAgentArtifact == nil || strings.TrimSpace(profile.NodeAgentArtifact.FileName) == "" {
return nil
}
out := []string{}
fileName := strings.TrimLeft(strings.TrimSpace(profile.NodeAgentArtifact.FileName), "/")
for _, endpoint := range profile.ArtifactEndpoints {
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
out = append(out, trimmed+"/"+fileName)
}
}
return out
}
func linuxArtifactSHA256(profile LinuxInstallProfile) string {
if profile.NodeAgentArtifact == nil {
return ""
}
return strings.TrimSpace(profile.NodeAgentArtifact.SHA256)
}
func linuxArtifactSizeBytes(profile LinuxInstallProfile) int64 {
if profile.NodeAgentArtifact == nil {
return 0
}
return profile.NodeAgentArtifact.SizeBytes
}
func (m LinuxManager) Install(ctx context.Context, cfg LinuxInstallConfig) (LinuxInstallResult, error) {
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
cfg.RuntimeConfig.Replace = cfg.Replace
cfg.RuntimeConfig.StateDir = firstNonEmpty(cfg.StateDir, cfg.RuntimeConfig.StateDir)
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
if err := cfg.RuntimeConfig.ValidateInstall(); err != nil {
return LinuxInstallResult{}, err
}
slug := safeUnitSlug(cfg.RuntimeConfig.NodeName)
cfg.InstallDir = firstNonEmpty(cfg.InstallDir, filepath.Join(DefaultLinuxInstallRoot, slug))
cfg.StateDir = firstNonEmpty(cfg.RuntimeConfig.StateDir, filepath.Join(DefaultLinuxStateRoot, slug))
cfg.ConfigDir = firstNonEmpty(cfg.ConfigDir, filepath.Join(DefaultLinuxConfigRoot, slug))
cfg.UnitDir = firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir)
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "systemd"))
unitName := "rap-node-agent-" + slug + ".service"
result := LinuxInstallResult{
NodeName: cfg.RuntimeConfig.NodeName,
InstallDir: cfg.InstallDir,
StateDir: cfg.StateDir,
ConfigDir: cfg.ConfigDir,
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent"),
HostAgentPath: filepath.Join(cfg.InstallDir, "rap-host-agent"),
EnvPath: filepath.Join(cfg.ConfigDir, "rap-node-agent.env"),
UnitName: unitName,
UnitPath: filepath.Join(cfg.UnitDir, unitName),
}
if cfg.DryRun {
return result, nil
}
if runtime.GOOS != "linux" {
return result, fmt.Errorf("linux install is only supported on linux hosts")
}
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
return result, err
}
if err := os.MkdirAll(cfg.StateDir, 0o700); err != nil {
return result, err
}
if err := os.MkdirAll(cfg.ConfigDir, 0o755); err != nil {
return result, err
}
if len(cfg.ArtifactURLs) > 0 && (cfg.Replace || !fileExists(result.NodeAgentPath)) {
m.stopService(ctx, result.UnitName)
path, err := downloadFirstArtifact(ctx, cfg.ArtifactURLs, cfg.ArtifactSHA256, cfg.ArtifactSizeBytes)
if err != nil {
return result, err
}
defer os.Remove(path)
if err := copyFile(path, result.NodeAgentPath, 0o755); err != nil {
m.stopService(ctx, result.UnitName)
if retryErr := copyFile(path, result.NodeAgentPath, 0o755); retryErr != nil {
return result, err
}
}
result.Downloaded = true
}
if !fileExists(result.NodeAgentPath) {
return result, fmt.Errorf("node-agent binary is missing at %s and no artifact was available", result.NodeAgentPath)
}
if err := os.WriteFile(result.EnvPath, []byte(linuxEnvFile(cfg.RuntimeConfig, cfg.StateDir)), 0o600); err != nil {
return result, err
}
if cfg.StartupMode != "none" {
if err := os.MkdirAll(cfg.UnitDir, 0o755); err != nil {
return result, err
}
if err := os.WriteFile(result.UnitPath, []byte(linuxNodeAgentUnit(result)), 0o644); err != nil {
return result, err
}
runner := m.runner()
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
return result, err
}
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", result.UnitName); err != nil {
return result, err
}
result.Started = true
}
return installLinuxHostAgentUpdater(ctx, m, result, cfg)
}
func (m LinuxManager) stopService(ctx context.Context, unitName string) {
if strings.TrimSpace(unitName) == "" {
return
}
_, _ = m.runner().Run(ctx, "systemctl", "stop", unitName)
}
func (m LinuxManager) runner() CommandRunner {
if m.Runner != nil {
return m.Runner
}
return ExecRunner{}
}
func linuxEnvFile(cfg RuntimeConfig, stateDir string) string {
lines := []string{}
for _, env := range NodeAgentEnvWithStateDir(cfg, stateDir) {
key, value, ok := strings.Cut(env, "=")
if !ok {
continue
}
lines = append(lines, key+"="+systemdQuote(value))
}
return strings.Join(lines, "\n") + "\n"
}
func linuxNodeAgentUnit(result LinuxInstallResult) string {
return fmt.Sprintf(`[Unit]
Description=RAP node-agent %s
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
EnvironmentFile=%s
ExecStart=%s
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
`, result.NodeName, systemdQuote(result.EnvPath), systemdQuote(result.NodeAgentPath))
}
func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result LinuxInstallResult, cfg LinuxInstallConfig) (LinuxInstallResult, error) {
if !cfg.AutoUpdateEnabled || strings.EqualFold(cfg.StartupMode, "none") {
return result, nil
}
if cfg.AutoUpdateCurrentVersion == "" || (cfg.Replace && !result.Downloaded) {
cfg.AutoUpdateCurrentVersion = "0.0.0"
}
if err := installHostAgentBinary(cfg.HostAgentSourcePath, result.HostAgentPath); err != nil {
return result, err
}
interval := cfg.AutoUpdateIntervalSeconds
if interval == 0 {
interval = 21600
}
initialDelay := cfg.AutoUpdateInitialDelaySeconds
if initialDelay == 0 {
initialDelay = 15
}
healthTimeout := cfg.AutoUpdateHealthTimeoutSeconds
if healthTimeout == 0 {
healthTimeout = 30
}
args := []string{
result.HostAgentPath,
"update-loop",
"--backend-url", cfg.RuntimeConfig.BackendURL,
"--cluster-id", cfg.RuntimeConfig.ClusterID,
"--state-dir", result.StateDir,
"--current-version", cfg.AutoUpdateCurrentVersion,
"--os", "linux",
"--arch", runtime.GOARCH,
"--install-type", BinaryUpdateInstallType,
"--binary-path", result.NodeAgentPath,
"--systemd-unit", result.UnitName,
"--health-timeout-seconds", fmt.Sprintf("%d", healthTimeout),
"--interval-seconds", fmt.Sprintf("%d", interval),
"--initial-delay-seconds", fmt.Sprintf("%d", initialDelay),
"--host-agent-update-status-enabled",
"--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"),
"--host-agent-binary-path", result.HostAgentPath,
}
if strings.TrimSpace(cfg.NodeID) != "" {
args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID))
}
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
}
unitName := "rap-host-agent-updater-" + safeUnitSlug(result.NodeName) + ".service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
unit := fmt.Sprintf(`[Unit]
Description=RAP host-agent updater for %s
After=network-online.target %s
Wants=network-online.target
[Service]
Type=simple
ExecStart=%s
Restart=always
RestartSec=30
[Install]
WantedBy=multi-user.target
`, result.NodeName, result.UnitName, systemdJoin(args))
if err := os.WriteFile(unitPath, []byte(unit), 0o644); err != nil {
return result, err
}
runner := m.runner()
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
return result, err
}
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", unitName); err != nil {
return result, err
}
result.UpdaterUnitName = unitName
result.UpdaterStarted = true
return result, nil
}
func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
req.OS = firstNonEmpty(req.OS, "linux")
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
req = req.Normalize()
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return UpdateResult{}, err
}
plan, err := FetchNodeUpdatePlan(ctx, req)
if err != nil {
return UpdateResult{}, err
}
result := UpdateResult{Action: plan.Action, Reason: plan.Reason, TargetVersion: plan.TargetVersion, ContainerName: req.SystemdUnitName, NewImage: req.BinaryPath}
if plan.Action != "update" {
if !req.DryRun {
status := statusFromNoopPlan(req, plan)
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["systemd_unit"] = req.SystemdUnitName
status.Payload["binary_path"] = req.BinaryPath
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
}
return result, nil
}
if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact == nil {
err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != BinaryUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
runner := m.runner()
_, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
return result, err
}
result.Replaced = true
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
_ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()})
return result, nil
}
func (m LinuxManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
req.OS = firstNonEmpty(req.OS, "linux")
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
cfg.Request = req
return runLinuxUpdateLoop(ctx, m, cfg)
}
func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfig) error {
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
}
if cfg.InitialDelay > 0 {
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
return err
}
}
runs := 0
lastTriggerGeneration := currentUpdateTriggerGeneration(cfg.Request.StateDir)
for {
runs++
result, err := m.ApplyUpdate(ctx, cfg.Request)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("linux_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, cfg.Request.StateDir)
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepUntilUpdateIntervalOrTrigger(ctx, cfg.Request.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
continue
} else {
logf("linux_update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
}
} else {
logf("linux_update_loop run=%d action=%s reason=%s target=%s unit=%s replaced=%t", runs, result.Action, result.Reason, result.TargetVersion, result.ContainerName, result.Replaced)
if result.Action == "update" && result.TargetVersion != "" {
cfg.Request.CurrentVersion = result.TargetVersion
}
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, cfg.Request.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, cfg.Request.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, cfg.Request.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, cfg.Request.StateDir)
hostReq.Channel = firstNonEmpty(hostReq.Channel, cfg.Request.Channel)
hostReq.OS = firstNonEmpty(hostReq.OS, "linux")
hostReq.Arch = firstNonEmpty(hostReq.Arch, runtime.GOARCH)
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, BinaryUpdateInstallType)
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
if hostErr != nil {
logf("linux_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
} else {
logf("linux_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t", runs, hostResult.Action, hostResult.Reason, hostResult.TargetVersion, hostResult.NewImage, hostResult.Replaced, hostResult.RestartNeeded)
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
cfg.HostAgentUpdateRequest.CurrentVersion = hostResult.TargetVersion
}
}
}
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepUntilUpdateIntervalOrTrigger(ctx, cfg.Request.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
}
}
@@ -0,0 +1,333 @@
package hostagent
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"strings"
"time"
)
type DockerInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
Image string `json:"image"`
ContainerName string `json:"container_name"`
StateDir string `json:"state_dir"`
Network string `json:"network"`
RestartPolicy string `json:"restart_policy"`
PullImage bool `json:"pull_image"`
Replace bool `json:"replace"`
DockerVPNGatewayEnabled bool `json:"docker_vpn_gateway_enabled"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
Roles []string `json:"roles"`
}
type DockerArtifact struct {
Kind string `json:"kind"`
Image string `json:"image"`
MediaType string `json:"media_type"`
FileName string `json:"file_name"`
URLs []string `json:"urls"`
SHA256 string `json:"sha256"`
SizeBytes int64 `json:"size_bytes"`
}
type WindowsInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
StateDir string `json:"state_dir"`
InstallDir string `json:"install_dir"`
StartupMode string `json:"startup_mode"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
Roles []string `json:"roles"`
}
type LinuxInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
StateDir string `json:"state_dir"`
InstallDir string `json:"install_dir"`
StartupMode string `json:"startup_mode"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
Roles []string `json:"roles"`
}
type ProfileRequest struct {
URL string
ClusterID string
InstallToken string
NodeName string
HTTPClient *http.Client
}
func FetchDockerInstallProfile(ctx context.Context, req ProfileRequest) (DockerInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return DockerInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/docker-install-profile") {
url += "/node-agents/docker-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return DockerInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return DockerInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return DockerInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return DockerInstallProfile{}, fmt.Errorf("fetch docker install profile: %s", resp.Status)
}
var envelope struct {
Profile DockerInstallProfile `json:"docker_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return DockerInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
}
func FetchWindowsInstallProfile(ctx context.Context, req ProfileRequest) (WindowsInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return WindowsInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/windows-install-profile") {
url += "/node-agents/windows-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return WindowsInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return WindowsInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return WindowsInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return WindowsInstallProfile{}, fmt.Errorf("fetch windows install profile: %s", resp.Status)
}
var envelope struct {
Profile WindowsInstallProfile `json:"windows_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return WindowsInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
}
func FetchLinuxInstallProfile(ctx context.Context, req ProfileRequest) (LinuxInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return LinuxInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/linux-install-profile") {
url += "/node-agents/linux-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return LinuxInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return LinuxInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return LinuxInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return LinuxInstallProfile{}, fmt.Errorf("fetch linux install profile: %s", resp.Status)
}
var envelope struct {
Profile LinuxInstallProfile `json:"linux_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return LinuxInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
}
func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
return RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
Image: profile.Image,
ContainerName: profile.ContainerName,
StateDir: profile.StateDir,
Network: profile.Network,
RestartPolicy: profile.RestartPolicy,
PullImage: profile.PullImage,
Replace: profile.Replace,
DockerVPNGatewayEnabled: profile.DockerVPNGatewayEnabled,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
ImageArtifactURLs: dockerArtifactURLs(profile),
ImageArtifactSHA256: dockerArtifactSHA256(profile),
ImageArtifactSizeBytes: dockerArtifactSizeBytes(profile),
}
}
func dockerArtifactURLs(profile DockerInstallProfile) []string {
if profile.DockerImageArtifact != nil && len(profile.DockerImageArtifact.URLs) > 0 {
return append([]string(nil), profile.DockerImageArtifact.URLs...)
}
if profile.DockerImageArtifact == nil || strings.TrimSpace(profile.DockerImageArtifact.FileName) == "" {
return nil
}
out := []string{}
fileName := strings.TrimLeft(strings.TrimSpace(profile.DockerImageArtifact.FileName), "/")
for _, endpoint := range profile.ArtifactEndpoints {
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
out = append(out, trimmed+"/"+fileName)
}
}
return out
}
func dockerArtifactSHA256(profile DockerInstallProfile) string {
if profile.DockerImageArtifact == nil {
return ""
}
return strings.TrimSpace(profile.DockerImageArtifact.SHA256)
}
func dockerArtifactSizeBytes(profile DockerInstallProfile) int64 {
if profile.DockerImageArtifact == nil {
return 0
}
return profile.DockerImageArtifact.SizeBytes
}
@@ -0,0 +1,258 @@
package hostagent
import (
"context"
"errors"
"fmt"
"os"
"strings"
"time"
)
type HostAgentUpdateRequest struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
CurrentVersion string
Channel string
OS string
Arch string
InstallType string
BinaryPath string
DryRun bool
RestartService string
RestartAfterApply bool
}
type HostAgentUpdateLoopConfig struct {
Request HostAgentUpdateRequest
Interval time.Duration
InitialDelay time.Duration
Jitter float64
MaxRuns int
StopOnError bool
Logf func(format string, args ...any)
}
func (req HostAgentUpdateRequest) updateRequest() UpdateRequest {
return UpdateRequest{
BackendURL: req.BackendURL,
ClusterID: req.ClusterID,
NodeID: req.NodeID,
StateDir: req.StateDir,
Product: HostAgentUpdateProduct,
CurrentVersion: req.CurrentVersion,
OS: firstNonEmpty(req.OS, "linux"),
Arch: req.Arch,
InstallType: firstNonEmpty(req.InstallType, BinaryUpdateInstallType),
Channel: req.Channel,
ContainerName: "host-agent-service",
DryRun: req.DryRun,
}
}
func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUpdateRequest) (UpdateResult, error) {
binaryPath := firstNonEmpty(req.BinaryPath, DefaultHostAgentInstallPath)
planReq := req.updateRequest()
planReq.BinaryDefaults()
resolved, err := resolveUpdateRequest(planReq)
if err != nil {
return UpdateResult{}, err
}
plan, err := FetchNodeUpdatePlan(ctx, resolved)
if err != nil {
return UpdateResult{}, err
}
result := UpdateResult{
Action: plan.Action,
Reason: plan.Reason,
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
NewImage: binaryPath,
}
if plan.Action != "update" {
if !req.DryRun {
status := statusFromNoopPlan(resolved, plan)
status.Product = HostAgentUpdateProduct
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["binary_path"] = binaryPath
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, status)
}
return result, nil
}
if plan.Artifact == nil {
err := errors.New("host-agent update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
return result, err
}
if !isBinaryInstallType(plan.Artifact.InstallType) {
err := fmt.Errorf("unsupported host-agent artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL)
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": binaryPath},
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
if err := installHostAgentBinary(path, binaryPath); err != nil {
stageErr := stageHostAgentBinary(path, binaryPath)
if stageErr == nil {
result.RestartNeeded = true
_ = saveUpdateState(resolved.StateDir, UpdateState{
Product: HostAgentUpdateProduct,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "apply",
Status: "staged",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"binary_path": binaryPath, "staged_path": binaryPath + ".next", "restart_needed": true, "replace_error": err.Error()},
})
return result, nil
}
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr)))
return result, err
}
result.Loaded = true
result.Replaced = true
result.RestartNeeded = true
_ = saveUpdateState(resolved.StateDir, UpdateState{
Product: HostAgentUpdateProduct,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "apply",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"binary_path": binaryPath, "restart_needed": true},
})
if req.RestartAfterApply && strings.TrimSpace(req.RestartService) != "" {
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
_, err = runner.Run(ctx, "systemctl", "restart", req.RestartService)
if err != nil {
return result, err
}
result.RestartNeeded = false
}
return result, nil
}
func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgentUpdateLoopConfig) error {
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
if cfg.InitialDelay < 0 || cfg.Interval < 0 {
return errors.New("host-agent update loop durations must not be negative")
}
if cfg.Jitter < 0 || cfg.Jitter > 1 {
return errors.New("host-agent update loop jitter must be between 0 and 1")
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
}
if cfg.InitialDelay > 0 {
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
return err
}
}
runs := 0
req := cfg.Request
for {
runs++
result, err := m.ApplyHostAgentUpdate(ctx, req)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
} else {
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
}
} else {
logf("host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
runs,
result.Action,
result.Reason,
result.TargetVersion,
result.NewImage,
result.Replaced,
result.RestartNeeded,
)
if result.Action == "update" && result.TargetVersion != "" {
req.CurrentVersion = result.TargetVersion
}
}
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
return err
}
}
}
func (req *UpdateRequest) BinaryDefaults() {
req.Product = firstNonEmpty(req.Product, HostAgentUpdateProduct)
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
req.OS = firstNonEmpty(req.OS, "linux")
}
func isBinaryInstallType(value string) bool {
switch strings.TrimSpace(value) {
case "", BinaryUpdateInstallType, "windows_binary", "binary", "host_binary", "linux-amd64-binary", "windows-amd64-binary":
return true
default:
return false
}
}
func hostAgentInstallTypeFor(nodeInstallType string) string {
if strings.TrimSpace(nodeInstallType) == WindowsUpdateInstallType {
return "windows_binary"
}
return BinaryUpdateInstallType
}
func stageHostAgentBinary(sourcePath, binaryPath string) error {
return copyFile(sourcePath, binaryPath+".next", 0o755)
}
@@ -0,0 +1,321 @@
package hostagent
import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strings"
)
const (
DefaultHostAgentInstallPath = "/usr/local/bin/rap-host-agent"
DefaultSystemdUnitDir = "/etc/systemd/system"
)
type UpdateServiceConfig struct {
RuntimeConfig RuntimeConfig
Product string
CurrentVersion string
Channel string
IntervalSeconds int
InitialDelaySeconds int
Jitter float64
HealthTimeoutSec int
BinaryInstallPath string
SourceBinaryPath string
UnitDir string
ManageSystemd bool
DryRun bool
InstallSelfUpdater bool
SelfUpdateVersion string
}
type UpdateServiceResult struct {
Installed bool
Started bool
UnitName string
UnitPath string
BinaryPath string
Unit string
SelfUnitName string
SelfUnitPath string
SelfUnit string
}
func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServiceConfig) (UpdateServiceResult, error) {
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
if cfg.Product == "" {
cfg.Product = DefaultUpdateProduct
}
if cfg.IntervalSeconds == 0 {
cfg.IntervalSeconds = 21600
}
if cfg.Jitter == 0 {
cfg.Jitter = 0.15
}
if cfg.HealthTimeoutSec == 0 {
cfg.HealthTimeoutSec = 30
}
cfg.BinaryInstallPath = firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath)
cfg.UnitDir = firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir)
unitName := "rap-host-agent-updater-" + safeUnitSlug(cfg.RuntimeConfig.ContainerName) + ".service"
result := UpdateServiceResult{
UnitName: unitName,
UnitPath: filepath.Join(cfg.UnitDir, unitName),
BinaryPath: cfg.BinaryInstallPath,
}
unit, err := buildUpdateServiceUnit(cfg)
if err != nil {
return result, err
}
result.Unit = unit
if cfg.DryRun {
if cfg.InstallSelfUpdater {
selfUnit, selfUnitName, selfUnitPath, err := buildHostAgentSelfUpdateUnit(cfg)
if err != nil {
return result, err
}
result.SelfUnit = selfUnit
result.SelfUnitName = selfUnitName
result.SelfUnitPath = selfUnitPath
}
return result, nil
}
if runtime.GOOS != "linux" && cfg.UnitDir == DefaultSystemdUnitDir {
return result, fmt.Errorf("systemd update service install is only supported on linux")
}
if err := installHostAgentBinary(cfg.SourceBinaryPath, cfg.BinaryInstallPath); err != nil {
return result, err
}
if err := os.MkdirAll(cfg.UnitDir, 0o755); err != nil {
return result, err
}
if err := os.WriteFile(result.UnitPath, []byte(unit), 0o644); err != nil {
return result, err
}
if cfg.InstallSelfUpdater {
selfUnit, selfUnitName, selfUnitPath, err := buildHostAgentSelfUpdateUnit(cfg)
if err != nil {
return result, err
}
if err := os.WriteFile(selfUnitPath, []byte(selfUnit), 0o644); err != nil {
return result, err
}
result.SelfUnit = selfUnit
result.SelfUnitName = selfUnitName
result.SelfUnitPath = selfUnitPath
}
result.Installed = true
if cfg.ManageSystemd {
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
return result, err
}
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", unitName); err != nil {
return result, err
}
if cfg.InstallSelfUpdater && result.SelfUnitName != "" {
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", result.SelfUnitName); err != nil {
return result, err
}
}
result.Started = true
}
return result, nil
}
func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
var missing []string
if runtimeCfg.BackendURL == "" {
missing = append(missing, "backend-url")
}
if runtimeCfg.ClusterID == "" {
missing = append(missing, "cluster-id")
}
if runtimeCfg.ContainerName == "" {
missing = append(missing, "container-name")
}
if runtimeCfg.StateDir == "" {
missing = append(missing, "state-dir")
}
if len(missing) > 0 {
return "", fmt.Errorf("missing required update service settings: %s", strings.Join(missing, ", "))
}
args := []string{
cfg.BinaryInstallPath,
"update-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir,
"--container-name", runtimeCfg.ContainerName,
"--product", firstNonEmpty(cfg.Product, DefaultUpdateProduct),
"--current-version", strings.TrimSpace(cfg.CurrentVersion),
"--interval-seconds", fmt.Sprintf("%d", cfg.IntervalSeconds),
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds),
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
"--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec),
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
execStart := systemdJoin(args)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent updater for %s
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
ExecStart=%s
Restart=always
RestartSec=30
[Install]
WantedBy=multi-user.target
`, runtimeCfg.ContainerName, execStart), nil
}
func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host-agent self updater")
}
unitName := "rap-host-agent-self-updater.service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
currentVersion := firstNonEmpty(cfg.SelfUpdateVersion, cfg.CurrentVersion)
args := []string{
cfg.BinaryInstallPath,
"update-host-agent-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir,
"--binary-path", firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath),
"--current-version", currentVersion,
"--interval-seconds", fmt.Sprintf("%d", cfg.IntervalSeconds),
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30),
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
return fmt.Sprintf(`[Unit]
Description=RAP host-agent self updater
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
ExecStart=%s
Restart=always
RestartSec=60
[Install]
WantedBy=multi-user.target
`, systemdJoin(args)), unitName, unitPath, nil
}
func installHostAgentBinary(sourcePath, targetPath string) error {
sourcePath = strings.TrimSpace(sourcePath)
targetPath = strings.TrimSpace(targetPath)
if sourcePath == "" {
var err error
sourcePath, err = os.Executable()
if err != nil {
return err
}
}
if samePath(sourcePath, targetPath) {
return os.Chmod(targetPath, 0o755)
}
src, err := os.Open(sourcePath)
if err != nil {
return err
}
defer src.Close()
if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil {
return err
}
tmp := targetPath + ".tmp"
dst, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
if err != nil {
return err
}
if _, err := io.Copy(dst, src); err != nil {
_ = dst.Close()
_ = os.Remove(tmp)
return err
}
if err := dst.Close(); err != nil {
_ = os.Remove(tmp)
return err
}
if err := os.Chmod(tmp, 0o755); err != nil {
_ = os.Remove(tmp)
return err
}
return os.Rename(tmp, targetPath)
}
func samePath(a, b string) bool {
absA, errA := filepath.Abs(a)
absB, errB := filepath.Abs(b)
if errA == nil && errB == nil {
return absA == absB
}
return a == b
}
func safeUnitSlug(value string) string {
value = strings.ToLower(strings.TrimSpace(value))
if value == "" {
value = DefaultContainerName
}
var b strings.Builder
lastDash := false
for _, r := range value {
ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
if ok {
b.WriteRune(r)
lastDash = false
continue
}
if !lastDash {
b.WriteByte('-')
lastDash = true
}
}
out := strings.Trim(b.String(), "-")
if out == "" {
return DefaultContainerName
}
return out
}
func systemdJoin(args []string) string {
out := make([]string, 0, len(args))
for _, arg := range args {
out = append(out, systemdQuote(arg))
}
return strings.Join(out, " ")
}
func systemdQuote(arg string) string {
if arg == "" {
return `""`
}
if !strings.ContainsAny(arg, " \t\n\"'\\") {
return arg
}
arg = strings.ReplaceAll(arg, `\`, `\\`)
arg = strings.ReplaceAll(arg, `"`, `\"`)
return `"` + arg + `"`
}
@@ -0,0 +1,171 @@
package hostagent
import (
"context"
"os"
"path/filepath"
"strings"
"testing"
)
func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
dir := t.TempDir()
source := filepath.Join(dir, "rap-host-agent-src")
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
t.Fatalf("write source: %v", err)
}
unitDir := filepath.Join(dir, "systemd")
binaryPath := filepath.Join(dir, "bin", "rap-host-agent")
result, err := (DockerManager{}).InstallUpdateService(context.Background(), UpdateServiceConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
NodeName: "node-a",
ContainerName: "rap-node-agent-node-a",
StateDir: "/var/lib/rap/nodes/node-a",
},
CurrentVersion: "0.1.0-current",
IntervalSeconds: 60,
Jitter: 0.2,
SourceBinaryPath: source,
BinaryInstallPath: binaryPath,
UnitDir: unitDir,
ManageSystemd: false,
InstallSelfUpdater: true,
SelfUpdateVersion: "0.1.0-host",
})
if err != nil {
t.Fatalf("install update service: %v", err)
}
if !result.Installed || result.Started {
t.Fatalf("unexpected result: %+v", result)
}
unit, err := os.ReadFile(result.UnitPath)
if err != nil {
t.Fatalf("read unit: %v", err)
}
text := string(unit)
for _, want := range []string{
"ExecStart=",
" update-loop",
"--backend-url http://control/api/v1",
"--cluster-id cluster-1",
"--state-dir /var/lib/rap/nodes/node-a",
"--container-name rap-node-agent-node-a",
"--current-version 0.1.0-current",
"--interval-seconds 60",
"Restart=always",
} {
if !strings.Contains(text, want) {
t.Fatalf("unit missing %q:\n%s", want, text)
}
}
if payload, err := os.ReadFile(binaryPath); err != nil || string(payload) != "binary" {
t.Fatalf("binary copy = %q, %v", payload, err)
}
if result.SelfUnitName != "rap-host-agent-self-updater.service" || result.SelfUnitPath == "" {
t.Fatalf("self updater result = %+v", result)
}
selfUnit, err := os.ReadFile(result.SelfUnitPath)
if err != nil {
t.Fatalf("read self unit: %v", err)
}
if text := string(selfUnit); !strings.Contains(text, "update-host-agent-loop") || !strings.Contains(text, "--current-version 0.1.0-host") {
t.Fatalf("unexpected self unit:\n%s", text)
}
}
func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
cfg := WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
},
NodeID: "node-1",
AutoUpdateCurrentVersion: "0.1.2",
AutoUpdateIntervalSeconds: 120,
AutoUpdateInitialDelaySeconds: 7,
AutoUpdateHealthTimeoutSeconds: 11,
}
result := WindowsInstallResult{
NodeName: "win-a",
StateDir: `C:\ProgramData\RAP\nodes\win-a`,
NodeAgentPath: `C:\Program Files\RAP\win-a\rap-node-agent.exe`,
TaskName: "RAP Node Agent win-a",
}
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
for _, want := range []string{
":loop",
"rap-host-agent.exe.next",
"update-loop --backend-url",
"--backend-url \"http://control/api/v1\"",
"--cluster-id \"cluster-1\"",
"--node-id \"node-1\"",
"--state-dir \"C:\\ProgramData\\RAP\\nodes\\win-a\"",
"--install-type windows_service",
"--binary-path \"C:\\Program Files\\RAP\\win-a\\rap-node-agent.exe\"",
"--host-agent-binary-path \"C:\\Program Files\\RAP\\win-a\\rap-host-agent.exe\"",
"--windows-task-name \"RAP Node Agent win-a\"",
"--current-version 0.1.2",
"--host-agent-current-version 0.1.2",
"--interval-seconds 120",
"timeout /t 120",
} {
if !strings.Contains(script, want) {
t.Fatalf("script missing %q:\n%s", want, script)
}
}
}
func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) {
result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
NodeName: "win-a",
},
InstallDir: `C:\Program Files\RAP\win-a`,
Replace: true,
DryRun: true,
})
if err != nil {
t.Fatalf("replace install should not require join token: %v", err)
}
if result.NodeName != "win-a" || result.NodeAgentPath == "" {
t.Fatalf("unexpected dry-run result: %+v", result)
}
}
func TestWindowsRepairUpdaterStartsFromUnknownVersion(t *testing.T) {
dir := t.TempDir()
source := filepath.Join(dir, "rap-host-agent.exe")
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
t.Fatalf("write source: %v", err)
}
result, err := installWindowsHostAgentUpdater(context.Background(), WindowsManager{Runner: &recordingRunner{}}, WindowsInstallResult{
NodeName: "win-a",
InstallDir: dir,
StateDir: dir,
NodeAgentPath: filepath.Join(dir, "rap-node-agent.exe"),
TaskName: "RAP Node Agent win-a",
StartupMode: "user-task",
}, WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
},
Replace: true,
AutoUpdateEnabled: true,
HostAgentSourcePath: source,
})
if err != nil {
t.Fatalf("install updater: %v", err)
}
script, err := os.ReadFile(filepath.Join(result.InstallDir, "rap-host-agent-update.cmd"))
if err != nil {
t.Fatalf("read updater script: %v", err)
}
if !strings.Contains(string(script), "--current-version 0.0.0") {
t.Fatalf("repair updater should force unknown current version:\n%s", script)
}
}
@@ -0,0 +1,947 @@
package hostagent
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"math/rand"
"net/http"
"net/url"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const (
DefaultUpdateProduct = "rap-node-agent"
HostAgentUpdateProduct = "rap-host-agent"
DefaultUpdateInstallType = "docker"
BinaryUpdateInstallType = "linux_binary"
WindowsUpdateInstallType = "windows_service"
UpdateStateFileName = "host-update-state.json"
UpdateTriggerFileName = "update-trigger.json"
)
var ErrNodeIdentityNotReady = errors.New("node identity is not approved yet")
type UpdateRequest struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
Product string
CurrentVersion string
OS string
Arch string
InstallType string
Channel string
ContainerName string
BinaryPath string
WindowsTaskName string
SystemdUnitName string
HealthTimeout time.Duration
DryRun bool
AllowProductionMesh bool
}
type UpdateResult struct {
Action string
Reason string
TargetVersion string
ContainerName string
PreviousImageID string
NewImage string
ContainerID string
Loaded bool
Replaced bool
RolledBack bool
RestartNeeded bool
}
type UpdateLoopConfig struct {
Request UpdateRequest
Interval time.Duration
InitialDelay time.Duration
Jitter float64
MaxRuns int
StopOnError bool
HostAgentUpdateEnabled bool
HostAgentUpdateRequest HostAgentUpdateRequest
Logf func(format string, args ...any)
}
type UpdateState struct {
Product string `json:"product"`
CurrentVersion string `json:"current_version"`
TargetVersion string `json:"target_version,omitempty"`
ContainerName string `json:"container_name,omitempty"`
Image string `json:"image,omitempty"`
UpdatedAt time.Time `json:"updated_at"`
}
type UpdateTrigger struct {
SchemaVersion string `json:"schema_version"`
Generation string `json:"generation"`
Products []string `json:"products,omitempty"`
Reason string `json:"reason,omitempty"`
DeliveryMode string `json:"delivery_mode,omitempty"`
SubscriptionStatus string `json:"subscription_status,omitempty"`
UpdateServiceNodeID string `json:"update_service_node_id,omitempty"`
UpdateServiceStatus string `json:"update_service_status,omitempty"`
FallbackPollSeconds int `json:"fallback_poll_seconds,omitempty"`
ObservedAt time.Time `json:"observed_at"`
}
type NodeUpdatePlanResponse struct {
Plan NodeUpdatePlan `json:"node_update_plan"`
}
type NodeUpdatePlan struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
Product string `json:"product"`
CurrentVersion string `json:"current_version,omitempty"`
Action string `json:"action"`
Reason string `json:"reason"`
TargetVersion string `json:"target_version,omitempty"`
Channel string `json:"channel,omitempty"`
Strategy string `json:"strategy,omitempty"`
RollbackAllowed bool `json:"rollback_allowed"`
HealthWindowSec int `json:"health_window_seconds,omitempty"`
Artifact *ReleaseArtifact `json:"artifact,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature json.RawMessage `json:"authority_signature,omitempty"`
ProductionForwarding bool `json:"production_forwarding"`
}
type ReleaseArtifact struct {
ID string `json:"id"`
ReleaseID string `json:"release_id"`
ClusterID string `json:"cluster_id"`
Product string `json:"product"`
Version string `json:"version"`
OS string `json:"os"`
Arch string `json:"arch"`
InstallType string `json:"install_type"`
Kind string `json:"kind"`
URL string `json:"url"`
URLs []string `json:"urls,omitempty"`
SHA256 string `json:"sha256"`
SizeBytes int64 `json:"size_bytes"`
Signature *string `json:"signature,omitempty"`
Metadata json.RawMessage `json:"metadata"`
CreatedAt time.Time `json:"created_at"`
}
type NodeUpdateStatusRequest struct {
Product string `json:"product"`
CurrentVersion string `json:"current_version,omitempty"`
TargetVersion string `json:"target_version,omitempty"`
Phase string `json:"phase"`
Status string `json:"status"`
AttemptID string `json:"attempt_id,omitempty"`
ErrorMessage *string `json:"error_message,omitempty"`
RollbackVersion *string `json:"rollback_version,omitempty"`
Payload map[string]any `json:"payload,omitempty"`
ObservedAt time.Time `json:"observed_at,omitempty"`
}
type dockerInspectContainer struct {
ID string `json:"Id"`
Image string `json:"Image"`
Config struct {
Image string `json:"Image"`
Env []string `json:"Env"`
} `json:"Config"`
HostConfig struct {
Privileged bool `json:"Privileged"`
NetworkMode string `json:"NetworkMode"`
CapAdd []string `json:"CapAdd"`
Devices []struct {
PathOnHost string `json:"PathOnHost"`
PathInContainer string `json:"PathInContainer"`
CgroupPermissions string `json:"CgroupPermissions"`
} `json:"Devices"`
RestartPolicy struct {
Name string `json:"Name"`
} `json:"RestartPolicy"`
} `json:"HostConfig"`
Mounts []struct {
Source string `json:"Source"`
Destination string `json:"Destination"`
} `json:"Mounts"`
State struct {
Running bool `json:"Running"`
} `json:"State"`
}
func (req UpdateRequest) Normalize() UpdateRequest {
req.BackendURL = strings.TrimRight(strings.TrimSpace(req.BackendURL), "/")
req.ClusterID = strings.TrimSpace(req.ClusterID)
req.NodeID = strings.TrimSpace(req.NodeID)
req.StateDir = strings.TrimSpace(req.StateDir)
req.Product = firstNonEmpty(req.Product, DefaultUpdateProduct)
req.OS = firstNonEmpty(req.OS, runtime.GOOS)
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
req.InstallType = firstNonEmpty(req.InstallType, DefaultUpdateInstallType)
req.Channel = strings.TrimSpace(req.Channel)
req.ContainerName = firstNonEmpty(req.ContainerName, DefaultContainerName)
req.BinaryPath = strings.TrimSpace(req.BinaryPath)
req.WindowsTaskName = strings.TrimSpace(req.WindowsTaskName)
req.SystemdUnitName = strings.TrimSpace(req.SystemdUnitName)
if req.HealthTimeout == 0 {
req.HealthTimeout = 30 * time.Second
}
return req
}
func (req UpdateRequest) Validate() error {
req = req.Normalize()
var missing []string
if req.BackendURL == "" {
missing = append(missing, "backend-url")
}
if req.ClusterID == "" {
missing = append(missing, "cluster-id")
}
if req.NodeID == "" && req.StateDir == "" {
missing = append(missing, "node-id-or-state-dir")
}
if req.InstallType == WindowsUpdateInstallType {
if req.BinaryPath == "" {
missing = append(missing, "binary-path")
}
if req.WindowsTaskName == "" {
missing = append(missing, "windows-task-name")
}
} else if req.InstallType == BinaryUpdateInstallType && req.Product != HostAgentUpdateProduct {
if req.BinaryPath == "" {
missing = append(missing, "binary-path")
}
if req.SystemdUnitName == "" {
missing = append(missing, "systemd-unit")
}
} else if req.ContainerName == "" {
missing = append(missing, "container-name")
}
if len(missing) > 0 {
return fmt.Errorf("missing required update settings: %s", strings.Join(missing, ", "))
}
if req.HealthTimeout < 0 {
return errors.New("health timeout must not be negative")
}
return nil
}
func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
req = req.Normalize()
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return UpdateResult{}, err
}
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
docker := firstNonEmpty(m.Binary, "docker")
plan, err := FetchNodeUpdatePlan(ctx, req)
if err != nil {
return UpdateResult{}, err
}
if plan.HealthWindowSec > 0 && req.HealthTimeout == 30*time.Second {
req.HealthTimeout = time.Duration(plan.HealthWindowSec) * time.Second
}
result := UpdateResult{
Action: plan.Action,
Reason: plan.Reason,
TargetVersion: plan.TargetVersion,
ContainerName: req.ContainerName,
}
if plan.Action != "update" {
if !req.DryRun {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromNoopPlan(req, plan))
}
return result, nil
}
if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact == nil {
err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != DefaultUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
result.NewImage = artifactImage(*plan.Artifact, "")
return result, nil
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "planned",
Status: "accepted",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason},
})
current, cfg, err := m.runtimeConfigFromContainer(ctx, runner, docker, req.ContainerName)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "inspect", "failed", err))
return result, err
}
result.PreviousImageID = current.Image
cfg.BackendURL = firstNonEmpty(cfg.BackendURL, req.BackendURL)
cfg.ClusterID = firstNonEmpty(cfg.ClusterID, req.ClusterID)
cfg.ContainerName = req.ContainerName
cfg.Image = artifactImage(*plan.Artifact, cfg.Image)
cfg.ImageArtifactURLs = artifactURLsForBackend(*plan.Artifact, req.BackendURL)
cfg.ImageArtifactSHA256 = plan.Artifact.SHA256
cfg.ImageArtifactSizeBytes = plan.Artifact.SizeBytes
cfg.Replace = true
cfg.JoinToken = ""
result.NewImage = cfg.Image
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": cfg.ImageArtifactURLs, "image": cfg.Image},
})
installed, err := m.Install(ctx, cfg)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
if rollbackErr == nil && plan.RollbackAllowed {
result.RolledBack = true
}
return result, err
}
result.Loaded = installed.Loaded
result.Replaced = installed.Replaced
result.ContainerID = installed.ContainerID
if err := m.waitContainerRunning(ctx, runner, docker, req.ContainerName, req.HealthTimeout); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "health_check", "failed", err))
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
if rollbackErr == nil && plan.RollbackAllowed {
result.RolledBack = true
}
return result, err
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "health_check",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"container_id": installed.ContainerID, "image": cfg.Image},
})
_ = saveUpdateState(req.StateDir, UpdateState{
Product: req.Product,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
ContainerName: req.ContainerName,
Image: cfg.Image,
UpdatedAt: time.Now().UTC(),
})
return result, nil
}
func (m DockerManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request.Normalize()
if err := req.Validate(); err != nil {
return err
}
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
if cfg.Interval < 0 {
return errors.New("update loop interval must not be negative")
}
if cfg.InitialDelay < 0 {
return errors.New("update loop initial delay must not be negative")
}
if cfg.Jitter < 0 || cfg.Jitter > 1 {
return errors.New("update loop jitter must be between 0 and 1")
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
}
if cfg.InitialDelay > 0 {
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
return err
}
}
runs := 0
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
for {
runs++
result, err := m.ApplyUpdate(ctx, req)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
return err
}
continue
}
logf("update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
} else {
logf("update_loop run=%d action=%s reason=%s target=%s container=%s loaded=%t replaced=%t rolled_back=%t",
runs,
result.Action,
result.Reason,
result.TargetVersion,
result.ContainerName,
result.Loaded,
result.Replaced,
result.RolledBack,
)
if result.Action == "update" && result.TargetVersion != "" && !result.RolledBack {
req.CurrentVersion = result.TargetVersion
}
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
hostReq.CurrentVersion = firstNonEmpty(hostReq.CurrentVersion, req.CurrentVersion)
hostReq.OS = firstNonEmpty(hostReq.OS, req.OS)
hostReq.Arch = firstNonEmpty(hostReq.Arch, req.Arch)
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, hostAgentInstallTypeFor(req.InstallType))
result, err := m.ApplyHostAgentUpdate(ctx, hostReq)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
} else {
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
}
} else {
logf("host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
runs,
result.Action,
result.Reason,
result.TargetVersion,
result.NewImage,
result.Replaced,
result.RestartNeeded,
)
if result.Action == "update" && result.TargetVersion != "" {
cfg.HostAgentUpdateRequest.CurrentVersion = result.TargetVersion
}
if result.RestartNeeded {
return nil
}
}
}
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
}
}
func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan, error) {
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return NodeUpdatePlan{}, err
}
values := url.Values{}
values.Set("product", req.Product)
values.Set("current_version", req.CurrentVersion)
values.Set("os", req.OS)
values.Set("arch", req.Arch)
values.Set("install_type", req.InstallType)
if req.Channel != "" {
values.Set("channel", req.Channel)
}
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/plan?%s", req.BackendURL, url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode())
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return NodeUpdatePlan{}, err
}
resp, err := http.DefaultClient.Do(httpReq)
if err != nil {
return NodeUpdatePlan{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return NodeUpdatePlan{}, fmt.Errorf("fetch update plan: %s", resp.Status)
}
var out NodeUpdatePlanResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return NodeUpdatePlan{}, err
}
return out.Plan, nil
}
func resolveUpdateRequest(req UpdateRequest) (UpdateRequest, error) {
req = req.Normalize()
if err := req.Validate(); err != nil {
return UpdateRequest{}, err
}
if req.NodeID == "" {
identity, err := state.Load(filepath.Join(req.StateDir, state.FileName))
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return UpdateRequest{}, ErrNodeIdentityNotReady
}
return UpdateRequest{}, err
}
if strings.TrimSpace(identity.NodeID) == "" {
return UpdateRequest{}, ErrNodeIdentityNotReady
}
req.NodeID = strings.TrimSpace(identity.NodeID)
if req.ClusterID == "" {
req.ClusterID = strings.TrimSpace(identity.ClusterID)
}
}
if updateState, err := loadUpdateState(req.StateDir, req.Product); err == nil && updateState.Product == req.Product && updateState.CurrentVersion != "" {
req.CurrentVersion = updateState.CurrentVersion
}
return req, nil
}
func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID string, request NodeUpdateStatusRequest) error {
backendURL = strings.TrimRight(strings.TrimSpace(backendURL), "/")
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/status", backendURL, url.PathEscape(clusterID), url.PathEscape(nodeID))
body, err := json.Marshal(request)
if err != nil {
return err
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body))
if err != nil {
return err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(httpReq)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("report update status: %s", resp.Status)
}
return nil
}
func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner CommandRunner, docker, containerName string) (dockerInspectContainer, RuntimeConfig, error) {
out, err := runner.Run(ctx, docker, "inspect", containerName)
if err != nil {
return dockerInspectContainer{}, RuntimeConfig{}, err
}
var inspected []dockerInspectContainer
if err := json.Unmarshal([]byte(out), &inspected); err != nil {
return dockerInspectContainer{}, RuntimeConfig{}, err
}
if len(inspected) == 0 {
return dockerInspectContainer{}, RuntimeConfig{}, fmt.Errorf("container %q not found", containerName)
}
env := envMap(inspected[0].Config.Env)
cfg := RuntimeConfig{
BackendURL: env["RAP_BACKEND_URL"],
ClusterID: env["RAP_CLUSTER_ID"],
NodeName: firstNonEmpty(env["RAP_NODE_NAME"], containerName),
Image: inspected[0].Config.Image,
ContainerName: containerName,
StateDir: hostStateDir(inspected[0]),
Network: firstNonEmpty(inspected[0].HostConfig.NetworkMode, DefaultNetwork),
RestartPolicy: firstNonEmpty(inspected[0].HostConfig.RestartPolicy.Name, "unless-stopped"),
WorkloadSupervisionEnabled: parseBool(env["RAP_WORKLOAD_SUPERVISION_ENABLED"]),
MeshSyntheticRuntimeEnabled: true,
MeshProductionForwardingEnabled: parseBool(env["RAP_MESH_PRODUCTION_FORWARDING_ENABLED"]),
MeshListenAddr: env["RAP_MESH_LISTEN_ADDR"],
MeshListenPortMode: env["RAP_MESH_LISTEN_PORT_MODE"],
MeshListenAutoPortStart: parseInt(env["RAP_MESH_LISTEN_AUTO_PORT_START"]),
MeshListenAutoPortEnd: parseInt(env["RAP_MESH_LISTEN_AUTO_PORT_END"]),
MeshAdvertiseEndpoint: env["RAP_MESH_ADVERTISE_ENDPOINT"],
MeshAdvertiseEndpointsJSON: env["RAP_MESH_ADVERTISE_ENDPOINTS_JSON"],
MeshAdvertiseTransport: env["RAP_MESH_ADVERTISE_TRANSPORT"],
MeshConnectivityMode: env["RAP_MESH_CONNECTIVITY_MODE"],
MeshNATType: env["RAP_MESH_NAT_TYPE"],
MeshRegion: env["RAP_MESH_REGION"],
HeartbeatIntervalSeconds: parseInt(env["RAP_HEARTBEAT_INTERVAL_SECONDS"]),
EnrollmentPollIntervalSeconds: parseInt(env["RAP_ENROLLMENT_POLL_INTERVAL_SECONDS"]),
EnrollmentPollTimeoutSeconds: parseInt(env["RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS"]),
ProductionObservationSinkCap: parseInt(env["RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY"]),
DockerVPNGatewayEnabled: dockerInspectHasVPNGatewayRuntime(inspected[0]),
}
return inspected[0], cfg.Normalize(), nil
}
func dockerInspectHasVPNGatewayRuntime(container dockerInspectContainer) bool {
hasNetAdmin := false
for _, cap := range container.HostConfig.CapAdd {
if strings.EqualFold(strings.TrimSpace(cap), "NET_ADMIN") {
hasNetAdmin = true
break
}
}
hasTun := false
for _, device := range container.HostConfig.Devices {
if device.PathOnHost == "/dev/net/tun" || device.PathInContainer == "/dev/net/tun" {
hasTun = true
break
}
}
return (container.HostConfig.Privileged || hasNetAdmin) && hasTun
}
func (m DockerManager) waitContainerRunning(ctx context.Context, runner CommandRunner, docker, containerName string, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
for {
out, err := runner.Run(ctx, docker, "inspect", "--format", "{{.State.Running}}", containerName)
if err == nil && strings.TrimSpace(out) == "true" {
return nil
}
if timeout == 0 || time.Now().After(deadline) {
if err != nil {
return err
}
return fmt.Errorf("container %q is not running", containerName)
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(time.Second):
}
}
}
func (m DockerManager) rollbackContainer(ctx context.Context, runner CommandRunner, docker string, cfg RuntimeConfig, previous dockerInspectContainer, allowed bool) error {
if !allowed || strings.TrimSpace(previous.Image) == "" {
return nil
}
rollbackCfg := cfg
rollbackCfg.Image = previous.Image
rollbackCfg.ImageArtifactURLs = nil
rollbackCfg.ImageArtifactSHA256 = ""
rollbackCfg.ImageArtifactSizeBytes = 0
rollbackCfg.Replace = true
_, err := m.Install(ctx, rollbackCfg)
if err == nil {
_, _ = runner.Run(ctx, docker, "inspect", "--format", "{{.State.Running}}", cfg.ContainerName)
}
return err
}
func artifactImage(artifact ReleaseArtifact, fallback string) string {
if len(artifact.Metadata) > 0 {
var metadata struct {
Image string `json:"image"`
}
if err := json.Unmarshal(artifact.Metadata, &metadata); err == nil && strings.TrimSpace(metadata.Image) != "" {
return strings.TrimSpace(metadata.Image)
}
}
if artifact.InstallType == DefaultUpdateInstallType && artifact.Product != "" && artifact.Version != "" {
return strings.TrimSpace(artifact.Product) + ":" + strings.TrimSpace(artifact.Version)
}
return firstNonEmpty(fallback, DefaultImage)
}
func artifactURLs(artifact ReleaseArtifact) []string {
out := make([]string, 0, 1+len(artifact.URLs))
for _, raw := range append([]string{artifact.URL}, artifact.URLs...) {
raw = strings.TrimSpace(raw)
if raw == "" || containsArtifactURL(out, raw) {
continue
}
out = append(out, raw)
}
return out
}
func artifactURLsForBackend(artifact ReleaseArtifact, backendURL string) []string {
urls := artifactURLs(artifact)
base, err := url.Parse(strings.TrimSpace(backendURL))
if err != nil || base.Scheme == "" || base.Host == "" {
return urls
}
origin := base.Scheme + "://" + base.Host
out := make([]string, 0, len(urls))
for _, raw := range urls {
if strings.HasPrefix(raw, "/") {
raw = origin + raw
}
if !containsArtifactURL(out, raw) {
out = append(out, raw)
}
}
return out
}
func containsArtifactURL(values []string, value string) bool {
for _, item := range values {
if item == value {
return true
}
}
return false
}
func statusFromError(req UpdateRequest, plan NodeUpdatePlan, phase, status string, err error) NodeUpdateStatusRequest {
message := err.Error()
return NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: phase,
Status: status,
AttemptID: updateAttemptID(plan),
ErrorMessage: &message,
ObservedAt: time.Now().UTC(),
}
}
func statusFromNoopPlan(req UpdateRequest, plan NodeUpdatePlan) NodeUpdateStatusRequest {
return NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "plan",
Status: "noop",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{
"action": plan.Action,
"reason": plan.Reason,
"strategy": plan.Strategy,
"channel": plan.Channel,
},
}
}
func updateAttemptID(plan NodeUpdatePlan) string {
parts := []string{plan.NodeID, plan.Product, plan.TargetVersion}
if plan.Artifact != nil {
parts = append(parts, plan.Artifact.ID)
}
return strings.Join(parts, ":")
}
func envMap(items []string) map[string]string {
out := map[string]string{}
for _, item := range items {
key, value, ok := strings.Cut(item, "=")
if ok {
out[key] = value
}
}
return out
}
func hostStateDir(container dockerInspectContainer) string {
for _, mount := range container.Mounts {
if mount.Destination == "/var/lib/rap-node-agent" && mount.Source != "" {
return mount.Source
}
}
return DefaultStateDir
}
func parseBool(value string) bool {
switch strings.ToLower(strings.TrimSpace(value)) {
case "1", "true", "yes", "y", "on":
return true
default:
return false
}
}
func parseInt(value string) int {
out, _ := strconv.Atoi(strings.TrimSpace(value))
return out
}
func loadUpdateState(stateDir string, product string) (UpdateState, error) {
stateDir = strings.TrimSpace(stateDir)
if stateDir == "" {
return UpdateState{}, os.ErrNotExist
}
product = firstNonEmpty(normalizeUpdateProductToken(product), DefaultUpdateProduct)
payload, err := os.ReadFile(updateStatePath(stateDir, product))
if err != nil && product == DefaultUpdateProduct {
payload, err = os.ReadFile(filepath.Join(stateDir, UpdateStateFileName))
}
if err != nil {
return UpdateState{}, err
}
var item UpdateState
if err := json.Unmarshal(payload, &item); err != nil {
return UpdateState{}, err
}
item.Product = firstNonEmpty(item.Product, product)
return item, nil
}
func saveUpdateState(stateDir string, item UpdateState) error {
stateDir = strings.TrimSpace(stateDir)
if stateDir == "" || item.CurrentVersion == "" {
return nil
}
item.Product = firstNonEmpty(item.Product, DefaultUpdateProduct)
if item.UpdatedAt.IsZero() {
item.UpdatedAt = time.Now().UTC()
}
if err := os.MkdirAll(stateDir, 0o700); err != nil {
return err
}
payload, err := json.MarshalIndent(item, "", " ")
if err != nil {
return err
}
return os.WriteFile(updateStatePath(stateDir, item.Product), payload, 0o600)
}
func updateStatePath(stateDir, product string) string {
product = normalizeUpdateProductToken(firstNonEmpty(product, DefaultUpdateProduct))
if product == "" || product == DefaultUpdateProduct {
return filepath.Join(stateDir, UpdateStateFileName)
}
return filepath.Join(stateDir, "host-update-state-"+product+".json")
}
func UpdateTriggerPath(stateDir string) string {
return filepath.Join(strings.TrimSpace(stateDir), UpdateTriggerFileName)
}
func SaveUpdateTrigger(stateDir string, trigger UpdateTrigger) error {
stateDir = strings.TrimSpace(stateDir)
trigger.Generation = strings.TrimSpace(trigger.Generation)
if stateDir == "" || trigger.Generation == "" {
return nil
}
if trigger.SchemaVersion == "" {
trigger.SchemaVersion = "rap.node_update_trigger.v1"
}
if trigger.ObservedAt.IsZero() {
trigger.ObservedAt = time.Now().UTC()
}
if err := os.MkdirAll(stateDir, 0o700); err != nil {
return err
}
payload, err := json.MarshalIndent(trigger, "", " ")
if err != nil {
return err
}
return os.WriteFile(UpdateTriggerPath(stateDir), payload, 0o600)
}
func currentUpdateTriggerGeneration(stateDir string) string {
payload, err := os.ReadFile(UpdateTriggerPath(stateDir))
if err != nil {
return ""
}
var trigger UpdateTrigger
if err := json.Unmarshal(payload, &trigger); err != nil {
return ""
}
return strings.TrimSpace(trigger.Generation)
}
func CurrentUpdateTriggerGenerationForNodeAgent(stateDir string) string {
return currentUpdateTriggerGeneration(stateDir)
}
func normalizeUpdateProductToken(value string) string {
value = strings.ToLower(strings.TrimSpace(value))
var b strings.Builder
for _, r := range value {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' {
b.WriteRune(r)
}
}
return b.String()
}
func sleepContext(ctx context.Context, duration time.Duration) error {
if duration <= 0 {
return nil
}
timer := time.NewTimer(duration)
defer timer.Stop()
select {
case <-ctx.Done():
return ctx.Err()
case <-timer.C:
return nil
}
}
func sleepUntilUpdateIntervalOrTrigger(ctx context.Context, stateDir string, duration time.Duration, lastGeneration *string) error {
if duration <= 0 {
return nil
}
deadline := time.NewTimer(duration)
defer deadline.Stop()
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-deadline.C:
return nil
case <-ticker.C:
generation := currentUpdateTriggerGeneration(stateDir)
if generation != "" && lastGeneration != nil && generation != *lastGeneration {
*lastGeneration = generation
return nil
}
}
}
}
func jitteredDuration(base time.Duration, jitter float64) time.Duration {
if base <= 0 || jitter <= 0 {
return base
}
spread := int64(float64(base) * jitter)
if spread <= 0 {
return base
}
offset := rand.Int63n(spread*2+1) - spread
return base + time.Duration(offset)
}
@@ -0,0 +1,672 @@
package hostagent
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
type updateRunner struct {
calls [][]string
healthOkay bool
inspectJSON string
}
func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.T) {
urls := artifactURLsForBackend(ReleaseArtifact{
URL: "/downloads/rap-node-agent-0.2.92.tar",
URLs: []string{"/downloads/mirror.tar", "https://cdn.example.test/agent.tar"},
}, "http://control.example.test:18080/api/v1")
want := []string{
"http://control.example.test:18080/downloads/rap-node-agent-0.2.92.tar",
"http://control.example.test:18080/downloads/mirror.tar",
"https://cdn.example.test/agent.tar",
}
if len(urls) != len(want) {
t.Fatalf("urls = %#v", urls)
}
for i := range want {
if urls[i] != want[i] {
t.Fatalf("urls[%d] = %q, want %q; all=%#v", i, urls[i], want[i], urls)
}
}
}
func (r *updateRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) >= 2 && args[0] == "inspect" && args[1] == "--format" {
if r.healthOkay {
return "true\n", nil
}
return "false\n", nil
}
if len(args) == 2 && args[0] == "inspect" {
return r.inspectJSON, nil
}
if len(args) >= 2 && args[0] == "image" && args[1] == "inspect" {
return "[]", nil
}
if len(args) > 0 && args[0] == "run" {
return "updated-container\n", nil
}
return "", nil
}
func TestApplyUpdateFetchesPlanLoadsImageAndRecreatesContainer(t *testing.T) {
artifactBody := []byte("fake docker image tar")
statuses := []NodeUpdateStatusRequest{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"schema_version": "rap.node_update_plan.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0-old",
"action": "update",
"reason": "matching_release_available",
"target_version": "0.1.0-new",
"rollback_allowed": true,
"health_window_seconds": 1,
"production_forwarding": false,
"artifact": map[string]any{
"id": "artifact-1",
"product": "rap-node-agent",
"version": "0.1.0-new",
"os": "linux",
"arch": "amd64",
"install_type": "docker",
"url": serverArtifactURL(r),
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
"size_bytes": len(artifactBody),
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
},
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
var status NodeUpdateStatusRequest
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
t.Fatalf("decode status: %v", err)
}
statuses = append(statuses, status)
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
_, _ = w.Write(artifactBody)
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixture(server.URL)}
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.0-old",
ContainerName: "rap-node-agent-node-1",
HealthTimeout: time.Second,
})
if err != nil {
t.Fatalf("apply update: %v", err)
}
if result.Action != "update" || !result.Loaded || !result.Replaced || result.NewImage != "rap-node-agent:test-new" {
t.Fatalf("unexpected result: %+v", result)
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
for _, want := range []string{"inspect\x00rap-node-agent-node-1", "load\x00-i", "rm\x00-f\x00rap-node-agent-node-1", "run\x00-d", "RAP_NODE_NAME=node-a"} {
if !strings.Contains(joined, want) {
t.Fatalf("missing docker call part %q in %#v", want, runner.calls)
}
}
if len(statuses) != 3 || statuses[0].Phase != "planned" || statuses[1].Phase != "download" || statuses[2].Status != "succeeded" {
t.Fatalf("statuses = %+v", statuses)
}
}
func TestApplyUpdatePreservesDockerVPNGatewayRuntime(t *testing.T) {
previousStatHostPath := statHostPath
statHostPath = func(string) (os.FileInfo, error) { return nil, nil }
t.Cleanup(func() { statHostPath = previousStatHostPath })
artifactBody := []byte("fake docker image tar")
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"schema_version": "rap.node_update_plan.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.2.7",
"action": "update",
"reason": "matching_release_available",
"target_version": "0.2.8",
"rollback_allowed": true,
"health_window_seconds": 1,
"artifact": map[string]any{
"id": "artifact-1",
"product": "rap-node-agent",
"version": "0.2.8",
"os": "linux",
"arch": "amd64",
"install_type": "docker",
"url": serverArtifactURL(r),
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
"size_bytes": len(artifactBody),
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
},
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
_, _ = w.Write(artifactBody)
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixtureWithVPNGatewayRuntime()}
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.2.7",
ContainerName: "rap-node-agent-node-1",
HealthTimeout: time.Second,
})
if err != nil {
t.Fatalf("ApplyUpdate failed: %v", err)
}
if !result.Replaced {
t.Fatalf("expected replacement")
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
for _, want := range []string{"--privileged", "--cap-add\x00NET_ADMIN", "--device\x00/dev/net/tun:/dev/net/tun"} {
if !strings.Contains(joined, want) {
t.Fatalf("docker run did not preserve %q in %#v", want, runner.calls)
}
}
}
func TestApplyUpdateNoopsWithoutDockerWhenPlanHasNoAction(t *testing.T) {
statuses := []NodeUpdateStatusRequest{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.3",
"action": "none",
"reason": "already_current",
"target_version": "0.1.3",
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
var status NodeUpdateStatusRequest
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
t.Fatalf("decode status: %v", err)
}
statuses = append(statuses, status)
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
runner := &updateRunner{}
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.3",
ContainerName: "rap-node-agent-node-1",
})
if err != nil {
t.Fatalf("apply update: %v", err)
}
if result.Action != "none" || result.Reason != "already_current" {
t.Fatalf("result = %+v", result)
}
if len(runner.calls) != 0 {
t.Fatalf("docker should not be called, got %#v", runner.calls)
}
if len(statuses) != 1 || statuses[0].Phase != "plan" || statuses[0].Status != "noop" || statuses[0].TargetVersion != "0.1.3" {
t.Fatalf("statuses = %+v", statuses)
}
}
func TestWindowsApplyUpdateNoopReportsTaskStatus(t *testing.T) {
statuses := []NodeUpdateStatusRequest{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.3",
"action": "none",
"reason": "already_current",
"target_version": "0.1.3",
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
var status NodeUpdateStatusRequest
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
t.Fatalf("decode status: %v", err)
}
statuses = append(statuses, status)
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
result, err := (WindowsManager{Runner: &updateRunner{}}).ApplyUpdate(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.3",
InstallType: WindowsUpdateInstallType,
BinaryPath: `C:\Program Files\RAP\node\rap-node-agent.exe`,
WindowsTaskName: "RAP Node Agent node",
})
if err != nil {
t.Fatalf("windows apply update: %v", err)
}
if result.Action != "none" || result.Reason != "already_current" {
t.Fatalf("result = %+v", result)
}
if len(statuses) != 1 || statuses[0].Phase != "plan" || statuses[0].Status != "noop" {
t.Fatalf("statuses = %+v", statuses)
}
if statuses[0].Payload["task"] != "RAP Node Agent node" {
t.Fatalf("status payload = %+v", statuses[0].Payload)
}
}
func TestRunUpdateLoopAdvancesCurrentVersionAfterSuccessfulUpdate(t *testing.T) {
artifactBody := []byte("fake docker image tar")
planRequests := []string{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
current := r.URL.Query().Get("current_version")
planRequests = append(planRequests, current)
action := "update"
reason := "matching_release_available"
if current == "0.1.0-new" {
action = "none"
reason = "already_current"
}
plan := map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": current,
"action": action,
"reason": reason,
"target_version": "0.1.0-new",
"rollback_allowed": true,
"production_forwarding": false,
}
if action == "update" {
plan["artifact"] = map[string]any{
"id": "artifact-1",
"product": "rap-node-agent",
"version": "0.1.0-new",
"os": "linux",
"arch": "amd64",
"install_type": "docker",
"url": serverArtifactURL(r),
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
"size_bytes": len(artifactBody),
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
}
}
_ = json.NewEncoder(w).Encode(map[string]any{"node_update_plan": plan})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
_, _ = w.Write(artifactBody)
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixture(server.URL)}
err := (DockerManager{Runner: runner}).RunUpdateLoop(context.Background(), UpdateLoopConfig{
Request: UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.0-old",
ContainerName: "rap-node-agent-node-1",
HealthTimeout: time.Second,
},
Interval: time.Millisecond,
MaxRuns: 2,
})
if err != nil {
t.Fatalf("run update loop: %v", err)
}
if strings.Join(planRequests, ",") != "0.1.0-old,0.1.0-new" {
t.Fatalf("plan current versions = %#v", planRequests)
}
}
func TestRunUpdateLoopReportsHostAgentStatusWhenEnabled(t *testing.T) {
statuses := []NodeUpdateStatusRequest{}
planProducts := []string{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
product := r.URL.Query().Get("product")
planProducts = append(planProducts, product)
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": product,
"current_version": "0.1.3",
"action": "none",
"reason": "already_current",
"target_version": "0.1.3",
"rollback_allowed": true,
"production_forwarding": false,
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
var status NodeUpdateStatusRequest
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
t.Fatalf("decode status: %v", err)
}
statuses = append(statuses, status)
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
err := (DockerManager{}).RunUpdateLoop(context.Background(), UpdateLoopConfig{
Request: UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.3",
ContainerName: "rap-node-agent-node-1",
},
HostAgentUpdateEnabled: true,
HostAgentUpdateRequest: HostAgentUpdateRequest{
CurrentVersion: "0.1.3",
BinaryPath: filepath.Join(t.TempDir(), "rap-host-agent"),
},
MaxRuns: 1,
})
if err != nil {
t.Fatalf("run update loop: %v", err)
}
if strings.Join(planProducts, ",") != "rap-node-agent,rap-host-agent" {
t.Fatalf("plan products = %#v", planProducts)
}
if len(statuses) != 2 || statuses[0].Product != "rap-node-agent" || statuses[1].Product != "rap-host-agent" {
t.Fatalf("statuses = %+v", statuses)
}
if statuses[1].Phase != "plan" || statuses[1].Status != "noop" {
t.Fatalf("host-agent status = %+v", statuses[1])
}
}
func TestFetchNodeUpdatePlanResolvesNodeIDAndVersionFromStateDir(t *testing.T) {
dir := t.TempDir()
if err := state.Save(filepath.Join(dir, state.FileName), state.Identity{
NodeID: "node-from-state",
ClusterID: "cluster-1",
NodeName: "node-a",
}); err != nil {
t.Fatalf("save identity: %v", err)
}
if err := saveUpdateState(dir, UpdateState{
Product: "rap-node-agent",
CurrentVersion: "0.1.0-state",
}); err != nil {
t.Fatalf("save update state: %v", err)
}
var gotPath string
var gotCurrent string
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
gotPath = r.URL.Path
gotCurrent = r.URL.Query().Get("current_version")
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-from-state",
"product": "rap-node-agent",
"action": "none",
"reason": "already_current",
},
})
}))
defer server.Close()
if _, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
StateDir: dir,
CurrentVersion: "0.1.0-flag",
}); err != nil {
t.Fatalf("fetch plan: %v", err)
}
if !strings.Contains(gotPath, "/nodes/node-from-state/updates/plan") || gotCurrent != "0.1.0-state" {
t.Fatalf("path/current = %q/%q", gotPath, gotCurrent)
}
}
func TestApplyHostAgentUpdateDownloadsAndReplacesBinary(t *testing.T) {
dir := t.TempDir()
if err := state.Save(filepath.Join(dir, state.FileName), state.Identity{
NodeID: "node-1",
ClusterID: "cluster-1",
NodeName: "node-a",
}); err != nil {
t.Fatalf("save identity: %v", err)
}
binaryPath := filepath.Join(dir, "rap-host-agent")
artifactBody := []byte("new host agent binary")
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
if r.URL.Query().Get("product") != HostAgentUpdateProduct || r.URL.Query().Get("install_type") != BinaryUpdateInstallType {
t.Fatalf("unexpected query: %s", r.URL.RawQuery)
}
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": HostAgentUpdateProduct,
"action": "update",
"reason": "matching_release_available",
"target_version": "0.1.0-host-new",
"rollback_allowed": false,
"production_forwarding": false,
"artifact": map[string]any{
"id": "artifact-host-1",
"product": HostAgentUpdateProduct,
"version": "0.1.0-host-new",
"os": "linux",
"arch": "amd64",
"install_type": BinaryUpdateInstallType,
"url": serverArtifactURL(r),
"sha256": "adc549d9e66ef64a507dd6880590d31309e16a3be965a92d849edd103cfb1815",
"size_bytes": len(artifactBody),
},
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
_, _ = w.Write(artifactBody)
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
result, err := (DockerManager{}).ApplyHostAgentUpdate(context.Background(), HostAgentUpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
StateDir: dir,
CurrentVersion: "0.1.0-host-old",
BinaryPath: binaryPath,
})
if err != nil {
t.Fatalf("apply host-agent update: %v", err)
}
if !result.Replaced || !result.RestartNeeded {
t.Fatalf("result = %+v", result)
}
payload, err := os.ReadFile(binaryPath)
if err != nil || string(payload) != string(artifactBody) {
t.Fatalf("binary payload = %q, %v", payload, err)
}
updateState, err := loadUpdateState(dir, HostAgentUpdateProduct)
if err != nil {
t.Fatalf("load update state: %v", err)
}
if updateState.Product != HostAgentUpdateProduct || updateState.CurrentVersion != "0.1.0-host-new" {
t.Fatalf("update state = %+v", updateState)
}
}
func TestUpdateStateIsProductScoped(t *testing.T) {
dir := t.TempDir()
if err := saveUpdateState(dir, UpdateState{Product: DefaultUpdateProduct, CurrentVersion: "node-v"}); err != nil {
t.Fatalf("save node state: %v", err)
}
if err := saveUpdateState(dir, UpdateState{Product: HostAgentUpdateProduct, CurrentVersion: "host-v"}); err != nil {
t.Fatalf("save host state: %v", err)
}
nodeState, err := loadUpdateState(dir, DefaultUpdateProduct)
if err != nil {
t.Fatalf("load node state: %v", err)
}
hostState, err := loadUpdateState(dir, HostAgentUpdateProduct)
if err != nil {
t.Fatalf("load host state: %v", err)
}
if nodeState.CurrentVersion != "node-v" || hostState.CurrentVersion != "host-v" {
t.Fatalf("states overlapped: node=%+v host=%+v", nodeState, hostState)
}
}
func TestArtifactImageDerivesDockerTagFromProductAndVersion(t *testing.T) {
got := artifactImage(ReleaseArtifact{
Product: "rap-node-agent",
Version: "0.2.77",
InstallType: DefaultUpdateInstallType,
}, "rap-node-agent:old")
if got != "rap-node-agent:0.2.77" {
t.Fatalf("expected versioned docker image, got %q", got)
}
}
func serverArtifactURL(r *http.Request) string {
scheme := "http"
if r.TLS != nil {
scheme = "https"
}
return fmt.Sprintf("%s://%s/artifact.tar", scheme, r.Host)
}
func dockerInspectFixture(_ string) string {
return `[
{
"Id": "old-container",
"Image": "sha256:oldimage",
"Config": {
"Image": "rap-node-agent:test-old",
"Env": [
"RAP_BACKEND_URL=http://control/api/v1",
"RAP_CLUSTER_ID=cluster-1",
"RAP_NODE_NAME=node-a",
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
"RAP_HEARTBEAT_INTERVAL_SECONDS=15",
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=5",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
"RAP_MESH_LISTEN_ADDR=:19131"
]
},
"HostConfig": {
"NetworkMode": "host",
"RestartPolicy": {"Name": "unless-stopped"}
},
"Mounts": [
{"Source": "/var/lib/rap/nodes/node-a", "Destination": "/var/lib/rap-node-agent"}
],
"State": {"Running": true}
}
]`
}
func dockerInspectFixtureWithVPNGatewayRuntime() string {
return `[
{
"Id": "old-container",
"Image": "sha256:oldimage",
"Config": {
"Image": "rap-node-agent:test-old",
"Env": [
"RAP_BACKEND_URL=http://control/api/v1",
"RAP_CLUSTER_ID=cluster-1",
"RAP_NODE_NAME=node-a",
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
"RAP_HEARTBEAT_INTERVAL_SECONDS=15",
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=5",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
"RAP_MESH_LISTEN_ADDR=:19131"
]
},
"HostConfig": {
"NetworkMode": "host",
"Privileged": true,
"CapAdd": ["NET_ADMIN"],
"Devices": [
{"PathOnHost": "/dev/net/tun", "PathInContainer": "/dev/net/tun", "CgroupPermissions": "rwm"}
],
"RestartPolicy": {"Name": "unless-stopped"}
},
"Mounts": [
{"Source": "/var/lib/rap/nodes/node-a", "Destination": "/var/lib/rap-node-agent"}
],
"State": {"Running": true}
}
]`
}
@@ -0,0 +1,368 @@
package hostagent
import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strings"
)
const (
DefaultWindowsInstallDir = `C:\Program Files\RAP`
DefaultWindowsStateRoot = `C:\ProgramData\RAP\nodes`
)
type WindowsInstallConfig struct {
RuntimeConfig RuntimeConfig
NodeID string
InstallDir string
StartupMode string
ArtifactURLs []string
ArtifactSHA256 string
ArtifactSizeBytes int64
Replace bool
DryRun bool
AutoUpdateEnabled bool
AutoUpdateCurrentVersion string
AutoUpdateChannel string
AutoUpdateIntervalSeconds int
AutoUpdateInitialDelaySeconds int
AutoUpdateHealthTimeoutSeconds int
HostAgentSourcePath string
}
type WindowsInstallResult struct {
NodeName string
InstallDir string
StateDir string
NodeAgentPath string
WrapperPath string
StartupMode string
TaskName string
HostAgentPath string
UpdaterTaskName string
Downloaded bool
Started bool
UpdaterStarted bool
AdminFallback bool
}
type WindowsManager struct {
Runner CommandRunner
}
func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInstallConfig {
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(profile.NodeName)))
return WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
StateDir: stateDir,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
},
InstallDir: firstNonEmpty(profile.InstallDir, filepath.Join(DefaultWindowsInstallDir, safeUnitSlug(profile.NodeName))),
StartupMode: firstNonEmpty(profile.StartupMode, "auto"),
ArtifactURLs: binaryArtifactURLs(profile),
ArtifactSHA256: binaryArtifactSHA256(profile),
ArtifactSizeBytes: binaryArtifactSizeBytes(profile),
Replace: true,
AutoUpdateEnabled: true,
}
}
func (m WindowsManager) Install(ctx context.Context, cfg WindowsInstallConfig) (WindowsInstallResult, error) {
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
if strings.TrimSpace(cfg.RuntimeConfig.StateDir) == "" {
cfg.RuntimeConfig.StateDir = filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(cfg.RuntimeConfig.NodeName))
}
cfg.RuntimeConfig.Replace = cfg.Replace
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
if err := cfg.RuntimeConfig.ValidateInstall(); err != nil {
return WindowsInstallResult{}, err
}
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "auto"))
noAdminPreferred := cfg.StartupMode == "user-task"
cfg.InstallDir = firstNonEmpty(cfg.InstallDir, defaultWindowsInstallDir(cfg.RuntimeConfig.NodeName, noAdminPreferred))
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "auto"))
if noAdminPreferred && strings.HasPrefix(strings.ToLower(cfg.RuntimeConfig.StateDir), strings.ToLower(DefaultWindowsStateRoot)) {
cfg.RuntimeConfig.StateDir = defaultWindowsStateDir(cfg.RuntimeConfig.NodeName, true)
}
result := WindowsInstallResult{
NodeName: cfg.RuntimeConfig.NodeName,
InstallDir: cfg.InstallDir,
StateDir: cfg.RuntimeConfig.StateDir,
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent.exe"),
WrapperPath: filepath.Join(cfg.InstallDir, "rap-node-agent-run.cmd"),
StartupMode: cfg.StartupMode,
TaskName: "RAP Node Agent " + safeUnitSlug(cfg.RuntimeConfig.NodeName),
}
if cfg.DryRun {
return result, nil
}
if runtime.GOOS != "windows" {
return result, fmt.Errorf("windows install is only supported on windows hosts")
}
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
if cfg.StartupMode != "auto" || !isAccessDenied(err) {
return result, err
}
cfg.InstallDir = defaultWindowsInstallDir(cfg.RuntimeConfig.NodeName, true)
cfg.RuntimeConfig.StateDir = defaultWindowsStateDir(cfg.RuntimeConfig.NodeName, true)
result.InstallDir = cfg.InstallDir
result.StateDir = cfg.RuntimeConfig.StateDir
result.NodeAgentPath = filepath.Join(cfg.InstallDir, "rap-node-agent.exe")
result.WrapperPath = filepath.Join(cfg.InstallDir, "rap-node-agent-run.cmd")
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
return result, err
}
result.AdminFallback = true
}
if err := os.MkdirAll(cfg.RuntimeConfig.StateDir, 0o700); err != nil {
return result, err
}
if len(cfg.ArtifactURLs) > 0 && (cfg.Replace || !fileExists(result.NodeAgentPath)) {
m.stopExistingNodeAgent(ctx, result.TaskName, result.NodeAgentPath)
path, err := downloadFirstArtifact(ctx, cfg.ArtifactURLs, cfg.ArtifactSHA256, cfg.ArtifactSizeBytes)
if err != nil {
return result, err
}
defer os.Remove(path)
if err := copyFile(path, result.NodeAgentPath, 0o755); err != nil {
m.stopExistingNodeAgent(ctx, result.TaskName, result.NodeAgentPath)
if retryErr := copyFile(path, result.NodeAgentPath, 0o755); retryErr == nil {
result.Downloaded = true
goto binaryReady
}
return result, err
}
result.Downloaded = true
}
binaryReady:
if !fileExists(result.NodeAgentPath) {
return result, fmt.Errorf("node-agent binary is missing at %s and no artifact was available", result.NodeAgentPath)
}
if err := os.WriteFile(filepath.Join(cfg.InstallDir, "rap-node-agent.env.cmd"), []byte(windowsEnvScript(cfg.RuntimeConfig)), 0o600); err != nil {
return result, err
}
if err := os.WriteFile(result.WrapperPath, []byte(windowsWrapperScript(result.NodeAgentPath, filepath.Join(cfg.InstallDir, "rap-node-agent.env.cmd"))), 0o755); err != nil {
return result, err
}
logPath := filepath.Join(cfg.RuntimeConfig.StateDir, "rap-node-agent.log")
started, fallback, mode, err := m.installStartupTask(ctx, result.TaskName, result.WrapperPath, logPath, cfg.StartupMode)
if err != nil {
return result, err
}
result.Started = started
result.AdminFallback = fallback
result.StartupMode = mode
result, err = installWindowsHostAgentUpdater(ctx, m, result, cfg)
if err != nil {
return result, err
}
return result, nil
}
func (m WindowsManager) stopExistingNodeAgent(ctx context.Context, taskName, nodeAgentPath string) {
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
_, _ = runner.Run(ctx, "schtasks", "/End", "/TN", taskName)
escapedPath := strings.ReplaceAll(nodeAgentPath, `'`, `''`)
_, _ = runner.Run(ctx, "powershell", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command",
`Get-Process rap-node-agent -ErrorAction SilentlyContinue | Where-Object { $_.Path -eq '`+escapedPath+`' } | Stop-Process -Force -ErrorAction SilentlyContinue`)
}
func (m WindowsManager) installStartupTask(ctx context.Context, taskName, wrapperPath, logPath, mode string) (bool, bool, string, error) {
if mode == "none" {
return false, false, mode, nil
}
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
if mode == "auto" || mode == "system-task" {
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "ONSTART", "/RU", "SYSTEM", "/RL", "HIGHEST", "/TR", windowsTaskAction(wrapperPath, logPath), "/F")
if err == nil {
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
return true, false, "system-task", nil
}
if mode == "system-task" {
return false, false, mode, err
}
}
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "ONLOGON", "/TR", windowsTaskAction(wrapperPath, logPath), "/F")
if err != nil {
return false, mode == "auto", "user-task", err
}
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
return true, mode == "auto", "user-task", nil
}
func windowsTaskAction(wrapperPath, logPath string) string {
return `cmd.exe /c ""` + wrapperPath + `" >> "` + logPath + `" 2>&1"`
}
func windowsEnvScript(cfg RuntimeConfig) string {
lines := []string{"@echo off"}
for _, env := range NodeAgentEnv(cfg) {
key, value, ok := strings.Cut(env, "=")
if !ok {
continue
}
lines = append(lines, "set "+key+"="+value)
}
return strings.Join(lines, "\r\n") + "\r\n"
}
func windowsWrapperScript(nodeAgentPath, envPath string) string {
return strings.Join([]string{
"@echo off",
`call "` + envPath + `"`,
`"` + nodeAgentPath + `"`,
}, "\r\n") + "\r\n"
}
func binaryArtifactURLs(profile WindowsInstallProfile) []string {
if profile.NodeAgentArtifact != nil && len(profile.NodeAgentArtifact.URLs) > 0 {
return append([]string(nil), profile.NodeAgentArtifact.URLs...)
}
if profile.NodeAgentArtifact == nil || strings.TrimSpace(profile.NodeAgentArtifact.FileName) == "" {
return nil
}
out := []string{}
fileName := strings.TrimLeft(strings.TrimSpace(profile.NodeAgentArtifact.FileName), "/")
for _, endpoint := range profile.ArtifactEndpoints {
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
out = append(out, trimmed+"/"+fileName)
}
}
return out
}
func binaryArtifactSHA256(profile WindowsInstallProfile) string {
if profile.NodeAgentArtifact == nil {
return ""
}
return strings.TrimSpace(profile.NodeAgentArtifact.SHA256)
}
func binaryArtifactSizeBytes(profile WindowsInstallProfile) int64 {
if profile.NodeAgentArtifact == nil {
return 0
}
return profile.NodeAgentArtifact.SizeBytes
}
func fileExists(path string) bool {
_, err := os.Stat(path)
return err == nil
}
func copyFile(source, target string, mode os.FileMode) error {
src, err := os.Open(source)
if err != nil {
return err
}
defer src.Close()
if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil {
return err
}
tmp := target + ".tmp"
dst, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
if err != nil {
return err
}
if _, err := io.Copy(dst, src); err != nil {
_ = dst.Close()
_ = os.Remove(tmp)
return err
}
if err := dst.Close(); err != nil {
_ = os.Remove(tmp)
return err
}
if err := replaceFile(tmp, target); err != nil {
_ = os.Remove(tmp)
return err
}
return nil
}
func replaceFile(tmp, target string) error {
if runtime.GOOS != "windows" {
return os.Rename(tmp, target)
}
backup := target + ".bak"
_ = os.Remove(backup)
if fileExists(target) {
if err := os.Rename(target, backup); err != nil {
return err
}
}
if err := os.Rename(tmp, target); err != nil {
if fileExists(backup) {
_ = os.Rename(backup, target)
}
return err
}
_ = os.Remove(backup)
return nil
}
func defaultWindowsInstallDir(nodeName string, userMode bool) string {
slug := safeUnitSlug(nodeName)
if userMode {
if base := strings.TrimSpace(os.Getenv("LOCALAPPDATA")); base != "" {
return filepath.Join(base, "RAP", slug)
}
if base := strings.TrimSpace(os.Getenv("USERPROFILE")); base != "" {
return filepath.Join(base, "AppData", "Local", "RAP", slug)
}
}
return filepath.Join(DefaultWindowsInstallDir, slug)
}
func defaultWindowsStateDir(nodeName string, userMode bool) string {
slug := safeUnitSlug(nodeName)
if userMode {
if base := strings.TrimSpace(os.Getenv("LOCALAPPDATA")); base != "" {
return filepath.Join(base, "RAP", "nodes", slug)
}
if base := strings.TrimSpace(os.Getenv("USERPROFILE")); base != "" {
return filepath.Join(base, "AppData", "Local", "RAP", "nodes", slug)
}
}
return filepath.Join(DefaultWindowsStateRoot, slug)
}
func isAccessDenied(err error) bool {
if err == nil {
return false
}
value := strings.ToLower(err.Error())
return strings.Contains(value, "access is denied") ||
strings.Contains(value, "permission denied") ||
strings.Contains(value, "operation not permitted")
}
@@ -0,0 +1,337 @@
package hostagent
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"time"
)
func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
req.InstallType = WindowsUpdateInstallType
}
req.OS = firstNonEmpty(req.OS, "windows")
req.Arch = firstNonEmpty(req.Arch, "amd64")
req = req.Normalize()
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return UpdateResult{}, err
}
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
plan, err := FetchNodeUpdatePlan(ctx, req)
if err != nil {
return UpdateResult{}, err
}
if plan.HealthWindowSec > 0 && req.HealthTimeout == 30*time.Second {
req.HealthTimeout = time.Duration(plan.HealthWindowSec) * time.Second
}
result := UpdateResult{
Action: plan.Action,
Reason: plan.Reason,
TargetVersion: plan.TargetVersion,
ContainerName: req.WindowsTaskName,
NewImage: req.BinaryPath,
}
if plan.Action != "update" {
if !req.DryRun {
status := statusFromNoopPlan(req, plan)
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["task"] = req.WindowsTaskName
status.Payload["binary_path"] = req.BinaryPath
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
}
return result, nil
}
if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact == nil {
err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != WindowsUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
return result, nil
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "planned",
Status: "accepted",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName},
})
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath},
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if retryErr := copyFile(path, req.BinaryPath, 0o755); retryErr != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
return result, err
}
}
result.Replaced = true
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "health_check",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"task": req.WindowsTaskName, "binary_path": req.BinaryPath},
})
_ = saveUpdateState(req.StateDir, UpdateState{
Product: req.Product,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
Image: req.BinaryPath,
UpdatedAt: time.Now().UTC(),
})
return result, nil
}
func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
req.InstallType = WindowsUpdateInstallType
}
req.OS = firstNonEmpty(req.OS, "windows")
req.Arch = firstNonEmpty(req.Arch, "amd64")
req = req.Normalize()
if err := req.Validate(); err != nil {
return err
}
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
if cfg.Interval < 0 {
return errors.New("update loop interval must not be negative")
}
if cfg.InitialDelay < 0 {
return errors.New("update loop initial delay must not be negative")
}
if cfg.Jitter < 0 || cfg.Jitter > 1 {
return errors.New("update loop jitter must be between 0 and 1")
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
}
if cfg.InitialDelay > 0 {
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
return err
}
}
runs := 0
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
for {
runs++
result, err := m.ApplyUpdate(ctx, req)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("windows_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
return err
}
continue
}
logf("windows_update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
} else {
logf("windows_update_loop run=%d action=%s reason=%s target=%s task=%s replaced=%t",
runs,
result.Action,
result.Reason,
result.TargetVersion,
result.ContainerName,
result.Replaced,
)
if result.Action == "update" && result.TargetVersion != "" && !result.RolledBack {
req.CurrentVersion = result.TargetVersion
}
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
hostReq.OS = firstNonEmpty(hostReq.OS, "windows")
hostReq.Arch = firstNonEmpty(hostReq.Arch, "amd64")
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, "windows_binary")
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
if hostErr != nil {
if errors.Is(hostErr, ErrNodeIdentityNotReady) {
logf("windows_host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
} else {
logf("windows_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
if cfg.StopOnError {
return hostErr
}
}
} else {
logf("windows_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
runs,
hostResult.Action,
hostResult.Reason,
hostResult.TargetVersion,
hostResult.NewImage,
hostResult.Replaced,
hostResult.RestartNeeded,
)
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
cfg.HostAgentUpdateRequest.CurrentVersion = hostResult.TargetVersion
}
}
}
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
}
}
func installWindowsHostAgentUpdater(ctx context.Context, m WindowsManager, result WindowsInstallResult, cfg WindowsInstallConfig) (WindowsInstallResult, error) {
if !cfg.AutoUpdateEnabled || strings.EqualFold(result.StartupMode, "none") {
return result, nil
}
if cfg.AutoUpdateCurrentVersion == "" || (cfg.Replace && !result.Downloaded) {
cfg.AutoUpdateCurrentVersion = "0.0.0"
}
hostAgentPath := filepath.Join(result.InstallDir, "rap-host-agent.exe")
if err := installHostAgentBinary(cfg.HostAgentSourcePath, hostAgentPath); err != nil {
return result, err
}
wrapperPath := filepath.Join(result.InstallDir, "rap-host-agent-update.cmd")
logPath := filepath.Join(result.StateDir, "rap-host-agent-update.log")
taskName := "RAP Host Agent Updater " + safeUnitSlug(result.NodeName)
script := windowsHostAgentUpdateScript(hostAgentPath, cfg, result)
if err := os.WriteFile(wrapperPath, []byte(script), 0o755); err != nil {
return result, err
}
started, fallback, mode, err := m.installStartupTask(ctx, taskName, wrapperPath, logPath, cfg.StartupMode)
if err != nil {
return result, err
}
result.HostAgentPath = hostAgentPath
result.UpdaterTaskName = taskName
result.UpdaterStarted = started
if fallback {
result.AdminFallback = true
}
if mode != "" && mode != result.StartupMode {
result.StartupMode = mode
}
return result, nil
}
func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig, result WindowsInstallResult) string {
currentVersion := firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0")
interval := cfg.AutoUpdateIntervalSeconds
if interval == 0 {
interval = 21600
}
initialDelay := cfg.AutoUpdateInitialDelaySeconds
if initialDelay == 0 {
initialDelay = 15
}
healthTimeout := cfg.AutoUpdateHealthTimeoutSeconds
if healthTimeout == 0 {
healthTimeout = 30
}
updateLoopArgs := []string{
`"` + hostAgentPath + `"`,
"update-loop",
"--backend-url", `"` + cfg.RuntimeConfig.BackendURL + `"`,
"--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`,
"--state-dir", `"` + result.StateDir + `"`,
"--current-version", currentVersion,
"--os", "windows",
"--arch", "amd64",
"--install-type", WindowsUpdateInstallType,
"--binary-path", `"` + result.NodeAgentPath + `"`,
"--windows-task-name", `"` + result.TaskName + `"`,
"--health-timeout-seconds", fmt.Sprintf("%d", healthTimeout),
"--interval-seconds", fmt.Sprintf("%d", interval),
"--initial-delay-seconds", "0",
"--host-agent-update-status-enabled",
"--host-agent-current-version", currentVersion,
"--host-agent-binary-path", `"` + hostAgentPath + `"`,
}
if strings.TrimSpace(cfg.NodeID) != "" {
updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`)
}
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
updateLoopArgs = append(updateLoopArgs, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
}
lines := []string{
"@echo off",
"setlocal",
"set RAP_HOST_AGENT=" + `"` + hostAgentPath + `"`,
"set RAP_HOST_AGENT_NEXT=" + `"` + hostAgentPath + `.next"`,
}
if initialDelay > 0 {
lines = append(lines, "timeout /t "+fmt.Sprintf("%d", initialDelay)+" /nobreak >NUL")
}
lines = append(lines, []string{
":loop",
"if exist %RAP_HOST_AGENT_NEXT% (",
" copy /Y %RAP_HOST_AGENT_NEXT% %RAP_HOST_AGENT% >NUL",
" if %ERRORLEVEL% EQU 0 del /F /Q %RAP_HOST_AGENT_NEXT%",
")",
strings.Join(updateLoopArgs, " "),
"timeout /t " + fmt.Sprintf("%d", interval) + " /nobreak >NUL",
"goto loop",
"endlocal",
"rem initial-delay-seconds " + fmt.Sprintf("%d", initialDelay),
}...)
return strings.Join(lines, "\r\n") + "\r\n"
}
@@ -63,10 +63,12 @@ const (
ProductionChannelVPNPacket = "vpn_packet"
ProductionMessageVPNPacketBatch = "vpn.packet_batch"
FabricServiceClassVPNPackets = "vpn_packets"
FabricServiceClassRemoteWorkspace = "remote_workspace"
FabricServiceChannelBulk = "bulk"
FabricServiceChannelControl = "control"
FabricServiceChannelInteractive = "interactive"
FabricServiceChannelReliable = "reliable"
FabricServiceChannelDroppable = "droppable"
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionVPNPacketPayloadBytes = 256 * 1024
MaxProductionEnvelopeFutureSkew = time.Minute
@@ -59,9 +59,9 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
reasons := []string{"base"}
switch candidate.Transport {
case "direct_tcp_tls":
case "direct_tcp_tls", "direct_http", "direct_https":
score += 35
reasons = append(reasons, "transport:direct_tcp_tls")
reasons = append(reasons, "transport:direct")
case "wss":
score += 25
reasons = append(reasons, "transport:wss")
@@ -37,27 +37,28 @@ type PeerCacheSnapshot struct {
}
type PeerCacheEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
}
type peerCacheBuildEntry struct {
@@ -117,6 +118,10 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
MaxVerificationAge: time.Hour,
})
if len(scored) > 0 {
entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored))
for _, scoredCandidate := range scored {
entry.EndpointCandidates = append(entry.EndpointCandidates, scoredCandidate.Candidate)
}
entry.BestCandidateID = scored[0].Candidate.EndpointID
entry.BestCandidateAddr = scored[0].Candidate.Address
entry.BestTransport = scored[0].Candidate.Transport
@@ -66,24 +66,44 @@ type PeerConnectionManagerSnapshot struct {
}
type PeerConnectionProbeResult struct {
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
SelectedCandidateID string `json:"selected_candidate_id,omitempty"`
SelectedEndpoint string `json:"selected_endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
CandidateResults []PeerConnectionCandidateProbeResult `json:"candidate_results,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
type PeerConnectionCandidateProbeResult struct {
CandidateID string `json:"candidate_id,omitempty"`
Endpoint string `json:"endpoint"`
Transport string `json:"transport,omitempty"`
LinkStatus string `json:"link_status"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
type peerConnectionProbeTarget struct {
CandidateID string
Endpoint string
Transport string
}
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
@@ -137,6 +157,10 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
RendezvousLeases: rendezvousLeases,
Now: startedAt,
})
entriesByNode := map[string]PeerCacheEntry{}
for _, entry := range peerSnapshot.Entries {
entriesByNode[entry.NodeID] = entry
}
cycle := PeerConnectionManagerCycle{
Mode: recoveryPlan.Mode,
StartedAt: startedAt,
@@ -150,7 +174,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
}
for _, intent := range intentPlan.Intents {
result := m.probeIntent(ctx, intent)
result := m.probeIntent(ctx, intent, entriesByNode[intent.NodeID])
cycle.Results = append(cycle.Results, result)
switch result.LinkStatus {
case PeerConnectionProbeReachable:
@@ -200,7 +224,7 @@ func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvo
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
}
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult {
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent, cacheEntry PeerCacheEntry) PeerConnectionProbeResult {
startedAt := normalizedNow(m.now())
result := PeerConnectionProbeResult{
NodeID: intent.NodeID,
@@ -254,9 +278,6 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
result.CompletedAt = normalizedNow(m.now())
return result
}
m.tracker.BeginProbe(peer, startedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
defer cancel()
target := PeerIdentity{
ClusterID: m.local.ClusterID,
NodeID: intent.NodeID,
@@ -264,30 +285,118 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
if intent.RelayCandidate && intent.RelayNodeID != "" {
target.NodeID = intent.RelayNodeID
}
_, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
completedAt := normalizedNow(m.now())
if err != nil {
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = err.Error()
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt)
targets := []peerConnectionProbeTarget{{
CandidateID: intent.BestCandidateID,
Endpoint: intent.Endpoint,
Transport: intent.Transport,
}}
if intent.DirectCandidate {
targets = peerConnectionProbeTargets(intent, cacheEntry)
}
var lastFailure string
for _, probeTarget := range targets {
probePeer := peer
probePeer.Endpoint = strings.TrimRight(strings.TrimSpace(probeTarget.Endpoint), "/")
probePeer.BestCandidateID = strings.TrimSpace(probeTarget.CandidateID)
probePeer.BestCandidateAddr = probePeer.Endpoint
probePeer.BestTransport = strings.TrimSpace(probeTarget.Transport)
if probePeer.Endpoint == "" {
continue
}
candidateStartedAt := normalizedNow(m.now())
m.tracker.BeginProbe(probePeer, candidateStartedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
_, err := NewClient(probePeer.Endpoint).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
cancel()
completedAt := normalizedNow(m.now())
candidateResult := PeerConnectionCandidateProbeResult{
CandidateID: probePeer.BestCandidateID,
Endpoint: probePeer.Endpoint,
Transport: probePeer.BestTransport,
StartedAt: candidateStartedAt,
CompletedAt: completedAt,
}
if err != nil {
lastFailure = err.Error()
candidateResult.LinkStatus = PeerConnectionProbeUnreachable
candidateResult.FailureReason = lastFailure
result.CandidateResults = append(result.CandidateResults, candidateResult)
continue
}
latency := int(completedAt.Sub(candidateStartedAt).Milliseconds())
if latency < 0 {
latency = 0
}
candidateResult.LinkStatus = PeerConnectionProbeReachable
candidateResult.LatencyMs = latency
result.CandidateResults = append(result.CandidateResults, candidateResult)
result.LinkStatus = PeerConnectionProbeReachable
result.Endpoint = probePeer.Endpoint
result.SelectedCandidateID = probePeer.BestCandidateID
result.SelectedEndpoint = probePeer.Endpoint
result.LatencyMs = latency
if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
}
result.CompletedAt = completedAt
return result
}
latency := int(completedAt.Sub(startedAt).Milliseconds())
if latency < 0 {
latency = 0
}
result.LinkStatus = PeerConnectionProbeReachable
result.LatencyMs = latency
if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt)
completedAt := normalizedNow(m.now())
if lastFailure == "" {
lastFailure = "no_probe_endpoint_available"
}
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = lastFailure
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, lastFailure, completedAt)
result.CompletedAt = completedAt
return result
}
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
seen := map[string]struct{}{}
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
add := func(candidateID, endpoint, transport string) {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
if endpoint == "" {
return
}
key := candidateID + "|" + endpoint
if _, ok := seen[key]; ok {
return
}
seen[key] = struct{}{}
out = append(out, peerConnectionProbeTarget{
CandidateID: strings.TrimSpace(candidateID),
Endpoint: endpoint,
Transport: strings.TrimSpace(transport),
})
}
for _, candidate := range cacheEntry.EndpointCandidates {
if !candidateUsableForDirectProbe(candidate) {
continue
}
add(candidate.EndpointID, candidate.Address, candidate.Transport)
}
add(intent.BestCandidateID, intent.Endpoint, intent.Transport)
return out
}
func candidateUsableForDirectProbe(candidate PeerEndpointCandidate) bool {
endpoint := strings.TrimSpace(candidate.Address)
if endpoint == "" || strings.HasPrefix(endpoint, "relay://") || strings.HasPrefix(endpoint, "outbound://") {
return false
}
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
if connectivity == "outbound_only" || connectivity == "relay_required" || reachability == "outbound_only" || reachability == "relay" {
return false
}
return transport == "" || strings.Contains(transport, "direct") || transport == "wss" || strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
}
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
snapshot := m.tracker.Snapshot()
for _, entry := range snapshot.Entries {
@@ -188,3 +188,71 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
}
}
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-dead",
NodeID: "node-b",
Transport: "direct_http",
Address: "http://127.0.0.1:1",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
},
{
EndpointID: "node-b-live",
NodeID: "node-b",
Transport: "direct_http",
Address: server.URL,
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 2,
},
},
},
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 100 * time.Millisecond},
ProbeTimeout: 100 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Failed != 0 || len(cycle.Results) != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
result := cycle.Results[0]
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != server.URL {
t.Fatalf("fallback did not select live candidate: %+v", result)
}
if len(result.CandidateResults) != 2 ||
result.CandidateResults[0].LinkStatus != PeerConnectionProbeUnreachable ||
result.CandidateResults[1].LinkStatus != PeerConnectionProbeReachable {
t.Fatalf("candidate probe trail mismatch: %+v", result.CandidateResults)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != server.URL {
t.Fatalf("tracker did not retain selected candidate: %+v", snapshot)
}
}
@@ -138,6 +138,32 @@ func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now
return entry
}
func (t *PeerConnectionTracker) RecordSuccessForPeer(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
entry.ConsecutiveSuccesses++
entry.ConsecutiveFailures = 0
entry.LastLatencyMs = latencyMs
entry.LastFailureReason = ""
entry.LastProbeAt = now
entry.BackoffUntil = time.Time{}
nextState := PeerConnectionReady
if latencyMs >= 500 {
nextState = PeerConnectionDegraded
}
if entry.State != nextState {
entry.State = nextState
entry.LastTransitionAt = now
}
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
@@ -34,12 +34,20 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
return err
}
}
if envelope.ChannelClass != ProductionChannelFabricControl {
maxPayloadBytes := MaxProductionEnvelopePayloadBytes
switch envelope.ChannelClass {
case ProductionChannelFabricControl:
if envelope.MessageType != ProductionMessageFabricControl {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
case ProductionChannelVPNPacket:
if envelope.MessageType != ProductionMessageVPNPacketBatch {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
maxPayloadBytes = MaxProductionVPNPacketPayloadBytes
default:
return ErrUnauthorizedChannel
}
if envelope.MessageType != ProductionMessageFabricControl {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
if envelope.TTL <= 0 {
return ErrTTLExhausted
}
@@ -58,8 +66,8 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
if envelope.PayloadLength != len(envelope.Payload) {
return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes {
return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid)
if envelope.PayloadLength > maxPayloadBytes {
return fmt.Errorf("%w: payload exceeds channel limit", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadHash == "" {
return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid)
@@ -22,7 +22,7 @@ func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope Producti
if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) {
return ErrRouteExpired
}
if !contains(route.AllowedChannels, ProductionChannelFabricControl) {
if !contains(route.AllowedChannels, envelope.ChannelClass) {
return ErrUnauthorizedChannel
}
path := routePath(route)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -2,6 +2,8 @@ package supervisor
import (
"context"
"strings"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
)
@@ -17,24 +19,146 @@ type StubSupervisor struct {
func (s StubSupervisor) Apply(_ context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error) {
statuses := make([]client.WorkloadStatusRequest, 0, len(desired))
for _, workload := range desired {
state := "degraded"
if workload.DesiredState == "disabled" {
state = "stopped"
}
version := workload.Version
if version == "" {
version = s.Version
}
statuses = append(statuses, client.WorkloadStatusRequest{
ReportedState: state,
RuntimeMode: workload.RuntimeMode,
Version: version,
StatusPayload: map[string]any{
"supervisor": "stub",
"desired_state": workload.DesiredState,
"service_type": workload.ServiceType,
},
})
statuses = append(statuses, s.applyOne(workload))
}
return statuses, nil
}
func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.WorkloadStatusRequest {
serviceType := strings.TrimSpace(workload.ServiceType)
desiredState := strings.TrimSpace(strings.ToLower(workload.DesiredState))
if desiredState == "" {
desiredState = "disabled"
}
runtimeMode := strings.TrimSpace(strings.ToLower(workload.RuntimeMode))
if runtimeMode == "" {
runtimeMode = "native"
}
version := strings.TrimSpace(workload.Version)
if version == "" {
version = s.Version
}
payload := map[string]any{
"schema_version": "rap.node_agent.workload_supervision.v1",
"supervisor": "node-agent-local",
"desired_state": desiredState,
"service_type": serviceType,
"runtime_mode": runtimeMode,
"observed_at": time.Now().UTC().Format(time.RFC3339Nano),
}
if desiredState != "enabled" {
payload["reason"] = "desired_state_not_enabled"
return client.WorkloadStatusRequest{
ReportedState: "stopped",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
if serviceType == "core-mesh" || serviceType == "mesh-listener" {
payload["reason"] = "builtin_node_agent_service_ready"
payload["execution_mode"] = "builtin"
payload["traffic"] = serviceTrafficMode(serviceType)
return client.WorkloadStatusRequest{
ReportedState: "running",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
if serviceType == "synthetic.echo" && runtimeMode == "native" {
payload["reason"] = "internal_synthetic_echo_ready"
payload["execution_mode"] = "builtin"
payload["traffic"] = "test_service_only"
return client.WorkloadStatusRequest{
ReportedState: "running",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
if serviceType == "rdp-worker" && runtimeMode == "native" && boolConfig(workload.Config, "adapter_contract_probe") {
payload["reason"] = "remote_workspace_adapter_contract_probe_ready"
payload["execution_mode"] = "contract_probe"
payload["service_class"] = "remote_workspace"
payload["fabric_service_channel_required"] = true
payload["backend_relay_steady_state"] = false
payload["channels"] = remoteWorkspaceAdapterChannels()
payload["frame_batch_contract"] = remoteWorkspaceFrameBatchContract()
payload["traffic"] = "none"
return client.WorkloadStatusRequest{
ReportedState: "running",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
payload["reason"] = "service_runtime_not_implemented"
payload["traffic"] = "blocked"
return client.WorkloadStatusRequest{
ReportedState: "degraded",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
func boolConfig(values map[string]any, key string) bool {
if values == nil {
return false
}
value, ok := values[key]
if !ok {
return false
}
switch typed := value.(type) {
case bool:
return typed
case string:
return strings.EqualFold(strings.TrimSpace(typed), "true")
default:
return false
}
}
func remoteWorkspaceAdapterChannels() []map[string]any {
return []map[string]any{
{"name": "input", "direction": "client_to_adapter", "reliability": "reliable_ordered", "priority": "critical", "droppable": true, "may_block_input": false},
{"name": "control", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "high", "droppable": false, "may_block_input": false},
{"name": "display", "direction": "adapter_to_client", "reliability": "droppable_latest", "priority": "high", "droppable": true, "may_block_input": false},
{"name": "cursor", "direction": "adapter_to_client", "reliability": "droppable_latest", "priority": "high", "droppable": true, "may_block_input": false},
{"name": "clipboard", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "medium", "droppable": false, "may_block_input": false},
{"name": "file_transfer", "direction": "bidirectional", "reliability": "reliable_chunked", "priority": "medium", "droppable": false, "may_block_input": false},
{"name": "audio", "direction": "adapter_to_client", "reliability": "adaptive_droppable", "priority": "medium", "droppable": true, "may_block_input": false},
{"name": "device", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "medium", "droppable": false, "may_block_input": false},
{"name": "telemetry", "direction": "adapter_to_client", "reliability": "sampled_droppable", "priority": "low", "droppable": true, "may_block_input": false},
}
}
func remoteWorkspaceFrameBatchContract() map[string]any {
return map[string]any{
"schema_version": "rap.remote_workspace_frame_batch.v1",
"adapter_contract_id": "rap.rdp_worker.remote_workspace_adapter_contract_probe.v1",
"probe_only": true,
"payload_forwarding": "not_implemented",
"service_class": "remote_workspace",
"allowed_flow_classes": []string{"control", "interactive", "reliable", "bulk", "droppable"},
"allowed_payload_encodings": []string{
"none",
"base64",
},
"max_probe_frames": 32,
"channels": remoteWorkspaceAdapterChannels(),
}
}
func serviceTrafficMode(serviceType string) string {
switch serviceType {
case "core-mesh":
return "fabric_control"
case "mesh-listener":
return "entry_listener"
default:
return "unknown"
}
}
@@ -33,3 +33,101 @@ func TestStubSupervisorReportsStoppedForDisabledWorkload(t *testing.T) {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
}
func TestStubSupervisorRunsInternalSyntheticEchoWorkload(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "synthetic.echo", DesiredState: "enabled", RuntimeMode: "native"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "running" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
if statuses[0].StatusPayload["reason"] != "internal_synthetic_echo_ready" {
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
}
if statuses[0].StatusPayload["execution_mode"] != "builtin" {
t.Fatalf("execution_mode = %v", statuses[0].StatusPayload["execution_mode"])
}
}
func TestStubSupervisorReportsBuiltinFabricServicesRunning(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "core-mesh", DesiredState: "enabled", RuntimeMode: "container"},
{ServiceType: "mesh-listener", DesiredState: "enabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if len(statuses) != 2 {
t.Fatalf("statuses length = %d", len(statuses))
}
for _, status := range statuses {
if status.ReportedState != "running" {
t.Fatalf("ReportedState = %q", status.ReportedState)
}
if status.StatusPayload["reason"] != "builtin_node_agent_service_ready" {
t.Fatalf("reason = %v", status.StatusPayload["reason"])
}
}
}
func TestStubSupervisorKeepsUnsupportedEnabledWorkloadDegraded(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "rdp-worker", DesiredState: "enabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "degraded" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
if statuses[0].StatusPayload["reason"] != "service_runtime_not_implemented" {
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
}
}
func TestStubSupervisorRunsRDPWorkerAdapterContractProbeOnly(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{
ServiceType: "rdp-worker",
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"adapter_contract_probe": true,
},
},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "running" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
if statuses[0].StatusPayload["reason"] != "remote_workspace_adapter_contract_probe_ready" {
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
}
if statuses[0].StatusPayload["service_class"] != "remote_workspace" {
t.Fatalf("service_class = %v", statuses[0].StatusPayload["service_class"])
}
if statuses[0].StatusPayload["backend_relay_steady_state"] != false {
t.Fatalf("backend_relay_steady_state = %v", statuses[0].StatusPayload["backend_relay_steady_state"])
}
channels, ok := statuses[0].StatusPayload["channels"].([]map[string]any)
if !ok || len(channels) != 9 {
t.Fatalf("channels = %#v", statuses[0].StatusPayload["channels"])
}
if channels[0]["name"] != "input" || channels[0]["priority"] != "critical" || channels[0]["droppable"] != true || channels[0]["may_block_input"] != false {
t.Fatalf("unexpected input channel: %#v", channels[0])
}
frameBatch, ok := statuses[0].StatusPayload["frame_batch_contract"].(map[string]any)
if !ok {
t.Fatalf("frame_batch_contract = %#v", statuses[0].StatusPayload["frame_batch_contract"])
}
if frameBatch["schema_version"] != "rap.remote_workspace_frame_batch.v1" ||
frameBatch["payload_forwarding"] != "not_implemented" ||
frameBatch["service_class"] != "remote_workspace" {
t.Fatalf("unexpected frame batch contract: %#v", frameBatch)
}
}
@@ -385,32 +385,37 @@ func (s *FabricFlowScheduler) ConfigureAdaptivePolicy(policy FabricServiceChanne
}
func (s *FabricFlowScheduler) ScheduleClientPackets(packets [][]byte) []FabricScheduledPacketBatch {
return s.scheduleClientPackets("", "", packets)
scheduled, _ := s.scheduleClientPackets("", "", packets)
return scheduled
}
func (s *FabricFlowScheduler) ScheduleClientPacketsForConnection(vpnConnectionID string, packets [][]byte) []FabricScheduledPacketBatch {
return s.scheduleClientPackets(vpnConnectionID, "", packets)
scheduled, _ := s.scheduleClientPackets(vpnConnectionID, "", packets)
return scheduled
}
func (s *FabricFlowScheduler) ScheduleClientPacketsForConnectionClass(vpnConnectionID string, trafficClass string, packets [][]byte) []FabricScheduledPacketBatch {
return s.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
scheduled, _ := s.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
return scheduled
}
func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, trafficClass string, packets [][]byte) []FabricScheduledPacketBatch {
func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, trafficClass string, packets [][]byte) ([]FabricScheduledPacketBatch, uint64) {
packets = cleanPacketBatch(packets)
if len(packets) == 0 {
return nil
return nil, 0
}
if s == nil {
s = NewFabricFlowScheduler(0, 0)
}
trafficClass = normalizeFabricTrafficClass(trafficClass)
grouped := map[string]*FabricScheduledPacketBatch{}
var droppedCount uint64
for _, packet := range packets {
flowID, shard := classifyPacketFlow(packet, s.shardCountValue())
channelID := fabricFlowChannelIDForClass(vpnConnectionID, trafficClass, shard)
queueDepth, dropped := s.enqueue(channelID, trafficClass)
if dropped {
droppedCount++
continue
}
batch := grouped[channelID]
@@ -433,7 +438,7 @@ func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, traf
out = append(out, *batch)
}
s.sortScheduledBatches(out)
return out
return out, droppedCount
}
func fabricFlowChannelID(vpnConnectionID string, shard int) string {
@@ -1441,11 +1446,9 @@ func (i *FabricClientPacketIngress) SendClientPacketBatchWithTrafficClass(ctx co
}
i.recordSendBatch(len(packets))
scheduler := i.flowScheduler()
droppedBefore := scheduler.Dropped()
scheduled := scheduler.ScheduleClientPacketsForConnectionClass(vpnConnectionID, trafficClass, packets)
droppedAfter := scheduler.Dropped()
if droppedAfter > droppedBefore {
i.recordFlowDropped(droppedAfter - droppedBefore)
scheduled, droppedCount := scheduler.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
if droppedCount > 0 {
i.recordFlowDropped(droppedCount)
}
if len(scheduled) == 0 {
i.recordError(mesh.ErrSyntheticRelayQueueFull)
@@ -1657,8 +1660,10 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
if i == nil || routesFunc == nil {
return nil
}
localClusterID := i.clusterID()
localNodeID := i.localNodeID()
if clusterID == "" {
clusterID = i.ClusterID
clusterID = localClusterID
}
now := time.Now().UTC()
var preferred []fabricClientRouteCandidate
@@ -1676,7 +1681,7 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
}
}
for _, route := range routesFunc() {
if route.ClusterID != clusterID || route.SourceNodeID != i.LocalNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
if route.ClusterID != clusterID || route.SourceNodeID != localNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
continue
}
if manager.isWithdrawn(route.RouteID) {
@@ -1685,8 +1690,8 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
if !route.ExpiresAt.IsZero() && !route.ExpiresAt.After(now) {
continue
}
nextHop := nextHopAfter(route.Hops, i.LocalNodeID, route.DestinationNodeID)
if nextHop == "" || nextHop == i.LocalNodeID {
nextHop := nextHopAfter(route.Hops, localNodeID, route.DestinationNodeID)
if nextHop == "" || nextHop == localNodeID {
continue
}
candidate := fabricClientRouteCandidate{Route: route, NextHop: nextHop}
@@ -2024,7 +2029,7 @@ func (i *FabricClientPacketIngress) routeProvenance(clusterID string) map[string
if i == nil || routesFunc == nil {
return out
}
localNodeID := strings.TrimSpace(i.LocalNodeID)
localNodeID := i.localNodeID()
for _, route := range routesFunc() {
if strings.TrimSpace(route.RouteID) == "" {
continue
@@ -2322,6 +2327,24 @@ func (i *FabricClientPacketIngress) routesFunc() func() []mesh.SyntheticRoute {
return i.Routes
}
func (i *FabricClientPacketIngress) clusterID() string {
if i == nil {
return ""
}
i.mu.Lock()
defer i.mu.Unlock()
return strings.TrimSpace(i.ClusterID)
}
func (i *FabricClientPacketIngress) localNodeID() string {
if i == nil {
return ""
}
i.mu.Lock()
defer i.mu.Unlock()
return strings.TrimSpace(i.LocalNodeID)
}
func (i *FabricClientPacketIngress) flowScheduler() *FabricFlowScheduler {
if i == nil {
return NewFabricFlowScheduler(0, 0)
@@ -324,10 +324,13 @@ func TestFabricFlowSchedulerDropsWhenChannelQueueIsFull(t *testing.T) {
packetA := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
packetB := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
batches := scheduler.ScheduleClientPackets([][]byte{packetA, packetB})
batches, dropped := scheduler.scheduleClientPackets("", "", [][]byte{packetA, packetB})
if len(batches) != 1 || len(batches[0].Packets) != 1 {
t.Fatalf("batches = %#v, want one accepted packet", batches)
}
if dropped != 1 {
t.Fatalf("dropped = %d, want per-call drop count 1", dropped)
}
snapshot := scheduler.Snapshot()
if snapshot.Dropped != 1 || !snapshot.BackpressureActive {
t.Fatalf("snapshot = %+v, want one dropped packet and active backpressure", snapshot)
@@ -1069,6 +1072,60 @@ func TestFabricClientPacketIngressIsolatesRouteMemoryPerVPNConnection(t *testing
}
}
func TestFabricClientPacketIngressRouteSelectionUsesUpdatedRuntimeIdentity(t *testing.T) {
transport := &captureManyProductionTransport{}
ingress := &FabricClientPacketIngress{
ForwardTransport: transport,
Inbox: NewFabricPacketInbox(8),
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
Routes: func() []mesh.SyntheticRoute {
return []mesh.SyntheticRoute{{
RouteID: "route-entry-1",
ClusterID: "cluster-1",
SourceNodeID: "entry-1",
DestinationNodeID: "exit-1",
Hops: []string{"entry-1", "relay-1", "exit-1"},
AllowedChannels: []string{mesh.ProductionChannelVPNPacket},
ExpiresAt: time.Now().UTC().Add(time.Minute),
MaxTTL: 8,
}}
},
}
ingress.UpdateRuntime(
transport,
NewFabricPacketInbox(8),
"cluster-1",
"entry-2",
nil,
func() []mesh.SyntheticRoute {
return []mesh.SyntheticRoute{{
RouteID: "route-entry-2",
ClusterID: "cluster-1",
SourceNodeID: "entry-2",
DestinationNodeID: "exit-2",
Hops: []string{"entry-2", "relay-2", "exit-2"},
AllowedChannels: []string{mesh.ProductionChannelVPNPacket},
ExpiresAt: time.Now().UTC().Add(time.Minute),
MaxTTL: 8,
}}
},
"policy-updated",
)
packet := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 443)
if err := ingress.SendClientPacketBatch(context.Background(), "", "vpn-1", [][]byte{packet}); err != nil {
t.Fatalf("send after runtime update: %v", err)
}
if len(transport.envelopes) != 1 {
t.Fatalf("envelopes = %d, want one send", len(transport.envelopes))
}
envelope := transport.envelopes[0]
if envelope.RouteID != "route-entry-2" || envelope.SourceNodeID != "entry-2" || transport.calls[0] != "relay-2" {
t.Fatalf("envelope route/source/next-hop = %s/%s/%s, want updated entry-2 route", envelope.RouteID, envelope.SourceNodeID, transport.calls[0])
}
}
func TestFabricClientPacketIngressParallelFlowWindowDoesNotBlockIndependentChannel(t *testing.T) {
scheduler := NewFabricFlowScheduler(8, 16)
slowPacket, fastPacket := packetsForOrderedDistinctChannels(scheduler.shardCountValue())
@@ -0,0 +1,170 @@
//go:build windows && rap_vpn_windows_tun
package vpnruntime
import (
"crypto/sha256"
_ "embed"
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"strings"
wgtun "golang.zx2c4.com/wireguard/tun"
)
const windowsGatewayMTU = 1420
//go:embed assets/windows/amd64/wintun.dll
var embeddedWintunDLL []byte
type tunDevice struct {
dev wgtun.Device
name string
}
func openGatewayTun(name, addressCIDR, routeCIDR string) (*tunDevice, error) {
if _, _, err := net.ParseCIDR(addressCIDR); err != nil {
return nil, fmt.Errorf("invalid vpn gateway address %q: %w", addressCIDR, err)
}
if _, _, err := net.ParseCIDR(routeCIDR); err != nil {
return nil, fmt.Errorf("invalid vpn gateway route %q: %w", routeCIDR, err)
}
if err := ensureWintunDLL(); err != nil {
return nil, err
}
dev, err := wgtun.CreateTUN(name, windowsGatewayMTU)
if err != nil {
return nil, fmt.Errorf("create wintun interface %s: %w", name, err)
}
if err := configureGatewayInterface(name, addressCIDR, routeCIDR); err != nil {
_ = dev.Close()
return nil, err
}
return &tunDevice{dev: dev, name: name}, nil
}
func (d *tunDevice) Read(packet []byte) (int, error) {
bufs := [][]byte{packet}
sizes := []int{0}
n, err := d.dev.Read(bufs, sizes, 0)
if err != nil {
return 0, err
}
if n <= 0 {
return 0, nil
}
return sizes[0], nil
}
func (d *tunDevice) Write(packet []byte) (int, error) {
n, err := d.dev.Write([][]byte{packet}, 0)
if err != nil {
return 0, err
}
if n <= 0 {
return 0, nil
}
return len(packet), nil
}
func (d *tunDevice) Close() error {
_ = removeWindowsGatewayNat()
return d.dev.Close()
}
func configureGatewayInterface(name, addressCIDR, routeCIDR string) error {
ip, network, err := net.ParseCIDR(addressCIDR)
if err != nil {
return fmt.Errorf("invalid vpn gateway address %q: %w", addressCIDR, err)
}
ones, bits := network.Mask.Size()
if bits != 32 || ones <= 0 {
return fmt.Errorf("invalid vpn gateway prefix %q", addressCIDR)
}
_, route, err := net.ParseCIDR(routeCIDR)
if err != nil {
return fmt.Errorf("invalid vpn gateway route %q: %w", routeCIDR, err)
}
script := fmt.Sprintf(`
$ErrorActionPreference = 'Stop'
$alias = %s
$address = %s
$prefixLength = %d
$natPrefix = %s
$natName = 'RAPVPN'
$adapter = Get-NetAdapter -Name $alias -ErrorAction Stop
$adapter | Enable-NetAdapter -Confirm:$false -ErrorAction SilentlyContinue | Out-Null
$existing = Get-NetIPAddress -InterfaceAlias $alias -AddressFamily IPv4 -ErrorAction SilentlyContinue
foreach ($addr in $existing) {
if ($addr.IPAddress -ne $address -or $addr.PrefixLength -ne $prefixLength) {
Remove-NetIPAddress -InterfaceAlias $alias -IPAddress $addr.IPAddress -Confirm:$false -ErrorAction SilentlyContinue
}
}
if (-not (Get-NetIPAddress -InterfaceAlias $alias -IPAddress $address -AddressFamily IPv4 -ErrorAction SilentlyContinue)) {
New-NetIPAddress -InterfaceAlias $alias -IPAddress $address -PrefixLength $prefixLength -Type Unicast | Out-Null
}
Set-NetIPInterface -InterfaceAlias $alias -AddressFamily IPv4 -Forwarding Enabled
Get-NetIPInterface -AddressFamily IPv4 | Where-Object { $_.ConnectionState -eq 'Connected' -and $_.InterfaceAlias -ne 'Loopback Pseudo-Interface 1' } | Set-NetIPInterface -Forwarding Enabled
$existingNat = Get-NetNat -Name $natName -ErrorAction SilentlyContinue
if ($existingNat -and $existingNat.InternalIPInterfaceAddressPrefix -ne $natPrefix) {
$existingNat | Remove-NetNat -Confirm:$false
$existingNat = $null
}
if (-not $existingNat) {
New-NetNat -Name $natName -InternalIPInterfaceAddressPrefix $natPrefix | Out-Null
}
`, psQuote(name), psQuote(ip.String()), ones, psQuote(route.String()))
if err := runPowerShell(script); err != nil {
return fmt.Errorf("configure windows vpn gateway interface %s: %w", name, err)
}
return nil
}
func removeWindowsGatewayNat() error {
return runPowerShell(`Get-NetNat -Name 'RAPVPN' -ErrorAction SilentlyContinue | Remove-NetNat -Confirm:$false -ErrorAction SilentlyContinue`)
}
func runPowerShell(script string) error {
cmd := exec.Command("powershell.exe", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("powershell failed: %w: %s", err, strings.TrimSpace(string(out)))
}
return nil
}
func psQuote(value string) string {
return "'" + strings.ReplaceAll(value, "'", "''") + "'"
}
func ensureWintunDLL() error {
exePath, err := os.Executable()
if err != nil {
return fmt.Errorf("locate node-agent executable for wintun.dll: %w", err)
}
target := filepath.Join(filepath.Dir(exePath), "wintun.dll")
if payload, err := os.ReadFile(target); err == nil && sameSHA256(payload, embeddedWintunDLL) {
return nil
}
tmp := target + ".tmp"
if err := os.WriteFile(tmp, embeddedWintunDLL, 0o644); err != nil {
return fmt.Errorf("write embedded wintun.dll: %w", err)
}
_ = os.Remove(target)
if err := os.Rename(tmp, target); err != nil {
_ = os.Remove(tmp)
return fmt.Errorf("install embedded wintun.dll: %w", err)
}
return nil
}
func sameSHA256(a, b []byte) bool {
left := sha256.Sum256(a)
right := sha256.Sum256(b)
return left == right
}