Record project continuation changes
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
FROM golang:1.23-bookworm AS build
|
||||
FROM golang:1.25-bookworm AS build
|
||||
|
||||
WORKDIR /src
|
||||
COPY agents/rap-node-agent/go.mod ./
|
||||
@@ -6,8 +6,10 @@ RUN go mod download
|
||||
COPY agents/rap-node-agent/ ./
|
||||
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/rap-node-agent ./cmd/rap-node-agent
|
||||
|
||||
FROM gcr.io/distroless/static-debian12:nonroot
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends ca-certificates iproute2 iptables procps \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
COPY --from=build /out/rap-node-agent /usr/local/bin/rap-node-agent
|
||||
USER nonroot:nonroot
|
||||
ENTRYPOINT ["/usr/local/bin/rap-node-agent"]
|
||||
|
||||
@@ -66,6 +66,11 @@ Implemented:
|
||||
- synthetic route-health route config refresh from Control Plane path
|
||||
decisions
|
||||
- route-health expected/observed effective path drift reporting
|
||||
- host-agent Docker update plan executor with artifact checksum/size
|
||||
verification, container replacement, health check, status reporting, and
|
||||
rollback attempt
|
||||
- host-agent update loop for service/timer placement
|
||||
- host-agent binary self-update loop for the updater service itself
|
||||
- maximum capacity guard for the local production observation sink
|
||||
- panic-safe fail-closed production envelope observation wrapper
|
||||
- explicit `4096` byte payload boundary for validated production
|
||||
@@ -98,7 +103,7 @@ Not implemented yet:
|
||||
- VPN runtime
|
||||
- production workload supervision
|
||||
- certificate issuance/rotation
|
||||
- updater runtime
|
||||
- in-agent native updater runtime
|
||||
- privileged host route/firewall control
|
||||
|
||||
## Build
|
||||
@@ -107,9 +112,237 @@ Not implemented yet:
|
||||
cd agents\rap-node-agent
|
||||
go test ./...
|
||||
go build -o bin\rap-node-agent.exe .\cmd\rap-node-agent
|
||||
go build -buildvcs=false -o bin\rap-host-agent.exe .\cmd\rap-host-agent
|
||||
go build -o bin\mesh-live-smoke.exe .\cmd\mesh-live-smoke
|
||||
```
|
||||
|
||||
## Docker Host Agent Bootstrap
|
||||
|
||||
`rap-host-agent` is the first host-level installer/updater boundary for Docker
|
||||
placement. It does not join the mesh itself. It applies the cluster's install
|
||||
intent locally by running the `rap-node-agent` container with a persistent host
|
||||
state directory. On Linux it also installs a systemd `update-loop` service by
|
||||
default, so nodes continue to update from Control Plane policy without operator
|
||||
commands on each host.
|
||||
|
||||
Preferred profile-based install:
|
||||
|
||||
```bash
|
||||
rap-host-agent install \
|
||||
--profile-url https://control.example.com/api/v1 \
|
||||
--cluster-id <cluster_id> \
|
||||
--install-token <one_time_install_token> \
|
||||
--node-name docker-node-1
|
||||
```
|
||||
|
||||
The host-agent exchanges the install token for a signed control-plane install
|
||||
profile, then applies Docker image, container, state-dir, mesh listen,
|
||||
advertise, NAT/connectivity, and region settings from that profile. The same
|
||||
token is then used by the node-agent for first enrollment, so the operator does
|
||||
not need to manually pass cluster/runtime flags.
|
||||
|
||||
Manual install is still supported:
|
||||
|
||||
```bash
|
||||
rap-host-agent install \
|
||||
--backend-url http://192.168.200.61:18080/api/v1 \
|
||||
--cluster-id <cluster_id> \
|
||||
--join-token <raw_join_token> \
|
||||
--node-name docker-node-1 \
|
||||
--image rap-node-agent:dev-enrollment-bootstrap-smoke \
|
||||
--container-name rap-node-agent-docker-node-1 \
|
||||
--state-dir /var/lib/rap/nodes/docker-node-1 \
|
||||
--network host \
|
||||
--replace
|
||||
```
|
||||
|
||||
The command creates or replaces only the local Docker container. The running
|
||||
node-agent submits the join request, waits for owner approval, stores its
|
||||
identity in the mounted state directory, and then sends heartbeats. Re-running
|
||||
with `--replace` updates the container while preserving node identity. Pass
|
||||
`--auto-update-enabled=false` only for lab/debug installs where the local
|
||||
systemd updater must not be registered.
|
||||
|
||||
Useful checks:
|
||||
|
||||
```bash
|
||||
rap-host-agent status --container-name rap-node-agent-docker-node-1
|
||||
docker logs -f rap-node-agent-docker-node-1
|
||||
```
|
||||
|
||||
For a node that was installed before the updater existed, register only the
|
||||
local updater service without recreating the node-agent container:
|
||||
|
||||
```bash
|
||||
rap-host-agent install-updater \
|
||||
--backend-url http://192.168.200.61:18080/api/v1 \
|
||||
--cluster-id <cluster_id> \
|
||||
--state-dir /var/lib/rap/nodes/docker-node-1 \
|
||||
--container-name rap-node-agent-docker-node-1
|
||||
```
|
||||
|
||||
## Docker Host Agent Updates
|
||||
|
||||
`rap-host-agent update` applies one Control Plane update plan for an already
|
||||
enrolled Docker node. The host-agent fetches the plan, downloads the selected
|
||||
Docker image tar, verifies size and sha256, loads the image, recreates the
|
||||
node-agent container from the existing Docker runtime settings, checks that the
|
||||
container is running, and reports update phases back to the Control Plane.
|
||||
|
||||
```bash
|
||||
rap-host-agent update \
|
||||
--backend-url http://192.168.200.61:18080/api/v1 \
|
||||
--cluster-id <cluster_id> \
|
||||
--node-id <node_id> \
|
||||
--container-name rap-node-agent-docker-node-1 \
|
||||
--current-version 0.1.0-c17z26
|
||||
```
|
||||
|
||||
`rap-host-agent update-loop` is the per-node executor and health boundary. It
|
||||
does not need to poll for normal releases: the node-agent receives an
|
||||
`rap.node_update_hint.v1` subscription hint from Control Plane or the assigned
|
||||
update-cache service during heartbeat, writes `<state-dir>/update-trigger.json`,
|
||||
and the host-agent wakes immediately. The interval is an emergency fallback for
|
||||
missed hints, service migration, or a dead update-cache service; keep it long
|
||||
in production. The loop keeps running after transient errors by default and
|
||||
advances its in-process current version after a successful update so it does
|
||||
not repeatedly apply the same plan. When started without `--node-id` it reads
|
||||
`<state-dir>/identity.json` and waits until the approved node identity appears,
|
||||
which lets the updater service start immediately during first install. It also
|
||||
persists the last applied node-agent version in
|
||||
`<state-dir>/host-update-state.json` so a service restart does not reapply an
|
||||
already-installed release.
|
||||
|
||||
```bash
|
||||
rap-host-agent update-loop \
|
||||
--backend-url http://192.168.200.61:18080/api/v1 \
|
||||
--cluster-id <cluster_id> \
|
||||
--node-id <node_id> \
|
||||
--container-name rap-node-agent-docker-node-1 \
|
||||
--current-version 0.1.0-c17z26 \
|
||||
--interval-seconds 21600 \
|
||||
--jitter 0.15
|
||||
```
|
||||
|
||||
Update-cache nodes are ordinary cluster nodes with the `update-cache` role.
|
||||
Control Plane assigns a healthy update-cache node in the heartbeat hint. If the
|
||||
assigned service disappears, the next hint returns `control_plane_fallback` or a
|
||||
new service assignment; the local updater stays subscribed and only uses the
|
||||
long fallback timer as a last resort.
|
||||
|
||||
`rap-host-agent update-host-agent-loop` updates the host-agent binary itself.
|
||||
Only one global systemd unit is installed per Docker host:
|
||||
`rap-host-agent-self-updater.service`. It uses one approved local node identity
|
||||
to ask Control Plane for product `rap-host-agent` with install type
|
||||
`linux_binary`, verifies the downloaded binary size and sha256, atomically
|
||||
replaces `/usr/local/bin/rap-host-agent`, and reports status. The already
|
||||
running process continues until systemd restarts it, while new invocations use
|
||||
the new binary.
|
||||
|
||||
```bash
|
||||
rap-host-agent update-host-agent-loop \
|
||||
--backend-url http://192.168.200.61:18080/api/v1 \
|
||||
--cluster-id <cluster_id> \
|
||||
--state-dir /var/lib/rap/nodes/docker-node-1 \
|
||||
--binary-path /usr/local/bin/rap-host-agent
|
||||
```
|
||||
|
||||
## Windows Host Agent Bootstrap And Updates
|
||||
|
||||
Windows uses the same Control Plane install profile, but the local placement is
|
||||
a Scheduled Task instead of Docker. In `--startup-mode auto` the installer first
|
||||
tries an elevated `ONSTART` task running as `SYSTEM`; without admin rights it
|
||||
falls back to a per-user `ONLOGON` task. The `ONSTART` mode starts after reboot
|
||||
without an interactive user session. The `ONLOGON` fallback can only start after
|
||||
that Windows user signs in.
|
||||
|
||||
```cmd
|
||||
powershell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -UseBasicParsing 'http://control.example.com/downloads/rap-host-agent-windows-amd64.exe' -OutFile $env:TEMP\rap-host-agent.exe"
|
||||
%TEMP%\rap-host-agent.exe install-windows --profile-url "http://control.example.com/api/v1" --cluster-id "<cluster_id>" --install-token "<one_time_install_token>" --node-name "office-win-1" --startup-mode "auto"
|
||||
```
|
||||
|
||||
`install-windows` installs two tasks:
|
||||
|
||||
- `RAP Node Agent <node>` runs `rap-node-agent.exe`.
|
||||
- `RAP Host Agent Updater <node>` runs `rap-host-agent update-loop` for product
|
||||
`rap-node-agent`, install type `windows_service`, and replaces the local
|
||||
`rap-node-agent.exe` from signed release artifacts.
|
||||
|
||||
During first bootstrap the updater can read `<state-dir>\identity.json` and
|
||||
will wait until the join request is approved. For an already-enrolled Windows
|
||||
node, prefer passing `--node-id` explicitly. That makes the updater wrapper
|
||||
independent from the local identity file location and is required for repair of
|
||||
older Windows installs where the node is already heartbeat-healthy but the
|
||||
host-agent updater has no usable identity file.
|
||||
|
||||
```cmd
|
||||
%TEMP%\rap-host-agent.exe install-windows --backend-url "http://control.example.com/api/v1" --cluster-id "<cluster_id>" --node-id "<node_id>" --node-name "office-win-1" --replace --startup-mode "auto" --auto-update-current-version "<current_version>"
|
||||
```
|
||||
|
||||
The admin UI node details page generates a downloadable
|
||||
`rap-repair-updater-<node>.cmd` for this repair path. It performs these steps:
|
||||
|
||||
- prints `schtasks /Query` diagnostics for the node-agent and updater tasks;
|
||||
- prints the local `rap-*.exe*` files;
|
||||
- downloads the current `rap-host-agent.exe`;
|
||||
- reinstalls the Windows updater wrapper with `--node-id`;
|
||||
- runs a foreground one-shot `update-loop --max-runs 1`;
|
||||
- applies `rap-host-agent.exe.next` if the running host-agent could not replace
|
||||
itself;
|
||||
- restarts `RAP Host Agent Updater <node>`;
|
||||
- prints post-repair diagnostics.
|
||||
|
||||
Expected successful updater reports in the admin panel:
|
||||
|
||||
```text
|
||||
rap-node-agent <target> -> <target> plan/noop
|
||||
rap-host-agent <target> -> <target> plan/noop
|
||||
```
|
||||
|
||||
If the latest host-agent report is `apply/staged`, the new host-agent binary
|
||||
was downloaded as `rap-host-agent.exe.next` but the running process still held
|
||||
the old executable. End and run the updater task once, or rerun the generated
|
||||
repair command:
|
||||
|
||||
```cmd
|
||||
schtasks /End /TN "RAP Host Agent Updater office-win-1"
|
||||
schtasks /Run /TN "RAP Host Agent Updater office-win-1"
|
||||
```
|
||||
|
||||
### Windows Reboot / Autostart Verification
|
||||
|
||||
After installation or repair, verify the service survives a reboot:
|
||||
|
||||
1. Reboot the Windows host, or at minimum restart both scheduled tasks.
|
||||
2. Confirm the tasks exist:
|
||||
|
||||
```cmd
|
||||
schtasks /Query /TN "RAP Node Agent office-win-1" /V /FO LIST
|
||||
schtasks /Query /TN "RAP Host Agent Updater office-win-1" /V /FO LIST
|
||||
```
|
||||
|
||||
3. Confirm the admin panel shows:
|
||||
|
||||
```text
|
||||
heartbeat: fresh
|
||||
rap-node-agent: plan/noop
|
||||
rap-host-agent: plan/noop
|
||||
node version_state: current
|
||||
```
|
||||
|
||||
Without admin rights, `install-windows --startup-mode auto` may fall back to
|
||||
`user-task`. That node can still heartbeat and update after the user logs in,
|
||||
but it will not start before logon after a reboot. Use an elevated shell for
|
||||
production Windows nodes that must recover unattended.
|
||||
|
||||
Control Plane release artifacts for Windows must use:
|
||||
|
||||
- `product=rap-node-agent`
|
||||
- `os=windows`
|
||||
- `arch=amd64`
|
||||
- `install_type=windows_service`
|
||||
- `kind=binary`
|
||||
|
||||
## First Enrollment
|
||||
|
||||
Create a join token from the platform control plane, then run:
|
||||
@@ -185,9 +418,18 @@ bounded `synthetic.echo` test-service runtime, and live synthetic HTTP endpoint.
|
||||
It must not be used for RDP, VPN, file, video, or other production service
|
||||
traffic.
|
||||
|
||||
`RAP_WORKLOAD_SUPERVISION_ENABLED` defaults to `false`. While service runtime
|
||||
supervision is still a stub, the agent does not poll desired workloads or report
|
||||
workload status unless this flag is explicitly enabled.
|
||||
`RAP_WORKLOAD_SUPERVISION_ENABLED` defaults to `false`. When enabled, the agent
|
||||
polls node-scoped desired workloads and reports status. The current bounded
|
||||
runtime reports built-in `core-mesh` and `mesh-listener` services as running
|
||||
when enabled, supports the native built-in `synthetic.echo` test workload, and
|
||||
keeps unsupported production workloads such as RDP workers degraded until their
|
||||
supervisors are implemented.
|
||||
|
||||
For Remote Workspace/RDP integration work, the native `rdp-worker` desired
|
||||
workload supports only an explicit `adapter_contract_probe` mode. That mode
|
||||
reports the remote-workspace adapter channel contract and requires Fabric
|
||||
Service Channel as the future data plane; it does not start FreeRDP, create a
|
||||
remote session, or carry production RDP payloads.
|
||||
|
||||
`RAP_MESH_LISTEN_ADDR` starts the C17E/C17F/C17G synthetic HTTP endpoint only when
|
||||
`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true`. `RAP_MESH_SYNTHETIC_CONFIG` points to
|
||||
@@ -423,6 +665,63 @@ observations with expected/observed hops and drift status. This probes
|
||||
replacement relay effective paths for control-plane health only and does not
|
||||
enable service payload forwarding.
|
||||
|
||||
C17Z21 defines the portable inbound listener contract for Docker, Linux
|
||||
service, Windows service, and future OS-specific node packages. The node-agent
|
||||
does not stop when the mesh listen port cannot be bound. It keeps the outbound
|
||||
Control Plane session alive and emits `c17z21.mesh_listener_report.v1` in
|
||||
heartbeat metadata with configured address, effective address, listen mode,
|
||||
listener status, inbound reachability, one-way connectivity, failure reason,
|
||||
and port-conflict diagnostics.
|
||||
|
||||
`RAP_MESH_LISTEN_PORT_MODE` controls behavior:
|
||||
|
||||
- `manual`: bind exactly `RAP_MESH_LISTEN_ADDR`; on conflict report
|
||||
`listen_failed` and wait for an operator/config change.
|
||||
- `auto`: try `RAP_MESH_LISTEN_ADDR`; on conflict scan
|
||||
`RAP_MESH_LISTEN_AUTO_PORT_START..RAP_MESH_LISTEN_AUTO_PORT_END` and report
|
||||
`auto_rebound` when a free port is selected.
|
||||
- `disabled`: do not open an inbound listener; the node is expected to be
|
||||
outbound-only, relay/rendezvous, or Control Plane only.
|
||||
|
||||
For `RAP_MESH_CONNECTIVITY_MODE=outbound_only`, inbound listener failure is not
|
||||
treated as node death. The heartbeat remains `healthy` with
|
||||
`mesh_one_way_connectivity=true` and listener diagnostics. For direct/private
|
||||
LAN modes, a listener failure degrades the node so the admin panel can show
|
||||
that the node is alive but cannot accept inbound mesh traffic. Service payload
|
||||
forwarding is still not enabled by this contract.
|
||||
|
||||
C17Z22 separates outbound Control Plane presence from inbound mesh
|
||||
reachability. When synthetic mesh testing is enabled, every heartbeat includes
|
||||
`c17z22.mesh_outbound_session_report.v1` with node-to-control-plane direction,
|
||||
keepalive transport, listener conflict state, rendezvous/relay counters, and a
|
||||
flag showing whether the current outbound session can be used as a reverse
|
||||
control-channel contract. This is the portable basis for Docker, Linux service,
|
||||
Windows service, and future packages where a node may be behind NAT or have no
|
||||
stable inbound address. It is still control-plane telemetry only and does not
|
||||
carry RDP/VPN/service payload traffic.
|
||||
|
||||
C17Z24 separates the listener bind address from advertised mesh endpoints. The
|
||||
agent never advertises loopback addresses discovered from the local listener;
|
||||
`127.0.0.1`/`::1` are test-only bind details, not cluster reachability data.
|
||||
When the listener is active, the agent enumerates active non-loopback host
|
||||
interfaces and reports usable endpoint candidates with interface metadata,
|
||||
address family, reachability, NAT/connectivity hints, and priority. Container
|
||||
bridge/veth interfaces and link-local addresses are filtered by default, while
|
||||
physical and VPN-style interfaces are kept so different cluster segments can
|
||||
choose the address that matches their network. Operator-provided
|
||||
`RAP_MESH_ADVERTISE_ENDPOINT` or endpoint-candidate JSON remains authoritative
|
||||
and is ranked ahead of auto-discovered addresses.
|
||||
|
||||
C17Z25 adds per-peer endpoint fallback probing to the control-plane mesh
|
||||
manager. A node no longer treats the top-ranked endpoint candidate as the only
|
||||
possible address for a peer. For each warm direct/private/corporate peer, the
|
||||
manager probes the ranked candidate list until one `/mesh/v1/health` endpoint
|
||||
responds or all direct candidates fail. Heartbeat metadata includes
|
||||
`c17z25.mesh_peer_connection_manager_report.v1` with `probe_results`,
|
||||
`selected_candidate_id`, `selected_endpoint`, and per-candidate success/failure
|
||||
details. This is still control-plane health and address selection telemetry; it
|
||||
does not forward RDP/VPN/service payloads.
|
||||
|
||||
Scoped synthetic config shape:
|
||||
|
||||
```json
|
||||
@@ -480,7 +779,7 @@ Expected:
|
||||
- The agent never assigns roles to itself.
|
||||
- The agent reports capabilities only.
|
||||
- Platform policy assigns roles.
|
||||
- No RDP/VPN/production service traffic is carried by the C17A-C17Z18 staged
|
||||
- No RDP/VPN/production service traffic is carried by the C17A-C17Z22 staged
|
||||
mesh runtime.
|
||||
- Production forwarding remains disabled by default and limited to
|
||||
`fabric.control` when explicitly enabled.
|
||||
|
||||
@@ -0,0 +1,744 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/hostagent"
|
||||
)
|
||||
|
||||
type installCommandConfig struct {
|
||||
Runtime hostagent.RuntimeConfig
|
||||
DryRun bool
|
||||
AutoUpdateEnabled bool
|
||||
AutoUpdate hostagent.UpdateServiceConfig
|
||||
}
|
||||
|
||||
func main() {
|
||||
log.SetFlags(0)
|
||||
applyStagedSelfUpdate()
|
||||
if len(os.Args) < 2 {
|
||||
usage()
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
defer stop()
|
||||
switch os.Args[1] {
|
||||
case "install":
|
||||
if err := runInstall(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("install failed: %v", err)
|
||||
}
|
||||
case "install-windows":
|
||||
if err := runInstallWindows(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("install-windows failed: %v", err)
|
||||
}
|
||||
case "install-linux":
|
||||
if err := runInstallLinux(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("install-linux failed: %v", err)
|
||||
}
|
||||
case "status":
|
||||
if err := runStatus(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("status failed: %v", err)
|
||||
}
|
||||
case "update":
|
||||
if err := runUpdate(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update failed: %v", err)
|
||||
}
|
||||
case "update-loop":
|
||||
if err := runUpdateLoop(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update-loop failed: %v", err)
|
||||
}
|
||||
case "install-updater":
|
||||
if err := runInstallUpdater(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("install-updater failed: %v", err)
|
||||
}
|
||||
case "update-host-agent":
|
||||
if err := runUpdateHostAgent(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update-host-agent failed: %v", err)
|
||||
}
|
||||
case "update-host-agent-loop":
|
||||
if err := runUpdateHostAgentLoop(ctx, os.Args[2:]); err != nil {
|
||||
log.Fatalf("update-host-agent-loop failed: %v", err)
|
||||
}
|
||||
default:
|
||||
usage()
|
||||
os.Exit(2)
|
||||
}
|
||||
}
|
||||
|
||||
func applyStagedSelfUpdate() {
|
||||
if runtime.GOOS == "windows" {
|
||||
return
|
||||
}
|
||||
executable, err := os.Executable()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
staged := executable + ".next"
|
||||
if _, err := os.Stat(staged); err != nil {
|
||||
return
|
||||
}
|
||||
backup := executable + ".old"
|
||||
_ = os.Remove(backup)
|
||||
if err := os.Rename(executable, backup); err != nil {
|
||||
return
|
||||
}
|
||||
if err := os.Rename(staged, executable); err != nil {
|
||||
_ = os.Rename(backup, executable)
|
||||
return
|
||||
}
|
||||
_ = os.Chmod(executable, 0o755)
|
||||
_ = os.Remove(backup)
|
||||
}
|
||||
|
||||
func runInstallLinux(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("install-linux", flag.ContinueOnError)
|
||||
cfg := hostagent.LinuxInstallConfig{}
|
||||
var profileURL string
|
||||
var installToken string
|
||||
fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
|
||||
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/linux-install-profile URL for profile-based install.")
|
||||
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Linux install profile.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
|
||||
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.")
|
||||
fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_LINUX_INSTALL_DIR", ""), "Directory for rap-node-agent and rap-host-agent.")
|
||||
fs.StringVar(&cfg.ConfigDir, "config-dir", getenv("RAP_LINUX_CONFIG_DIR", ""), "Directory for node-agent env file.")
|
||||
fs.StringVar(&cfg.StartupMode, "startup-mode", getenv("RAP_LINUX_STARTUP_MODE", "systemd"), "Startup mode: systemd, auto, or none.")
|
||||
fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_REPLACE", true), "Replace local node-agent binary/config when an artifact is available.")
|
||||
fs.BoolVar(&cfg.DryRun, "dry-run", false, "Print resolved placement without installing.")
|
||||
fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Linux host-agent update service.")
|
||||
fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
|
||||
fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
|
||||
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
|
||||
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
|
||||
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent path copied to the persistent updater location.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "direct_http"), "Advertised transport.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "linux"), "Region/site hint.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
|
||||
dryRun := cfg.DryRun
|
||||
startupMode := strings.TrimSpace(cfg.StartupMode)
|
||||
autoUpdateEnabled := cfg.AutoUpdateEnabled
|
||||
autoUpdateCurrentVersion := cfg.AutoUpdateCurrentVersion
|
||||
autoUpdateChannel := cfg.AutoUpdateChannel
|
||||
autoUpdateIntervalSeconds := cfg.AutoUpdateIntervalSeconds
|
||||
autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds
|
||||
autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds
|
||||
hostAgentSourcePath := cfg.HostAgentSourcePath
|
||||
profile, err := hostagent.FetchLinuxInstallProfile(ctx, hostagent.ProfileRequest{URL: profileURL, ClusterID: cfg.RuntimeConfig.ClusterID, InstallToken: installToken, NodeName: cfg.RuntimeConfig.NodeName})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg = hostagent.LinuxInstallConfigFromProfile(profile)
|
||||
cfg.Replace = true
|
||||
cfg.DryRun = dryRun
|
||||
cfg.AutoUpdateEnabled = autoUpdateEnabled
|
||||
cfg.AutoUpdateCurrentVersion = autoUpdateCurrentVersion
|
||||
cfg.AutoUpdateChannel = autoUpdateChannel
|
||||
cfg.AutoUpdateIntervalSeconds = autoUpdateIntervalSeconds
|
||||
cfg.AutoUpdateInitialDelaySeconds = autoUpdateInitialDelaySeconds
|
||||
cfg.AutoUpdateHealthTimeoutSeconds = autoUpdateHealthTimeoutSeconds
|
||||
cfg.HostAgentSourcePath = hostAgentSourcePath
|
||||
if startupMode != "" {
|
||||
cfg.StartupMode = startupMode
|
||||
}
|
||||
}
|
||||
result, err := (hostagent.LinuxManager{}).Install(ctx, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("node=%s install_dir=%s state_dir=%s node_agent=%s unit=%s downloaded=%t started=%t updater_unit=%s updater_started=%t\n",
|
||||
result.NodeName, result.InstallDir, result.StateDir, result.NodeAgentPath, result.UnitName, result.Downloaded, result.Started, result.UpdaterUnitName, result.UpdaterStarted)
|
||||
fmt.Println("next: approve the join request in the platform admin panel, then the Linux node-agent will finish bootstrap and start heartbeats")
|
||||
return nil
|
||||
}
|
||||
|
||||
func runInstallWindows(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("install-windows", flag.ContinueOnError)
|
||||
cfg := hostagent.WindowsInstallConfig{}
|
||||
var profileURL string
|
||||
var installToken string
|
||||
fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
|
||||
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/windows-install-profile URL for profile-based install.")
|
||||
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Windows install profile.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.")
|
||||
fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_WINDOWS_INSTALL_DIR", ""), "Directory for rap-node-agent.exe and wrapper scripts.")
|
||||
fs.StringVar(&cfg.StartupMode, "startup-mode", getenv("RAP_WINDOWS_STARTUP_MODE", "auto"), "Startup mode: auto, system-task, user-task, or none.")
|
||||
fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_REPLACE", true), "Replace local node-agent binary/config when an artifact is available.")
|
||||
fs.BoolVar(&cfg.DryRun, "dry-run", false, "Print resolved placement without installing.")
|
||||
fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Windows host-agent update task.")
|
||||
fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
|
||||
fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
|
||||
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
|
||||
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
|
||||
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent.exe path copied to the persistent updater location.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", true), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ":19131"), "Synthetic mesh HTTP listen address.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "direct_http"), "Advertised transport.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
|
||||
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "windows"), "Region/site hint.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
|
||||
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
|
||||
dryRun := cfg.DryRun
|
||||
startupMode := strings.TrimSpace(cfg.StartupMode)
|
||||
autoUpdateEnabled := cfg.AutoUpdateEnabled
|
||||
autoUpdateCurrentVersion := cfg.AutoUpdateCurrentVersion
|
||||
autoUpdateChannel := cfg.AutoUpdateChannel
|
||||
autoUpdateIntervalSeconds := cfg.AutoUpdateIntervalSeconds
|
||||
autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds
|
||||
autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds
|
||||
hostAgentSourcePath := cfg.HostAgentSourcePath
|
||||
profile, err := hostagent.FetchWindowsInstallProfile(ctx, hostagent.ProfileRequest{
|
||||
URL: profileURL,
|
||||
ClusterID: cfg.RuntimeConfig.ClusterID,
|
||||
InstallToken: installToken,
|
||||
NodeName: cfg.RuntimeConfig.NodeName,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg = hostagent.WindowsInstallConfigFromProfile(profile)
|
||||
cfg.Replace = true
|
||||
cfg.DryRun = dryRun
|
||||
cfg.AutoUpdateEnabled = autoUpdateEnabled
|
||||
cfg.AutoUpdateCurrentVersion = autoUpdateCurrentVersion
|
||||
cfg.AutoUpdateChannel = autoUpdateChannel
|
||||
cfg.AutoUpdateIntervalSeconds = autoUpdateIntervalSeconds
|
||||
cfg.AutoUpdateInitialDelaySeconds = autoUpdateInitialDelaySeconds
|
||||
cfg.AutoUpdateHealthTimeoutSeconds = autoUpdateHealthTimeoutSeconds
|
||||
cfg.HostAgentSourcePath = hostAgentSourcePath
|
||||
if startupMode != "" {
|
||||
cfg.StartupMode = startupMode
|
||||
}
|
||||
}
|
||||
result, err := (hostagent.WindowsManager{}).Install(ctx, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("node=%s install_dir=%s state_dir=%s node_agent=%s startup_mode=%s task=%s downloaded=%t started=%t updater_task=%s updater_started=%t admin_fallback=%t\n",
|
||||
result.NodeName, result.InstallDir, result.StateDir, result.NodeAgentPath, result.StartupMode, result.TaskName, result.Downloaded, result.Started, result.UpdaterTaskName, result.UpdaterStarted, result.AdminFallback)
|
||||
fmt.Println("next: approve the join request in the platform admin panel, then the Windows node-agent will finish bootstrap and start heartbeats")
|
||||
return nil
|
||||
}
|
||||
|
||||
func runInstall(ctx context.Context, args []string) error {
|
||||
installCfg, err := parseInstall(args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cfg := installCfg.Runtime.Normalize()
|
||||
cfg = cfg.Normalize()
|
||||
runArgs := hostagent.DockerRunArgs(cfg)
|
||||
if installCfg.DryRun {
|
||||
fmt.Printf("docker %s\n", shellJoin(hostagent.RedactedArgs(runArgs)))
|
||||
if installCfg.AutoUpdateEnabled {
|
||||
service := installCfg.AutoUpdate
|
||||
service.RuntimeConfig = cfg
|
||||
service.DryRun = true
|
||||
result, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Print(result.Unit)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
result, err := (hostagent.DockerManager{}).Install(ctx, cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("container=%s image=%s id=%s pulled=%t replaced=%t\n", result.ContainerName, result.Image, result.ContainerID, result.Pulled, result.Replaced)
|
||||
if installCfg.AutoUpdateEnabled {
|
||||
service := installCfg.AutoUpdate
|
||||
service.RuntimeConfig = cfg
|
||||
service.ManageSystemd = true
|
||||
serviceResult, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t\n", serviceResult.UnitName, serviceResult.UnitPath, serviceResult.BinaryPath, serviceResult.Started)
|
||||
}
|
||||
fmt.Println("next: approve the join request in the platform admin panel, then the node-agent will finish bootstrap and start heartbeats")
|
||||
return nil
|
||||
}
|
||||
|
||||
func runStatus(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("status", flag.ContinueOnError)
|
||||
containerName := fs.String("container-name", hostagent.DefaultContainerName, "Docker container name.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := (hostagent.DockerManager{}).Status(ctx, *containerName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Print(out)
|
||||
return nil
|
||||
}
|
||||
|
||||
func runUpdate(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("update", flag.ContinueOnError)
|
||||
req := hostagent.UpdateRequest{}
|
||||
var healthTimeoutSeconds int
|
||||
registerUpdateFlags(fs, &req, &healthTimeoutSeconds)
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
req.HealthTimeout = time.Duration(healthTimeoutSeconds) * time.Second
|
||||
if req.DryRun {
|
||||
plan, err := hostagent.FetchNodeUpdatePlan(ctx, req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("action=%s reason=%s target=%s production_forwarding=%t\n", plan.Action, plan.Reason, plan.TargetVersion, plan.ProductionForwarding)
|
||||
if plan.Artifact != nil {
|
||||
fmt.Printf("artifact=%s sha256=%s size=%d\n", plan.Artifact.URL, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
var result hostagent.UpdateResult
|
||||
var err error
|
||||
if req.InstallType == hostagent.WindowsUpdateInstallType || runtime.GOOS == "windows" {
|
||||
result, err = (hostagent.WindowsManager{}).ApplyUpdate(ctx, req)
|
||||
} else if req.InstallType == hostagent.BinaryUpdateInstallType {
|
||||
result, err = (hostagent.LinuxManager{}).ApplyUpdate(ctx, req)
|
||||
} else {
|
||||
result, err = (hostagent.DockerManager{}).ApplyUpdate(ctx, req)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("action=%s reason=%s target=%s container=%s image=%s id=%s loaded=%t replaced=%t rolled_back=%t\n",
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.ContainerName,
|
||||
result.NewImage,
|
||||
result.ContainerID,
|
||||
result.Loaded,
|
||||
result.Replaced,
|
||||
result.RolledBack,
|
||||
)
|
||||
return nil
|
||||
}
|
||||
|
||||
func runUpdateLoop(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("update-loop", flag.ContinueOnError)
|
||||
req := hostagent.UpdateRequest{}
|
||||
var healthTimeoutSeconds int
|
||||
var intervalSeconds int
|
||||
var initialDelaySeconds int
|
||||
var maxRuns int
|
||||
var jitter float64
|
||||
var stopOnError bool
|
||||
var hostAgentStatusEnabled bool
|
||||
var hostAgentVersion string
|
||||
var hostAgentBinaryPath string
|
||||
registerUpdateFlags(fs, &req, &healthTimeoutSeconds)
|
||||
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Seconds between emergency fallback update plan polls. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 0), "Seconds to wait before the first poll.")
|
||||
fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.")
|
||||
fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.")
|
||||
fs.BoolVar(&stopOnError, "stop-on-error", getenvBool("RAP_UPDATE_STOP_ON_ERROR", false), "Stop the loop after the first failed update attempt.")
|
||||
fs.BoolVar(&hostAgentStatusEnabled, "host-agent-update-status-enabled", getenvBool("RAP_HOST_AGENT_UPDATE_STATUS_ENABLED", true), "Also poll/report rap-host-agent update status from this loop.")
|
||||
fs.StringVar(&hostAgentVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Current rap-host-agent version reported by the loop.")
|
||||
fs.StringVar(&hostAgentBinaryPath, "host-agent-binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "rap-host-agent binary path used for host-agent update status.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
req.HealthTimeout = time.Duration(healthTimeoutSeconds) * time.Second
|
||||
cfg := hostagent.UpdateLoopConfig{
|
||||
Request: req,
|
||||
Interval: time.Duration(intervalSeconds) * time.Second,
|
||||
InitialDelay: time.Duration(initialDelaySeconds) * time.Second,
|
||||
Jitter: jitter,
|
||||
MaxRuns: maxRuns,
|
||||
StopOnError: stopOnError,
|
||||
Logf: func(format string, args ...any) {
|
||||
fmt.Printf(format+"\n", args...)
|
||||
},
|
||||
}
|
||||
cfg.HostAgentUpdateEnabled = hostAgentStatusEnabled
|
||||
cfg.HostAgentUpdateRequest = hostagent.HostAgentUpdateRequest{
|
||||
BackendURL: req.BackendURL,
|
||||
ClusterID: req.ClusterID,
|
||||
NodeID: req.NodeID,
|
||||
StateDir: req.StateDir,
|
||||
CurrentVersion: hostAgentVersion,
|
||||
Channel: req.Channel,
|
||||
OS: firstNonEmptyLocal(req.OS, runtime.GOOS),
|
||||
Arch: firstNonEmptyLocal(req.Arch, runtime.GOARCH),
|
||||
InstallType: hostagent.BinaryUpdateInstallType,
|
||||
BinaryPath: hostAgentBinaryPath,
|
||||
}
|
||||
if req.InstallType == hostagent.WindowsUpdateInstallType || runtime.GOOS == "windows" {
|
||||
cfg.HostAgentUpdateRequest.InstallType = "windows_binary"
|
||||
return (hostagent.WindowsManager{}).RunUpdateLoop(ctx, cfg)
|
||||
}
|
||||
if req.InstallType == hostagent.BinaryUpdateInstallType {
|
||||
return (hostagent.LinuxManager{}).RunUpdateLoop(ctx, cfg)
|
||||
}
|
||||
return (hostagent.DockerManager{}).RunUpdateLoop(ctx, cfg)
|
||||
}
|
||||
|
||||
func firstNonEmptyLocal(values ...string) string {
|
||||
for _, value := range values {
|
||||
if strings.TrimSpace(value) != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func runInstallUpdater(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("install-updater", flag.ContinueOnError)
|
||||
runtimeCfg := hostagent.RuntimeConfig{}
|
||||
service := hostagent.UpdateServiceConfig{}
|
||||
var dryRun bool
|
||||
var selfUpdater bool
|
||||
fs.StringVar(&runtimeCfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&runtimeCfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&runtimeCfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.")
|
||||
fs.StringVar(&runtimeCfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.")
|
||||
fs.StringVar(&service.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version before first successful update.")
|
||||
fs.StringVar(&service.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
|
||||
fs.IntVar(&service.IntervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&service.InitialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
|
||||
fs.Float64Var(&service.Jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.")
|
||||
fs.IntVar(&service.HealthTimeoutSec, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
|
||||
fs.StringVar(&service.BinaryInstallPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.")
|
||||
fs.BoolVar(&selfUpdater, "self-updater-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.")
|
||||
fs.BoolVar(&dryRun, "dry-run", false, "Print the systemd unit without installing it.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
service.RuntimeConfig = runtimeCfg
|
||||
service.ManageSystemd = !dryRun
|
||||
service.DryRun = dryRun
|
||||
service.InstallSelfUpdater = selfUpdater
|
||||
service.SelfUpdateVersion = agent.Version
|
||||
result, err := (hostagent.DockerManager{}).InstallUpdateService(ctx, service)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if dryRun {
|
||||
fmt.Print(result.Unit)
|
||||
if result.SelfUnit != "" {
|
||||
fmt.Print(result.SelfUnit)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
fmt.Printf("updater_service=%s unit=%s binary=%s started=%t self_updater=%s\n", result.UnitName, result.UnitPath, result.BinaryPath, result.Started, result.SelfUnitName)
|
||||
return nil
|
||||
}
|
||||
|
||||
func runUpdateHostAgent(ctx context.Context, args []string) error {
|
||||
req, interval, initialDelay, jitter, maxRuns, stopOnError, loop, err := parseHostAgentUpdate(args)
|
||||
_, _, _, _, _ = interval, initialDelay, jitter, maxRuns, stopOnError
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if loop {
|
||||
return fmt.Errorf("internal parser error: loop flag set for one-shot update")
|
||||
}
|
||||
result, err := (hostagent.DockerManager{}).ApplyHostAgentUpdate(ctx, req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t\n", result.Action, result.Reason, result.TargetVersion, result.NewImage, result.Replaced, result.RestartNeeded)
|
||||
return nil
|
||||
}
|
||||
|
||||
func runUpdateHostAgentLoop(ctx context.Context, args []string) error {
|
||||
req, interval, initialDelay, jitter, maxRuns, stopOnError, _, err := parseHostAgentUpdate(args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return (hostagent.DockerManager{}).RunHostAgentUpdateLoop(ctx, hostagent.HostAgentUpdateLoopConfig{
|
||||
Request: req,
|
||||
Interval: time.Duration(interval) * time.Second,
|
||||
InitialDelay: time.Duration(initialDelay) * time.Second,
|
||||
Jitter: jitter,
|
||||
MaxRuns: maxRuns,
|
||||
StopOnError: stopOnError,
|
||||
Logf: func(format string, args ...any) {
|
||||
fmt.Printf(format+"\n", args...)
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func parseHostAgentUpdate(args []string) (hostagent.HostAgentUpdateRequest, int, int, float64, int, bool, bool, error) {
|
||||
fs := flag.NewFlagSet("update-host-agent", flag.ContinueOnError)
|
||||
req := hostagent.HostAgentUpdateRequest{}
|
||||
var intervalSeconds int
|
||||
var initialDelaySeconds int
|
||||
var maxRuns int
|
||||
var jitter float64
|
||||
var stopOnError bool
|
||||
fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
|
||||
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json.")
|
||||
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Currently installed rap-host-agent version.")
|
||||
fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
|
||||
fs.StringVar(&req.OS, "os", getenv("RAP_HOST_AGENT_UPDATE_OS", runtime.GOOS), "Host-agent artifact OS selector.")
|
||||
fs.StringVar(&req.Arch, "arch", getenv("RAP_HOST_AGENT_UPDATE_ARCH", runtime.GOARCH), "Host-agent artifact architecture selector.")
|
||||
fs.StringVar(&req.InstallType, "install-type", getenv("RAP_HOST_AGENT_UPDATE_INSTALL_TYPE", hostagent.BinaryUpdateInstallType), "Host-agent artifact install type.")
|
||||
fs.StringVar(&req.BinaryPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "rap-host-agent binary path to replace atomically.")
|
||||
fs.BoolVar(&req.DryRun, "dry-run", false, "Fetch and print the update plan without applying it.")
|
||||
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INTERVAL_SECONDS", 900), "Seconds between host-agent update plan polls.")
|
||||
fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INITIAL_DELAY_SECONDS", 45), "Seconds to wait before the first poll.")
|
||||
fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.")
|
||||
fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.")
|
||||
fs.BoolVar(&stopOnError, "stop-on-error", getenvBool("RAP_UPDATE_STOP_ON_ERROR", false), "Stop the loop after the first failed update attempt.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return hostagent.HostAgentUpdateRequest{}, 0, 0, 0, 0, false, false, err
|
||||
}
|
||||
return req, intervalSeconds, initialDelaySeconds, jitter, maxRuns, stopOnError, false, nil
|
||||
}
|
||||
|
||||
func registerUpdateFlags(fs *flag.FlagSet, req *hostagent.UpdateRequest, healthTimeoutSeconds *int) {
|
||||
fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
|
||||
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json; used when node-id is not known yet.")
|
||||
fs.StringVar(&req.Product, "product", getenv("RAP_UPDATE_PRODUCT", hostagent.DefaultUpdateProduct), "Update product name.")
|
||||
fs.StringVar(&req.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Currently running product version.")
|
||||
fs.StringVar(&req.OS, "os", getenv("RAP_UPDATE_OS", runtime.GOOS), "Artifact OS selector.")
|
||||
fs.StringVar(&req.Arch, "arch", getenv("RAP_UPDATE_ARCH", runtime.GOARCH), "Artifact architecture selector.")
|
||||
fs.StringVar(&req.InstallType, "install-type", getenv("RAP_UPDATE_INSTALL_TYPE", hostagent.DefaultUpdateInstallType), "Artifact install type.")
|
||||
fs.StringVar(&req.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
|
||||
fs.StringVar(&req.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.")
|
||||
fs.StringVar(&req.BinaryPath, "binary-path", getenv("RAP_NODE_AGENT_BINARY_PATH", ""), "Windows node-agent binary path to replace.")
|
||||
fs.StringVar(&req.WindowsTaskName, "windows-task-name", getenv("RAP_WINDOWS_TASK_NAME", ""), "Windows Scheduled Task name used to restart node-agent.")
|
||||
fs.StringVar(&req.SystemdUnitName, "systemd-unit", getenv("RAP_SYSTEMD_UNIT", ""), "Linux systemd unit used to restart node-agent.")
|
||||
fs.IntVar(healthTimeoutSeconds, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Seconds to wait for the updated container to be running.")
|
||||
fs.BoolVar(&req.DryRun, "dry-run", false, "Fetch and print the update plan without applying it.")
|
||||
}
|
||||
|
||||
func parseInstall(args []string) (installCommandConfig, error) {
|
||||
fs := flag.NewFlagSet("install", flag.ContinueOnError)
|
||||
cfg := hostagent.RuntimeConfig{}
|
||||
var dryRun bool
|
||||
var profileURL string
|
||||
var installToken string
|
||||
var autoUpdateEnabled bool
|
||||
autoUpdate := hostagent.UpdateServiceConfig{}
|
||||
fs.StringVar(&cfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
|
||||
fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
|
||||
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/docker-install-profile URL for profile-based install.")
|
||||
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Docker install profile.")
|
||||
fs.StringVar(&cfg.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
|
||||
fs.StringVar(&cfg.Image, "image", getenv("RAP_NODE_AGENT_IMAGE", hostagent.DefaultImage), "Docker image for rap-node-agent.")
|
||||
fs.StringVar(&cfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name.")
|
||||
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path mounted as node-agent state.")
|
||||
fs.StringVar(&cfg.Network, "network", getenv("RAP_DOCKER_NETWORK", hostagent.DefaultNetwork), "Docker network mode/name.")
|
||||
fs.StringVar(&cfg.RestartPolicy, "restart", getenv("RAP_DOCKER_RESTART", "unless-stopped"), "Docker restart policy.")
|
||||
fs.BoolVar(&cfg.PullImage, "pull", getenvBool("RAP_DOCKER_PULL", false), "Pull image before running.")
|
||||
fs.BoolVar(&cfg.Replace, "replace", getenvBool("RAP_DOCKER_REPLACE", false), "Remove an existing container with the same name before run.")
|
||||
fs.BoolVar(&cfg.DockerVPNGatewayEnabled, "docker-vpn-gateway-enabled", getenvBool("RAP_DOCKER_VPN_GATEWAY_ENABLED", false), "Run Docker node-agent with NET_ADMIN and /dev/net/tun for VPN gateway mode.")
|
||||
fs.StringVar(&cfg.ImageArtifactSHA256, "image-artifact-sha256", getenv("RAP_NODE_AGENT_IMAGE_ARTIFACT_SHA256", ""), "Expected SHA-256 for a Docker image tar artifact.")
|
||||
fs.Int64Var(&cfg.ImageArtifactSizeBytes, "image-artifact-size-bytes", getenvInt64("RAP_NODE_AGENT_IMAGE_ARTIFACT_SIZE_BYTES", 0), "Expected byte size for a Docker image tar artifact (used as a best-effort check when sha256 is provided).")
|
||||
fs.BoolVar(&dryRun, "dry-run", false, "Print the docker command with secrets redacted.")
|
||||
fs.BoolVar(&autoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the local update-loop service.")
|
||||
fs.BoolVar(&autoUpdate.InstallSelfUpdater, "host-agent-self-update-enabled", getenvBool("RAP_HOST_AGENT_SELF_UPDATE_ENABLED", true), "Install and start one global host-agent binary self-updater service.")
|
||||
fs.StringVar(&autoUpdate.CurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
|
||||
fs.StringVar(&autoUpdate.SelfUpdateVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Initial host-agent binary version used by the self-updater.")
|
||||
fs.StringVar(&autoUpdate.Channel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
|
||||
fs.IntVar(&autoUpdate.IntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
|
||||
fs.IntVar(&autoUpdate.InitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
|
||||
fs.Float64Var(&autoUpdate.Jitter, "auto-update-jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.")
|
||||
fs.IntVar(&autoUpdate.HealthTimeoutSec, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
|
||||
fs.StringVar(&autoUpdate.BinaryInstallPath, "auto-update-binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "Persistent host path for rap-host-agent binary used by the service.")
|
||||
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
|
||||
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable synthetic mesh runtime.")
|
||||
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
|
||||
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Synthetic mesh HTTP listen address inside container.")
|
||||
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", ""), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 0), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", ""), "Advertised transport.")
|
||||
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", ""), "Connectivity mode hint.")
|
||||
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", ""), "NAT type hint.")
|
||||
fs.StringVar(&cfg.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint.")
|
||||
fs.IntVar(&cfg.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
|
||||
fs.IntVar(&cfg.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
|
||||
fs.IntVar(&cfg.EnrollmentPollTimeoutSeconds, "enrollment-poll-timeout-seconds", getenvInt("RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0), "Enrollment approval timeout seconds. Use 0 to wait indefinitely.")
|
||||
fs.IntVar(&cfg.ProductionObservationSinkCap, "production-observation-sink-capacity", getenvInt("RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Production observation sink capacity.")
|
||||
extraEnv := repeatedFlag{}
|
||||
extraRunArg := repeatedFlag{}
|
||||
imageArtifactURL := repeatedFlag{}
|
||||
fs.Var(&extraEnv, "env", "Extra KEY=VALUE env passed to node-agent container; may be repeated.")
|
||||
fs.Var(&extraRunArg, "docker-run-arg", "Extra raw docker run argument; may be repeated.")
|
||||
fs.Var(&imageArtifactURL, "image-artifact-url", "Docker image tar artifact URL to docker load before running; may be repeated.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return installCommandConfig{}, err
|
||||
}
|
||||
cfg.ExtraEnv = extraEnv
|
||||
cfg.AdditionalDockerRunArgs = extraRunArg
|
||||
cfg.ImageArtifactURLs = append(cfg.ImageArtifactURLs, imageArtifactURL...)
|
||||
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
|
||||
profile, err := hostagent.FetchDockerInstallProfile(context.Background(), hostagent.ProfileRequest{
|
||||
URL: profileURL,
|
||||
ClusterID: cfg.ClusterID,
|
||||
InstallToken: installToken,
|
||||
NodeName: cfg.NodeName,
|
||||
})
|
||||
if err != nil {
|
||||
return installCommandConfig{}, err
|
||||
}
|
||||
profileCfg := hostagent.RuntimeConfigFromProfile(profile)
|
||||
profileCfg.ExtraEnv = cfg.ExtraEnv
|
||||
profileCfg.AdditionalDockerRunArgs = cfg.AdditionalDockerRunArgs
|
||||
profileCfg.DockerVPNGatewayEnabled = profileCfg.DockerVPNGatewayEnabled || cfg.DockerVPNGatewayEnabled
|
||||
if len(imageArtifactURL) > 0 {
|
||||
profileCfg.ImageArtifactURLs = append([]string(nil), imageArtifactURL...)
|
||||
}
|
||||
if cfg.ImageArtifactSHA256 != "" {
|
||||
profileCfg.ImageArtifactSHA256 = cfg.ImageArtifactSHA256
|
||||
}
|
||||
if cfg.ImageArtifactSizeBytes > 0 {
|
||||
profileCfg.ImageArtifactSizeBytes = cfg.ImageArtifactSizeBytes
|
||||
}
|
||||
cfg = profileCfg
|
||||
}
|
||||
if err := cfg.ValidateInstall(); err != nil {
|
||||
return installCommandConfig{}, err
|
||||
}
|
||||
return installCommandConfig{
|
||||
Runtime: cfg,
|
||||
DryRun: dryRun,
|
||||
AutoUpdateEnabled: autoUpdateEnabled,
|
||||
AutoUpdate: autoUpdate,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type repeatedFlag []string
|
||||
|
||||
func (f *repeatedFlag) String() string {
|
||||
return strings.Join(*f, ",")
|
||||
}
|
||||
|
||||
func (f *repeatedFlag) Set(value string) error {
|
||||
*f = append(*f, value)
|
||||
return nil
|
||||
}
|
||||
|
||||
func getenv(key, fallback string) string {
|
||||
if value := strings.TrimSpace(os.Getenv(key)); value != "" {
|
||||
return value
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func getenvBool(key string, fallback bool) bool {
|
||||
switch strings.ToLower(strings.TrimSpace(os.Getenv(key))) {
|
||||
case "1", "true", "yes", "y", "on":
|
||||
return true
|
||||
case "0", "false", "no", "n", "off":
|
||||
return false
|
||||
default:
|
||||
return fallback
|
||||
}
|
||||
}
|
||||
|
||||
func getenvInt(key string, fallback int) int {
|
||||
var out int
|
||||
if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%d", &out); err == nil {
|
||||
return out
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func getenvInt64(key string, fallback int64) int64 {
|
||||
var out int64
|
||||
if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%d", &out); err == nil {
|
||||
return out
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func getenvFloat(key string, fallback float64) float64 {
|
||||
var out float64
|
||||
if _, err := fmt.Sscanf(strings.TrimSpace(os.Getenv(key)), "%f", &out); err == nil {
|
||||
return out
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func shellJoin(args []string) string {
|
||||
parts := make([]string, 0, len(args))
|
||||
for _, arg := range args {
|
||||
if strings.ContainsAny(arg, " \t\"'") {
|
||||
parts = append(parts, `"`+strings.ReplaceAll(arg, `"`, `\"`)+`"`)
|
||||
} else {
|
||||
parts = append(parts, arg)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func usage() {
|
||||
fmt.Fprintln(os.Stderr, `usage:
|
||||
rap-host-agent install -profile-url URL -install-token TOKEN [-node-name NAME] [docker options]
|
||||
rap-host-agent install -backend-url URL -cluster-id ID -join-token TOKEN -node-name NAME [docker options]
|
||||
rap-host-agent install-windows -profile-url URL -install-token TOKEN [-node-name NAME] [windows options]
|
||||
rap-host-agent install-linux -profile-url URL -install-token TOKEN [-node-name NAME] [linux/systemd options]
|
||||
rap-host-agent install-updater -backend-url URL -cluster-id ID -state-dir DIR -container-name NAME
|
||||
rap-host-agent update-host-agent -backend-url URL -cluster-id ID -state-dir DIR
|
||||
rap-host-agent update-host-agent-loop -backend-url URL -cluster-id ID -state-dir DIR
|
||||
rap-host-agent update -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent update-loop -backend-url URL -cluster-id ID -node-id ID [-container-name NAME]
|
||||
rap-host-agent status [-container-name NAME]`)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -78,6 +78,202 @@ func TestLoadSyntheticMeshConfigPrefersScopedFile(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticMeshConfigAuthorityHashUsesRawConfigPayload(t *testing.T) {
|
||||
raw := json.RawMessage(`{
|
||||
"enabled": true,
|
||||
"schema_version": "c18z-test.synthetic.v1",
|
||||
"cluster_id": "cluster-1",
|
||||
"local_node_id": "node-a",
|
||||
"authority_required": true,
|
||||
"cluster_authority": {"schema_version":"rap.cluster_authority.v1"},
|
||||
"authority_payload": {"ignored": true},
|
||||
"authority_signature": {"ignored": true},
|
||||
"config_version": "config-1",
|
||||
"peer_endpoints": {},
|
||||
"routes": [],
|
||||
"production_forwarding": true,
|
||||
"future_backend_field": {"must_remain_hash_visible": true}
|
||||
}`)
|
||||
var remote client.SyntheticMeshConfig
|
||||
if err := json.Unmarshal(raw, &remote); err != nil {
|
||||
t.Fatalf("unmarshal synthetic config: %v", err)
|
||||
}
|
||||
var unsigned map[string]json.RawMessage
|
||||
if err := json.Unmarshal(raw, &unsigned); err != nil {
|
||||
t.Fatalf("unmarshal unsigned map: %v", err)
|
||||
}
|
||||
delete(unsigned, "authority_payload")
|
||||
delete(unsigned, "authority_signature")
|
||||
unsignedRaw, err := json.Marshal(unsigned)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal unsigned map: %v", err)
|
||||
}
|
||||
want, err := agentauthority.HashRaw(unsignedRaw)
|
||||
if err != nil {
|
||||
t.Fatalf("hash unsigned map: %v", err)
|
||||
}
|
||||
got, err := syntheticMeshConfigAuthorityHash(remote)
|
||||
if err != nil {
|
||||
t.Fatalf("hash synthetic config: %v", err)
|
||||
}
|
||||
if got != want {
|
||||
t.Fatalf("hash = %s, want raw-preserving hash %s", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneConsumesRemediationCommand(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
decisions := routeManagerDecisionsFromControlPlane(nil, []client.FabricServiceChannelRemediationCommand{{
|
||||
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
|
||||
CommandID: "cmd-1",
|
||||
Action: "prefer_alternate_route",
|
||||
ClusterID: "cluster-1",
|
||||
ChannelID: "channel-1",
|
||||
ServiceClass: "vpn_packets",
|
||||
PrimaryRouteID: "route-primary",
|
||||
ReplacementRouteID: "route-alternate",
|
||||
Reason: "authorized_alternate_route_available",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 1 {
|
||||
t.Fatalf("decisions = %+v, want one remediation decision", decisions)
|
||||
}
|
||||
decision := decisions[0]
|
||||
if decision.RouteID != "route-primary" ||
|
||||
decision.ReplacementRouteID != "route-alternate" ||
|
||||
decision.RebuildStatus != "applied" ||
|
||||
decision.DecisionSource != "service_channel_remediation_command" ||
|
||||
decision.RebuildRequestID != "cmd-1" {
|
||||
t.Fatalf("unexpected remediation decision: %+v", decision)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneConsumesRebuildRouteCommand(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
decisions := routeManagerDecisionsFromControlPlane(nil, []client.FabricServiceChannelRemediationCommand{{
|
||||
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
|
||||
CommandID: "cmd-rebuild",
|
||||
Action: "rebuild_route",
|
||||
ClusterID: "cluster-1",
|
||||
ChannelID: "channel-1",
|
||||
ServiceClass: "vpn_packets",
|
||||
PrimaryRouteID: "route-primary",
|
||||
Reason: "route_feedback_recommends_rebuild",
|
||||
GuardStatus: "allowed",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 1 {
|
||||
t.Fatalf("decisions = %+v, want one rebuild remediation decision", decisions)
|
||||
}
|
||||
decision := decisions[0]
|
||||
if decision.RouteID != "route-primary" ||
|
||||
decision.RebuildStatus != "pending_degraded_fallback" ||
|
||||
decision.DecisionSource != "service_channel_remediation_command" ||
|
||||
decision.RebuildRequestID != "cmd-rebuild" {
|
||||
t.Fatalf("unexpected rebuild remediation decision: %+v", decision)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneRejectsGuardedRemediationCommand(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
decisions := routeManagerDecisionsFromControlPlane(nil, []client.FabricServiceChannelRemediationCommand{{
|
||||
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
|
||||
CommandID: "cmd-guarded",
|
||||
Action: "prefer_alternate_route",
|
||||
ClusterID: "cluster-1",
|
||||
ChannelID: "channel-1",
|
||||
ServiceClass: "vpn_packets",
|
||||
PrimaryRouteID: "route-primary",
|
||||
ReplacementRouteID: "route-outside-policy",
|
||||
GuardStatus: "rejected",
|
||||
GuardReason: "replacement_exit_outside_signed_pool_policy",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 0 {
|
||||
t.Fatalf("guarded remediation command must not reach route-manager: %+v", decisions)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneKeepsExplicitRemediationCommand(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
report := &client.RoutePathDecisionReport{Decisions: []client.RoutePathDecision{{
|
||||
RouteID: "route-primary",
|
||||
ReplacementRouteID: "route-alternate",
|
||||
RebuildRequestID: "feedback-rebuild",
|
||||
RebuildStatus: "applied",
|
||||
RebuildReason: "service_channel_feedback_rebuild_applied_to_alternate",
|
||||
DecisionSource: "service_channel_feedback_replacement",
|
||||
Generation: "gen-1",
|
||||
}}}
|
||||
decisions := routeManagerDecisionsFromControlPlane(report, []client.FabricServiceChannelRemediationCommand{{
|
||||
CommandID: "cmd-1",
|
||||
Action: "prefer_alternate_route",
|
||||
PrimaryRouteID: "route-primary",
|
||||
ReplacementRouteID: "route-alternate",
|
||||
Reason: "authorized_alternate_route_available",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 2 {
|
||||
t.Fatalf("decisions = %+v, want feedback and explicit remediation command", decisions)
|
||||
}
|
||||
if decisions[1].DecisionSource != "service_channel_remediation_command" || decisions[1].RebuildRequestID != "cmd-1" {
|
||||
t.Fatalf("remediation command was not kept as explicit route-manager input: %+v", decisions)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRouteManagerDecisionsFromControlPlaneSkipsCommandAlreadyResolvedByPlanner(t *testing.T) {
|
||||
now := time.Now().UTC()
|
||||
report := &client.RoutePathDecisionReport{Decisions: []client.RoutePathDecision{{
|
||||
RouteID: "route-primary",
|
||||
ReplacementRouteID: "route-planner",
|
||||
RebuildRequestID: "cmd-rebuild",
|
||||
RebuildStatus: "applied",
|
||||
RebuildReason: "remediation_rebuild_applied_to_alternate",
|
||||
DecisionSource: "service_channel_remediation_command",
|
||||
Generation: "config-c18z77",
|
||||
}}}
|
||||
decisions := routeManagerDecisionsFromControlPlane(report, []client.FabricServiceChannelRemediationCommand{{
|
||||
CommandID: "cmd-rebuild",
|
||||
Action: "rebuild_route",
|
||||
PrimaryRouteID: "route-primary",
|
||||
Reason: "route_feedback_recommends_rebuild",
|
||||
GuardStatus: "allowed",
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
}})
|
||||
if len(decisions) != 1 {
|
||||
t.Fatalf("decisions = %+v, want only planner-resolved decision", decisions)
|
||||
}
|
||||
if decisions[0].RebuildStatus != "applied" || decisions[0].ReplacementRouteID != "route-planner" {
|
||||
t.Fatalf("unexpected planner decision: %+v", decisions[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricServiceChannelAccessStatsReportsDataPlaneViolations(t *testing.T) {
|
||||
stats := newFabricServiceChannelAccessStats()
|
||||
stats.Observe(mesh.FabricServiceChannelAccessLogEntry{
|
||||
Event: "fabric_service_channel_data_plane_violation",
|
||||
ClusterID: "cluster-1",
|
||||
ChannelID: "channel-1",
|
||||
ResourceID: "vpn-1",
|
||||
BackendRelayPolicy: "disabled",
|
||||
ViolationStatus: "fabric_route_send_failed_backend_fallback_blocked",
|
||||
ViolationReason: "mesh synthetic route not found",
|
||||
OccurredAt: time.Unix(10, 0).UTC(),
|
||||
})
|
||||
report := stats.Report(time.Unix(20, 0).UTC())
|
||||
if report["backend_fallback_blocked"] != int64(1) ||
|
||||
report["fabric_route_send_failure"] != int64(1) ||
|
||||
report["last_data_plane_violation_status"] != "fabric_route_send_failed_backend_fallback_blocked" ||
|
||||
report["last_data_plane_violation_reason"] != "mesh synthetic route not found" {
|
||||
t.Fatalf("unexpected violation report: %+v", report)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyEnrollmentBootstrapAcceptsSignedApproval(t *testing.T) {
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
@@ -134,6 +330,134 @@ func TestVerifyEnrollmentBootstrapAcceptsSignedApproval(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyControlPlaneSyntheticMeshConfigAcceptsSignedServiceChannelFeedback(t *testing.T) {
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("generate key: %v", err)
|
||||
}
|
||||
publicKeyB64 := base64.StdEncoding.EncodeToString(publicKey)
|
||||
fingerprint := agentauthority.Fingerprint(publicKey)
|
||||
now := time.Now().UTC()
|
||||
remote := client.SyntheticMeshConfig{
|
||||
Enabled: true,
|
||||
SchemaVersion: "c17z18.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
AuthorityRequired: true,
|
||||
ClusterAuthority: &client.ClusterAuthorityDescriptor{
|
||||
SchemaVersion: agentauthority.AuthoritySchemaVersion,
|
||||
ClusterID: "cluster-1",
|
||||
AuthorityState: "authoritative",
|
||||
KeyAlgorithm: agentauthority.AlgorithmEd25519,
|
||||
PublicKey: publicKeyB64,
|
||||
PublicKeyFingerprint: fingerprint,
|
||||
},
|
||||
ConfigVersion: "config-v1",
|
||||
PeerDirectoryVersion: "config-v1",
|
||||
PolicyVersion: "config-v1",
|
||||
PeerEndpoints: map[string]string{},
|
||||
PeerEndpointCandidates: map[string][]client.PeerEndpointCandidate{},
|
||||
PeerDirectory: []client.PeerDirectoryEntry{},
|
||||
RecoverySeeds: []client.PeerRecoverySeed{},
|
||||
RendezvousLeases: []client.PeerRendezvousLease{},
|
||||
RoutePathDecisions: &client.RoutePathDecisionReport{
|
||||
SchemaVersion: "c17z18.route_path_decisions.v1",
|
||||
DecisionMode: "control_plane_effective_path_from_relay_policy_and_service_channel_feedback",
|
||||
Generation: "config-v1",
|
||||
DecisionCount: 1,
|
||||
ReplacementDecisionCount: 1,
|
||||
RebuildRequestCount: 1,
|
||||
RebuildAppliedCount: 1,
|
||||
ControlPlaneOnly: true,
|
||||
Decisions: []client.RoutePathDecision{{
|
||||
DecisionID: "route-ab-path-node-a-service-channel-feedback",
|
||||
RouteID: "route-ab",
|
||||
ReplacementRouteID: "route-ac",
|
||||
RebuildRequestID: "route-ab-node-a-config-v1-rebuild",
|
||||
RebuildStatus: "applied",
|
||||
RebuildReason: "service_channel_feedback_rebuild_applied_to_alternate",
|
||||
RebuildAttempt: 2,
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-b",
|
||||
OriginalHops: []string{"node-a", "node-b"},
|
||||
EffectiveHops: []string{"node-a", "node-c", "node-b"},
|
||||
LocalRole: "source",
|
||||
DecisionSource: "service_channel_feedback_replacement",
|
||||
Generation: "config-v1",
|
||||
PathScore: 1000,
|
||||
ScoreReasons: []string{"service_channel_rebuild_applied"},
|
||||
ControlPlaneOnly: true,
|
||||
ExpiresAt: now.Add(30 * time.Second),
|
||||
}},
|
||||
},
|
||||
ServiceChannelFeedback: &client.FabricServiceChannelFeedbackReport{
|
||||
SchemaVersion: "c18n.fabric_service_channel_route_feedback_report.v1",
|
||||
GeneratedAt: now,
|
||||
FeedbackMaxAgeSeconds: 30,
|
||||
ObservationCount: 1,
|
||||
FencedRouteCount: 1,
|
||||
Observations: []client.FabricServiceChannelFeedbackObservation{{
|
||||
ClusterID: "cluster-1",
|
||||
ReporterNodeID: "node-a",
|
||||
RouteID: "route-ab",
|
||||
ServiceClass: "vpn_packets",
|
||||
FeedbackStatus: "fenced",
|
||||
ScoreAdjustment: -1000,
|
||||
Reasons: []string{"route_rebuild_recommended"},
|
||||
ConsecutiveFailures: 2,
|
||||
Payload: json.RawMessage(`{"route_rebuild_recommended":true}`),
|
||||
ObservedAt: now,
|
||||
ExpiresAt: now.Add(30 * time.Second),
|
||||
}},
|
||||
},
|
||||
MeshListener: nil,
|
||||
Routes: []client.SyntheticMeshRouteConfig{},
|
||||
ProductionForwarding: false,
|
||||
}
|
||||
configHash, err := syntheticMeshConfigAuthorityHash(remote)
|
||||
if err != nil {
|
||||
t.Fatalf("config hash: %v", err)
|
||||
}
|
||||
payload, err := json.Marshal(controlPlaneMeshConfigAuthorityPayload{
|
||||
SchemaVersion: "rap.cluster.mesh_config_snapshot.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
ConfigVersion: "config-v1",
|
||||
ConfigSHA256: configHash,
|
||||
IssuedAt: now,
|
||||
ExpiresAt: now.Add(time.Hour),
|
||||
ControlPlaneOnly: true,
|
||||
ProductionForwarding: false,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("marshal payload: %v", err)
|
||||
}
|
||||
canonical, err := agentauthority.CanonicalJSON(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("canonical json: %v", err)
|
||||
}
|
||||
remote.AuthorityPayload = payload
|
||||
remote.AuthoritySignature = &client.ClusterSignature{
|
||||
SchemaVersion: agentauthority.SignatureSchemaVersion,
|
||||
Algorithm: agentauthority.AlgorithmEd25519,
|
||||
KeyFingerprint: fingerprint,
|
||||
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
|
||||
SignedAt: now,
|
||||
}
|
||||
|
||||
err = verifyControlPlaneSyntheticMeshConfig(remote, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
ClusterAuthorityPublicKey: publicKeyB64,
|
||||
ClusterAuthorityFingerprint: fingerprint,
|
||||
}, config.Config{})
|
||||
if err != nil {
|
||||
t.Fatalf("verify control-plane synthetic mesh config: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyEnrollmentBootstrapRejectsPinnedAuthorityMismatch(t *testing.T) {
|
||||
bootstrap := client.NodeBootstrap{
|
||||
NodeID: "node-1",
|
||||
@@ -155,6 +479,54 @@ func TestVerifyEnrollmentBootstrapRejectsPinnedAuthorityMismatch(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnsureApprovedIdentityKeepsPollingWhenTimeoutDisabled(t *testing.T) {
|
||||
var bootstrapPolls int
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.URL.Path == "/node-agents/enroll":
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"status": "pending",
|
||||
"join_request": map[string]any{"id": "join-request-1"},
|
||||
})
|
||||
case r.URL.Path == "/node-agents/enrollments/join-request-1/bootstrap":
|
||||
bootstrapPolls++
|
||||
if bootstrapPolls >= 2 {
|
||||
cancel()
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"status": "pending",
|
||||
"join_request": map[string]any{"id": "join-request-1"},
|
||||
})
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
dir := t.TempDir()
|
||||
identity, err := state.LoadOrCreate(dir, "cluster-1", "node-a")
|
||||
if err != nil {
|
||||
t.Fatalf("load identity: %v", err)
|
||||
}
|
||||
_, err = ensureApprovedIdentity(ctx, config.Config{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-token",
|
||||
NodeName: "node-a",
|
||||
StateDir: dir,
|
||||
EnrollmentPollInterval: time.Millisecond,
|
||||
EnrollmentPollTimeout: 0,
|
||||
}, identity, client.New(server.URL))
|
||||
if err == nil || !strings.Contains(err.Error(), "context canceled") {
|
||||
t.Fatalf("ensureApprovedIdentity err = %v, want context canceled", err)
|
||||
}
|
||||
if bootstrapPolls < 2 {
|
||||
t.Fatalf("bootstrap polls = %d, want at least 2", bootstrapPolls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticQualityScoreIsBounded(t *testing.T) {
|
||||
cases := []struct {
|
||||
latency int
|
||||
@@ -209,6 +581,168 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayloadReportsMeshListenerFailureWithoutKillingHeartbeat(t *testing.T) {
|
||||
now := time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC)
|
||||
payload := heartbeatPayload(config.Config{
|
||||
MeshConnectivityMode: "private_lan",
|
||||
}, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
}, &syntheticMeshState{
|
||||
ListenerReport: meshListenerReport{
|
||||
SchemaVersion: "c17z21.mesh_listener_report.v1",
|
||||
ConfiguredListenAddr: ":19131",
|
||||
ListenPortMode: "manual",
|
||||
Status: "listen_failed",
|
||||
InboundReachability: "unavailable",
|
||||
ControlPlaneReachable: true,
|
||||
OneWayConnectivity: true,
|
||||
FailureReason: "bind_failed",
|
||||
FailureError: "listen tcp :19131: bind: address already in use",
|
||||
PortConflict: true,
|
||||
},
|
||||
}, now)
|
||||
|
||||
report, ok := payload.Metadata["mesh_listener_report"].(meshListenerReport)
|
||||
if !ok {
|
||||
t.Fatalf("mesh listener report missing: %+v", payload.Metadata)
|
||||
}
|
||||
if payload.HealthStatus != "warning" || report.Status != "listen_failed" || !report.PortConflict {
|
||||
t.Fatalf("unexpected listener health report: status=%s report=%+v", payload.HealthStatus, report)
|
||||
}
|
||||
if payload.Capabilities["mesh_listener_diagnostics"] != true || payload.Capabilities["mesh_one_way_connectivity"] != true {
|
||||
t.Fatalf("listener capabilities missing: %+v", payload.Capabilities)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAdvertisedEndpointCandidatesPreferManualEndpoints(t *testing.T) {
|
||||
now := time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC)
|
||||
candidates, err := advertisedEndpointCandidates(config.Config{
|
||||
MeshAdvertiseEndpointsJSON: `[{"endpoint_id":"node-a-json","node_id":"node-a","transport":"direct_http","address":"http://10.10.10.10:19131","priority":12,"connectivity_mode":"private_lan","reachability":"private"}]`,
|
||||
MeshAdvertiseEndpoint: "http://203.0.113.10:19131",
|
||||
MeshAdvertiseTransport: "direct_http",
|
||||
MeshConnectivityMode: "direct",
|
||||
MeshNATType: "port_restricted",
|
||||
MeshRegion: "edge",
|
||||
}, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
}, nil, now)
|
||||
if err != nil {
|
||||
t.Fatalf("advertised endpoint candidates failed: %v", err)
|
||||
}
|
||||
if len(candidates) != 2 {
|
||||
t.Fatalf("expected two manual candidates, got %d: %+v", len(candidates), candidates)
|
||||
}
|
||||
if candidates[0].Address != "http://203.0.113.10:19131" || candidates[0].Priority != 10 {
|
||||
t.Fatalf("explicit advertise endpoint must win: %+v", candidates)
|
||||
}
|
||||
if candidates[1].Address != "http://10.10.10.10:19131" || candidates[1].Priority != 12 {
|
||||
t.Fatalf("json candidate order mismatch: %+v", candidates)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNetworkInterfaceClassificationSkipsContainerNoise(t *testing.T) {
|
||||
tests := map[string]string{
|
||||
"ens160": "physical",
|
||||
"wg0": "vpn",
|
||||
"tailscale0": "vpn",
|
||||
"docker0": "container",
|
||||
"br-a1b2c3d4": "container",
|
||||
"vethabc123": "container",
|
||||
}
|
||||
for name, want := range tests {
|
||||
if got := classifyNetworkInterface(name); got != want {
|
||||
t.Fatalf("classifyNetworkInterface(%q)=%q, want %q", name, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayloadTreatsOutboundOnlyListenerFailureAsOneWayConnectivity(t *testing.T) {
|
||||
payload := heartbeatPayload(config.Config{
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshConnectivityMode: "outbound_only",
|
||||
}, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
}, &syntheticMeshState{
|
||||
ListenerReport: meshListenerReport{
|
||||
SchemaVersion: "c17z21.mesh_listener_report.v1",
|
||||
ConfiguredListenAddr: ":19131",
|
||||
ListenPortMode: "manual",
|
||||
Status: "listen_failed",
|
||||
InboundReachability: "unavailable",
|
||||
ControlPlaneReachable: true,
|
||||
OneWayConnectivity: true,
|
||||
FailureReason: "bind_failed",
|
||||
},
|
||||
}, time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC))
|
||||
|
||||
if payload.HealthStatus != "healthy" {
|
||||
t.Fatalf("HealthStatus = %q, want healthy for outbound-only listener failure", payload.HealthStatus)
|
||||
}
|
||||
report, ok := payload.Metadata["mesh_outbound_session_report"].(meshOutboundSessionReport)
|
||||
if !ok {
|
||||
t.Fatalf("mesh outbound session report missing: %+v", payload.Metadata)
|
||||
}
|
||||
if report.Status != "ready" || !report.UsableForInboundControl || report.ListenerStatus != "listen_failed" {
|
||||
t.Fatalf("unexpected outbound session report: %+v", report)
|
||||
}
|
||||
if payload.Capabilities["mesh_outbound_control_session"] != true ||
|
||||
payload.Capabilities["mesh_reverse_control_channel_contract"] != true {
|
||||
t.Fatalf("outbound session capabilities missing: %+v", payload.Capabilities)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayloadReportsMeshConfigLoadFailureWithoutDroppingPresence(t *testing.T) {
|
||||
payload := heartbeatPayload(config.Config{
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshConnectivityMode: "private_lan",
|
||||
}, state.Identity{
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-a",
|
||||
}, &syntheticMeshState{
|
||||
ConfigLoadError: "control-plane synthetic mesh config unavailable",
|
||||
ListenerReport: meshListenerReport{
|
||||
SchemaVersion: "c17z21.mesh_listener_report.v1",
|
||||
ConfiguredListenAddr: ":19131",
|
||||
ListenPortMode: "manual",
|
||||
Status: "listening",
|
||||
InboundReachability: "private",
|
||||
ControlPlaneReachable: true,
|
||||
},
|
||||
}, time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC))
|
||||
|
||||
report, ok := payload.Metadata["mesh_outbound_session_report"].(meshOutboundSessionReport)
|
||||
if !ok {
|
||||
t.Fatalf("mesh outbound session report missing: %+v", payload.Metadata)
|
||||
}
|
||||
if payload.HealthStatus != "warning" || report.Status != "degraded" || report.ConfigLoadError == "" {
|
||||
t.Fatalf("unexpected config-load diagnostic heartbeat: health=%s report=%+v", payload.HealthStatus, report)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOutboundSessionReportTreatsListeningPrivateLANAsUsable(t *testing.T) {
|
||||
report := meshOutboundSessionReportFromState(config.Config{
|
||||
BackendURL: "http://control/api/v1",
|
||||
MeshConnectivityMode: "private_lan",
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
}, &syntheticMeshState{
|
||||
ListenerReport: meshListenerReport{
|
||||
SchemaVersion: "c17z21.mesh_listener_report.v1",
|
||||
Status: "listening",
|
||||
InboundReachability: reachabilityFromConnectivityMode("private_lan"),
|
||||
},
|
||||
}, time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC))
|
||||
|
||||
if !report.UsableForInboundControl {
|
||||
t.Fatalf("listening private LAN listener must be usable: %+v", report)
|
||||
}
|
||||
if reachabilityFromConnectivityMode("private_lan") != "private" {
|
||||
t.Fatalf("private_lan reachability mismatch")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayloadReportsMultipleMeshEndpoints(t *testing.T) {
|
||||
payload := heartbeatPayload(config.Config{
|
||||
MeshAdvertiseEndpointsJSON: `[{
|
||||
@@ -1050,17 +1584,36 @@ func TestProductionEnvelopeObservationSinkFromConfigCreatesBoundedSink(t *testin
|
||||
func TestProductionForwardingLogStateDistinguishesGateFromRuntime(t *testing.T) {
|
||||
gateEnabled, runtimeEnabled := productionForwardingLogState(config.Config{
|
||||
MeshProductionForwardingEnabled: true,
|
||||
})
|
||||
}, false)
|
||||
if !gateEnabled {
|
||||
t.Fatal("gateEnabled = false, want true")
|
||||
}
|
||||
if !runtimeEnabled {
|
||||
t.Fatal("runtimeEnabled = false, want true")
|
||||
}
|
||||
gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{})
|
||||
gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{}, false)
|
||||
if gateEnabled || runtimeEnabled {
|
||||
t.Fatalf("default log state = gate:%t runtime:%t, want false/false", gateEnabled, runtimeEnabled)
|
||||
}
|
||||
gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{}, true)
|
||||
if !gateEnabled || !runtimeEnabled {
|
||||
t.Fatalf("signed control-plane log state = gate:%t runtime:%t, want true/true", gateEnabled, runtimeEnabled)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshLinkStatusFromPeerProbeMapsDeferredForLatestLinks(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
mesh.PeerConnectionProbeReachable: "reachable",
|
||||
mesh.PeerConnectionProbeUnreachable: "unreachable",
|
||||
mesh.PeerConnectionProbeDeferred: "degraded",
|
||||
mesh.PeerConnectionProbeSkipped: "unknown",
|
||||
"unexpected": "unknown",
|
||||
}
|
||||
for input, want := range cases {
|
||||
if got := meshLinkStatusFromPeerProbe(input); got != want {
|
||||
t.Fatalf("meshLinkStatusFromPeerProbe(%q) = %q, want %q", input, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLogProductionObservationSinkMetricsToleratesNilState(t *testing.T) {
|
||||
|
||||
@@ -1,3 +1,14 @@
|
||||
module github.com/example/remote-access-platform/agents/rap-node-agent
|
||||
|
||||
go 1.23.2
|
||||
go 1.25.5
|
||||
|
||||
require golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb
|
||||
|
||||
require (
|
||||
github.com/gorilla/websocket v1.5.3 // indirect
|
||||
golang.org/x/net v0.53.0 // indirect
|
||||
golang.org/x/sys v0.43.0 // indirect
|
||||
golang.org/x/time v0.15.0 // indirect
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
|
||||
gvisor.dev/gvisor v0.0.0-20260505022556-2306ef3db943 // indirect
|
||||
)
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
|
||||
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
|
||||
golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
|
||||
golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA=
|
||||
golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs=
|
||||
golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI=
|
||||
golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
|
||||
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg=
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2/go.mod h1:deeaetjYA+DHMHg+sMSMI58GrEteJUUzzw7en6TJQcI=
|
||||
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb h1:whnFRlWMcXI9d+ZbWg+4sHnLp52d5yiIPUxMBSt4X9A=
|
||||
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb/go.mod h1:rpwXGsirqLqN2L0JDJQlwOboGHmptD5ZD6T2VmcqhTw=
|
||||
gvisor.dev/gvisor v0.0.0-20260505022556-2306ef3db943 h1:YUPk0vGbex2+Jk7XXIgLIPG6oEAD9ml0x7wd6i/bmA4=
|
||||
gvisor.dev/gvisor v0.0.0-20260505022556-2306ef3db943/go.mod h1:xQ2PWgHmWJA/Ph4i1q1jBm39BKhc3W0DXqWoDSyuBOY=
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
const Version = "0.1.0-c3"
|
||||
const Version = "0.2.256-c18z82"
|
||||
|
||||
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
|
||||
return client.EnrollRequest{
|
||||
@@ -17,18 +17,26 @@ func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) cli
|
||||
NodeFingerprint: identity.NodeFingerprint,
|
||||
PublicKey: identity.PublicKey,
|
||||
ReportedCapabilities: map[string]any{
|
||||
"can_accept_client_ingress": false,
|
||||
"can_accept_node_ingress": false,
|
||||
"can_route_mesh": false,
|
||||
"can_run_rdp_worker": true,
|
||||
"can_run_vnc_worker": false,
|
||||
"can_run_vpn_exit": false,
|
||||
"can_run_vpn_connector": false,
|
||||
"can_run_file_cache": false,
|
||||
"can_run_update_cache": false,
|
||||
"can_run_video_relay": false,
|
||||
"native_node_agent_version": Version,
|
||||
"service_supervision_enabled": false,
|
||||
"can_accept_client_ingress": false,
|
||||
"can_accept_node_ingress": false,
|
||||
"can_route_mesh": false,
|
||||
"can_run_rdp_worker": true,
|
||||
"can_run_vnc_worker": false,
|
||||
"can_run_vpn_exit": true,
|
||||
"can_run_vpn_connector": true,
|
||||
"can_run_file_cache": false,
|
||||
"can_run_update_cache": false,
|
||||
"can_run_video_relay": false,
|
||||
"native_node_agent_version": Version,
|
||||
"node_update_plan_contract": "rap.node_update_plan.v1",
|
||||
"node_update_status_report": true,
|
||||
"host_agent_update_required": true,
|
||||
"service_supervision_enabled": false,
|
||||
"vpn_assignment_status": true,
|
||||
"vpn_packet_forwarding": true,
|
||||
"vpn_fabric_packet_transport": true,
|
||||
"vpn_local_gateway_shortcut": true,
|
||||
"external_backend_entry_proxy": true,
|
||||
},
|
||||
ReportedFacts: map[string]any{
|
||||
"os": runtime.GOOS,
|
||||
@@ -45,13 +53,28 @@ func HeartbeatPayload() client.HeartbeatRequest {
|
||||
HealthStatus: "healthy",
|
||||
ReportedVersion: Version,
|
||||
Capabilities: map[string]any{
|
||||
"native_node_agent": true,
|
||||
"native_node_agent": true,
|
||||
"node_update_plan_contract": "rap.node_update_plan.v1",
|
||||
"node_update_status_report": true,
|
||||
"vpn_assignment_status": true,
|
||||
"vpn_packet_forwarding": true,
|
||||
"vpn_fabric_packet_transport": true,
|
||||
"vpn_local_gateway_shortcut": true,
|
||||
"external_backend_entry_proxy": true,
|
||||
},
|
||||
ServiceStates: map[string]any{
|
||||
"workload_supervision": "not_implemented_c3",
|
||||
},
|
||||
Metadata: map[string]any{
|
||||
"stage": "c3",
|
||||
"update_runtime": map[string]any{
|
||||
"product": "rap-node-agent",
|
||||
"current_version": Version,
|
||||
"host_agent_present": true,
|
||||
"self_update_enabled": true,
|
||||
"rollback_executor_ready": true,
|
||||
"reason": "host-agent updater active",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -260,6 +260,7 @@ type SyntheticMeshRouteConfig struct {
|
||||
}
|
||||
|
||||
type SyntheticMeshConfig struct {
|
||||
Raw json.RawMessage `json:"-"`
|
||||
Enabled bool `json:"enabled"`
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
@@ -286,6 +287,17 @@ type SyntheticMeshConfig struct {
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
}
|
||||
|
||||
func (c *SyntheticMeshConfig) UnmarshalJSON(data []byte) error {
|
||||
type syntheticMeshConfigAlias SyntheticMeshConfig
|
||||
var decoded syntheticMeshConfigAlias
|
||||
if err := json.Unmarshal(data, &decoded); err != nil {
|
||||
return err
|
||||
}
|
||||
*c = SyntheticMeshConfig(decoded)
|
||||
c.Raw = append(c.Raw[:0], data...)
|
||||
return nil
|
||||
}
|
||||
|
||||
type FabricServiceChannelRemediationCommand struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
CommandID string `json:"command_id"`
|
||||
|
||||
@@ -28,6 +28,9 @@ type Config struct {
|
||||
MeshProductionForwardingEnabled bool
|
||||
MeshProductionObservationSinkCapacity int
|
||||
MeshListenAddr string
|
||||
MeshListenPortMode string
|
||||
MeshListenAutoPortStart int
|
||||
MeshListenAutoPortEnd int
|
||||
MeshAdvertiseEndpoint string
|
||||
MeshAdvertiseEndpointsJSON string
|
||||
MeshAdvertiseTransport string
|
||||
@@ -58,6 +61,9 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
|
||||
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
|
||||
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.")
|
||||
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "direct_tcp_tls"), "Transport label for the advertised mesh endpoint.")
|
||||
@@ -70,7 +76,7 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
heartbeatSeconds := getEnvInt(env, "RAP_HEARTBEAT_INTERVAL_SECONDS", 15)
|
||||
fs.DurationVar(&cfg.HeartbeatInterval, "heartbeat-interval", time.Duration(heartbeatSeconds)*time.Second, "Heartbeat interval.")
|
||||
enrollmentPollIntervalSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5)
|
||||
enrollmentPollTimeoutSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 600)
|
||||
enrollmentPollTimeoutSeconds := getEnvSignedInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0)
|
||||
fs.DurationVar(&cfg.EnrollmentPollInterval, "enrollment-poll-interval", time.Duration(enrollmentPollIntervalSeconds)*time.Second, "Enrollment approval polling interval.")
|
||||
fs.DurationVar(&cfg.EnrollmentPollTimeout, "enrollment-poll-timeout", time.Duration(enrollmentPollTimeoutSeconds)*time.Second, "Enrollment approval polling timeout.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
@@ -84,6 +90,7 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
|
||||
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
|
||||
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
|
||||
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
|
||||
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
|
||||
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
|
||||
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
|
||||
@@ -117,6 +124,20 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
|
||||
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
|
||||
}
|
||||
switch cfg.MeshListenPortMode {
|
||||
case "", "manual", "auto", "disabled":
|
||||
if cfg.MeshListenPortMode == "" {
|
||||
cfg.MeshListenPortMode = "manual"
|
||||
}
|
||||
default:
|
||||
return Config{}, errors.New("mesh listen port mode must be manual, auto, or disabled")
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart <= 0 || cfg.MeshListenAutoPortEnd <= 0 {
|
||||
return Config{}, errors.New("mesh listen auto port range must be positive")
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
|
||||
return Config{}, errors.New("mesh listen auto port start must be less than or equal to end")
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,9 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
|
||||
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5",
|
||||
"RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001",
|
||||
"RAP_MESH_LISTEN_PORT_MODE": "auto",
|
||||
"RAP_MESH_LISTEN_AUTO_PORT_START": "19010",
|
||||
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443/",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
|
||||
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
|
||||
@@ -65,6 +68,9 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
if cfg.MeshListenAddr != "127.0.0.1:19001" {
|
||||
t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr)
|
||||
}
|
||||
if cfg.MeshListenPortMode != "auto" || cfg.MeshListenAutoPortStart != 19010 || cfg.MeshListenAutoPortEnd != 19020 {
|
||||
t.Fatalf("unexpected mesh listen port config: %+v", cfg)
|
||||
}
|
||||
if cfg.MeshAdvertiseEndpoint != "https://node-a.example.test:443" ||
|
||||
cfg.MeshAdvertiseEndpointsJSON == "" ||
|
||||
cfg.MeshAdvertiseTransport != "wss" ||
|
||||
@@ -81,6 +87,19 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadConfigDefaultsEnrollmentPollingToNoTimeout(t *testing.T) {
|
||||
cfg, err := Load(nil, map[string]string{
|
||||
"RAP_BACKEND_URL": "http://backend/api/v1",
|
||||
"RAP_NODE_NAME": "node-a",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("load config: %v", err)
|
||||
}
|
||||
if cfg.EnrollmentPollTimeout != 0 {
|
||||
t.Fatalf("EnrollmentPollTimeout = %s, want no timeout", cfg.EnrollmentPollTimeout)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
|
||||
_, err := Load(nil, map[string]string{
|
||||
"RAP_BACKEND_URL": "http://backend/api/v1",
|
||||
|
||||
@@ -0,0 +1,135 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultContainerName = "rap-node-agent"
|
||||
DefaultImage = "rap-node-agent:latest"
|
||||
DefaultStateDir = "/var/lib/rap-node-agent"
|
||||
DefaultNetwork = "host"
|
||||
)
|
||||
|
||||
type RuntimeConfig struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
JoinToken string
|
||||
NodeName string
|
||||
Image string
|
||||
ContainerName string
|
||||
StateDir string
|
||||
Network string
|
||||
RestartPolicy string
|
||||
PullImage bool
|
||||
Replace bool
|
||||
DockerVPNGatewayEnabled bool
|
||||
WorkloadSupervisionEnabled bool
|
||||
MeshSyntheticRuntimeEnabled bool
|
||||
MeshProductionForwardingEnabled bool
|
||||
MeshListenAddr string
|
||||
MeshListenPortMode string
|
||||
MeshListenAutoPortStart int
|
||||
MeshListenAutoPortEnd int
|
||||
MeshAdvertiseEndpoint string
|
||||
MeshAdvertiseEndpointsJSON string
|
||||
MeshAdvertiseTransport string
|
||||
MeshConnectivityMode string
|
||||
MeshNATType string
|
||||
MeshRegion string
|
||||
HeartbeatIntervalSeconds int
|
||||
EnrollmentPollIntervalSeconds int
|
||||
EnrollmentPollTimeoutSeconds int
|
||||
ExtraEnv []string
|
||||
AdditionalDockerRunArgs []string
|
||||
ProductionObservationSinkCap int
|
||||
ImageArtifactURLs []string
|
||||
ImageArtifactSHA256 string
|
||||
ImageArtifactSizeBytes int64
|
||||
}
|
||||
|
||||
func (cfg RuntimeConfig) Normalize() RuntimeConfig {
|
||||
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
|
||||
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
|
||||
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
|
||||
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
|
||||
cfg.Image = firstNonEmpty(cfg.Image, DefaultImage)
|
||||
cfg.ContainerName = firstNonEmpty(cfg.ContainerName, DefaultContainerName)
|
||||
cfg.StateDir = firstNonEmpty(cfg.StateDir, DefaultStateDir)
|
||||
cfg.Network = firstNonEmpty(cfg.Network, DefaultNetwork)
|
||||
cfg.RestartPolicy = firstNonEmpty(cfg.RestartPolicy, "unless-stopped")
|
||||
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
|
||||
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
|
||||
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
|
||||
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
|
||||
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
|
||||
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
|
||||
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
|
||||
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
|
||||
cfg.ImageArtifactSHA256 = strings.TrimSpace(cfg.ImageArtifactSHA256)
|
||||
if cfg.HeartbeatIntervalSeconds == 0 {
|
||||
cfg.HeartbeatIntervalSeconds = 15
|
||||
}
|
||||
if cfg.EnrollmentPollIntervalSeconds == 0 {
|
||||
cfg.EnrollmentPollIntervalSeconds = 5
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func (cfg RuntimeConfig) ValidateInstall() error {
|
||||
cfg = cfg.Normalize()
|
||||
var missing []string
|
||||
if cfg.BackendURL == "" {
|
||||
missing = append(missing, "backend-url")
|
||||
}
|
||||
if cfg.ClusterID == "" {
|
||||
missing = append(missing, "cluster-id")
|
||||
}
|
||||
if cfg.NodeName == "" {
|
||||
missing = append(missing, "node-name")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return fmt.Errorf("missing required install settings: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
if cfg.JoinToken == "" && !cfg.Replace {
|
||||
return errors.New("join-token is required for first install; pass -replace only when updating an already enrolled local state")
|
||||
}
|
||||
if cfg.HeartbeatIntervalSeconds <= 0 {
|
||||
return errors.New("heartbeat interval must be positive")
|
||||
}
|
||||
if cfg.EnrollmentPollIntervalSeconds <= 0 {
|
||||
return errors.New("enrollment poll interval must be positive")
|
||||
}
|
||||
if cfg.EnrollmentPollTimeoutSeconds < 0 {
|
||||
return errors.New("enrollment poll timeout must not be negative")
|
||||
}
|
||||
switch cfg.MeshListenPortMode {
|
||||
case "", "manual", "auto", "disabled":
|
||||
default:
|
||||
return errors.New("mesh listen port mode must be manual, auto, or disabled")
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart < 0 || cfg.MeshListenAutoPortEnd < 0 {
|
||||
return errors.New("mesh listen auto port range must not be negative")
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart > 0 && cfg.MeshListenAutoPortEnd > 0 && cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
|
||||
return errors.New("mesh listen auto port start must be less than or equal to end")
|
||||
}
|
||||
if cfg.ProductionObservationSinkCap < 0 {
|
||||
return errors.New("production observation sink capacity must not be negative")
|
||||
}
|
||||
for _, item := range cfg.ExtraEnv {
|
||||
if !strings.Contains(item, "=") {
|
||||
return fmt.Errorf("extra env %q must be KEY=VALUE", item)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func firstNonEmpty(value, fallback string) string {
|
||||
if strings.TrimSpace(value) == "" {
|
||||
return fallback
|
||||
}
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
@@ -0,0 +1,335 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type CommandRunner interface {
|
||||
Run(ctx context.Context, name string, args ...string) (string, error)
|
||||
}
|
||||
|
||||
type ExecRunner struct{}
|
||||
|
||||
func (ExecRunner) Run(ctx context.Context, name string, args ...string) (string, error) {
|
||||
cmd := exec.CommandContext(ctx, name, args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return string(out), fmt.Errorf("%s %s: %w\n%s", name, strings.Join(args, " "), err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
type DockerManager struct {
|
||||
Runner CommandRunner
|
||||
Binary string
|
||||
}
|
||||
|
||||
var statHostPath = os.Stat
|
||||
|
||||
type InstallResult struct {
|
||||
ContainerName string
|
||||
Image string
|
||||
Replaced bool
|
||||
Pulled bool
|
||||
Loaded bool
|
||||
ContainerID string
|
||||
}
|
||||
|
||||
func (m DockerManager) Install(ctx context.Context, cfg RuntimeConfig) (InstallResult, error) {
|
||||
if err := cfg.ValidateInstall(); err != nil {
|
||||
return InstallResult{}, err
|
||||
}
|
||||
cfg = cfg.Normalize()
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
docker := firstNonEmpty(m.Binary, "docker")
|
||||
result := InstallResult{ContainerName: cfg.ContainerName, Image: cfg.Image}
|
||||
|
||||
if err := PrepareStateDir(cfg.StateDir); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if cfg.DockerVPNGatewayEnabled {
|
||||
if err := ensureHostTunDevice(ctx, runner); err != nil {
|
||||
return result, err
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.PullImage {
|
||||
if _, err := runner.Run(ctx, docker, "pull", cfg.Image); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Pulled = true
|
||||
} else if len(cfg.ImageArtifactURLs) > 0 {
|
||||
loaded, err := m.ensureImageFromArtifact(ctx, runner, docker, cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Loaded = loaded
|
||||
}
|
||||
|
||||
if cfg.Replace {
|
||||
if _, err := runner.Run(ctx, docker, "rm", "-f", cfg.ContainerName); err != nil && !isNoSuchContainerError(err) {
|
||||
return result, err
|
||||
}
|
||||
result.Replaced = true
|
||||
}
|
||||
|
||||
args := DockerRunArgs(cfg)
|
||||
out, err := runner.Run(ctx, docker, args...)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.ContainerID = strings.TrimSpace(out)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func ensureHostTunDevice(ctx context.Context, runner CommandRunner) error {
|
||||
if _, err := statHostPath("/dev/net/tun"); err == nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := runner.Run(ctx, "modprobe", "tun"); err != nil {
|
||||
return fmt.Errorf("docker vpn gateway requires host /dev/net/tun; modprobe tun failed: %w", err)
|
||||
}
|
||||
if _, err := statHostPath("/dev/net/tun"); err != nil {
|
||||
return fmt.Errorf("docker vpn gateway requires host /dev/net/tun after modprobe tun: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m DockerManager) ensureImageFromArtifact(ctx context.Context, runner CommandRunner, docker string, cfg RuntimeConfig) (bool, error) {
|
||||
if _, err := runner.Run(ctx, docker, "image", "inspect", cfg.Image); err == nil && !cfg.Replace {
|
||||
return false, nil
|
||||
}
|
||||
path, err := downloadFirstArtifact(ctx, cfg.ImageArtifactURLs, cfg.ImageArtifactSHA256, cfg.ImageArtifactSizeBytes)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
if _, err := runner.Run(ctx, docker, "load", "-i", path); err != nil {
|
||||
return false, err
|
||||
}
|
||||
if _, err := runner.Run(ctx, docker, "image", "inspect", cfg.Image); err != nil {
|
||||
return true, fmt.Errorf("loaded artifact but image %q is not available: %w", cfg.Image, err)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func downloadFirstArtifact(ctx context.Context, urls []string, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
|
||||
var lastErr error
|
||||
for _, rawURL := range urls {
|
||||
rawURL = strings.TrimSpace(rawURL)
|
||||
if rawURL == "" {
|
||||
continue
|
||||
}
|
||||
for attempt := 1; attempt <= 3; attempt++ {
|
||||
path, err := downloadArtifact(ctx, rawURL, expectedSHA256, expectedSizeBytes)
|
||||
if err == nil {
|
||||
return path, nil
|
||||
}
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
if lastErr != nil {
|
||||
return "", lastErr
|
||||
}
|
||||
return "", fmt.Errorf("no artifact URLs configured")
|
||||
}
|
||||
|
||||
func downloadArtifact(ctx context.Context, rawURL, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("download artifact %s: %w", rawURL, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return "", fmt.Errorf("download artifact %s: %s", rawURL, resp.Status)
|
||||
}
|
||||
file, err := os.CreateTemp("", "rap-docker-image-*.tar")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
path := file.Name()
|
||||
hasher := sha256.New()
|
||||
written, copyErr := io.Copy(io.MultiWriter(file, hasher), resp.Body)
|
||||
closeErr := file.Close()
|
||||
if copyErr != nil {
|
||||
os.Remove(path)
|
||||
return "", copyErr
|
||||
}
|
||||
if closeErr != nil {
|
||||
os.Remove(path)
|
||||
return "", closeErr
|
||||
}
|
||||
if resp.ContentLength >= 0 && written != resp.ContentLength {
|
||||
os.Remove(path)
|
||||
return "", fmt.Errorf("artifact download truncated for %s: got %d bytes want content-length %d", rawURL, written, resp.ContentLength)
|
||||
}
|
||||
if expectedSizeBytes > 0 && written != expectedSizeBytes {
|
||||
if strings.TrimSpace(expectedSHA256) != "" {
|
||||
os.Remove(path)
|
||||
return "", fmt.Errorf("artifact size mismatch for %s: got %d bytes want %d", rawURL, written, expectedSizeBytes)
|
||||
}
|
||||
fmt.Printf("artifact size mismatch for %s: got %d bytes want %d; proceeding without checksum for backward-compatible installs\n", rawURL, written, expectedSizeBytes)
|
||||
}
|
||||
actual := hex.EncodeToString(hasher.Sum(nil))
|
||||
if expected := strings.TrimSpace(expectedSHA256); expected != "" && !strings.EqualFold(actual, expected) {
|
||||
os.Remove(path)
|
||||
return "", fmt.Errorf("artifact checksum mismatch for %s: got %s want %s", rawURL, actual, expected)
|
||||
}
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func (m DockerManager) Status(ctx context.Context, containerName string) (string, error) {
|
||||
containerName = firstNonEmpty(containerName, DefaultContainerName)
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
docker := firstNonEmpty(m.Binary, "docker")
|
||||
return runner.Run(ctx, docker, "ps", "-a", "--filter", "name=^/"+containerName+"$", "--format", "{{.Names}}\t{{.Image}}\t{{.Status}}")
|
||||
}
|
||||
|
||||
func PrepareStateDir(stateDir string) error {
|
||||
stateDir = strings.TrimSpace(stateDir)
|
||||
if stateDir == "" || !looksLikeHostPath(stateDir) {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(stateDir, 0o777); err != nil {
|
||||
return fmt.Errorf("prepare state dir %q: %w", stateDir, err)
|
||||
}
|
||||
if err := os.Chmod(stateDir, 0o777); err != nil {
|
||||
if isAccessDenied(err) {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("chmod state dir %q: %w", stateDir, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func DockerRunArgs(cfg RuntimeConfig) []string {
|
||||
cfg = cfg.Normalize()
|
||||
args := []string{
|
||||
"run", "-d",
|
||||
"--name", cfg.ContainerName,
|
||||
"--restart", cfg.RestartPolicy,
|
||||
"--network", cfg.Network,
|
||||
"-v", cfg.StateDir + ":/var/lib/rap-node-agent",
|
||||
}
|
||||
if cfg.DockerVPNGatewayEnabled {
|
||||
args = append(args,
|
||||
"--privileged",
|
||||
"--cap-add", "NET_ADMIN",
|
||||
"--device", "/dev/net/tun:/dev/net/tun",
|
||||
)
|
||||
}
|
||||
args = append(args, cfg.AdditionalDockerRunArgs...)
|
||||
for _, env := range NodeAgentEnv(cfg) {
|
||||
args = append(args, "-e", env)
|
||||
}
|
||||
args = append(args, cfg.Image)
|
||||
return args
|
||||
}
|
||||
|
||||
func NodeAgentEnv(cfg RuntimeConfig) []string {
|
||||
return NodeAgentEnvWithStateDir(cfg, "/var/lib/rap-node-agent")
|
||||
}
|
||||
|
||||
func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
|
||||
cfg = cfg.Normalize()
|
||||
stateDir = firstNonEmpty(stateDir, cfg.StateDir)
|
||||
env := []string{
|
||||
"RAP_BACKEND_URL=" + cfg.BackendURL,
|
||||
"RAP_CLUSTER_ID=" + cfg.ClusterID,
|
||||
"RAP_NODE_NAME=" + cfg.NodeName,
|
||||
"RAP_NODE_STATE_DIR=" + stateDir,
|
||||
"RAP_HEARTBEAT_INTERVAL_SECONDS=" + strconv.Itoa(cfg.HeartbeatIntervalSeconds),
|
||||
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollIntervalSeconds),
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollTimeoutSeconds),
|
||||
"RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled),
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled),
|
||||
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled),
|
||||
}
|
||||
if cfg.JoinToken != "" {
|
||||
env = append(env, "RAP_JOIN_TOKEN="+cfg.JoinToken)
|
||||
}
|
||||
if cfg.MeshListenAddr != "" {
|
||||
env = append(env, "RAP_MESH_LISTEN_ADDR="+cfg.MeshListenAddr)
|
||||
}
|
||||
if cfg.MeshListenPortMode != "" {
|
||||
env = append(env, "RAP_MESH_LISTEN_PORT_MODE="+cfg.MeshListenPortMode)
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart > 0 {
|
||||
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_START="+strconv.Itoa(cfg.MeshListenAutoPortStart))
|
||||
}
|
||||
if cfg.MeshListenAutoPortEnd > 0 {
|
||||
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_END="+strconv.Itoa(cfg.MeshListenAutoPortEnd))
|
||||
}
|
||||
if cfg.MeshAdvertiseEndpoint != "" {
|
||||
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINT="+cfg.MeshAdvertiseEndpoint)
|
||||
}
|
||||
if cfg.MeshAdvertiseEndpointsJSON != "" {
|
||||
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON="+cfg.MeshAdvertiseEndpointsJSON)
|
||||
}
|
||||
if cfg.MeshAdvertiseTransport != "" {
|
||||
env = append(env, "RAP_MESH_ADVERTISE_TRANSPORT="+cfg.MeshAdvertiseTransport)
|
||||
}
|
||||
if cfg.MeshConnectivityMode != "" {
|
||||
env = append(env, "RAP_MESH_CONNECTIVITY_MODE="+cfg.MeshConnectivityMode)
|
||||
}
|
||||
if cfg.MeshNATType != "" {
|
||||
env = append(env, "RAP_MESH_NAT_TYPE="+cfg.MeshNATType)
|
||||
}
|
||||
if cfg.MeshRegion != "" {
|
||||
env = append(env, "RAP_MESH_REGION="+cfg.MeshRegion)
|
||||
}
|
||||
if cfg.ProductionObservationSinkCap > 0 {
|
||||
env = append(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY="+strconv.Itoa(cfg.ProductionObservationSinkCap))
|
||||
}
|
||||
env = append(env, cfg.ExtraEnv...)
|
||||
return env
|
||||
}
|
||||
|
||||
func RedactedArgs(args []string) []string {
|
||||
out := append([]string(nil), args...)
|
||||
for i := 0; i < len(out)-1; i++ {
|
||||
if out[i] == "-e" && strings.HasPrefix(out[i+1], "RAP_JOIN_TOKEN=") {
|
||||
out[i+1] = "RAP_JOIN_TOKEN=***"
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isNoSuchContainerError(err error) bool {
|
||||
value := strings.ToLower(err.Error())
|
||||
return strings.Contains(value, "no such container") || strings.Contains(value, "no such object")
|
||||
}
|
||||
|
||||
func looksLikeHostPath(value string) bool {
|
||||
if filepath.IsAbs(value) {
|
||||
return true
|
||||
}
|
||||
return strings.HasPrefix(value, ".") || strings.HasPrefix(value, "~") || strings.Contains(value, "/") || strings.Contains(value, `\`)
|
||||
}
|
||||
|
||||
func boolString(value bool) string {
|
||||
if value {
|
||||
return "true"
|
||||
}
|
||||
return "false"
|
||||
}
|
||||
@@ -0,0 +1,366 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type recordingRunner struct {
|
||||
calls [][]string
|
||||
}
|
||||
|
||||
func (r *recordingRunner) Run(_ context.Context, name string, args ...string) (string, error) {
|
||||
r.calls = append(r.calls, append([]string{name}, args...))
|
||||
if len(args) > 0 && args[0] == "run" {
|
||||
return "container-1\n", nil
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
type imageMissingRunner struct {
|
||||
calls [][]string
|
||||
inspectSeen int
|
||||
}
|
||||
|
||||
func (r *imageMissingRunner) Run(_ context.Context, name string, args ...string) (string, error) {
|
||||
r.calls = append(r.calls, append([]string{name}, args...))
|
||||
if len(args) >= 3 && args[0] == "image" && args[1] == "inspect" {
|
||||
r.inspectSeen++
|
||||
if r.inspectSeen == 1 {
|
||||
return "", fmt.Errorf("No such image")
|
||||
}
|
||||
return "[]", nil
|
||||
}
|
||||
if len(args) > 0 && args[0] == "run" {
|
||||
return "container-1\n", nil
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
type imagePresentRunner struct {
|
||||
calls [][]string
|
||||
}
|
||||
|
||||
func (r *imagePresentRunner) Run(_ context.Context, name string, args ...string) (string, error) {
|
||||
r.calls = append(r.calls, append([]string{name}, args...))
|
||||
if len(args) > 0 && args[0] == "run" {
|
||||
return "container-1\n", nil
|
||||
}
|
||||
return "[]", nil
|
||||
}
|
||||
|
||||
func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
|
||||
args := DockerRunArgs(RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1/",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "/srv/rap/node-a",
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshListenAddr: ":19131",
|
||||
MeshAdvertiseEndpoint: "http://10.0.0.11:19131/",
|
||||
MeshConnectivityMode: "private_lan",
|
||||
})
|
||||
|
||||
joined := strings.Join(args, "\x00")
|
||||
for _, want := range []string{
|
||||
"run", "-d", "--name\x00rap-node-agent-node-a", "--network\x00host",
|
||||
"-v\x00/srv/rap/node-a:/var/lib/rap-node-agent",
|
||||
"RAP_BACKEND_URL=http://control/api/v1",
|
||||
"RAP_CLUSTER_ID=cluster-1",
|
||||
"RAP_JOIN_TOKEN=join-secret",
|
||||
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
|
||||
"RAP_MESH_LISTEN_ADDR=:19131",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINT=http://10.0.0.11:19131",
|
||||
"RAP_MESH_CONNECTIVITY_MODE=private_lan",
|
||||
"rap-node-agent:test",
|
||||
} {
|
||||
if !strings.Contains(joined, want) {
|
||||
t.Fatalf("docker args missing %q in %#v", want, args)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDockerRunArgsEnableVPNGatewayDevice(t *testing.T) {
|
||||
args := DockerRunArgs(RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
StateDir: "rap-node-state",
|
||||
DockerVPNGatewayEnabled: true,
|
||||
})
|
||||
|
||||
joined := strings.Join(args, "\x00")
|
||||
for _, want := range []string{
|
||||
"--privileged",
|
||||
"--cap-add\x00NET_ADMIN",
|
||||
"--device\x00/dev/net/tun:/dev/net/tun",
|
||||
} {
|
||||
if !strings.Contains(joined, want) {
|
||||
t.Fatalf("docker vpn gateway args missing %q in %#v", want, args)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrepareStateDirCreatesWritableHostPath(t *testing.T) {
|
||||
dir := filepath.Join(t.TempDir(), "node-state")
|
||||
if err := PrepareStateDir(dir); err != nil {
|
||||
t.Fatalf("prepare state dir: %v", err)
|
||||
}
|
||||
info, err := os.Stat(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("stat state dir: %v", err)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
t.Fatalf("state path is not a directory")
|
||||
}
|
||||
if info.Mode().Perm()&0o777 != 0o777 {
|
||||
t.Fatalf("state dir mode = %v, want writable for container nonroot user", info.Mode().Perm())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrepareStateDirSkipsNamedVolume(t *testing.T) {
|
||||
if err := PrepareStateDir("rap-node-state"); err != nil {
|
||||
t.Fatalf("named volume should be ignored: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/api/v1/node-agents/docker-install-profile" {
|
||||
t.Fatalf("path = %s", r.URL.Path)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"docker_install_profile": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"backend_url": "https://control.example.test/api/v1",
|
||||
"join_token": "rap_join_profile",
|
||||
"node_name": "node-a",
|
||||
"image": "rap-node-agent:test",
|
||||
"artifact_endpoints": []string{"https://cache.example.test/artifacts"},
|
||||
"docker_image_artifact": map[string]any{
|
||||
"kind": "docker_image_tar",
|
||||
"image": "rap-node-agent:test",
|
||||
"file_name": "rap-node-agent-test.tar",
|
||||
"size_bytes": 21,
|
||||
},
|
||||
"container_name": "rap-node-agent-node-a",
|
||||
"state_dir": "/var/lib/rap/nodes/node-a",
|
||||
"network": "host",
|
||||
"restart_policy": "unless-stopped",
|
||||
"replace": true,
|
||||
"mesh_synthetic_runtime_enabled": true,
|
||||
"mesh_connectivity_mode": "outbound_only",
|
||||
},
|
||||
})
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
profile, err := FetchDockerInstallProfile(context.Background(), ProfileRequest{
|
||||
URL: server.URL + "/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
InstallToken: "rap_join_profile",
|
||||
NodeName: "node-a",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("fetch profile: %v", err)
|
||||
}
|
||||
cfg := RuntimeConfigFromProfile(profile).Normalize()
|
||||
if cfg.BackendURL != "https://control.example.test/api/v1" ||
|
||||
cfg.ClusterID != "cluster-1" ||
|
||||
cfg.JoinToken != "rap_join_profile" ||
|
||||
cfg.ContainerName != "rap-node-agent-node-a" ||
|
||||
len(cfg.ImageArtifactURLs) != 1 ||
|
||||
cfg.ImageArtifactSizeBytes != 21 ||
|
||||
!cfg.MeshSyntheticRuntimeEnabled ||
|
||||
cfg.MeshConnectivityMode != "outbound_only" {
|
||||
t.Fatalf("unexpected cfg: %+v", cfg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInstallLoadsImageArtifactWhenImageMissing(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("fake docker image tar"))
|
||||
}))
|
||||
defer server.Close()
|
||||
runner := &imageMissingRunner{}
|
||||
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
Replace: true,
|
||||
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
|
||||
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
ImageArtifactSizeBytes: 21,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Loaded || result.ContainerID != "container-1" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
if !strings.Contains(joined, "load\x00-i") || !strings.Contains(joined, "run\x00-d") {
|
||||
t.Fatalf("expected docker load and run calls, got %#v", runner.calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInstallAcceptsSizeMismatchWhenChecksumMissing(t *testing.T) {
|
||||
const payload = "fake docker image tar"
|
||||
const wrongSize = 999
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte(payload))
|
||||
}))
|
||||
defer server.Close()
|
||||
runner := &imageMissingRunner{}
|
||||
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
Replace: true,
|
||||
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
|
||||
ImageArtifactSHA256: "", // intentionally absent -> size mismatch should not block install
|
||||
ImageArtifactSizeBytes: wrongSize,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Loaded || result.ContainerID != "container-1" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInstallReloadsImageArtifactWhenReplacingMutableTag(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("fake docker image tar"))
|
||||
}))
|
||||
defer server.Close()
|
||||
runner := &imagePresentRunner{}
|
||||
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
Replace: true,
|
||||
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
|
||||
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
ImageArtifactSizeBytes: 21,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Loaded {
|
||||
t.Fatalf("expected image artifact reload, got %+v", result)
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
if !strings.Contains(joined, "load\x00-i") {
|
||||
t.Fatalf("expected docker load even when image exists during replace, got %#v", runner.calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDockerInstallLoadsExplicitArtifactBeforeReplace(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/rap-node-agent-test.tar" {
|
||||
t.Fatalf("unexpected path %s", r.URL.Path)
|
||||
}
|
||||
_, _ = w.Write([]byte("fake docker image tar"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
runner := &imageMissingRunner{}
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
Replace: true,
|
||||
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
|
||||
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
ImageArtifactSizeBytes: 21,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Loaded || !result.Replaced {
|
||||
t.Fatalf("expected explicit artifact load and replace, got %+v", result)
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
if !strings.Contains(joined, "load\x00-i") {
|
||||
t.Fatalf("expected docker load call, got %#v", runner.calls)
|
||||
}
|
||||
}
|
||||
|
||||
func flattenCalls(calls [][]string) []string {
|
||||
out := []string{}
|
||||
for _, call := range calls {
|
||||
out = append(out, call...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func TestInstallCanPullReplaceAndRedactsJoinToken(t *testing.T) {
|
||||
runner := &recordingRunner{}
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
PullImage: true,
|
||||
Replace: true,
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Pulled || !result.Replaced || result.ContainerID != "container-1" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if len(runner.calls) != 3 {
|
||||
t.Fatalf("calls = %#v", runner.calls)
|
||||
}
|
||||
redacted := strings.Join(RedactedArgs(runner.calls[2][1:]), " ")
|
||||
if strings.Contains(redacted, "join-secret") || !strings.Contains(redacted, "RAP_JOIN_TOKEN=***") {
|
||||
t.Fatalf("redacted args leaked token: %s", redacted)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRequiresJoinTokenUnlessReplacingExistingState(t *testing.T) {
|
||||
err := RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a"}.ValidateInstall()
|
||||
if err == nil || !strings.Contains(err.Error(), "join-token") {
|
||||
t.Fatalf("expected join token validation error, got %v", err)
|
||||
}
|
||||
err = RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a", Replace: true}.ValidateInstall()
|
||||
if err != nil {
|
||||
t.Fatalf("replace update should allow missing join token: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,481 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultLinuxInstallRoot = "/opt/rap"
|
||||
DefaultLinuxStateRoot = "/var/lib/rap/nodes"
|
||||
DefaultLinuxConfigRoot = "/etc/rap"
|
||||
)
|
||||
|
||||
type LinuxInstallConfig struct {
|
||||
RuntimeConfig RuntimeConfig
|
||||
NodeID string
|
||||
InstallDir string
|
||||
StateDir string
|
||||
ConfigDir string
|
||||
UnitDir string
|
||||
StartupMode string
|
||||
ArtifactURLs []string
|
||||
ArtifactSHA256 string
|
||||
ArtifactSizeBytes int64
|
||||
Replace bool
|
||||
DryRun bool
|
||||
AutoUpdateEnabled bool
|
||||
AutoUpdateCurrentVersion string
|
||||
AutoUpdateChannel string
|
||||
AutoUpdateIntervalSeconds int
|
||||
AutoUpdateInitialDelaySeconds int
|
||||
AutoUpdateHealthTimeoutSeconds int
|
||||
HostAgentSourcePath string
|
||||
}
|
||||
|
||||
type LinuxInstallResult struct {
|
||||
NodeName string
|
||||
InstallDir string
|
||||
StateDir string
|
||||
ConfigDir string
|
||||
NodeAgentPath string
|
||||
HostAgentPath string
|
||||
EnvPath string
|
||||
UnitName string
|
||||
UnitPath string
|
||||
UpdaterUnitName string
|
||||
Downloaded bool
|
||||
Started bool
|
||||
UpdaterStarted bool
|
||||
}
|
||||
|
||||
type LinuxManager struct {
|
||||
Runner CommandRunner
|
||||
}
|
||||
|
||||
func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConfig {
|
||||
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultLinuxStateRoot, safeUnitSlug(profile.NodeName)))
|
||||
installDir := firstNonEmpty(profile.InstallDir, filepath.Join(DefaultLinuxInstallRoot, safeUnitSlug(profile.NodeName)))
|
||||
return LinuxInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: profile.BackendURL,
|
||||
ClusterID: profile.ClusterID,
|
||||
JoinToken: profile.JoinToken,
|
||||
NodeName: profile.NodeName,
|
||||
StateDir: stateDir,
|
||||
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
|
||||
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
|
||||
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
|
||||
MeshListenAddr: profile.MeshListenAddr,
|
||||
MeshListenPortMode: profile.MeshListenPortMode,
|
||||
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
|
||||
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
|
||||
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
|
||||
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
|
||||
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
|
||||
MeshConnectivityMode: profile.MeshConnectivityMode,
|
||||
MeshNATType: profile.MeshNATType,
|
||||
MeshRegion: profile.MeshRegion,
|
||||
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
|
||||
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
|
||||
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
|
||||
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
|
||||
},
|
||||
InstallDir: installDir,
|
||||
StateDir: stateDir,
|
||||
ConfigDir: filepath.Join(DefaultLinuxConfigRoot, safeUnitSlug(profile.NodeName)),
|
||||
StartupMode: firstNonEmpty(profile.StartupMode, "systemd"),
|
||||
ArtifactURLs: linuxArtifactURLs(profile),
|
||||
ArtifactSHA256: linuxArtifactSHA256(profile),
|
||||
ArtifactSizeBytes: linuxArtifactSizeBytes(profile),
|
||||
Replace: true,
|
||||
AutoUpdateEnabled: true,
|
||||
}
|
||||
}
|
||||
|
||||
func linuxArtifactURLs(profile LinuxInstallProfile) []string {
|
||||
if profile.NodeAgentArtifact != nil && len(profile.NodeAgentArtifact.URLs) > 0 {
|
||||
return append([]string(nil), profile.NodeAgentArtifact.URLs...)
|
||||
}
|
||||
if profile.NodeAgentArtifact == nil || strings.TrimSpace(profile.NodeAgentArtifact.FileName) == "" {
|
||||
return nil
|
||||
}
|
||||
out := []string{}
|
||||
fileName := strings.TrimLeft(strings.TrimSpace(profile.NodeAgentArtifact.FileName), "/")
|
||||
for _, endpoint := range profile.ArtifactEndpoints {
|
||||
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
|
||||
out = append(out, trimmed+"/"+fileName)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func linuxArtifactSHA256(profile LinuxInstallProfile) string {
|
||||
if profile.NodeAgentArtifact == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(profile.NodeAgentArtifact.SHA256)
|
||||
}
|
||||
|
||||
func linuxArtifactSizeBytes(profile LinuxInstallProfile) int64 {
|
||||
if profile.NodeAgentArtifact == nil {
|
||||
return 0
|
||||
}
|
||||
return profile.NodeAgentArtifact.SizeBytes
|
||||
}
|
||||
|
||||
func (m LinuxManager) Install(ctx context.Context, cfg LinuxInstallConfig) (LinuxInstallResult, error) {
|
||||
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
|
||||
cfg.RuntimeConfig.Replace = cfg.Replace
|
||||
cfg.RuntimeConfig.StateDir = firstNonEmpty(cfg.StateDir, cfg.RuntimeConfig.StateDir)
|
||||
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
|
||||
if err := cfg.RuntimeConfig.ValidateInstall(); err != nil {
|
||||
return LinuxInstallResult{}, err
|
||||
}
|
||||
slug := safeUnitSlug(cfg.RuntimeConfig.NodeName)
|
||||
cfg.InstallDir = firstNonEmpty(cfg.InstallDir, filepath.Join(DefaultLinuxInstallRoot, slug))
|
||||
cfg.StateDir = firstNonEmpty(cfg.RuntimeConfig.StateDir, filepath.Join(DefaultLinuxStateRoot, slug))
|
||||
cfg.ConfigDir = firstNonEmpty(cfg.ConfigDir, filepath.Join(DefaultLinuxConfigRoot, slug))
|
||||
cfg.UnitDir = firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir)
|
||||
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "systemd"))
|
||||
unitName := "rap-node-agent-" + slug + ".service"
|
||||
result := LinuxInstallResult{
|
||||
NodeName: cfg.RuntimeConfig.NodeName,
|
||||
InstallDir: cfg.InstallDir,
|
||||
StateDir: cfg.StateDir,
|
||||
ConfigDir: cfg.ConfigDir,
|
||||
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent"),
|
||||
HostAgentPath: filepath.Join(cfg.InstallDir, "rap-host-agent"),
|
||||
EnvPath: filepath.Join(cfg.ConfigDir, "rap-node-agent.env"),
|
||||
UnitName: unitName,
|
||||
UnitPath: filepath.Join(cfg.UnitDir, unitName),
|
||||
}
|
||||
if cfg.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
if runtime.GOOS != "linux" {
|
||||
return result, fmt.Errorf("linux install is only supported on linux hosts")
|
||||
}
|
||||
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.MkdirAll(cfg.StateDir, 0o700); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.MkdirAll(cfg.ConfigDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if len(cfg.ArtifactURLs) > 0 && (cfg.Replace || !fileExists(result.NodeAgentPath)) {
|
||||
m.stopService(ctx, result.UnitName)
|
||||
path, err := downloadFirstArtifact(ctx, cfg.ArtifactURLs, cfg.ArtifactSHA256, cfg.ArtifactSizeBytes)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
if err := copyFile(path, result.NodeAgentPath, 0o755); err != nil {
|
||||
m.stopService(ctx, result.UnitName)
|
||||
if retryErr := copyFile(path, result.NodeAgentPath, 0o755); retryErr != nil {
|
||||
return result, err
|
||||
}
|
||||
}
|
||||
result.Downloaded = true
|
||||
}
|
||||
if !fileExists(result.NodeAgentPath) {
|
||||
return result, fmt.Errorf("node-agent binary is missing at %s and no artifact was available", result.NodeAgentPath)
|
||||
}
|
||||
if err := os.WriteFile(result.EnvPath, []byte(linuxEnvFile(cfg.RuntimeConfig, cfg.StateDir)), 0o600); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if cfg.StartupMode != "none" {
|
||||
if err := os.MkdirAll(cfg.UnitDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.WriteFile(result.UnitPath, []byte(linuxNodeAgentUnit(result)), 0o644); err != nil {
|
||||
return result, err
|
||||
}
|
||||
runner := m.runner()
|
||||
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", result.UnitName); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Started = true
|
||||
}
|
||||
return installLinuxHostAgentUpdater(ctx, m, result, cfg)
|
||||
}
|
||||
|
||||
func (m LinuxManager) stopService(ctx context.Context, unitName string) {
|
||||
if strings.TrimSpace(unitName) == "" {
|
||||
return
|
||||
}
|
||||
_, _ = m.runner().Run(ctx, "systemctl", "stop", unitName)
|
||||
}
|
||||
|
||||
func (m LinuxManager) runner() CommandRunner {
|
||||
if m.Runner != nil {
|
||||
return m.Runner
|
||||
}
|
||||
return ExecRunner{}
|
||||
}
|
||||
|
||||
func linuxEnvFile(cfg RuntimeConfig, stateDir string) string {
|
||||
lines := []string{}
|
||||
for _, env := range NodeAgentEnvWithStateDir(cfg, stateDir) {
|
||||
key, value, ok := strings.Cut(env, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lines = append(lines, key+"="+systemdQuote(value))
|
||||
}
|
||||
return strings.Join(lines, "\n") + "\n"
|
||||
}
|
||||
|
||||
func linuxNodeAgentUnit(result LinuxInstallResult) string {
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=RAP node-agent %s
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=%s
|
||||
ExecStart=%s
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
`, result.NodeName, systemdQuote(result.EnvPath), systemdQuote(result.NodeAgentPath))
|
||||
}
|
||||
|
||||
func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result LinuxInstallResult, cfg LinuxInstallConfig) (LinuxInstallResult, error) {
|
||||
if !cfg.AutoUpdateEnabled || strings.EqualFold(cfg.StartupMode, "none") {
|
||||
return result, nil
|
||||
}
|
||||
if cfg.AutoUpdateCurrentVersion == "" || (cfg.Replace && !result.Downloaded) {
|
||||
cfg.AutoUpdateCurrentVersion = "0.0.0"
|
||||
}
|
||||
if err := installHostAgentBinary(cfg.HostAgentSourcePath, result.HostAgentPath); err != nil {
|
||||
return result, err
|
||||
}
|
||||
interval := cfg.AutoUpdateIntervalSeconds
|
||||
if interval == 0 {
|
||||
interval = 21600
|
||||
}
|
||||
initialDelay := cfg.AutoUpdateInitialDelaySeconds
|
||||
if initialDelay == 0 {
|
||||
initialDelay = 15
|
||||
}
|
||||
healthTimeout := cfg.AutoUpdateHealthTimeoutSeconds
|
||||
if healthTimeout == 0 {
|
||||
healthTimeout = 30
|
||||
}
|
||||
args := []string{
|
||||
result.HostAgentPath,
|
||||
"update-loop",
|
||||
"--backend-url", cfg.RuntimeConfig.BackendURL,
|
||||
"--cluster-id", cfg.RuntimeConfig.ClusterID,
|
||||
"--state-dir", result.StateDir,
|
||||
"--current-version", cfg.AutoUpdateCurrentVersion,
|
||||
"--os", "linux",
|
||||
"--arch", runtime.GOARCH,
|
||||
"--install-type", BinaryUpdateInstallType,
|
||||
"--binary-path", result.NodeAgentPath,
|
||||
"--systemd-unit", result.UnitName,
|
||||
"--health-timeout-seconds", fmt.Sprintf("%d", healthTimeout),
|
||||
"--interval-seconds", fmt.Sprintf("%d", interval),
|
||||
"--initial-delay-seconds", fmt.Sprintf("%d", initialDelay),
|
||||
"--host-agent-update-status-enabled",
|
||||
"--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"),
|
||||
"--host-agent-binary-path", result.HostAgentPath,
|
||||
}
|
||||
if strings.TrimSpace(cfg.NodeID) != "" {
|
||||
args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID))
|
||||
}
|
||||
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
|
||||
args = append(args, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
|
||||
}
|
||||
unitName := "rap-host-agent-updater-" + safeUnitSlug(result.NodeName) + ".service"
|
||||
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
|
||||
unit := fmt.Sprintf(`[Unit]
|
||||
Description=RAP host-agent updater for %s
|
||||
After=network-online.target %s
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=%s
|
||||
Restart=always
|
||||
RestartSec=30
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
`, result.NodeName, result.UnitName, systemdJoin(args))
|
||||
if err := os.WriteFile(unitPath, []byte(unit), 0o644); err != nil {
|
||||
return result, err
|
||||
}
|
||||
runner := m.runner()
|
||||
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", unitName); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.UpdaterUnitName = unitName
|
||||
result.UpdaterStarted = true
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
|
||||
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
|
||||
req.OS = firstNonEmpty(req.OS, "linux")
|
||||
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
|
||||
req = req.Normalize()
|
||||
var err error
|
||||
req, err = resolveUpdateRequest(req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
plan, err := FetchNodeUpdatePlan(ctx, req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
result := UpdateResult{Action: plan.Action, Reason: plan.Reason, TargetVersion: plan.TargetVersion, ContainerName: req.SystemdUnitName, NewImage: req.BinaryPath}
|
||||
if plan.Action != "update" {
|
||||
if !req.DryRun {
|
||||
status := statusFromNoopPlan(req, plan)
|
||||
if status.Payload == nil {
|
||||
status.Payload = map[string]any{}
|
||||
}
|
||||
status.Payload["systemd_unit"] = req.SystemdUnitName
|
||||
status.Payload["binary_path"] = req.BinaryPath
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.ProductionForwarding && !req.AllowProductionMesh {
|
||||
err := errors.New("refusing update plan with production forwarding enabled")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != BinaryUpdateInstallType {
|
||||
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
|
||||
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
runner := m.runner()
|
||||
_, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName)
|
||||
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
result.Replaced = true
|
||||
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
|
||||
_ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()})
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m LinuxManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
|
||||
req := cfg.Request
|
||||
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
|
||||
req.OS = firstNonEmpty(req.OS, "linux")
|
||||
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
|
||||
cfg.Request = req
|
||||
return runLinuxUpdateLoop(ctx, m, cfg)
|
||||
}
|
||||
|
||||
func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfig) error {
|
||||
if cfg.Interval == 0 {
|
||||
cfg.Interval = time.Hour
|
||||
}
|
||||
logf := cfg.Logf
|
||||
if logf == nil {
|
||||
logf = func(string, ...any) {}
|
||||
}
|
||||
if cfg.InitialDelay > 0 {
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
runs := 0
|
||||
lastTriggerGeneration := currentUpdateTriggerGeneration(cfg.Request.StateDir)
|
||||
for {
|
||||
runs++
|
||||
result, err := m.ApplyUpdate(ctx, cfg.Request)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("linux_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, cfg.Request.StateDir)
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepUntilUpdateIntervalOrTrigger(ctx, cfg.Request.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
} else {
|
||||
logf("linux_update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logf("linux_update_loop run=%d action=%s reason=%s target=%s unit=%s replaced=%t", runs, result.Action, result.Reason, result.TargetVersion, result.ContainerName, result.Replaced)
|
||||
if result.Action == "update" && result.TargetVersion != "" {
|
||||
cfg.Request.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
}
|
||||
if cfg.HostAgentUpdateEnabled {
|
||||
hostReq := cfg.HostAgentUpdateRequest
|
||||
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, cfg.Request.BackendURL)
|
||||
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, cfg.Request.ClusterID)
|
||||
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, cfg.Request.NodeID)
|
||||
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, cfg.Request.StateDir)
|
||||
hostReq.Channel = firstNonEmpty(hostReq.Channel, cfg.Request.Channel)
|
||||
hostReq.OS = firstNonEmpty(hostReq.OS, "linux")
|
||||
hostReq.Arch = firstNonEmpty(hostReq.Arch, runtime.GOARCH)
|
||||
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, BinaryUpdateInstallType)
|
||||
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
|
||||
if hostErr != nil {
|
||||
logf("linux_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
|
||||
} else {
|
||||
logf("linux_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t", runs, hostResult.Action, hostResult.Reason, hostResult.TargetVersion, hostResult.NewImage, hostResult.Replaced, hostResult.RestartNeeded)
|
||||
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
|
||||
cfg.HostAgentUpdateRequest.CurrentVersion = hostResult.TargetVersion
|
||||
}
|
||||
}
|
||||
}
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepUntilUpdateIntervalOrTrigger(ctx, cfg.Request.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,333 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type DockerInstallProfile struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
||||
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
Image string `json:"image"`
|
||||
ContainerName string `json:"container_name"`
|
||||
StateDir string `json:"state_dir"`
|
||||
Network string `json:"network"`
|
||||
RestartPolicy string `json:"restart_policy"`
|
||||
PullImage bool `json:"pull_image"`
|
||||
Replace bool `json:"replace"`
|
||||
DockerVPNGatewayEnabled bool `json:"docker_vpn_gateway_enabled"`
|
||||
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
|
||||
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
|
||||
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
|
||||
MeshListenAddr string `json:"mesh_listen_addr"`
|
||||
MeshListenPortMode string `json:"mesh_listen_port_mode"`
|
||||
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
|
||||
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
|
||||
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
|
||||
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
|
||||
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
|
||||
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
|
||||
MeshNATType string `json:"mesh_nat_type"`
|
||||
MeshRegion string `json:"mesh_region"`
|
||||
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
|
||||
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
|
||||
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
|
||||
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
|
||||
Roles []string `json:"roles"`
|
||||
}
|
||||
|
||||
type DockerArtifact struct {
|
||||
Kind string `json:"kind"`
|
||||
Image string `json:"image"`
|
||||
MediaType string `json:"media_type"`
|
||||
FileName string `json:"file_name"`
|
||||
URLs []string `json:"urls"`
|
||||
SHA256 string `json:"sha256"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
}
|
||||
|
||||
type WindowsInstallProfile struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
||||
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
StateDir string `json:"state_dir"`
|
||||
InstallDir string `json:"install_dir"`
|
||||
StartupMode string `json:"startup_mode"`
|
||||
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
|
||||
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
|
||||
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
|
||||
MeshListenAddr string `json:"mesh_listen_addr"`
|
||||
MeshListenPortMode string `json:"mesh_listen_port_mode"`
|
||||
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
|
||||
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
|
||||
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
|
||||
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
|
||||
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
|
||||
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
|
||||
MeshNATType string `json:"mesh_nat_type"`
|
||||
MeshRegion string `json:"mesh_region"`
|
||||
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
|
||||
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
|
||||
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
|
||||
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
|
||||
Roles []string `json:"roles"`
|
||||
}
|
||||
|
||||
type LinuxInstallProfile struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
||||
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
StateDir string `json:"state_dir"`
|
||||
InstallDir string `json:"install_dir"`
|
||||
StartupMode string `json:"startup_mode"`
|
||||
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
|
||||
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
|
||||
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
|
||||
MeshListenAddr string `json:"mesh_listen_addr"`
|
||||
MeshListenPortMode string `json:"mesh_listen_port_mode"`
|
||||
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
|
||||
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
|
||||
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
|
||||
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
|
||||
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
|
||||
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
|
||||
MeshNATType string `json:"mesh_nat_type"`
|
||||
MeshRegion string `json:"mesh_region"`
|
||||
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
|
||||
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
|
||||
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
|
||||
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
|
||||
Roles []string `json:"roles"`
|
||||
}
|
||||
|
||||
type ProfileRequest struct {
|
||||
URL string
|
||||
ClusterID string
|
||||
InstallToken string
|
||||
NodeName string
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
|
||||
func FetchDockerInstallProfile(ctx context.Context, req ProfileRequest) (DockerInstallProfile, error) {
|
||||
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
|
||||
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
|
||||
return DockerInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
|
||||
}
|
||||
if !strings.HasSuffix(url, "/node-agents/docker-install-profile") {
|
||||
url += "/node-agents/docker-install-profile"
|
||||
}
|
||||
body, err := json.Marshal(map[string]string{
|
||||
"cluster_id": strings.TrimSpace(req.ClusterID),
|
||||
"install_token": strings.TrimSpace(req.InstallToken),
|
||||
"node_name": strings.TrimSpace(req.NodeName),
|
||||
})
|
||||
if err != nil {
|
||||
return DockerInstallProfile{}, err
|
||||
}
|
||||
httpClient := req.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = &http.Client{Timeout: 20 * time.Second}
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return DockerInstallProfile{}, err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
resp, err := httpClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return DockerInstallProfile{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return DockerInstallProfile{}, fmt.Errorf("fetch docker install profile: %s", resp.Status)
|
||||
}
|
||||
var envelope struct {
|
||||
Profile DockerInstallProfile `json:"docker_install_profile"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
|
||||
return DockerInstallProfile{}, err
|
||||
}
|
||||
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
|
||||
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
|
||||
}
|
||||
return envelope.Profile, nil
|
||||
}
|
||||
|
||||
func FetchWindowsInstallProfile(ctx context.Context, req ProfileRequest) (WindowsInstallProfile, error) {
|
||||
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
|
||||
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
|
||||
return WindowsInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
|
||||
}
|
||||
if !strings.HasSuffix(url, "/node-agents/windows-install-profile") {
|
||||
url += "/node-agents/windows-install-profile"
|
||||
}
|
||||
body, err := json.Marshal(map[string]string{
|
||||
"cluster_id": strings.TrimSpace(req.ClusterID),
|
||||
"install_token": strings.TrimSpace(req.InstallToken),
|
||||
"node_name": strings.TrimSpace(req.NodeName),
|
||||
})
|
||||
if err != nil {
|
||||
return WindowsInstallProfile{}, err
|
||||
}
|
||||
httpClient := req.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = &http.Client{Timeout: 20 * time.Second}
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return WindowsInstallProfile{}, err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
resp, err := httpClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return WindowsInstallProfile{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return WindowsInstallProfile{}, fmt.Errorf("fetch windows install profile: %s", resp.Status)
|
||||
}
|
||||
var envelope struct {
|
||||
Profile WindowsInstallProfile `json:"windows_install_profile"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
|
||||
return WindowsInstallProfile{}, err
|
||||
}
|
||||
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
|
||||
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
|
||||
}
|
||||
return envelope.Profile, nil
|
||||
}
|
||||
|
||||
func FetchLinuxInstallProfile(ctx context.Context, req ProfileRequest) (LinuxInstallProfile, error) {
|
||||
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
|
||||
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
|
||||
return LinuxInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
|
||||
}
|
||||
if !strings.HasSuffix(url, "/node-agents/linux-install-profile") {
|
||||
url += "/node-agents/linux-install-profile"
|
||||
}
|
||||
body, err := json.Marshal(map[string]string{
|
||||
"cluster_id": strings.TrimSpace(req.ClusterID),
|
||||
"install_token": strings.TrimSpace(req.InstallToken),
|
||||
"node_name": strings.TrimSpace(req.NodeName),
|
||||
})
|
||||
if err != nil {
|
||||
return LinuxInstallProfile{}, err
|
||||
}
|
||||
httpClient := req.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = &http.Client{Timeout: 20 * time.Second}
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return LinuxInstallProfile{}, err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
resp, err := httpClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return LinuxInstallProfile{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return LinuxInstallProfile{}, fmt.Errorf("fetch linux install profile: %s", resp.Status)
|
||||
}
|
||||
var envelope struct {
|
||||
Profile LinuxInstallProfile `json:"linux_install_profile"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
|
||||
return LinuxInstallProfile{}, err
|
||||
}
|
||||
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
|
||||
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
|
||||
}
|
||||
return envelope.Profile, nil
|
||||
}
|
||||
|
||||
func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
|
||||
return RuntimeConfig{
|
||||
BackendURL: profile.BackendURL,
|
||||
ClusterID: profile.ClusterID,
|
||||
JoinToken: profile.JoinToken,
|
||||
NodeName: profile.NodeName,
|
||||
Image: profile.Image,
|
||||
ContainerName: profile.ContainerName,
|
||||
StateDir: profile.StateDir,
|
||||
Network: profile.Network,
|
||||
RestartPolicy: profile.RestartPolicy,
|
||||
PullImage: profile.PullImage,
|
||||
Replace: profile.Replace,
|
||||
DockerVPNGatewayEnabled: profile.DockerVPNGatewayEnabled,
|
||||
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
|
||||
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
|
||||
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
|
||||
MeshListenAddr: profile.MeshListenAddr,
|
||||
MeshListenPortMode: profile.MeshListenPortMode,
|
||||
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
|
||||
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
|
||||
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
|
||||
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
|
||||
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
|
||||
MeshConnectivityMode: profile.MeshConnectivityMode,
|
||||
MeshNATType: profile.MeshNATType,
|
||||
MeshRegion: profile.MeshRegion,
|
||||
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
|
||||
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
|
||||
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
|
||||
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
|
||||
ImageArtifactURLs: dockerArtifactURLs(profile),
|
||||
ImageArtifactSHA256: dockerArtifactSHA256(profile),
|
||||
ImageArtifactSizeBytes: dockerArtifactSizeBytes(profile),
|
||||
}
|
||||
}
|
||||
|
||||
func dockerArtifactURLs(profile DockerInstallProfile) []string {
|
||||
if profile.DockerImageArtifact != nil && len(profile.DockerImageArtifact.URLs) > 0 {
|
||||
return append([]string(nil), profile.DockerImageArtifact.URLs...)
|
||||
}
|
||||
if profile.DockerImageArtifact == nil || strings.TrimSpace(profile.DockerImageArtifact.FileName) == "" {
|
||||
return nil
|
||||
}
|
||||
out := []string{}
|
||||
fileName := strings.TrimLeft(strings.TrimSpace(profile.DockerImageArtifact.FileName), "/")
|
||||
for _, endpoint := range profile.ArtifactEndpoints {
|
||||
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
|
||||
out = append(out, trimmed+"/"+fileName)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func dockerArtifactSHA256(profile DockerInstallProfile) string {
|
||||
if profile.DockerImageArtifact == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(profile.DockerImageArtifact.SHA256)
|
||||
}
|
||||
|
||||
func dockerArtifactSizeBytes(profile DockerInstallProfile) int64 {
|
||||
if profile.DockerImageArtifact == nil {
|
||||
return 0
|
||||
}
|
||||
return profile.DockerImageArtifact.SizeBytes
|
||||
}
|
||||
@@ -0,0 +1,258 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type HostAgentUpdateRequest struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
CurrentVersion string
|
||||
Channel string
|
||||
OS string
|
||||
Arch string
|
||||
InstallType string
|
||||
BinaryPath string
|
||||
DryRun bool
|
||||
RestartService string
|
||||
RestartAfterApply bool
|
||||
}
|
||||
|
||||
type HostAgentUpdateLoopConfig struct {
|
||||
Request HostAgentUpdateRequest
|
||||
Interval time.Duration
|
||||
InitialDelay time.Duration
|
||||
Jitter float64
|
||||
MaxRuns int
|
||||
StopOnError bool
|
||||
Logf func(format string, args ...any)
|
||||
}
|
||||
|
||||
func (req HostAgentUpdateRequest) updateRequest() UpdateRequest {
|
||||
return UpdateRequest{
|
||||
BackendURL: req.BackendURL,
|
||||
ClusterID: req.ClusterID,
|
||||
NodeID: req.NodeID,
|
||||
StateDir: req.StateDir,
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
OS: firstNonEmpty(req.OS, "linux"),
|
||||
Arch: req.Arch,
|
||||
InstallType: firstNonEmpty(req.InstallType, BinaryUpdateInstallType),
|
||||
Channel: req.Channel,
|
||||
ContainerName: "host-agent-service",
|
||||
DryRun: req.DryRun,
|
||||
}
|
||||
}
|
||||
|
||||
func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUpdateRequest) (UpdateResult, error) {
|
||||
binaryPath := firstNonEmpty(req.BinaryPath, DefaultHostAgentInstallPath)
|
||||
planReq := req.updateRequest()
|
||||
planReq.BinaryDefaults()
|
||||
resolved, err := resolveUpdateRequest(planReq)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
plan, err := FetchNodeUpdatePlan(ctx, resolved)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
result := UpdateResult{
|
||||
Action: plan.Action,
|
||||
Reason: plan.Reason,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: "host-agent-service",
|
||||
NewImage: binaryPath,
|
||||
}
|
||||
if plan.Action != "update" {
|
||||
if !req.DryRun {
|
||||
status := statusFromNoopPlan(resolved, plan)
|
||||
status.Product = HostAgentUpdateProduct
|
||||
if status.Payload == nil {
|
||||
status.Payload = map[string]any{}
|
||||
}
|
||||
status.Payload["binary_path"] = binaryPath
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, status)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("host-agent update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if !isBinaryInstallType(plan.Artifact.InstallType) {
|
||||
err := fmt.Errorf("unsupported host-agent artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL)
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: resolved.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "download",
|
||||
Status: "started",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": binaryPath},
|
||||
})
|
||||
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "download", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
if err := installHostAgentBinary(path, binaryPath); err != nil {
|
||||
stageErr := stageHostAgentBinary(path, binaryPath)
|
||||
if stageErr == nil {
|
||||
result.RestartNeeded = true
|
||||
_ = saveUpdateState(resolved.StateDir, UpdateState{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: plan.TargetVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: "host-agent-service",
|
||||
Image: binaryPath,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: resolved.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "apply",
|
||||
Status: "staged",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"binary_path": binaryPath, "staged_path": binaryPath + ".next", "restart_needed": true, "replace_error": err.Error()},
|
||||
})
|
||||
return result, nil
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr)))
|
||||
return result, err
|
||||
}
|
||||
result.Loaded = true
|
||||
result.Replaced = true
|
||||
result.RestartNeeded = true
|
||||
_ = saveUpdateState(resolved.StateDir, UpdateState{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: plan.TargetVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: "host-agent-service",
|
||||
Image: binaryPath,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: resolved.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "apply",
|
||||
Status: "succeeded",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"binary_path": binaryPath, "restart_needed": true},
|
||||
})
|
||||
if req.RestartAfterApply && strings.TrimSpace(req.RestartService) != "" {
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
_, err = runner.Run(ctx, "systemctl", "restart", req.RestartService)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.RestartNeeded = false
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgentUpdateLoopConfig) error {
|
||||
if cfg.Interval == 0 {
|
||||
cfg.Interval = time.Hour
|
||||
}
|
||||
if cfg.InitialDelay < 0 || cfg.Interval < 0 {
|
||||
return errors.New("host-agent update loop durations must not be negative")
|
||||
}
|
||||
if cfg.Jitter < 0 || cfg.Jitter > 1 {
|
||||
return errors.New("host-agent update loop jitter must be between 0 and 1")
|
||||
}
|
||||
logf := cfg.Logf
|
||||
if logf == nil {
|
||||
logf = func(string, ...any) {}
|
||||
}
|
||||
if cfg.InitialDelay > 0 {
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
runs := 0
|
||||
req := cfg.Request
|
||||
for {
|
||||
runs++
|
||||
result, err := m.ApplyHostAgentUpdate(ctx, req)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
|
||||
} else {
|
||||
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logf("host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
|
||||
runs,
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.NewImage,
|
||||
result.Replaced,
|
||||
result.RestartNeeded,
|
||||
)
|
||||
if result.Action == "update" && result.TargetVersion != "" {
|
||||
req.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
}
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (req *UpdateRequest) BinaryDefaults() {
|
||||
req.Product = firstNonEmpty(req.Product, HostAgentUpdateProduct)
|
||||
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
|
||||
req.OS = firstNonEmpty(req.OS, "linux")
|
||||
}
|
||||
|
||||
func isBinaryInstallType(value string) bool {
|
||||
switch strings.TrimSpace(value) {
|
||||
case "", BinaryUpdateInstallType, "windows_binary", "binary", "host_binary", "linux-amd64-binary", "windows-amd64-binary":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func hostAgentInstallTypeFor(nodeInstallType string) string {
|
||||
if strings.TrimSpace(nodeInstallType) == WindowsUpdateInstallType {
|
||||
return "windows_binary"
|
||||
}
|
||||
return BinaryUpdateInstallType
|
||||
}
|
||||
|
||||
func stageHostAgentBinary(sourcePath, binaryPath string) error {
|
||||
return copyFile(sourcePath, binaryPath+".next", 0o755)
|
||||
}
|
||||
@@ -0,0 +1,321 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultHostAgentInstallPath = "/usr/local/bin/rap-host-agent"
|
||||
DefaultSystemdUnitDir = "/etc/systemd/system"
|
||||
)
|
||||
|
||||
type UpdateServiceConfig struct {
|
||||
RuntimeConfig RuntimeConfig
|
||||
Product string
|
||||
CurrentVersion string
|
||||
Channel string
|
||||
IntervalSeconds int
|
||||
InitialDelaySeconds int
|
||||
Jitter float64
|
||||
HealthTimeoutSec int
|
||||
BinaryInstallPath string
|
||||
SourceBinaryPath string
|
||||
UnitDir string
|
||||
ManageSystemd bool
|
||||
DryRun bool
|
||||
InstallSelfUpdater bool
|
||||
SelfUpdateVersion string
|
||||
}
|
||||
|
||||
type UpdateServiceResult struct {
|
||||
Installed bool
|
||||
Started bool
|
||||
UnitName string
|
||||
UnitPath string
|
||||
BinaryPath string
|
||||
Unit string
|
||||
SelfUnitName string
|
||||
SelfUnitPath string
|
||||
SelfUnit string
|
||||
}
|
||||
|
||||
func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServiceConfig) (UpdateServiceResult, error) {
|
||||
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
|
||||
if cfg.Product == "" {
|
||||
cfg.Product = DefaultUpdateProduct
|
||||
}
|
||||
if cfg.IntervalSeconds == 0 {
|
||||
cfg.IntervalSeconds = 21600
|
||||
}
|
||||
if cfg.Jitter == 0 {
|
||||
cfg.Jitter = 0.15
|
||||
}
|
||||
if cfg.HealthTimeoutSec == 0 {
|
||||
cfg.HealthTimeoutSec = 30
|
||||
}
|
||||
cfg.BinaryInstallPath = firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath)
|
||||
cfg.UnitDir = firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir)
|
||||
unitName := "rap-host-agent-updater-" + safeUnitSlug(cfg.RuntimeConfig.ContainerName) + ".service"
|
||||
result := UpdateServiceResult{
|
||||
UnitName: unitName,
|
||||
UnitPath: filepath.Join(cfg.UnitDir, unitName),
|
||||
BinaryPath: cfg.BinaryInstallPath,
|
||||
}
|
||||
unit, err := buildUpdateServiceUnit(cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Unit = unit
|
||||
if cfg.DryRun {
|
||||
if cfg.InstallSelfUpdater {
|
||||
selfUnit, selfUnitName, selfUnitPath, err := buildHostAgentSelfUpdateUnit(cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.SelfUnit = selfUnit
|
||||
result.SelfUnitName = selfUnitName
|
||||
result.SelfUnitPath = selfUnitPath
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if runtime.GOOS != "linux" && cfg.UnitDir == DefaultSystemdUnitDir {
|
||||
return result, fmt.Errorf("systemd update service install is only supported on linux")
|
||||
}
|
||||
if err := installHostAgentBinary(cfg.SourceBinaryPath, cfg.BinaryInstallPath); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.MkdirAll(cfg.UnitDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.WriteFile(result.UnitPath, []byte(unit), 0o644); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if cfg.InstallSelfUpdater {
|
||||
selfUnit, selfUnitName, selfUnitPath, err := buildHostAgentSelfUpdateUnit(cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.WriteFile(selfUnitPath, []byte(selfUnit), 0o644); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.SelfUnit = selfUnit
|
||||
result.SelfUnitName = selfUnitName
|
||||
result.SelfUnitPath = selfUnitPath
|
||||
}
|
||||
result.Installed = true
|
||||
if cfg.ManageSystemd {
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", unitName); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if cfg.InstallSelfUpdater && result.SelfUnitName != "" {
|
||||
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", result.SelfUnitName); err != nil {
|
||||
return result, err
|
||||
}
|
||||
}
|
||||
result.Started = true
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
|
||||
runtimeCfg := cfg.RuntimeConfig.Normalize()
|
||||
var missing []string
|
||||
if runtimeCfg.BackendURL == "" {
|
||||
missing = append(missing, "backend-url")
|
||||
}
|
||||
if runtimeCfg.ClusterID == "" {
|
||||
missing = append(missing, "cluster-id")
|
||||
}
|
||||
if runtimeCfg.ContainerName == "" {
|
||||
missing = append(missing, "container-name")
|
||||
}
|
||||
if runtimeCfg.StateDir == "" {
|
||||
missing = append(missing, "state-dir")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return "", fmt.Errorf("missing required update service settings: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
args := []string{
|
||||
cfg.BinaryInstallPath,
|
||||
"update-loop",
|
||||
"--backend-url", runtimeCfg.BackendURL,
|
||||
"--cluster-id", runtimeCfg.ClusterID,
|
||||
"--state-dir", runtimeCfg.StateDir,
|
||||
"--container-name", runtimeCfg.ContainerName,
|
||||
"--product", firstNonEmpty(cfg.Product, DefaultUpdateProduct),
|
||||
"--current-version", strings.TrimSpace(cfg.CurrentVersion),
|
||||
"--interval-seconds", fmt.Sprintf("%d", cfg.IntervalSeconds),
|
||||
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds),
|
||||
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
|
||||
"--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec),
|
||||
}
|
||||
if strings.TrimSpace(cfg.Channel) != "" {
|
||||
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
|
||||
}
|
||||
execStart := systemdJoin(args)
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=RAP host-agent updater for %s
|
||||
After=network-online.target docker.service
|
||||
Wants=network-online.target
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=%s
|
||||
Restart=always
|
||||
RestartSec=30
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
`, runtimeCfg.ContainerName, execStart), nil
|
||||
}
|
||||
|
||||
func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) {
|
||||
runtimeCfg := cfg.RuntimeConfig.Normalize()
|
||||
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
|
||||
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host-agent self updater")
|
||||
}
|
||||
unitName := "rap-host-agent-self-updater.service"
|
||||
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
|
||||
currentVersion := firstNonEmpty(cfg.SelfUpdateVersion, cfg.CurrentVersion)
|
||||
args := []string{
|
||||
cfg.BinaryInstallPath,
|
||||
"update-host-agent-loop",
|
||||
"--backend-url", runtimeCfg.BackendURL,
|
||||
"--cluster-id", runtimeCfg.ClusterID,
|
||||
"--state-dir", runtimeCfg.StateDir,
|
||||
"--binary-path", firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath),
|
||||
"--current-version", currentVersion,
|
||||
"--interval-seconds", fmt.Sprintf("%d", cfg.IntervalSeconds),
|
||||
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30),
|
||||
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
|
||||
}
|
||||
if strings.TrimSpace(cfg.Channel) != "" {
|
||||
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
|
||||
}
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=RAP host-agent self updater
|
||||
After=network-online.target docker.service
|
||||
Wants=network-online.target
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=%s
|
||||
Restart=always
|
||||
RestartSec=60
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
`, systemdJoin(args)), unitName, unitPath, nil
|
||||
}
|
||||
|
||||
func installHostAgentBinary(sourcePath, targetPath string) error {
|
||||
sourcePath = strings.TrimSpace(sourcePath)
|
||||
targetPath = strings.TrimSpace(targetPath)
|
||||
if sourcePath == "" {
|
||||
var err error
|
||||
sourcePath, err = os.Executable()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if samePath(sourcePath, targetPath) {
|
||||
return os.Chmod(targetPath, 0o755)
|
||||
}
|
||||
src, err := os.Open(sourcePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer src.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := targetPath + ".tmp"
|
||||
dst, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := io.Copy(dst, src); err != nil {
|
||||
_ = dst.Close()
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := dst.Close(); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := os.Chmod(tmp, 0o755); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, targetPath)
|
||||
}
|
||||
|
||||
func samePath(a, b string) bool {
|
||||
absA, errA := filepath.Abs(a)
|
||||
absB, errB := filepath.Abs(b)
|
||||
if errA == nil && errB == nil {
|
||||
return absA == absB
|
||||
}
|
||||
return a == b
|
||||
}
|
||||
|
||||
func safeUnitSlug(value string) string {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
if value == "" {
|
||||
value = DefaultContainerName
|
||||
}
|
||||
var b strings.Builder
|
||||
lastDash := false
|
||||
for _, r := range value {
|
||||
ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
|
||||
if ok {
|
||||
b.WriteRune(r)
|
||||
lastDash = false
|
||||
continue
|
||||
}
|
||||
if !lastDash {
|
||||
b.WriteByte('-')
|
||||
lastDash = true
|
||||
}
|
||||
}
|
||||
out := strings.Trim(b.String(), "-")
|
||||
if out == "" {
|
||||
return DefaultContainerName
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func systemdJoin(args []string) string {
|
||||
out := make([]string, 0, len(args))
|
||||
for _, arg := range args {
|
||||
out = append(out, systemdQuote(arg))
|
||||
}
|
||||
return strings.Join(out, " ")
|
||||
}
|
||||
|
||||
func systemdQuote(arg string) string {
|
||||
if arg == "" {
|
||||
return `""`
|
||||
}
|
||||
if !strings.ContainsAny(arg, " \t\n\"'\\") {
|
||||
return arg
|
||||
}
|
||||
arg = strings.ReplaceAll(arg, `\`, `\\`)
|
||||
arg = strings.ReplaceAll(arg, `"`, `\"`)
|
||||
return `"` + arg + `"`
|
||||
}
|
||||
@@ -0,0 +1,171 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
source := filepath.Join(dir, "rap-host-agent-src")
|
||||
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
|
||||
t.Fatalf("write source: %v", err)
|
||||
}
|
||||
unitDir := filepath.Join(dir, "systemd")
|
||||
binaryPath := filepath.Join(dir, "bin", "rap-host-agent")
|
||||
result, err := (DockerManager{}).InstallUpdateService(context.Background(), UpdateServiceConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
NodeName: "node-a",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "/var/lib/rap/nodes/node-a",
|
||||
},
|
||||
CurrentVersion: "0.1.0-current",
|
||||
IntervalSeconds: 60,
|
||||
Jitter: 0.2,
|
||||
SourceBinaryPath: source,
|
||||
BinaryInstallPath: binaryPath,
|
||||
UnitDir: unitDir,
|
||||
ManageSystemd: false,
|
||||
InstallSelfUpdater: true,
|
||||
SelfUpdateVersion: "0.1.0-host",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install update service: %v", err)
|
||||
}
|
||||
if !result.Installed || result.Started {
|
||||
t.Fatalf("unexpected result: %+v", result)
|
||||
}
|
||||
unit, err := os.ReadFile(result.UnitPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read unit: %v", err)
|
||||
}
|
||||
text := string(unit)
|
||||
for _, want := range []string{
|
||||
"ExecStart=",
|
||||
" update-loop",
|
||||
"--backend-url http://control/api/v1",
|
||||
"--cluster-id cluster-1",
|
||||
"--state-dir /var/lib/rap/nodes/node-a",
|
||||
"--container-name rap-node-agent-node-a",
|
||||
"--current-version 0.1.0-current",
|
||||
"--interval-seconds 60",
|
||||
"Restart=always",
|
||||
} {
|
||||
if !strings.Contains(text, want) {
|
||||
t.Fatalf("unit missing %q:\n%s", want, text)
|
||||
}
|
||||
}
|
||||
if payload, err := os.ReadFile(binaryPath); err != nil || string(payload) != "binary" {
|
||||
t.Fatalf("binary copy = %q, %v", payload, err)
|
||||
}
|
||||
if result.SelfUnitName != "rap-host-agent-self-updater.service" || result.SelfUnitPath == "" {
|
||||
t.Fatalf("self updater result = %+v", result)
|
||||
}
|
||||
selfUnit, err := os.ReadFile(result.SelfUnitPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read self unit: %v", err)
|
||||
}
|
||||
if text := string(selfUnit); !strings.Contains(text, "update-host-agent-loop") || !strings.Contains(text, "--current-version 0.1.0-host") {
|
||||
t.Fatalf("unexpected self unit:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
|
||||
cfg := WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
},
|
||||
NodeID: "node-1",
|
||||
AutoUpdateCurrentVersion: "0.1.2",
|
||||
AutoUpdateIntervalSeconds: 120,
|
||||
AutoUpdateInitialDelaySeconds: 7,
|
||||
AutoUpdateHealthTimeoutSeconds: 11,
|
||||
}
|
||||
result := WindowsInstallResult{
|
||||
NodeName: "win-a",
|
||||
StateDir: `C:\ProgramData\RAP\nodes\win-a`,
|
||||
NodeAgentPath: `C:\Program Files\RAP\win-a\rap-node-agent.exe`,
|
||||
TaskName: "RAP Node Agent win-a",
|
||||
}
|
||||
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
|
||||
for _, want := range []string{
|
||||
":loop",
|
||||
"rap-host-agent.exe.next",
|
||||
"update-loop --backend-url",
|
||||
"--backend-url \"http://control/api/v1\"",
|
||||
"--cluster-id \"cluster-1\"",
|
||||
"--node-id \"node-1\"",
|
||||
"--state-dir \"C:\\ProgramData\\RAP\\nodes\\win-a\"",
|
||||
"--install-type windows_service",
|
||||
"--binary-path \"C:\\Program Files\\RAP\\win-a\\rap-node-agent.exe\"",
|
||||
"--host-agent-binary-path \"C:\\Program Files\\RAP\\win-a\\rap-host-agent.exe\"",
|
||||
"--windows-task-name \"RAP Node Agent win-a\"",
|
||||
"--current-version 0.1.2",
|
||||
"--host-agent-current-version 0.1.2",
|
||||
"--interval-seconds 120",
|
||||
"timeout /t 120",
|
||||
} {
|
||||
if !strings.Contains(script, want) {
|
||||
t.Fatalf("script missing %q:\n%s", want, script)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) {
|
||||
result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
NodeName: "win-a",
|
||||
},
|
||||
InstallDir: `C:\Program Files\RAP\win-a`,
|
||||
Replace: true,
|
||||
DryRun: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("replace install should not require join token: %v", err)
|
||||
}
|
||||
if result.NodeName != "win-a" || result.NodeAgentPath == "" {
|
||||
t.Fatalf("unexpected dry-run result: %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsRepairUpdaterStartsFromUnknownVersion(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
source := filepath.Join(dir, "rap-host-agent.exe")
|
||||
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
|
||||
t.Fatalf("write source: %v", err)
|
||||
}
|
||||
result, err := installWindowsHostAgentUpdater(context.Background(), WindowsManager{Runner: &recordingRunner{}}, WindowsInstallResult{
|
||||
NodeName: "win-a",
|
||||
InstallDir: dir,
|
||||
StateDir: dir,
|
||||
NodeAgentPath: filepath.Join(dir, "rap-node-agent.exe"),
|
||||
TaskName: "RAP Node Agent win-a",
|
||||
StartupMode: "user-task",
|
||||
}, WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
},
|
||||
Replace: true,
|
||||
AutoUpdateEnabled: true,
|
||||
HostAgentSourcePath: source,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install updater: %v", err)
|
||||
}
|
||||
script, err := os.ReadFile(filepath.Join(result.InstallDir, "rap-host-agent-update.cmd"))
|
||||
if err != nil {
|
||||
t.Fatalf("read updater script: %v", err)
|
||||
}
|
||||
if !strings.Contains(string(script), "--current-version 0.0.0") {
|
||||
t.Fatalf("repair updater should force unknown current version:\n%s", script)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,947 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultUpdateProduct = "rap-node-agent"
|
||||
HostAgentUpdateProduct = "rap-host-agent"
|
||||
DefaultUpdateInstallType = "docker"
|
||||
BinaryUpdateInstallType = "linux_binary"
|
||||
WindowsUpdateInstallType = "windows_service"
|
||||
UpdateStateFileName = "host-update-state.json"
|
||||
UpdateTriggerFileName = "update-trigger.json"
|
||||
)
|
||||
|
||||
var ErrNodeIdentityNotReady = errors.New("node identity is not approved yet")
|
||||
|
||||
type UpdateRequest struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
Product string
|
||||
CurrentVersion string
|
||||
OS string
|
||||
Arch string
|
||||
InstallType string
|
||||
Channel string
|
||||
ContainerName string
|
||||
BinaryPath string
|
||||
WindowsTaskName string
|
||||
SystemdUnitName string
|
||||
HealthTimeout time.Duration
|
||||
DryRun bool
|
||||
AllowProductionMesh bool
|
||||
}
|
||||
|
||||
type UpdateResult struct {
|
||||
Action string
|
||||
Reason string
|
||||
TargetVersion string
|
||||
ContainerName string
|
||||
PreviousImageID string
|
||||
NewImage string
|
||||
ContainerID string
|
||||
Loaded bool
|
||||
Replaced bool
|
||||
RolledBack bool
|
||||
RestartNeeded bool
|
||||
}
|
||||
|
||||
type UpdateLoopConfig struct {
|
||||
Request UpdateRequest
|
||||
Interval time.Duration
|
||||
InitialDelay time.Duration
|
||||
Jitter float64
|
||||
MaxRuns int
|
||||
StopOnError bool
|
||||
HostAgentUpdateEnabled bool
|
||||
HostAgentUpdateRequest HostAgentUpdateRequest
|
||||
Logf func(format string, args ...any)
|
||||
}
|
||||
|
||||
type UpdateState struct {
|
||||
Product string `json:"product"`
|
||||
CurrentVersion string `json:"current_version"`
|
||||
TargetVersion string `json:"target_version,omitempty"`
|
||||
ContainerName string `json:"container_name,omitempty"`
|
||||
Image string `json:"image,omitempty"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type UpdateTrigger struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
Generation string `json:"generation"`
|
||||
Products []string `json:"products,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
DeliveryMode string `json:"delivery_mode,omitempty"`
|
||||
SubscriptionStatus string `json:"subscription_status,omitempty"`
|
||||
UpdateServiceNodeID string `json:"update_service_node_id,omitempty"`
|
||||
UpdateServiceStatus string `json:"update_service_status,omitempty"`
|
||||
FallbackPollSeconds int `json:"fallback_poll_seconds,omitempty"`
|
||||
ObservedAt time.Time `json:"observed_at"`
|
||||
}
|
||||
|
||||
type NodeUpdatePlanResponse struct {
|
||||
Plan NodeUpdatePlan `json:"node_update_plan"`
|
||||
}
|
||||
|
||||
type NodeUpdatePlan struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
NodeID string `json:"node_id"`
|
||||
Product string `json:"product"`
|
||||
CurrentVersion string `json:"current_version,omitempty"`
|
||||
Action string `json:"action"`
|
||||
Reason string `json:"reason"`
|
||||
TargetVersion string `json:"target_version,omitempty"`
|
||||
Channel string `json:"channel,omitempty"`
|
||||
Strategy string `json:"strategy,omitempty"`
|
||||
RollbackAllowed bool `json:"rollback_allowed"`
|
||||
HealthWindowSec int `json:"health_window_seconds,omitempty"`
|
||||
Artifact *ReleaseArtifact `json:"artifact,omitempty"`
|
||||
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
|
||||
AuthoritySignature json.RawMessage `json:"authority_signature,omitempty"`
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
}
|
||||
|
||||
type ReleaseArtifact struct {
|
||||
ID string `json:"id"`
|
||||
ReleaseID string `json:"release_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
Product string `json:"product"`
|
||||
Version string `json:"version"`
|
||||
OS string `json:"os"`
|
||||
Arch string `json:"arch"`
|
||||
InstallType string `json:"install_type"`
|
||||
Kind string `json:"kind"`
|
||||
URL string `json:"url"`
|
||||
URLs []string `json:"urls,omitempty"`
|
||||
SHA256 string `json:"sha256"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
Signature *string `json:"signature,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
}
|
||||
|
||||
type NodeUpdateStatusRequest struct {
|
||||
Product string `json:"product"`
|
||||
CurrentVersion string `json:"current_version,omitempty"`
|
||||
TargetVersion string `json:"target_version,omitempty"`
|
||||
Phase string `json:"phase"`
|
||||
Status string `json:"status"`
|
||||
AttemptID string `json:"attempt_id,omitempty"`
|
||||
ErrorMessage *string `json:"error_message,omitempty"`
|
||||
RollbackVersion *string `json:"rollback_version,omitempty"`
|
||||
Payload map[string]any `json:"payload,omitempty"`
|
||||
ObservedAt time.Time `json:"observed_at,omitempty"`
|
||||
}
|
||||
|
||||
type dockerInspectContainer struct {
|
||||
ID string `json:"Id"`
|
||||
Image string `json:"Image"`
|
||||
Config struct {
|
||||
Image string `json:"Image"`
|
||||
Env []string `json:"Env"`
|
||||
} `json:"Config"`
|
||||
HostConfig struct {
|
||||
Privileged bool `json:"Privileged"`
|
||||
NetworkMode string `json:"NetworkMode"`
|
||||
CapAdd []string `json:"CapAdd"`
|
||||
Devices []struct {
|
||||
PathOnHost string `json:"PathOnHost"`
|
||||
PathInContainer string `json:"PathInContainer"`
|
||||
CgroupPermissions string `json:"CgroupPermissions"`
|
||||
} `json:"Devices"`
|
||||
RestartPolicy struct {
|
||||
Name string `json:"Name"`
|
||||
} `json:"RestartPolicy"`
|
||||
} `json:"HostConfig"`
|
||||
Mounts []struct {
|
||||
Source string `json:"Source"`
|
||||
Destination string `json:"Destination"`
|
||||
} `json:"Mounts"`
|
||||
State struct {
|
||||
Running bool `json:"Running"`
|
||||
} `json:"State"`
|
||||
}
|
||||
|
||||
func (req UpdateRequest) Normalize() UpdateRequest {
|
||||
req.BackendURL = strings.TrimRight(strings.TrimSpace(req.BackendURL), "/")
|
||||
req.ClusterID = strings.TrimSpace(req.ClusterID)
|
||||
req.NodeID = strings.TrimSpace(req.NodeID)
|
||||
req.StateDir = strings.TrimSpace(req.StateDir)
|
||||
req.Product = firstNonEmpty(req.Product, DefaultUpdateProduct)
|
||||
req.OS = firstNonEmpty(req.OS, runtime.GOOS)
|
||||
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
|
||||
req.InstallType = firstNonEmpty(req.InstallType, DefaultUpdateInstallType)
|
||||
req.Channel = strings.TrimSpace(req.Channel)
|
||||
req.ContainerName = firstNonEmpty(req.ContainerName, DefaultContainerName)
|
||||
req.BinaryPath = strings.TrimSpace(req.BinaryPath)
|
||||
req.WindowsTaskName = strings.TrimSpace(req.WindowsTaskName)
|
||||
req.SystemdUnitName = strings.TrimSpace(req.SystemdUnitName)
|
||||
if req.HealthTimeout == 0 {
|
||||
req.HealthTimeout = 30 * time.Second
|
||||
}
|
||||
return req
|
||||
}
|
||||
|
||||
func (req UpdateRequest) Validate() error {
|
||||
req = req.Normalize()
|
||||
var missing []string
|
||||
if req.BackendURL == "" {
|
||||
missing = append(missing, "backend-url")
|
||||
}
|
||||
if req.ClusterID == "" {
|
||||
missing = append(missing, "cluster-id")
|
||||
}
|
||||
if req.NodeID == "" && req.StateDir == "" {
|
||||
missing = append(missing, "node-id-or-state-dir")
|
||||
}
|
||||
if req.InstallType == WindowsUpdateInstallType {
|
||||
if req.BinaryPath == "" {
|
||||
missing = append(missing, "binary-path")
|
||||
}
|
||||
if req.WindowsTaskName == "" {
|
||||
missing = append(missing, "windows-task-name")
|
||||
}
|
||||
} else if req.InstallType == BinaryUpdateInstallType && req.Product != HostAgentUpdateProduct {
|
||||
if req.BinaryPath == "" {
|
||||
missing = append(missing, "binary-path")
|
||||
}
|
||||
if req.SystemdUnitName == "" {
|
||||
missing = append(missing, "systemd-unit")
|
||||
}
|
||||
} else if req.ContainerName == "" {
|
||||
missing = append(missing, "container-name")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return fmt.Errorf("missing required update settings: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
if req.HealthTimeout < 0 {
|
||||
return errors.New("health timeout must not be negative")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
|
||||
req = req.Normalize()
|
||||
var err error
|
||||
req, err = resolveUpdateRequest(req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
docker := firstNonEmpty(m.Binary, "docker")
|
||||
|
||||
plan, err := FetchNodeUpdatePlan(ctx, req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
if plan.HealthWindowSec > 0 && req.HealthTimeout == 30*time.Second {
|
||||
req.HealthTimeout = time.Duration(plan.HealthWindowSec) * time.Second
|
||||
}
|
||||
result := UpdateResult{
|
||||
Action: plan.Action,
|
||||
Reason: plan.Reason,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: req.ContainerName,
|
||||
}
|
||||
if plan.Action != "update" {
|
||||
if !req.DryRun {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromNoopPlan(req, plan))
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.ProductionForwarding && !req.AllowProductionMesh {
|
||||
err := errors.New("refusing update plan with production forwarding enabled")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != DefaultUpdateInstallType {
|
||||
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
result.NewImage = artifactImage(*plan.Artifact, "")
|
||||
return result, nil
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "planned",
|
||||
Status: "accepted",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason},
|
||||
})
|
||||
|
||||
current, cfg, err := m.runtimeConfigFromContainer(ctx, runner, docker, req.ContainerName)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "inspect", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
result.PreviousImageID = current.Image
|
||||
cfg.BackendURL = firstNonEmpty(cfg.BackendURL, req.BackendURL)
|
||||
cfg.ClusterID = firstNonEmpty(cfg.ClusterID, req.ClusterID)
|
||||
cfg.ContainerName = req.ContainerName
|
||||
cfg.Image = artifactImage(*plan.Artifact, cfg.Image)
|
||||
cfg.ImageArtifactURLs = artifactURLsForBackend(*plan.Artifact, req.BackendURL)
|
||||
cfg.ImageArtifactSHA256 = plan.Artifact.SHA256
|
||||
cfg.ImageArtifactSizeBytes = plan.Artifact.SizeBytes
|
||||
cfg.Replace = true
|
||||
cfg.JoinToken = ""
|
||||
result.NewImage = cfg.Image
|
||||
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "download",
|
||||
Status: "started",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": cfg.ImageArtifactURLs, "image": cfg.Image},
|
||||
})
|
||||
installed, err := m.Install(ctx, cfg)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
|
||||
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
|
||||
if rollbackErr == nil && plan.RollbackAllowed {
|
||||
result.RolledBack = true
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
result.Loaded = installed.Loaded
|
||||
result.Replaced = installed.Replaced
|
||||
result.ContainerID = installed.ContainerID
|
||||
|
||||
if err := m.waitContainerRunning(ctx, runner, docker, req.ContainerName, req.HealthTimeout); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "health_check", "failed", err))
|
||||
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
|
||||
if rollbackErr == nil && plan.RollbackAllowed {
|
||||
result.RolledBack = true
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "health_check",
|
||||
Status: "succeeded",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"container_id": installed.ContainerID, "image": cfg.Image},
|
||||
})
|
||||
_ = saveUpdateState(req.StateDir, UpdateState{
|
||||
Product: req.Product,
|
||||
CurrentVersion: plan.TargetVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: req.ContainerName,
|
||||
Image: cfg.Image,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m DockerManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
|
||||
req := cfg.Request.Normalize()
|
||||
if err := req.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cfg.Interval == 0 {
|
||||
cfg.Interval = time.Hour
|
||||
}
|
||||
if cfg.Interval < 0 {
|
||||
return errors.New("update loop interval must not be negative")
|
||||
}
|
||||
if cfg.InitialDelay < 0 {
|
||||
return errors.New("update loop initial delay must not be negative")
|
||||
}
|
||||
if cfg.Jitter < 0 || cfg.Jitter > 1 {
|
||||
return errors.New("update loop jitter must be between 0 and 1")
|
||||
}
|
||||
logf := cfg.Logf
|
||||
if logf == nil {
|
||||
logf = func(string, ...any) {}
|
||||
}
|
||||
if cfg.InitialDelay > 0 {
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
runs := 0
|
||||
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
|
||||
for {
|
||||
runs++
|
||||
result, err := m.ApplyUpdate(ctx, req)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
}
|
||||
logf("update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
logf("update_loop run=%d action=%s reason=%s target=%s container=%s loaded=%t replaced=%t rolled_back=%t",
|
||||
runs,
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.ContainerName,
|
||||
result.Loaded,
|
||||
result.Replaced,
|
||||
result.RolledBack,
|
||||
)
|
||||
if result.Action == "update" && result.TargetVersion != "" && !result.RolledBack {
|
||||
req.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
}
|
||||
if cfg.HostAgentUpdateEnabled {
|
||||
hostReq := cfg.HostAgentUpdateRequest
|
||||
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
|
||||
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
|
||||
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
|
||||
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
|
||||
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
|
||||
hostReq.CurrentVersion = firstNonEmpty(hostReq.CurrentVersion, req.CurrentVersion)
|
||||
hostReq.OS = firstNonEmpty(hostReq.OS, req.OS)
|
||||
hostReq.Arch = firstNonEmpty(hostReq.Arch, req.Arch)
|
||||
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, hostAgentInstallTypeFor(req.InstallType))
|
||||
result, err := m.ApplyHostAgentUpdate(ctx, hostReq)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
|
||||
} else {
|
||||
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logf("host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
|
||||
runs,
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.NewImage,
|
||||
result.Replaced,
|
||||
result.RestartNeeded,
|
||||
)
|
||||
if result.Action == "update" && result.TargetVersion != "" {
|
||||
cfg.HostAgentUpdateRequest.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
if result.RestartNeeded {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan, error) {
|
||||
var err error
|
||||
req, err = resolveUpdateRequest(req)
|
||||
if err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
values := url.Values{}
|
||||
values.Set("product", req.Product)
|
||||
values.Set("current_version", req.CurrentVersion)
|
||||
values.Set("os", req.OS)
|
||||
values.Set("arch", req.Arch)
|
||||
values.Set("install_type", req.InstallType)
|
||||
if req.Channel != "" {
|
||||
values.Set("channel", req.Channel)
|
||||
}
|
||||
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/plan?%s", req.BackendURL, url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode())
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
resp, err := http.DefaultClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return NodeUpdatePlan{}, fmt.Errorf("fetch update plan: %s", resp.Status)
|
||||
}
|
||||
var out NodeUpdatePlanResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
return out.Plan, nil
|
||||
}
|
||||
|
||||
func resolveUpdateRequest(req UpdateRequest) (UpdateRequest, error) {
|
||||
req = req.Normalize()
|
||||
if err := req.Validate(); err != nil {
|
||||
return UpdateRequest{}, err
|
||||
}
|
||||
if req.NodeID == "" {
|
||||
identity, err := state.Load(filepath.Join(req.StateDir, state.FileName))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return UpdateRequest{}, ErrNodeIdentityNotReady
|
||||
}
|
||||
return UpdateRequest{}, err
|
||||
}
|
||||
if strings.TrimSpace(identity.NodeID) == "" {
|
||||
return UpdateRequest{}, ErrNodeIdentityNotReady
|
||||
}
|
||||
req.NodeID = strings.TrimSpace(identity.NodeID)
|
||||
if req.ClusterID == "" {
|
||||
req.ClusterID = strings.TrimSpace(identity.ClusterID)
|
||||
}
|
||||
}
|
||||
if updateState, err := loadUpdateState(req.StateDir, req.Product); err == nil && updateState.Product == req.Product && updateState.CurrentVersion != "" {
|
||||
req.CurrentVersion = updateState.CurrentVersion
|
||||
}
|
||||
return req, nil
|
||||
}
|
||||
|
||||
func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID string, request NodeUpdateStatusRequest) error {
|
||||
backendURL = strings.TrimRight(strings.TrimSpace(backendURL), "/")
|
||||
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/status", backendURL, url.PathEscape(clusterID), url.PathEscape(nodeID))
|
||||
body, err := json.Marshal(request)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
resp, err := http.DefaultClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return fmt.Errorf("report update status: %s", resp.Status)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner CommandRunner, docker, containerName string) (dockerInspectContainer, RuntimeConfig, error) {
|
||||
out, err := runner.Run(ctx, docker, "inspect", containerName)
|
||||
if err != nil {
|
||||
return dockerInspectContainer{}, RuntimeConfig{}, err
|
||||
}
|
||||
var inspected []dockerInspectContainer
|
||||
if err := json.Unmarshal([]byte(out), &inspected); err != nil {
|
||||
return dockerInspectContainer{}, RuntimeConfig{}, err
|
||||
}
|
||||
if len(inspected) == 0 {
|
||||
return dockerInspectContainer{}, RuntimeConfig{}, fmt.Errorf("container %q not found", containerName)
|
||||
}
|
||||
env := envMap(inspected[0].Config.Env)
|
||||
cfg := RuntimeConfig{
|
||||
BackendURL: env["RAP_BACKEND_URL"],
|
||||
ClusterID: env["RAP_CLUSTER_ID"],
|
||||
NodeName: firstNonEmpty(env["RAP_NODE_NAME"], containerName),
|
||||
Image: inspected[0].Config.Image,
|
||||
ContainerName: containerName,
|
||||
StateDir: hostStateDir(inspected[0]),
|
||||
Network: firstNonEmpty(inspected[0].HostConfig.NetworkMode, DefaultNetwork),
|
||||
RestartPolicy: firstNonEmpty(inspected[0].HostConfig.RestartPolicy.Name, "unless-stopped"),
|
||||
WorkloadSupervisionEnabled: parseBool(env["RAP_WORKLOAD_SUPERVISION_ENABLED"]),
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshProductionForwardingEnabled: parseBool(env["RAP_MESH_PRODUCTION_FORWARDING_ENABLED"]),
|
||||
MeshListenAddr: env["RAP_MESH_LISTEN_ADDR"],
|
||||
MeshListenPortMode: env["RAP_MESH_LISTEN_PORT_MODE"],
|
||||
MeshListenAutoPortStart: parseInt(env["RAP_MESH_LISTEN_AUTO_PORT_START"]),
|
||||
MeshListenAutoPortEnd: parseInt(env["RAP_MESH_LISTEN_AUTO_PORT_END"]),
|
||||
MeshAdvertiseEndpoint: env["RAP_MESH_ADVERTISE_ENDPOINT"],
|
||||
MeshAdvertiseEndpointsJSON: env["RAP_MESH_ADVERTISE_ENDPOINTS_JSON"],
|
||||
MeshAdvertiseTransport: env["RAP_MESH_ADVERTISE_TRANSPORT"],
|
||||
MeshConnectivityMode: env["RAP_MESH_CONNECTIVITY_MODE"],
|
||||
MeshNATType: env["RAP_MESH_NAT_TYPE"],
|
||||
MeshRegion: env["RAP_MESH_REGION"],
|
||||
HeartbeatIntervalSeconds: parseInt(env["RAP_HEARTBEAT_INTERVAL_SECONDS"]),
|
||||
EnrollmentPollIntervalSeconds: parseInt(env["RAP_ENROLLMENT_POLL_INTERVAL_SECONDS"]),
|
||||
EnrollmentPollTimeoutSeconds: parseInt(env["RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS"]),
|
||||
ProductionObservationSinkCap: parseInt(env["RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY"]),
|
||||
DockerVPNGatewayEnabled: dockerInspectHasVPNGatewayRuntime(inspected[0]),
|
||||
}
|
||||
return inspected[0], cfg.Normalize(), nil
|
||||
}
|
||||
|
||||
func dockerInspectHasVPNGatewayRuntime(container dockerInspectContainer) bool {
|
||||
hasNetAdmin := false
|
||||
for _, cap := range container.HostConfig.CapAdd {
|
||||
if strings.EqualFold(strings.TrimSpace(cap), "NET_ADMIN") {
|
||||
hasNetAdmin = true
|
||||
break
|
||||
}
|
||||
}
|
||||
hasTun := false
|
||||
for _, device := range container.HostConfig.Devices {
|
||||
if device.PathOnHost == "/dev/net/tun" || device.PathInContainer == "/dev/net/tun" {
|
||||
hasTun = true
|
||||
break
|
||||
}
|
||||
}
|
||||
return (container.HostConfig.Privileged || hasNetAdmin) && hasTun
|
||||
}
|
||||
|
||||
func (m DockerManager) waitContainerRunning(ctx context.Context, runner CommandRunner, docker, containerName string, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
out, err := runner.Run(ctx, docker, "inspect", "--format", "{{.State.Running}}", containerName)
|
||||
if err == nil && strings.TrimSpace(out) == "true" {
|
||||
return nil
|
||||
}
|
||||
if timeout == 0 || time.Now().After(deadline) {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("container %q is not running", containerName)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m DockerManager) rollbackContainer(ctx context.Context, runner CommandRunner, docker string, cfg RuntimeConfig, previous dockerInspectContainer, allowed bool) error {
|
||||
if !allowed || strings.TrimSpace(previous.Image) == "" {
|
||||
return nil
|
||||
}
|
||||
rollbackCfg := cfg
|
||||
rollbackCfg.Image = previous.Image
|
||||
rollbackCfg.ImageArtifactURLs = nil
|
||||
rollbackCfg.ImageArtifactSHA256 = ""
|
||||
rollbackCfg.ImageArtifactSizeBytes = 0
|
||||
rollbackCfg.Replace = true
|
||||
_, err := m.Install(ctx, rollbackCfg)
|
||||
if err == nil {
|
||||
_, _ = runner.Run(ctx, docker, "inspect", "--format", "{{.State.Running}}", cfg.ContainerName)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func artifactImage(artifact ReleaseArtifact, fallback string) string {
|
||||
if len(artifact.Metadata) > 0 {
|
||||
var metadata struct {
|
||||
Image string `json:"image"`
|
||||
}
|
||||
if err := json.Unmarshal(artifact.Metadata, &metadata); err == nil && strings.TrimSpace(metadata.Image) != "" {
|
||||
return strings.TrimSpace(metadata.Image)
|
||||
}
|
||||
}
|
||||
if artifact.InstallType == DefaultUpdateInstallType && artifact.Product != "" && artifact.Version != "" {
|
||||
return strings.TrimSpace(artifact.Product) + ":" + strings.TrimSpace(artifact.Version)
|
||||
}
|
||||
return firstNonEmpty(fallback, DefaultImage)
|
||||
}
|
||||
|
||||
func artifactURLs(artifact ReleaseArtifact) []string {
|
||||
out := make([]string, 0, 1+len(artifact.URLs))
|
||||
for _, raw := range append([]string{artifact.URL}, artifact.URLs...) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" || containsArtifactURL(out, raw) {
|
||||
continue
|
||||
}
|
||||
out = append(out, raw)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func artifactURLsForBackend(artifact ReleaseArtifact, backendURL string) []string {
|
||||
urls := artifactURLs(artifact)
|
||||
base, err := url.Parse(strings.TrimSpace(backendURL))
|
||||
if err != nil || base.Scheme == "" || base.Host == "" {
|
||||
return urls
|
||||
}
|
||||
origin := base.Scheme + "://" + base.Host
|
||||
out := make([]string, 0, len(urls))
|
||||
for _, raw := range urls {
|
||||
if strings.HasPrefix(raw, "/") {
|
||||
raw = origin + raw
|
||||
}
|
||||
if !containsArtifactURL(out, raw) {
|
||||
out = append(out, raw)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func containsArtifactURL(values []string, value string) bool {
|
||||
for _, item := range values {
|
||||
if item == value {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func statusFromError(req UpdateRequest, plan NodeUpdatePlan, phase, status string, err error) NodeUpdateStatusRequest {
|
||||
message := err.Error()
|
||||
return NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: phase,
|
||||
Status: status,
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ErrorMessage: &message,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}
|
||||
}
|
||||
|
||||
func statusFromNoopPlan(req UpdateRequest, plan NodeUpdatePlan) NodeUpdateStatusRequest {
|
||||
return NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "plan",
|
||||
Status: "noop",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{
|
||||
"action": plan.Action,
|
||||
"reason": plan.Reason,
|
||||
"strategy": plan.Strategy,
|
||||
"channel": plan.Channel,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func updateAttemptID(plan NodeUpdatePlan) string {
|
||||
parts := []string{plan.NodeID, plan.Product, plan.TargetVersion}
|
||||
if plan.Artifact != nil {
|
||||
parts = append(parts, plan.Artifact.ID)
|
||||
}
|
||||
return strings.Join(parts, ":")
|
||||
}
|
||||
|
||||
func envMap(items []string) map[string]string {
|
||||
out := map[string]string{}
|
||||
for _, item := range items {
|
||||
key, value, ok := strings.Cut(item, "=")
|
||||
if ok {
|
||||
out[key] = value
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func hostStateDir(container dockerInspectContainer) string {
|
||||
for _, mount := range container.Mounts {
|
||||
if mount.Destination == "/var/lib/rap-node-agent" && mount.Source != "" {
|
||||
return mount.Source
|
||||
}
|
||||
}
|
||||
return DefaultStateDir
|
||||
}
|
||||
|
||||
func parseBool(value string) bool {
|
||||
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||
case "1", "true", "yes", "y", "on":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func parseInt(value string) int {
|
||||
out, _ := strconv.Atoi(strings.TrimSpace(value))
|
||||
return out
|
||||
}
|
||||
|
||||
func loadUpdateState(stateDir string, product string) (UpdateState, error) {
|
||||
stateDir = strings.TrimSpace(stateDir)
|
||||
if stateDir == "" {
|
||||
return UpdateState{}, os.ErrNotExist
|
||||
}
|
||||
product = firstNonEmpty(normalizeUpdateProductToken(product), DefaultUpdateProduct)
|
||||
payload, err := os.ReadFile(updateStatePath(stateDir, product))
|
||||
if err != nil && product == DefaultUpdateProduct {
|
||||
payload, err = os.ReadFile(filepath.Join(stateDir, UpdateStateFileName))
|
||||
}
|
||||
if err != nil {
|
||||
return UpdateState{}, err
|
||||
}
|
||||
var item UpdateState
|
||||
if err := json.Unmarshal(payload, &item); err != nil {
|
||||
return UpdateState{}, err
|
||||
}
|
||||
item.Product = firstNonEmpty(item.Product, product)
|
||||
return item, nil
|
||||
}
|
||||
|
||||
func saveUpdateState(stateDir string, item UpdateState) error {
|
||||
stateDir = strings.TrimSpace(stateDir)
|
||||
if stateDir == "" || item.CurrentVersion == "" {
|
||||
return nil
|
||||
}
|
||||
item.Product = firstNonEmpty(item.Product, DefaultUpdateProduct)
|
||||
if item.UpdatedAt.IsZero() {
|
||||
item.UpdatedAt = time.Now().UTC()
|
||||
}
|
||||
if err := os.MkdirAll(stateDir, 0o700); err != nil {
|
||||
return err
|
||||
}
|
||||
payload, err := json.MarshalIndent(item, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(updateStatePath(stateDir, item.Product), payload, 0o600)
|
||||
}
|
||||
|
||||
func updateStatePath(stateDir, product string) string {
|
||||
product = normalizeUpdateProductToken(firstNonEmpty(product, DefaultUpdateProduct))
|
||||
if product == "" || product == DefaultUpdateProduct {
|
||||
return filepath.Join(stateDir, UpdateStateFileName)
|
||||
}
|
||||
return filepath.Join(stateDir, "host-update-state-"+product+".json")
|
||||
}
|
||||
|
||||
func UpdateTriggerPath(stateDir string) string {
|
||||
return filepath.Join(strings.TrimSpace(stateDir), UpdateTriggerFileName)
|
||||
}
|
||||
|
||||
func SaveUpdateTrigger(stateDir string, trigger UpdateTrigger) error {
|
||||
stateDir = strings.TrimSpace(stateDir)
|
||||
trigger.Generation = strings.TrimSpace(trigger.Generation)
|
||||
if stateDir == "" || trigger.Generation == "" {
|
||||
return nil
|
||||
}
|
||||
if trigger.SchemaVersion == "" {
|
||||
trigger.SchemaVersion = "rap.node_update_trigger.v1"
|
||||
}
|
||||
if trigger.ObservedAt.IsZero() {
|
||||
trigger.ObservedAt = time.Now().UTC()
|
||||
}
|
||||
if err := os.MkdirAll(stateDir, 0o700); err != nil {
|
||||
return err
|
||||
}
|
||||
payload, err := json.MarshalIndent(trigger, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(UpdateTriggerPath(stateDir), payload, 0o600)
|
||||
}
|
||||
|
||||
func currentUpdateTriggerGeneration(stateDir string) string {
|
||||
payload, err := os.ReadFile(UpdateTriggerPath(stateDir))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
var trigger UpdateTrigger
|
||||
if err := json.Unmarshal(payload, &trigger); err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(trigger.Generation)
|
||||
}
|
||||
|
||||
func CurrentUpdateTriggerGenerationForNodeAgent(stateDir string) string {
|
||||
return currentUpdateTriggerGeneration(stateDir)
|
||||
}
|
||||
|
||||
func normalizeUpdateProductToken(value string) string {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
var b strings.Builder
|
||||
for _, r := range value {
|
||||
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func sleepContext(ctx context.Context, duration time.Duration) error {
|
||||
if duration <= 0 {
|
||||
return nil
|
||||
}
|
||||
timer := time.NewTimer(duration)
|
||||
defer timer.Stop()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-timer.C:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func sleepUntilUpdateIntervalOrTrigger(ctx context.Context, stateDir string, duration time.Duration, lastGeneration *string) error {
|
||||
if duration <= 0 {
|
||||
return nil
|
||||
}
|
||||
deadline := time.NewTimer(duration)
|
||||
defer deadline.Stop()
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-deadline.C:
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
generation := currentUpdateTriggerGeneration(stateDir)
|
||||
if generation != "" && lastGeneration != nil && generation != *lastGeneration {
|
||||
*lastGeneration = generation
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func jitteredDuration(base time.Duration, jitter float64) time.Duration {
|
||||
if base <= 0 || jitter <= 0 {
|
||||
return base
|
||||
}
|
||||
spread := int64(float64(base) * jitter)
|
||||
if spread <= 0 {
|
||||
return base
|
||||
}
|
||||
offset := rand.Int63n(spread*2+1) - spread
|
||||
return base + time.Duration(offset)
|
||||
}
|
||||
@@ -0,0 +1,672 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
type updateRunner struct {
|
||||
calls [][]string
|
||||
healthOkay bool
|
||||
inspectJSON string
|
||||
}
|
||||
|
||||
func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.T) {
|
||||
urls := artifactURLsForBackend(ReleaseArtifact{
|
||||
URL: "/downloads/rap-node-agent-0.2.92.tar",
|
||||
URLs: []string{"/downloads/mirror.tar", "https://cdn.example.test/agent.tar"},
|
||||
}, "http://control.example.test:18080/api/v1")
|
||||
want := []string{
|
||||
"http://control.example.test:18080/downloads/rap-node-agent-0.2.92.tar",
|
||||
"http://control.example.test:18080/downloads/mirror.tar",
|
||||
"https://cdn.example.test/agent.tar",
|
||||
}
|
||||
if len(urls) != len(want) {
|
||||
t.Fatalf("urls = %#v", urls)
|
||||
}
|
||||
for i := range want {
|
||||
if urls[i] != want[i] {
|
||||
t.Fatalf("urls[%d] = %q, want %q; all=%#v", i, urls[i], want[i], urls)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (r *updateRunner) Run(_ context.Context, name string, args ...string) (string, error) {
|
||||
r.calls = append(r.calls, append([]string{name}, args...))
|
||||
if len(args) >= 2 && args[0] == "inspect" && args[1] == "--format" {
|
||||
if r.healthOkay {
|
||||
return "true\n", nil
|
||||
}
|
||||
return "false\n", nil
|
||||
}
|
||||
if len(args) == 2 && args[0] == "inspect" {
|
||||
return r.inspectJSON, nil
|
||||
}
|
||||
if len(args) >= 2 && args[0] == "image" && args[1] == "inspect" {
|
||||
return "[]", nil
|
||||
}
|
||||
if len(args) > 0 && args[0] == "run" {
|
||||
return "updated-container\n", nil
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestApplyUpdateFetchesPlanLoadsImageAndRecreatesContainer(t *testing.T) {
|
||||
artifactBody := []byte("fake docker image tar")
|
||||
statuses := []NodeUpdateStatusRequest{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"schema_version": "rap.node_update_plan.v1",
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.1.0-old",
|
||||
"action": "update",
|
||||
"reason": "matching_release_available",
|
||||
"target_version": "0.1.0-new",
|
||||
"rollback_allowed": true,
|
||||
"health_window_seconds": 1,
|
||||
"production_forwarding": false,
|
||||
"artifact": map[string]any{
|
||||
"id": "artifact-1",
|
||||
"product": "rap-node-agent",
|
||||
"version": "0.1.0-new",
|
||||
"os": "linux",
|
||||
"arch": "amd64",
|
||||
"install_type": "docker",
|
||||
"url": serverArtifactURL(r),
|
||||
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
"size_bytes": len(artifactBody),
|
||||
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
|
||||
},
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
var status NodeUpdateStatusRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
|
||||
t.Fatalf("decode status: %v", err)
|
||||
}
|
||||
statuses = append(statuses, status)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
|
||||
_, _ = w.Write(artifactBody)
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixture(server.URL)}
|
||||
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.0-old",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
HealthTimeout: time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply update: %v", err)
|
||||
}
|
||||
if result.Action != "update" || !result.Loaded || !result.Replaced || result.NewImage != "rap-node-agent:test-new" {
|
||||
t.Fatalf("unexpected result: %+v", result)
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
for _, want := range []string{"inspect\x00rap-node-agent-node-1", "load\x00-i", "rm\x00-f\x00rap-node-agent-node-1", "run\x00-d", "RAP_NODE_NAME=node-a"} {
|
||||
if !strings.Contains(joined, want) {
|
||||
t.Fatalf("missing docker call part %q in %#v", want, runner.calls)
|
||||
}
|
||||
}
|
||||
if len(statuses) != 3 || statuses[0].Phase != "planned" || statuses[1].Phase != "download" || statuses[2].Status != "succeeded" {
|
||||
t.Fatalf("statuses = %+v", statuses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyUpdatePreservesDockerVPNGatewayRuntime(t *testing.T) {
|
||||
previousStatHostPath := statHostPath
|
||||
statHostPath = func(string) (os.FileInfo, error) { return nil, nil }
|
||||
t.Cleanup(func() { statHostPath = previousStatHostPath })
|
||||
|
||||
artifactBody := []byte("fake docker image tar")
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"schema_version": "rap.node_update_plan.v1",
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.2.7",
|
||||
"action": "update",
|
||||
"reason": "matching_release_available",
|
||||
"target_version": "0.2.8",
|
||||
"rollback_allowed": true,
|
||||
"health_window_seconds": 1,
|
||||
"artifact": map[string]any{
|
||||
"id": "artifact-1",
|
||||
"product": "rap-node-agent",
|
||||
"version": "0.2.8",
|
||||
"os": "linux",
|
||||
"arch": "amd64",
|
||||
"install_type": "docker",
|
||||
"url": serverArtifactURL(r),
|
||||
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
"size_bytes": len(artifactBody),
|
||||
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
|
||||
},
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
|
||||
_, _ = w.Write(artifactBody)
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixtureWithVPNGatewayRuntime()}
|
||||
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.2.7",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
HealthTimeout: time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("ApplyUpdate failed: %v", err)
|
||||
}
|
||||
if !result.Replaced {
|
||||
t.Fatalf("expected replacement")
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
for _, want := range []string{"--privileged", "--cap-add\x00NET_ADMIN", "--device\x00/dev/net/tun:/dev/net/tun"} {
|
||||
if !strings.Contains(joined, want) {
|
||||
t.Fatalf("docker run did not preserve %q in %#v", want, runner.calls)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyUpdateNoopsWithoutDockerWhenPlanHasNoAction(t *testing.T) {
|
||||
statuses := []NodeUpdateStatusRequest{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.1.3",
|
||||
"action": "none",
|
||||
"reason": "already_current",
|
||||
"target_version": "0.1.3",
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
var status NodeUpdateStatusRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
|
||||
t.Fatalf("decode status: %v", err)
|
||||
}
|
||||
statuses = append(statuses, status)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
runner := &updateRunner{}
|
||||
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.3",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply update: %v", err)
|
||||
}
|
||||
if result.Action != "none" || result.Reason != "already_current" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if len(runner.calls) != 0 {
|
||||
t.Fatalf("docker should not be called, got %#v", runner.calls)
|
||||
}
|
||||
if len(statuses) != 1 || statuses[0].Phase != "plan" || statuses[0].Status != "noop" || statuses[0].TargetVersion != "0.1.3" {
|
||||
t.Fatalf("statuses = %+v", statuses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsApplyUpdateNoopReportsTaskStatus(t *testing.T) {
|
||||
statuses := []NodeUpdateStatusRequest{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.1.3",
|
||||
"action": "none",
|
||||
"reason": "already_current",
|
||||
"target_version": "0.1.3",
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
var status NodeUpdateStatusRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
|
||||
t.Fatalf("decode status: %v", err)
|
||||
}
|
||||
statuses = append(statuses, status)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
result, err := (WindowsManager{Runner: &updateRunner{}}).ApplyUpdate(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.3",
|
||||
InstallType: WindowsUpdateInstallType,
|
||||
BinaryPath: `C:\Program Files\RAP\node\rap-node-agent.exe`,
|
||||
WindowsTaskName: "RAP Node Agent node",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("windows apply update: %v", err)
|
||||
}
|
||||
if result.Action != "none" || result.Reason != "already_current" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if len(statuses) != 1 || statuses[0].Phase != "plan" || statuses[0].Status != "noop" {
|
||||
t.Fatalf("statuses = %+v", statuses)
|
||||
}
|
||||
if statuses[0].Payload["task"] != "RAP Node Agent node" {
|
||||
t.Fatalf("status payload = %+v", statuses[0].Payload)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunUpdateLoopAdvancesCurrentVersionAfterSuccessfulUpdate(t *testing.T) {
|
||||
artifactBody := []byte("fake docker image tar")
|
||||
planRequests := []string{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
current := r.URL.Query().Get("current_version")
|
||||
planRequests = append(planRequests, current)
|
||||
action := "update"
|
||||
reason := "matching_release_available"
|
||||
if current == "0.1.0-new" {
|
||||
action = "none"
|
||||
reason = "already_current"
|
||||
}
|
||||
plan := map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": current,
|
||||
"action": action,
|
||||
"reason": reason,
|
||||
"target_version": "0.1.0-new",
|
||||
"rollback_allowed": true,
|
||||
"production_forwarding": false,
|
||||
}
|
||||
if action == "update" {
|
||||
plan["artifact"] = map[string]any{
|
||||
"id": "artifact-1",
|
||||
"product": "rap-node-agent",
|
||||
"version": "0.1.0-new",
|
||||
"os": "linux",
|
||||
"arch": "amd64",
|
||||
"install_type": "docker",
|
||||
"url": serverArtifactURL(r),
|
||||
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
"size_bytes": len(artifactBody),
|
||||
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
|
||||
}
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{"node_update_plan": plan})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
|
||||
_, _ = w.Write(artifactBody)
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixture(server.URL)}
|
||||
err := (DockerManager{Runner: runner}).RunUpdateLoop(context.Background(), UpdateLoopConfig{
|
||||
Request: UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.0-old",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
HealthTimeout: time.Second,
|
||||
},
|
||||
Interval: time.Millisecond,
|
||||
MaxRuns: 2,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("run update loop: %v", err)
|
||||
}
|
||||
if strings.Join(planRequests, ",") != "0.1.0-old,0.1.0-new" {
|
||||
t.Fatalf("plan current versions = %#v", planRequests)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunUpdateLoopReportsHostAgentStatusWhenEnabled(t *testing.T) {
|
||||
statuses := []NodeUpdateStatusRequest{}
|
||||
planProducts := []string{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
product := r.URL.Query().Get("product")
|
||||
planProducts = append(planProducts, product)
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": product,
|
||||
"current_version": "0.1.3",
|
||||
"action": "none",
|
||||
"reason": "already_current",
|
||||
"target_version": "0.1.3",
|
||||
"rollback_allowed": true,
|
||||
"production_forwarding": false,
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
var status NodeUpdateStatusRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
|
||||
t.Fatalf("decode status: %v", err)
|
||||
}
|
||||
statuses = append(statuses, status)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
err := (DockerManager{}).RunUpdateLoop(context.Background(), UpdateLoopConfig{
|
||||
Request: UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.3",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
},
|
||||
HostAgentUpdateEnabled: true,
|
||||
HostAgentUpdateRequest: HostAgentUpdateRequest{
|
||||
CurrentVersion: "0.1.3",
|
||||
BinaryPath: filepath.Join(t.TempDir(), "rap-host-agent"),
|
||||
},
|
||||
MaxRuns: 1,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("run update loop: %v", err)
|
||||
}
|
||||
if strings.Join(planProducts, ",") != "rap-node-agent,rap-host-agent" {
|
||||
t.Fatalf("plan products = %#v", planProducts)
|
||||
}
|
||||
if len(statuses) != 2 || statuses[0].Product != "rap-node-agent" || statuses[1].Product != "rap-host-agent" {
|
||||
t.Fatalf("statuses = %+v", statuses)
|
||||
}
|
||||
if statuses[1].Phase != "plan" || statuses[1].Status != "noop" {
|
||||
t.Fatalf("host-agent status = %+v", statuses[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchNodeUpdatePlanResolvesNodeIDAndVersionFromStateDir(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
if err := state.Save(filepath.Join(dir, state.FileName), state.Identity{
|
||||
NodeID: "node-from-state",
|
||||
ClusterID: "cluster-1",
|
||||
NodeName: "node-a",
|
||||
}); err != nil {
|
||||
t.Fatalf("save identity: %v", err)
|
||||
}
|
||||
if err := saveUpdateState(dir, UpdateState{
|
||||
Product: "rap-node-agent",
|
||||
CurrentVersion: "0.1.0-state",
|
||||
}); err != nil {
|
||||
t.Fatalf("save update state: %v", err)
|
||||
}
|
||||
var gotPath string
|
||||
var gotCurrent string
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
gotPath = r.URL.Path
|
||||
gotCurrent = r.URL.Query().Get("current_version")
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-from-state",
|
||||
"product": "rap-node-agent",
|
||||
"action": "none",
|
||||
"reason": "already_current",
|
||||
},
|
||||
})
|
||||
}))
|
||||
defer server.Close()
|
||||
if _, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
StateDir: dir,
|
||||
CurrentVersion: "0.1.0-flag",
|
||||
}); err != nil {
|
||||
t.Fatalf("fetch plan: %v", err)
|
||||
}
|
||||
if !strings.Contains(gotPath, "/nodes/node-from-state/updates/plan") || gotCurrent != "0.1.0-state" {
|
||||
t.Fatalf("path/current = %q/%q", gotPath, gotCurrent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyHostAgentUpdateDownloadsAndReplacesBinary(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
if err := state.Save(filepath.Join(dir, state.FileName), state.Identity{
|
||||
NodeID: "node-1",
|
||||
ClusterID: "cluster-1",
|
||||
NodeName: "node-a",
|
||||
}); err != nil {
|
||||
t.Fatalf("save identity: %v", err)
|
||||
}
|
||||
binaryPath := filepath.Join(dir, "rap-host-agent")
|
||||
artifactBody := []byte("new host agent binary")
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
if r.URL.Query().Get("product") != HostAgentUpdateProduct || r.URL.Query().Get("install_type") != BinaryUpdateInstallType {
|
||||
t.Fatalf("unexpected query: %s", r.URL.RawQuery)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": HostAgentUpdateProduct,
|
||||
"action": "update",
|
||||
"reason": "matching_release_available",
|
||||
"target_version": "0.1.0-host-new",
|
||||
"rollback_allowed": false,
|
||||
"production_forwarding": false,
|
||||
"artifact": map[string]any{
|
||||
"id": "artifact-host-1",
|
||||
"product": HostAgentUpdateProduct,
|
||||
"version": "0.1.0-host-new",
|
||||
"os": "linux",
|
||||
"arch": "amd64",
|
||||
"install_type": BinaryUpdateInstallType,
|
||||
"url": serverArtifactURL(r),
|
||||
"sha256": "adc549d9e66ef64a507dd6880590d31309e16a3be965a92d849edd103cfb1815",
|
||||
"size_bytes": len(artifactBody),
|
||||
},
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
|
||||
_, _ = w.Write(artifactBody)
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
result, err := (DockerManager{}).ApplyHostAgentUpdate(context.Background(), HostAgentUpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
StateDir: dir,
|
||||
CurrentVersion: "0.1.0-host-old",
|
||||
BinaryPath: binaryPath,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply host-agent update: %v", err)
|
||||
}
|
||||
if !result.Replaced || !result.RestartNeeded {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
payload, err := os.ReadFile(binaryPath)
|
||||
if err != nil || string(payload) != string(artifactBody) {
|
||||
t.Fatalf("binary payload = %q, %v", payload, err)
|
||||
}
|
||||
updateState, err := loadUpdateState(dir, HostAgentUpdateProduct)
|
||||
if err != nil {
|
||||
t.Fatalf("load update state: %v", err)
|
||||
}
|
||||
if updateState.Product != HostAgentUpdateProduct || updateState.CurrentVersion != "0.1.0-host-new" {
|
||||
t.Fatalf("update state = %+v", updateState)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateStateIsProductScoped(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
if err := saveUpdateState(dir, UpdateState{Product: DefaultUpdateProduct, CurrentVersion: "node-v"}); err != nil {
|
||||
t.Fatalf("save node state: %v", err)
|
||||
}
|
||||
if err := saveUpdateState(dir, UpdateState{Product: HostAgentUpdateProduct, CurrentVersion: "host-v"}); err != nil {
|
||||
t.Fatalf("save host state: %v", err)
|
||||
}
|
||||
nodeState, err := loadUpdateState(dir, DefaultUpdateProduct)
|
||||
if err != nil {
|
||||
t.Fatalf("load node state: %v", err)
|
||||
}
|
||||
hostState, err := loadUpdateState(dir, HostAgentUpdateProduct)
|
||||
if err != nil {
|
||||
t.Fatalf("load host state: %v", err)
|
||||
}
|
||||
if nodeState.CurrentVersion != "node-v" || hostState.CurrentVersion != "host-v" {
|
||||
t.Fatalf("states overlapped: node=%+v host=%+v", nodeState, hostState)
|
||||
}
|
||||
}
|
||||
|
||||
func TestArtifactImageDerivesDockerTagFromProductAndVersion(t *testing.T) {
|
||||
got := artifactImage(ReleaseArtifact{
|
||||
Product: "rap-node-agent",
|
||||
Version: "0.2.77",
|
||||
InstallType: DefaultUpdateInstallType,
|
||||
}, "rap-node-agent:old")
|
||||
if got != "rap-node-agent:0.2.77" {
|
||||
t.Fatalf("expected versioned docker image, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func serverArtifactURL(r *http.Request) string {
|
||||
scheme := "http"
|
||||
if r.TLS != nil {
|
||||
scheme = "https"
|
||||
}
|
||||
return fmt.Sprintf("%s://%s/artifact.tar", scheme, r.Host)
|
||||
}
|
||||
|
||||
func dockerInspectFixture(_ string) string {
|
||||
return `[
|
||||
{
|
||||
"Id": "old-container",
|
||||
"Image": "sha256:oldimage",
|
||||
"Config": {
|
||||
"Image": "rap-node-agent:test-old",
|
||||
"Env": [
|
||||
"RAP_BACKEND_URL=http://control/api/v1",
|
||||
"RAP_CLUSTER_ID=cluster-1",
|
||||
"RAP_NODE_NAME=node-a",
|
||||
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
|
||||
"RAP_HEARTBEAT_INTERVAL_SECONDS=15",
|
||||
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=5",
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
|
||||
"RAP_MESH_LISTEN_ADDR=:19131"
|
||||
]
|
||||
},
|
||||
"HostConfig": {
|
||||
"NetworkMode": "host",
|
||||
"RestartPolicy": {"Name": "unless-stopped"}
|
||||
},
|
||||
"Mounts": [
|
||||
{"Source": "/var/lib/rap/nodes/node-a", "Destination": "/var/lib/rap-node-agent"}
|
||||
],
|
||||
"State": {"Running": true}
|
||||
}
|
||||
]`
|
||||
}
|
||||
|
||||
func dockerInspectFixtureWithVPNGatewayRuntime() string {
|
||||
return `[
|
||||
{
|
||||
"Id": "old-container",
|
||||
"Image": "sha256:oldimage",
|
||||
"Config": {
|
||||
"Image": "rap-node-agent:test-old",
|
||||
"Env": [
|
||||
"RAP_BACKEND_URL=http://control/api/v1",
|
||||
"RAP_CLUSTER_ID=cluster-1",
|
||||
"RAP_NODE_NAME=node-a",
|
||||
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
|
||||
"RAP_HEARTBEAT_INTERVAL_SECONDS=15",
|
||||
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=5",
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
|
||||
"RAP_MESH_LISTEN_ADDR=:19131"
|
||||
]
|
||||
},
|
||||
"HostConfig": {
|
||||
"NetworkMode": "host",
|
||||
"Privileged": true,
|
||||
"CapAdd": ["NET_ADMIN"],
|
||||
"Devices": [
|
||||
{"PathOnHost": "/dev/net/tun", "PathInContainer": "/dev/net/tun", "CgroupPermissions": "rwm"}
|
||||
],
|
||||
"RestartPolicy": {"Name": "unless-stopped"}
|
||||
},
|
||||
"Mounts": [
|
||||
{"Source": "/var/lib/rap/nodes/node-a", "Destination": "/var/lib/rap-node-agent"}
|
||||
],
|
||||
"State": {"Running": true}
|
||||
}
|
||||
]`
|
||||
}
|
||||
@@ -0,0 +1,368 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultWindowsInstallDir = `C:\Program Files\RAP`
|
||||
DefaultWindowsStateRoot = `C:\ProgramData\RAP\nodes`
|
||||
)
|
||||
|
||||
type WindowsInstallConfig struct {
|
||||
RuntimeConfig RuntimeConfig
|
||||
NodeID string
|
||||
InstallDir string
|
||||
StartupMode string
|
||||
ArtifactURLs []string
|
||||
ArtifactSHA256 string
|
||||
ArtifactSizeBytes int64
|
||||
Replace bool
|
||||
DryRun bool
|
||||
AutoUpdateEnabled bool
|
||||
AutoUpdateCurrentVersion string
|
||||
AutoUpdateChannel string
|
||||
AutoUpdateIntervalSeconds int
|
||||
AutoUpdateInitialDelaySeconds int
|
||||
AutoUpdateHealthTimeoutSeconds int
|
||||
HostAgentSourcePath string
|
||||
}
|
||||
|
||||
type WindowsInstallResult struct {
|
||||
NodeName string
|
||||
InstallDir string
|
||||
StateDir string
|
||||
NodeAgentPath string
|
||||
WrapperPath string
|
||||
StartupMode string
|
||||
TaskName string
|
||||
HostAgentPath string
|
||||
UpdaterTaskName string
|
||||
Downloaded bool
|
||||
Started bool
|
||||
UpdaterStarted bool
|
||||
AdminFallback bool
|
||||
}
|
||||
|
||||
type WindowsManager struct {
|
||||
Runner CommandRunner
|
||||
}
|
||||
|
||||
func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInstallConfig {
|
||||
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(profile.NodeName)))
|
||||
return WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: profile.BackendURL,
|
||||
ClusterID: profile.ClusterID,
|
||||
JoinToken: profile.JoinToken,
|
||||
NodeName: profile.NodeName,
|
||||
StateDir: stateDir,
|
||||
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
|
||||
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
|
||||
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
|
||||
MeshListenAddr: profile.MeshListenAddr,
|
||||
MeshListenPortMode: profile.MeshListenPortMode,
|
||||
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
|
||||
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
|
||||
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
|
||||
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
|
||||
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
|
||||
MeshConnectivityMode: profile.MeshConnectivityMode,
|
||||
MeshNATType: profile.MeshNATType,
|
||||
MeshRegion: profile.MeshRegion,
|
||||
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
|
||||
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
|
||||
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
|
||||
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
|
||||
},
|
||||
InstallDir: firstNonEmpty(profile.InstallDir, filepath.Join(DefaultWindowsInstallDir, safeUnitSlug(profile.NodeName))),
|
||||
StartupMode: firstNonEmpty(profile.StartupMode, "auto"),
|
||||
ArtifactURLs: binaryArtifactURLs(profile),
|
||||
ArtifactSHA256: binaryArtifactSHA256(profile),
|
||||
ArtifactSizeBytes: binaryArtifactSizeBytes(profile),
|
||||
Replace: true,
|
||||
AutoUpdateEnabled: true,
|
||||
}
|
||||
}
|
||||
|
||||
func (m WindowsManager) Install(ctx context.Context, cfg WindowsInstallConfig) (WindowsInstallResult, error) {
|
||||
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
|
||||
if strings.TrimSpace(cfg.RuntimeConfig.StateDir) == "" {
|
||||
cfg.RuntimeConfig.StateDir = filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(cfg.RuntimeConfig.NodeName))
|
||||
}
|
||||
cfg.RuntimeConfig.Replace = cfg.Replace
|
||||
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
|
||||
if err := cfg.RuntimeConfig.ValidateInstall(); err != nil {
|
||||
return WindowsInstallResult{}, err
|
||||
}
|
||||
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "auto"))
|
||||
noAdminPreferred := cfg.StartupMode == "user-task"
|
||||
cfg.InstallDir = firstNonEmpty(cfg.InstallDir, defaultWindowsInstallDir(cfg.RuntimeConfig.NodeName, noAdminPreferred))
|
||||
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "auto"))
|
||||
if noAdminPreferred && strings.HasPrefix(strings.ToLower(cfg.RuntimeConfig.StateDir), strings.ToLower(DefaultWindowsStateRoot)) {
|
||||
cfg.RuntimeConfig.StateDir = defaultWindowsStateDir(cfg.RuntimeConfig.NodeName, true)
|
||||
}
|
||||
result := WindowsInstallResult{
|
||||
NodeName: cfg.RuntimeConfig.NodeName,
|
||||
InstallDir: cfg.InstallDir,
|
||||
StateDir: cfg.RuntimeConfig.StateDir,
|
||||
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent.exe"),
|
||||
WrapperPath: filepath.Join(cfg.InstallDir, "rap-node-agent-run.cmd"),
|
||||
StartupMode: cfg.StartupMode,
|
||||
TaskName: "RAP Node Agent " + safeUnitSlug(cfg.RuntimeConfig.NodeName),
|
||||
}
|
||||
if cfg.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
if runtime.GOOS != "windows" {
|
||||
return result, fmt.Errorf("windows install is only supported on windows hosts")
|
||||
}
|
||||
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
|
||||
if cfg.StartupMode != "auto" || !isAccessDenied(err) {
|
||||
return result, err
|
||||
}
|
||||
cfg.InstallDir = defaultWindowsInstallDir(cfg.RuntimeConfig.NodeName, true)
|
||||
cfg.RuntimeConfig.StateDir = defaultWindowsStateDir(cfg.RuntimeConfig.NodeName, true)
|
||||
result.InstallDir = cfg.InstallDir
|
||||
result.StateDir = cfg.RuntimeConfig.StateDir
|
||||
result.NodeAgentPath = filepath.Join(cfg.InstallDir, "rap-node-agent.exe")
|
||||
result.WrapperPath = filepath.Join(cfg.InstallDir, "rap-node-agent-run.cmd")
|
||||
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.AdminFallback = true
|
||||
}
|
||||
if err := os.MkdirAll(cfg.RuntimeConfig.StateDir, 0o700); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if len(cfg.ArtifactURLs) > 0 && (cfg.Replace || !fileExists(result.NodeAgentPath)) {
|
||||
m.stopExistingNodeAgent(ctx, result.TaskName, result.NodeAgentPath)
|
||||
path, err := downloadFirstArtifact(ctx, cfg.ArtifactURLs, cfg.ArtifactSHA256, cfg.ArtifactSizeBytes)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
if err := copyFile(path, result.NodeAgentPath, 0o755); err != nil {
|
||||
m.stopExistingNodeAgent(ctx, result.TaskName, result.NodeAgentPath)
|
||||
if retryErr := copyFile(path, result.NodeAgentPath, 0o755); retryErr == nil {
|
||||
result.Downloaded = true
|
||||
goto binaryReady
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
result.Downloaded = true
|
||||
}
|
||||
binaryReady:
|
||||
if !fileExists(result.NodeAgentPath) {
|
||||
return result, fmt.Errorf("node-agent binary is missing at %s and no artifact was available", result.NodeAgentPath)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(cfg.InstallDir, "rap-node-agent.env.cmd"), []byte(windowsEnvScript(cfg.RuntimeConfig)), 0o600); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.WriteFile(result.WrapperPath, []byte(windowsWrapperScript(result.NodeAgentPath, filepath.Join(cfg.InstallDir, "rap-node-agent.env.cmd"))), 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
logPath := filepath.Join(cfg.RuntimeConfig.StateDir, "rap-node-agent.log")
|
||||
started, fallback, mode, err := m.installStartupTask(ctx, result.TaskName, result.WrapperPath, logPath, cfg.StartupMode)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Started = started
|
||||
result.AdminFallback = fallback
|
||||
result.StartupMode = mode
|
||||
result, err = installWindowsHostAgentUpdater(ctx, m, result, cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m WindowsManager) stopExistingNodeAgent(ctx context.Context, taskName, nodeAgentPath string) {
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
_, _ = runner.Run(ctx, "schtasks", "/End", "/TN", taskName)
|
||||
escapedPath := strings.ReplaceAll(nodeAgentPath, `'`, `''`)
|
||||
_, _ = runner.Run(ctx, "powershell", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command",
|
||||
`Get-Process rap-node-agent -ErrorAction SilentlyContinue | Where-Object { $_.Path -eq '`+escapedPath+`' } | Stop-Process -Force -ErrorAction SilentlyContinue`)
|
||||
}
|
||||
|
||||
func (m WindowsManager) installStartupTask(ctx context.Context, taskName, wrapperPath, logPath, mode string) (bool, bool, string, error) {
|
||||
if mode == "none" {
|
||||
return false, false, mode, nil
|
||||
}
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
if mode == "auto" || mode == "system-task" {
|
||||
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "ONSTART", "/RU", "SYSTEM", "/RL", "HIGHEST", "/TR", windowsTaskAction(wrapperPath, logPath), "/F")
|
||||
if err == nil {
|
||||
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
|
||||
return true, false, "system-task", nil
|
||||
}
|
||||
if mode == "system-task" {
|
||||
return false, false, mode, err
|
||||
}
|
||||
}
|
||||
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "ONLOGON", "/TR", windowsTaskAction(wrapperPath, logPath), "/F")
|
||||
if err != nil {
|
||||
return false, mode == "auto", "user-task", err
|
||||
}
|
||||
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
|
||||
return true, mode == "auto", "user-task", nil
|
||||
}
|
||||
|
||||
func windowsTaskAction(wrapperPath, logPath string) string {
|
||||
return `cmd.exe /c ""` + wrapperPath + `" >> "` + logPath + `" 2>&1"`
|
||||
}
|
||||
|
||||
func windowsEnvScript(cfg RuntimeConfig) string {
|
||||
lines := []string{"@echo off"}
|
||||
for _, env := range NodeAgentEnv(cfg) {
|
||||
key, value, ok := strings.Cut(env, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lines = append(lines, "set "+key+"="+value)
|
||||
}
|
||||
return strings.Join(lines, "\r\n") + "\r\n"
|
||||
}
|
||||
|
||||
func windowsWrapperScript(nodeAgentPath, envPath string) string {
|
||||
return strings.Join([]string{
|
||||
"@echo off",
|
||||
`call "` + envPath + `"`,
|
||||
`"` + nodeAgentPath + `"`,
|
||||
}, "\r\n") + "\r\n"
|
||||
}
|
||||
|
||||
func binaryArtifactURLs(profile WindowsInstallProfile) []string {
|
||||
if profile.NodeAgentArtifact != nil && len(profile.NodeAgentArtifact.URLs) > 0 {
|
||||
return append([]string(nil), profile.NodeAgentArtifact.URLs...)
|
||||
}
|
||||
if profile.NodeAgentArtifact == nil || strings.TrimSpace(profile.NodeAgentArtifact.FileName) == "" {
|
||||
return nil
|
||||
}
|
||||
out := []string{}
|
||||
fileName := strings.TrimLeft(strings.TrimSpace(profile.NodeAgentArtifact.FileName), "/")
|
||||
for _, endpoint := range profile.ArtifactEndpoints {
|
||||
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
|
||||
out = append(out, trimmed+"/"+fileName)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func binaryArtifactSHA256(profile WindowsInstallProfile) string {
|
||||
if profile.NodeAgentArtifact == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(profile.NodeAgentArtifact.SHA256)
|
||||
}
|
||||
|
||||
func binaryArtifactSizeBytes(profile WindowsInstallProfile) int64 {
|
||||
if profile.NodeAgentArtifact == nil {
|
||||
return 0
|
||||
}
|
||||
return profile.NodeAgentArtifact.SizeBytes
|
||||
}
|
||||
|
||||
func fileExists(path string) bool {
|
||||
_, err := os.Stat(path)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func copyFile(source, target string, mode os.FileMode) error {
|
||||
src, err := os.Open(source)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer src.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := target + ".tmp"
|
||||
dst, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := io.Copy(dst, src); err != nil {
|
||||
_ = dst.Close()
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := dst.Close(); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := replaceFile(tmp, target); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func replaceFile(tmp, target string) error {
|
||||
if runtime.GOOS != "windows" {
|
||||
return os.Rename(tmp, target)
|
||||
}
|
||||
backup := target + ".bak"
|
||||
_ = os.Remove(backup)
|
||||
if fileExists(target) {
|
||||
if err := os.Rename(target, backup); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := os.Rename(tmp, target); err != nil {
|
||||
if fileExists(backup) {
|
||||
_ = os.Rename(backup, target)
|
||||
}
|
||||
return err
|
||||
}
|
||||
_ = os.Remove(backup)
|
||||
return nil
|
||||
}
|
||||
|
||||
func defaultWindowsInstallDir(nodeName string, userMode bool) string {
|
||||
slug := safeUnitSlug(nodeName)
|
||||
if userMode {
|
||||
if base := strings.TrimSpace(os.Getenv("LOCALAPPDATA")); base != "" {
|
||||
return filepath.Join(base, "RAP", slug)
|
||||
}
|
||||
if base := strings.TrimSpace(os.Getenv("USERPROFILE")); base != "" {
|
||||
return filepath.Join(base, "AppData", "Local", "RAP", slug)
|
||||
}
|
||||
}
|
||||
return filepath.Join(DefaultWindowsInstallDir, slug)
|
||||
}
|
||||
|
||||
func defaultWindowsStateDir(nodeName string, userMode bool) string {
|
||||
slug := safeUnitSlug(nodeName)
|
||||
if userMode {
|
||||
if base := strings.TrimSpace(os.Getenv("LOCALAPPDATA")); base != "" {
|
||||
return filepath.Join(base, "RAP", "nodes", slug)
|
||||
}
|
||||
if base := strings.TrimSpace(os.Getenv("USERPROFILE")); base != "" {
|
||||
return filepath.Join(base, "AppData", "Local", "RAP", "nodes", slug)
|
||||
}
|
||||
}
|
||||
return filepath.Join(DefaultWindowsStateRoot, slug)
|
||||
}
|
||||
|
||||
func isAccessDenied(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
value := strings.ToLower(err.Error())
|
||||
return strings.Contains(value, "access is denied") ||
|
||||
strings.Contains(value, "permission denied") ||
|
||||
strings.Contains(value, "operation not permitted")
|
||||
}
|
||||
@@ -0,0 +1,337 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
|
||||
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
|
||||
req.InstallType = WindowsUpdateInstallType
|
||||
}
|
||||
req.OS = firstNonEmpty(req.OS, "windows")
|
||||
req.Arch = firstNonEmpty(req.Arch, "amd64")
|
||||
req = req.Normalize()
|
||||
var err error
|
||||
req, err = resolveUpdateRequest(req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
plan, err := FetchNodeUpdatePlan(ctx, req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
if plan.HealthWindowSec > 0 && req.HealthTimeout == 30*time.Second {
|
||||
req.HealthTimeout = time.Duration(plan.HealthWindowSec) * time.Second
|
||||
}
|
||||
result := UpdateResult{
|
||||
Action: plan.Action,
|
||||
Reason: plan.Reason,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: req.WindowsTaskName,
|
||||
NewImage: req.BinaryPath,
|
||||
}
|
||||
if plan.Action != "update" {
|
||||
if !req.DryRun {
|
||||
status := statusFromNoopPlan(req, plan)
|
||||
if status.Payload == nil {
|
||||
status.Payload = map[string]any{}
|
||||
}
|
||||
status.Payload["task"] = req.WindowsTaskName
|
||||
status.Payload["binary_path"] = req.BinaryPath
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.ProductionForwarding && !req.AllowProductionMesh {
|
||||
err := errors.New("refusing update plan with production forwarding enabled")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != WindowsUpdateInstallType {
|
||||
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "planned",
|
||||
Status: "accepted",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName},
|
||||
})
|
||||
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "download",
|
||||
Status: "started",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath},
|
||||
})
|
||||
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
|
||||
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
|
||||
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
|
||||
if retryErr := copyFile(path, req.BinaryPath, 0o755); retryErr != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
}
|
||||
result.Replaced = true
|
||||
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "health_check",
|
||||
Status: "succeeded",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"task": req.WindowsTaskName, "binary_path": req.BinaryPath},
|
||||
})
|
||||
_ = saveUpdateState(req.StateDir, UpdateState{
|
||||
Product: req.Product,
|
||||
CurrentVersion: plan.TargetVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Image: req.BinaryPath,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
|
||||
req := cfg.Request
|
||||
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
|
||||
req.InstallType = WindowsUpdateInstallType
|
||||
}
|
||||
req.OS = firstNonEmpty(req.OS, "windows")
|
||||
req.Arch = firstNonEmpty(req.Arch, "amd64")
|
||||
req = req.Normalize()
|
||||
if err := req.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cfg.Interval == 0 {
|
||||
cfg.Interval = time.Hour
|
||||
}
|
||||
if cfg.Interval < 0 {
|
||||
return errors.New("update loop interval must not be negative")
|
||||
}
|
||||
if cfg.InitialDelay < 0 {
|
||||
return errors.New("update loop initial delay must not be negative")
|
||||
}
|
||||
if cfg.Jitter < 0 || cfg.Jitter > 1 {
|
||||
return errors.New("update loop jitter must be between 0 and 1")
|
||||
}
|
||||
logf := cfg.Logf
|
||||
if logf == nil {
|
||||
logf = func(string, ...any) {}
|
||||
}
|
||||
if cfg.InitialDelay > 0 {
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
runs := 0
|
||||
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
|
||||
for {
|
||||
runs++
|
||||
result, err := m.ApplyUpdate(ctx, req)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("windows_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
}
|
||||
logf("windows_update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
logf("windows_update_loop run=%d action=%s reason=%s target=%s task=%s replaced=%t",
|
||||
runs,
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.ContainerName,
|
||||
result.Replaced,
|
||||
)
|
||||
if result.Action == "update" && result.TargetVersion != "" && !result.RolledBack {
|
||||
req.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
}
|
||||
if cfg.HostAgentUpdateEnabled {
|
||||
hostReq := cfg.HostAgentUpdateRequest
|
||||
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
|
||||
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
|
||||
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
|
||||
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
|
||||
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
|
||||
hostReq.OS = firstNonEmpty(hostReq.OS, "windows")
|
||||
hostReq.Arch = firstNonEmpty(hostReq.Arch, "amd64")
|
||||
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, "windows_binary")
|
||||
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
|
||||
if hostErr != nil {
|
||||
if errors.Is(hostErr, ErrNodeIdentityNotReady) {
|
||||
logf("windows_host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
|
||||
} else {
|
||||
logf("windows_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
|
||||
if cfg.StopOnError {
|
||||
return hostErr
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logf("windows_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
|
||||
runs,
|
||||
hostResult.Action,
|
||||
hostResult.Reason,
|
||||
hostResult.TargetVersion,
|
||||
hostResult.NewImage,
|
||||
hostResult.Replaced,
|
||||
hostResult.RestartNeeded,
|
||||
)
|
||||
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
|
||||
cfg.HostAgentUpdateRequest.CurrentVersion = hostResult.TargetVersion
|
||||
}
|
||||
}
|
||||
}
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func installWindowsHostAgentUpdater(ctx context.Context, m WindowsManager, result WindowsInstallResult, cfg WindowsInstallConfig) (WindowsInstallResult, error) {
|
||||
if !cfg.AutoUpdateEnabled || strings.EqualFold(result.StartupMode, "none") {
|
||||
return result, nil
|
||||
}
|
||||
if cfg.AutoUpdateCurrentVersion == "" || (cfg.Replace && !result.Downloaded) {
|
||||
cfg.AutoUpdateCurrentVersion = "0.0.0"
|
||||
}
|
||||
hostAgentPath := filepath.Join(result.InstallDir, "rap-host-agent.exe")
|
||||
if err := installHostAgentBinary(cfg.HostAgentSourcePath, hostAgentPath); err != nil {
|
||||
return result, err
|
||||
}
|
||||
wrapperPath := filepath.Join(result.InstallDir, "rap-host-agent-update.cmd")
|
||||
logPath := filepath.Join(result.StateDir, "rap-host-agent-update.log")
|
||||
taskName := "RAP Host Agent Updater " + safeUnitSlug(result.NodeName)
|
||||
script := windowsHostAgentUpdateScript(hostAgentPath, cfg, result)
|
||||
if err := os.WriteFile(wrapperPath, []byte(script), 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
started, fallback, mode, err := m.installStartupTask(ctx, taskName, wrapperPath, logPath, cfg.StartupMode)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.HostAgentPath = hostAgentPath
|
||||
result.UpdaterTaskName = taskName
|
||||
result.UpdaterStarted = started
|
||||
if fallback {
|
||||
result.AdminFallback = true
|
||||
}
|
||||
if mode != "" && mode != result.StartupMode {
|
||||
result.StartupMode = mode
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig, result WindowsInstallResult) string {
|
||||
currentVersion := firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0")
|
||||
interval := cfg.AutoUpdateIntervalSeconds
|
||||
if interval == 0 {
|
||||
interval = 21600
|
||||
}
|
||||
initialDelay := cfg.AutoUpdateInitialDelaySeconds
|
||||
if initialDelay == 0 {
|
||||
initialDelay = 15
|
||||
}
|
||||
healthTimeout := cfg.AutoUpdateHealthTimeoutSeconds
|
||||
if healthTimeout == 0 {
|
||||
healthTimeout = 30
|
||||
}
|
||||
updateLoopArgs := []string{
|
||||
`"` + hostAgentPath + `"`,
|
||||
"update-loop",
|
||||
"--backend-url", `"` + cfg.RuntimeConfig.BackendURL + `"`,
|
||||
"--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`,
|
||||
"--state-dir", `"` + result.StateDir + `"`,
|
||||
"--current-version", currentVersion,
|
||||
"--os", "windows",
|
||||
"--arch", "amd64",
|
||||
"--install-type", WindowsUpdateInstallType,
|
||||
"--binary-path", `"` + result.NodeAgentPath + `"`,
|
||||
"--windows-task-name", `"` + result.TaskName + `"`,
|
||||
"--health-timeout-seconds", fmt.Sprintf("%d", healthTimeout),
|
||||
"--interval-seconds", fmt.Sprintf("%d", interval),
|
||||
"--initial-delay-seconds", "0",
|
||||
"--host-agent-update-status-enabled",
|
||||
"--host-agent-current-version", currentVersion,
|
||||
"--host-agent-binary-path", `"` + hostAgentPath + `"`,
|
||||
}
|
||||
if strings.TrimSpace(cfg.NodeID) != "" {
|
||||
updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`)
|
||||
}
|
||||
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
|
||||
updateLoopArgs = append(updateLoopArgs, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
|
||||
}
|
||||
lines := []string{
|
||||
"@echo off",
|
||||
"setlocal",
|
||||
"set RAP_HOST_AGENT=" + `"` + hostAgentPath + `"`,
|
||||
"set RAP_HOST_AGENT_NEXT=" + `"` + hostAgentPath + `.next"`,
|
||||
}
|
||||
if initialDelay > 0 {
|
||||
lines = append(lines, "timeout /t "+fmt.Sprintf("%d", initialDelay)+" /nobreak >NUL")
|
||||
}
|
||||
lines = append(lines, []string{
|
||||
":loop",
|
||||
"if exist %RAP_HOST_AGENT_NEXT% (",
|
||||
" copy /Y %RAP_HOST_AGENT_NEXT% %RAP_HOST_AGENT% >NUL",
|
||||
" if %ERRORLEVEL% EQU 0 del /F /Q %RAP_HOST_AGENT_NEXT%",
|
||||
")",
|
||||
strings.Join(updateLoopArgs, " "),
|
||||
"timeout /t " + fmt.Sprintf("%d", interval) + " /nobreak >NUL",
|
||||
"goto loop",
|
||||
"endlocal",
|
||||
"rem initial-delay-seconds " + fmt.Sprintf("%d", initialDelay),
|
||||
}...)
|
||||
return strings.Join(lines, "\r\n") + "\r\n"
|
||||
}
|
||||
@@ -63,10 +63,12 @@ const (
|
||||
ProductionChannelVPNPacket = "vpn_packet"
|
||||
ProductionMessageVPNPacketBatch = "vpn.packet_batch"
|
||||
FabricServiceClassVPNPackets = "vpn_packets"
|
||||
FabricServiceClassRemoteWorkspace = "remote_workspace"
|
||||
FabricServiceChannelBulk = "bulk"
|
||||
FabricServiceChannelControl = "control"
|
||||
FabricServiceChannelInteractive = "interactive"
|
||||
FabricServiceChannelReliable = "reliable"
|
||||
FabricServiceChannelDroppable = "droppable"
|
||||
MaxProductionEnvelopePayloadBytes = 4096
|
||||
MaxProductionVPNPacketPayloadBytes = 256 * 1024
|
||||
MaxProductionEnvelopeFutureSkew = time.Minute
|
||||
|
||||
@@ -59,9 +59,9 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
|
||||
reasons := []string{"base"}
|
||||
|
||||
switch candidate.Transport {
|
||||
case "direct_tcp_tls":
|
||||
case "direct_tcp_tls", "direct_http", "direct_https":
|
||||
score += 35
|
||||
reasons = append(reasons, "transport:direct_tcp_tls")
|
||||
reasons = append(reasons, "transport:direct")
|
||||
case "wss":
|
||||
score += 25
|
||||
reasons = append(reasons, "transport:wss")
|
||||
|
||||
@@ -37,27 +37,28 @@ type PeerCacheSnapshot struct {
|
||||
}
|
||||
|
||||
type PeerCacheEntry struct {
|
||||
NodeID string `json:"node_id"`
|
||||
RouteIDs []string `json:"route_ids,omitempty"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
EndpointCount int `json:"endpoint_count"`
|
||||
CandidateCount int `json:"candidate_count"`
|
||||
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
|
||||
RecoverySeed bool `json:"recovery_seed"`
|
||||
Warm bool `json:"warm"`
|
||||
WarmReason string `json:"warm_reason,omitempty"`
|
||||
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
||||
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
|
||||
BestTransport string `json:"best_transport,omitempty"`
|
||||
BestReachability string `json:"best_reachability,omitempty"`
|
||||
BestConnectivity string `json:"best_connectivity,omitempty"`
|
||||
BestNATType string `json:"best_nat_type,omitempty"`
|
||||
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
|
||||
BestCandidateScore int `json:"best_candidate_score,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
RelayControl bool `json:"relay_control"`
|
||||
NodeID string `json:"node_id"`
|
||||
RouteIDs []string `json:"route_ids,omitempty"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
EndpointCount int `json:"endpoint_count"`
|
||||
CandidateCount int `json:"candidate_count"`
|
||||
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
|
||||
RecoverySeed bool `json:"recovery_seed"`
|
||||
Warm bool `json:"warm"`
|
||||
WarmReason string `json:"warm_reason,omitempty"`
|
||||
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
||||
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
|
||||
BestTransport string `json:"best_transport,omitempty"`
|
||||
BestReachability string `json:"best_reachability,omitempty"`
|
||||
BestConnectivity string `json:"best_connectivity,omitempty"`
|
||||
BestNATType string `json:"best_nat_type,omitempty"`
|
||||
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
|
||||
BestCandidateScore int `json:"best_candidate_score,omitempty"`
|
||||
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
RelayControl bool `json:"relay_control"`
|
||||
}
|
||||
|
||||
type peerCacheBuildEntry struct {
|
||||
@@ -117,6 +118,10 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
|
||||
MaxVerificationAge: time.Hour,
|
||||
})
|
||||
if len(scored) > 0 {
|
||||
entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored))
|
||||
for _, scoredCandidate := range scored {
|
||||
entry.EndpointCandidates = append(entry.EndpointCandidates, scoredCandidate.Candidate)
|
||||
}
|
||||
entry.BestCandidateID = scored[0].Candidate.EndpointID
|
||||
entry.BestCandidateAddr = scored[0].Candidate.Address
|
||||
entry.BestTransport = scored[0].Candidate.Transport
|
||||
|
||||
@@ -66,24 +66,44 @@ type PeerConnectionManagerSnapshot struct {
|
||||
}
|
||||
|
||||
type PeerConnectionProbeResult struct {
|
||||
NodeID string `json:"node_id"`
|
||||
LinkStatus string `json:"link_status"`
|
||||
Action string `json:"action"`
|
||||
Reason string `json:"reason"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
ConnectionState PeerConnectionState `json:"connection_state"`
|
||||
TransportMode string `json:"transport_mode"`
|
||||
RequiresRendezvous bool `json:"requires_rendezvous"`
|
||||
RendezvousResolved bool `json:"rendezvous_resolved"`
|
||||
DirectCandidate bool `json:"direct_candidate"`
|
||||
RelayCandidate bool `json:"relay_candidate"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
LatencyMs int `json:"latency_ms,omitempty"`
|
||||
FailureReason string `json:"failure_reason,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at"`
|
||||
NodeID string `json:"node_id"`
|
||||
LinkStatus string `json:"link_status"`
|
||||
Action string `json:"action"`
|
||||
Reason string `json:"reason"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
SelectedCandidateID string `json:"selected_candidate_id,omitempty"`
|
||||
SelectedEndpoint string `json:"selected_endpoint,omitempty"`
|
||||
ConnectionState PeerConnectionState `json:"connection_state"`
|
||||
TransportMode string `json:"transport_mode"`
|
||||
RequiresRendezvous bool `json:"requires_rendezvous"`
|
||||
RendezvousResolved bool `json:"rendezvous_resolved"`
|
||||
DirectCandidate bool `json:"direct_candidate"`
|
||||
RelayCandidate bool `json:"relay_candidate"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
LatencyMs int `json:"latency_ms,omitempty"`
|
||||
FailureReason string `json:"failure_reason,omitempty"`
|
||||
CandidateResults []PeerConnectionCandidateProbeResult `json:"candidate_results,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at"`
|
||||
}
|
||||
|
||||
type PeerConnectionCandidateProbeResult struct {
|
||||
CandidateID string `json:"candidate_id,omitempty"`
|
||||
Endpoint string `json:"endpoint"`
|
||||
Transport string `json:"transport,omitempty"`
|
||||
LinkStatus string `json:"link_status"`
|
||||
LatencyMs int `json:"latency_ms,omitempty"`
|
||||
FailureReason string `json:"failure_reason,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at"`
|
||||
}
|
||||
|
||||
type peerConnectionProbeTarget struct {
|
||||
CandidateID string
|
||||
Endpoint string
|
||||
Transport string
|
||||
}
|
||||
|
||||
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
|
||||
@@ -137,6 +157,10 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
|
||||
RendezvousLeases: rendezvousLeases,
|
||||
Now: startedAt,
|
||||
})
|
||||
entriesByNode := map[string]PeerCacheEntry{}
|
||||
for _, entry := range peerSnapshot.Entries {
|
||||
entriesByNode[entry.NodeID] = entry
|
||||
}
|
||||
cycle := PeerConnectionManagerCycle{
|
||||
Mode: recoveryPlan.Mode,
|
||||
StartedAt: startedAt,
|
||||
@@ -150,7 +174,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
|
||||
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
|
||||
}
|
||||
for _, intent := range intentPlan.Intents {
|
||||
result := m.probeIntent(ctx, intent)
|
||||
result := m.probeIntent(ctx, intent, entriesByNode[intent.NodeID])
|
||||
cycle.Results = append(cycle.Results, result)
|
||||
switch result.LinkStatus {
|
||||
case PeerConnectionProbeReachable:
|
||||
@@ -200,7 +224,7 @@ func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvo
|
||||
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult {
|
||||
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent, cacheEntry PeerCacheEntry) PeerConnectionProbeResult {
|
||||
startedAt := normalizedNow(m.now())
|
||||
result := PeerConnectionProbeResult{
|
||||
NodeID: intent.NodeID,
|
||||
@@ -254,9 +278,6 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
result.CompletedAt = normalizedNow(m.now())
|
||||
return result
|
||||
}
|
||||
m.tracker.BeginProbe(peer, startedAt)
|
||||
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
|
||||
defer cancel()
|
||||
target := PeerIdentity{
|
||||
ClusterID: m.local.ClusterID,
|
||||
NodeID: intent.NodeID,
|
||||
@@ -264,30 +285,118 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
if intent.RelayCandidate && intent.RelayNodeID != "" {
|
||||
target.NodeID = intent.RelayNodeID
|
||||
}
|
||||
_, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
|
||||
completedAt := normalizedNow(m.now())
|
||||
if err != nil {
|
||||
result.LinkStatus = PeerConnectionProbeUnreachable
|
||||
result.FailureReason = err.Error()
|
||||
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt)
|
||||
targets := []peerConnectionProbeTarget{{
|
||||
CandidateID: intent.BestCandidateID,
|
||||
Endpoint: intent.Endpoint,
|
||||
Transport: intent.Transport,
|
||||
}}
|
||||
if intent.DirectCandidate {
|
||||
targets = peerConnectionProbeTargets(intent, cacheEntry)
|
||||
}
|
||||
var lastFailure string
|
||||
for _, probeTarget := range targets {
|
||||
probePeer := peer
|
||||
probePeer.Endpoint = strings.TrimRight(strings.TrimSpace(probeTarget.Endpoint), "/")
|
||||
probePeer.BestCandidateID = strings.TrimSpace(probeTarget.CandidateID)
|
||||
probePeer.BestCandidateAddr = probePeer.Endpoint
|
||||
probePeer.BestTransport = strings.TrimSpace(probeTarget.Transport)
|
||||
if probePeer.Endpoint == "" {
|
||||
continue
|
||||
}
|
||||
candidateStartedAt := normalizedNow(m.now())
|
||||
m.tracker.BeginProbe(probePeer, candidateStartedAt)
|
||||
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
|
||||
_, err := NewClient(probePeer.Endpoint).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
|
||||
cancel()
|
||||
completedAt := normalizedNow(m.now())
|
||||
candidateResult := PeerConnectionCandidateProbeResult{
|
||||
CandidateID: probePeer.BestCandidateID,
|
||||
Endpoint: probePeer.Endpoint,
|
||||
Transport: probePeer.BestTransport,
|
||||
StartedAt: candidateStartedAt,
|
||||
CompletedAt: completedAt,
|
||||
}
|
||||
if err != nil {
|
||||
lastFailure = err.Error()
|
||||
candidateResult.LinkStatus = PeerConnectionProbeUnreachable
|
||||
candidateResult.FailureReason = lastFailure
|
||||
result.CandidateResults = append(result.CandidateResults, candidateResult)
|
||||
continue
|
||||
}
|
||||
latency := int(completedAt.Sub(candidateStartedAt).Milliseconds())
|
||||
if latency < 0 {
|
||||
latency = 0
|
||||
}
|
||||
candidateResult.LinkStatus = PeerConnectionProbeReachable
|
||||
candidateResult.LatencyMs = latency
|
||||
result.CandidateResults = append(result.CandidateResults, candidateResult)
|
||||
result.LinkStatus = PeerConnectionProbeReachable
|
||||
result.Endpoint = probePeer.Endpoint
|
||||
result.SelectedCandidateID = probePeer.BestCandidateID
|
||||
result.SelectedEndpoint = probePeer.Endpoint
|
||||
result.LatencyMs = latency
|
||||
if intent.RelayCandidate {
|
||||
result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt)
|
||||
} else {
|
||||
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
|
||||
}
|
||||
result.CompletedAt = completedAt
|
||||
return result
|
||||
}
|
||||
latency := int(completedAt.Sub(startedAt).Milliseconds())
|
||||
if latency < 0 {
|
||||
latency = 0
|
||||
}
|
||||
result.LinkStatus = PeerConnectionProbeReachable
|
||||
result.LatencyMs = latency
|
||||
if intent.RelayCandidate {
|
||||
result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt)
|
||||
} else {
|
||||
result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt)
|
||||
completedAt := normalizedNow(m.now())
|
||||
if lastFailure == "" {
|
||||
lastFailure = "no_probe_endpoint_available"
|
||||
}
|
||||
result.LinkStatus = PeerConnectionProbeUnreachable
|
||||
result.FailureReason = lastFailure
|
||||
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, lastFailure, completedAt)
|
||||
result.CompletedAt = completedAt
|
||||
return result
|
||||
}
|
||||
|
||||
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
|
||||
seen := map[string]struct{}{}
|
||||
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
|
||||
add := func(candidateID, endpoint, transport string) {
|
||||
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
|
||||
if endpoint == "" {
|
||||
return
|
||||
}
|
||||
key := candidateID + "|" + endpoint
|
||||
if _, ok := seen[key]; ok {
|
||||
return
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
out = append(out, peerConnectionProbeTarget{
|
||||
CandidateID: strings.TrimSpace(candidateID),
|
||||
Endpoint: endpoint,
|
||||
Transport: strings.TrimSpace(transport),
|
||||
})
|
||||
}
|
||||
for _, candidate := range cacheEntry.EndpointCandidates {
|
||||
if !candidateUsableForDirectProbe(candidate) {
|
||||
continue
|
||||
}
|
||||
add(candidate.EndpointID, candidate.Address, candidate.Transport)
|
||||
}
|
||||
add(intent.BestCandidateID, intent.Endpoint, intent.Transport)
|
||||
return out
|
||||
}
|
||||
|
||||
func candidateUsableForDirectProbe(candidate PeerEndpointCandidate) bool {
|
||||
endpoint := strings.TrimSpace(candidate.Address)
|
||||
if endpoint == "" || strings.HasPrefix(endpoint, "relay://") || strings.HasPrefix(endpoint, "outbound://") {
|
||||
return false
|
||||
}
|
||||
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
|
||||
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
|
||||
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
|
||||
if connectivity == "outbound_only" || connectivity == "relay_required" || reachability == "outbound_only" || reachability == "relay" {
|
||||
return false
|
||||
}
|
||||
return transport == "" || strings.Contains(transport, "direct") || transport == "wss" || strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
|
||||
snapshot := m.tracker.Snapshot()
|
||||
for _, entry := range snapshot.Entries {
|
||||
|
||||
@@ -188,3 +188,71 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
|
||||
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
|
||||
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-dead",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_http",
|
||||
Address: "http://127.0.0.1:1",
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "private_lan",
|
||||
Priority: 1,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-live",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_http",
|
||||
Address: server.URL,
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "private_lan",
|
||||
Priority: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
WarmPeerLimit: 1,
|
||||
Now: now,
|
||||
})
|
||||
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
|
||||
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
HTTPClient: &http.Client{Timeout: 100 * time.Millisecond},
|
||||
ProbeTimeout: 100 * time.Millisecond,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
return current
|
||||
},
|
||||
})
|
||||
|
||||
cycle := manager.ProbeOnce(context.Background())
|
||||
if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Failed != 0 || len(cycle.Results) != 1 {
|
||||
t.Fatalf("unexpected cycle: %+v", cycle)
|
||||
}
|
||||
result := cycle.Results[0]
|
||||
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != server.URL {
|
||||
t.Fatalf("fallback did not select live candidate: %+v", result)
|
||||
}
|
||||
if len(result.CandidateResults) != 2 ||
|
||||
result.CandidateResults[0].LinkStatus != PeerConnectionProbeUnreachable ||
|
||||
result.CandidateResults[1].LinkStatus != PeerConnectionProbeReachable {
|
||||
t.Fatalf("candidate probe trail mismatch: %+v", result.CandidateResults)
|
||||
}
|
||||
snapshot := tracker.Snapshot()
|
||||
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != server.URL {
|
||||
t.Fatalf("tracker did not retain selected candidate: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,6 +138,32 @@ func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now
|
||||
return entry
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) RecordSuccessForPeer(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
|
||||
if t == nil {
|
||||
return PeerConnectionState{}
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
now = normalizedNow(now)
|
||||
entry := t.entry(peer, now)
|
||||
entry.ConsecutiveSuccesses++
|
||||
entry.ConsecutiveFailures = 0
|
||||
entry.LastLatencyMs = latencyMs
|
||||
entry.LastFailureReason = ""
|
||||
entry.LastProbeAt = now
|
||||
entry.BackoffUntil = time.Time{}
|
||||
nextState := PeerConnectionReady
|
||||
if latencyMs >= 500 {
|
||||
nextState = PeerConnectionDegraded
|
||||
}
|
||||
if entry.State != nextState {
|
||||
entry.State = nextState
|
||||
entry.LastTransitionAt = now
|
||||
}
|
||||
t.entries[peer.NodeID] = entry
|
||||
return entry
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
|
||||
if t == nil {
|
||||
return PeerConnectionState{}
|
||||
|
||||
@@ -34,12 +34,20 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
|
||||
return err
|
||||
}
|
||||
}
|
||||
if envelope.ChannelClass != ProductionChannelFabricControl {
|
||||
maxPayloadBytes := MaxProductionEnvelopePayloadBytes
|
||||
switch envelope.ChannelClass {
|
||||
case ProductionChannelFabricControl:
|
||||
if envelope.MessageType != ProductionMessageFabricControl {
|
||||
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
case ProductionChannelVPNPacket:
|
||||
if envelope.MessageType != ProductionMessageVPNPacketBatch {
|
||||
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
maxPayloadBytes = MaxProductionVPNPacketPayloadBytes
|
||||
default:
|
||||
return ErrUnauthorizedChannel
|
||||
}
|
||||
if envelope.MessageType != ProductionMessageFabricControl {
|
||||
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.TTL <= 0 {
|
||||
return ErrTTLExhausted
|
||||
}
|
||||
@@ -58,8 +66,8 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
|
||||
if envelope.PayloadLength != len(envelope.Payload) {
|
||||
return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes {
|
||||
return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid)
|
||||
if envelope.PayloadLength > maxPayloadBytes {
|
||||
return fmt.Errorf("%w: payload exceeds channel limit", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.PayloadHash == "" {
|
||||
return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid)
|
||||
|
||||
@@ -22,7 +22,7 @@ func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope Producti
|
||||
if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) {
|
||||
return ErrRouteExpired
|
||||
}
|
||||
if !contains(route.AllowedChannels, ProductionChannelFabricControl) {
|
||||
if !contains(route.AllowedChannels, envelope.ChannelClass) {
|
||||
return ErrUnauthorizedChannel
|
||||
}
|
||||
path := routePath(route)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,8 @@ package supervisor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
|
||||
)
|
||||
@@ -17,24 +19,146 @@ type StubSupervisor struct {
|
||||
func (s StubSupervisor) Apply(_ context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error) {
|
||||
statuses := make([]client.WorkloadStatusRequest, 0, len(desired))
|
||||
for _, workload := range desired {
|
||||
state := "degraded"
|
||||
if workload.DesiredState == "disabled" {
|
||||
state = "stopped"
|
||||
}
|
||||
version := workload.Version
|
||||
if version == "" {
|
||||
version = s.Version
|
||||
}
|
||||
statuses = append(statuses, client.WorkloadStatusRequest{
|
||||
ReportedState: state,
|
||||
RuntimeMode: workload.RuntimeMode,
|
||||
Version: version,
|
||||
StatusPayload: map[string]any{
|
||||
"supervisor": "stub",
|
||||
"desired_state": workload.DesiredState,
|
||||
"service_type": workload.ServiceType,
|
||||
},
|
||||
})
|
||||
statuses = append(statuses, s.applyOne(workload))
|
||||
}
|
||||
return statuses, nil
|
||||
}
|
||||
|
||||
func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.WorkloadStatusRequest {
|
||||
serviceType := strings.TrimSpace(workload.ServiceType)
|
||||
desiredState := strings.TrimSpace(strings.ToLower(workload.DesiredState))
|
||||
if desiredState == "" {
|
||||
desiredState = "disabled"
|
||||
}
|
||||
runtimeMode := strings.TrimSpace(strings.ToLower(workload.RuntimeMode))
|
||||
if runtimeMode == "" {
|
||||
runtimeMode = "native"
|
||||
}
|
||||
version := strings.TrimSpace(workload.Version)
|
||||
if version == "" {
|
||||
version = s.Version
|
||||
}
|
||||
payload := map[string]any{
|
||||
"schema_version": "rap.node_agent.workload_supervision.v1",
|
||||
"supervisor": "node-agent-local",
|
||||
"desired_state": desiredState,
|
||||
"service_type": serviceType,
|
||||
"runtime_mode": runtimeMode,
|
||||
"observed_at": time.Now().UTC().Format(time.RFC3339Nano),
|
||||
}
|
||||
if desiredState != "enabled" {
|
||||
payload["reason"] = "desired_state_not_enabled"
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "stopped",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
if serviceType == "core-mesh" || serviceType == "mesh-listener" {
|
||||
payload["reason"] = "builtin_node_agent_service_ready"
|
||||
payload["execution_mode"] = "builtin"
|
||||
payload["traffic"] = serviceTrafficMode(serviceType)
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "running",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
if serviceType == "synthetic.echo" && runtimeMode == "native" {
|
||||
payload["reason"] = "internal_synthetic_echo_ready"
|
||||
payload["execution_mode"] = "builtin"
|
||||
payload["traffic"] = "test_service_only"
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "running",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
if serviceType == "rdp-worker" && runtimeMode == "native" && boolConfig(workload.Config, "adapter_contract_probe") {
|
||||
payload["reason"] = "remote_workspace_adapter_contract_probe_ready"
|
||||
payload["execution_mode"] = "contract_probe"
|
||||
payload["service_class"] = "remote_workspace"
|
||||
payload["fabric_service_channel_required"] = true
|
||||
payload["backend_relay_steady_state"] = false
|
||||
payload["channels"] = remoteWorkspaceAdapterChannels()
|
||||
payload["frame_batch_contract"] = remoteWorkspaceFrameBatchContract()
|
||||
payload["traffic"] = "none"
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "running",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
payload["reason"] = "service_runtime_not_implemented"
|
||||
payload["traffic"] = "blocked"
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "degraded",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
|
||||
func boolConfig(values map[string]any, key string) bool {
|
||||
if values == nil {
|
||||
return false
|
||||
}
|
||||
value, ok := values[key]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
switch typed := value.(type) {
|
||||
case bool:
|
||||
return typed
|
||||
case string:
|
||||
return strings.EqualFold(strings.TrimSpace(typed), "true")
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func remoteWorkspaceAdapterChannels() []map[string]any {
|
||||
return []map[string]any{
|
||||
{"name": "input", "direction": "client_to_adapter", "reliability": "reliable_ordered", "priority": "critical", "droppable": true, "may_block_input": false},
|
||||
{"name": "control", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "high", "droppable": false, "may_block_input": false},
|
||||
{"name": "display", "direction": "adapter_to_client", "reliability": "droppable_latest", "priority": "high", "droppable": true, "may_block_input": false},
|
||||
{"name": "cursor", "direction": "adapter_to_client", "reliability": "droppable_latest", "priority": "high", "droppable": true, "may_block_input": false},
|
||||
{"name": "clipboard", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "medium", "droppable": false, "may_block_input": false},
|
||||
{"name": "file_transfer", "direction": "bidirectional", "reliability": "reliable_chunked", "priority": "medium", "droppable": false, "may_block_input": false},
|
||||
{"name": "audio", "direction": "adapter_to_client", "reliability": "adaptive_droppable", "priority": "medium", "droppable": true, "may_block_input": false},
|
||||
{"name": "device", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "medium", "droppable": false, "may_block_input": false},
|
||||
{"name": "telemetry", "direction": "adapter_to_client", "reliability": "sampled_droppable", "priority": "low", "droppable": true, "may_block_input": false},
|
||||
}
|
||||
}
|
||||
|
||||
func remoteWorkspaceFrameBatchContract() map[string]any {
|
||||
return map[string]any{
|
||||
"schema_version": "rap.remote_workspace_frame_batch.v1",
|
||||
"adapter_contract_id": "rap.rdp_worker.remote_workspace_adapter_contract_probe.v1",
|
||||
"probe_only": true,
|
||||
"payload_forwarding": "not_implemented",
|
||||
"service_class": "remote_workspace",
|
||||
"allowed_flow_classes": []string{"control", "interactive", "reliable", "bulk", "droppable"},
|
||||
"allowed_payload_encodings": []string{
|
||||
"none",
|
||||
"base64",
|
||||
},
|
||||
"max_probe_frames": 32,
|
||||
"channels": remoteWorkspaceAdapterChannels(),
|
||||
}
|
||||
}
|
||||
|
||||
func serviceTrafficMode(serviceType string) string {
|
||||
switch serviceType {
|
||||
case "core-mesh":
|
||||
return "fabric_control"
|
||||
case "mesh-listener":
|
||||
return "entry_listener"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,3 +33,101 @@ func TestStubSupervisorReportsStoppedForDisabledWorkload(t *testing.T) {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubSupervisorRunsInternalSyntheticEchoWorkload(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{ServiceType: "synthetic.echo", DesiredState: "enabled", RuntimeMode: "native"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if statuses[0].ReportedState != "running" {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
if statuses[0].StatusPayload["reason"] != "internal_synthetic_echo_ready" {
|
||||
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
|
||||
}
|
||||
if statuses[0].StatusPayload["execution_mode"] != "builtin" {
|
||||
t.Fatalf("execution_mode = %v", statuses[0].StatusPayload["execution_mode"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubSupervisorReportsBuiltinFabricServicesRunning(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{ServiceType: "core-mesh", DesiredState: "enabled", RuntimeMode: "container"},
|
||||
{ServiceType: "mesh-listener", DesiredState: "enabled", RuntimeMode: "container"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if len(statuses) != 2 {
|
||||
t.Fatalf("statuses length = %d", len(statuses))
|
||||
}
|
||||
for _, status := range statuses {
|
||||
if status.ReportedState != "running" {
|
||||
t.Fatalf("ReportedState = %q", status.ReportedState)
|
||||
}
|
||||
if status.StatusPayload["reason"] != "builtin_node_agent_service_ready" {
|
||||
t.Fatalf("reason = %v", status.StatusPayload["reason"])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubSupervisorKeepsUnsupportedEnabledWorkloadDegraded(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{ServiceType: "rdp-worker", DesiredState: "enabled", RuntimeMode: "container"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if statuses[0].ReportedState != "degraded" {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
if statuses[0].StatusPayload["reason"] != "service_runtime_not_implemented" {
|
||||
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubSupervisorRunsRDPWorkerAdapterContractProbeOnly(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{
|
||||
ServiceType: "rdp-worker",
|
||||
DesiredState: "enabled",
|
||||
RuntimeMode: "native",
|
||||
Config: map[string]any{
|
||||
"adapter_contract_probe": true,
|
||||
},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if statuses[0].ReportedState != "running" {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
if statuses[0].StatusPayload["reason"] != "remote_workspace_adapter_contract_probe_ready" {
|
||||
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
|
||||
}
|
||||
if statuses[0].StatusPayload["service_class"] != "remote_workspace" {
|
||||
t.Fatalf("service_class = %v", statuses[0].StatusPayload["service_class"])
|
||||
}
|
||||
if statuses[0].StatusPayload["backend_relay_steady_state"] != false {
|
||||
t.Fatalf("backend_relay_steady_state = %v", statuses[0].StatusPayload["backend_relay_steady_state"])
|
||||
}
|
||||
channels, ok := statuses[0].StatusPayload["channels"].([]map[string]any)
|
||||
if !ok || len(channels) != 9 {
|
||||
t.Fatalf("channels = %#v", statuses[0].StatusPayload["channels"])
|
||||
}
|
||||
if channels[0]["name"] != "input" || channels[0]["priority"] != "critical" || channels[0]["droppable"] != true || channels[0]["may_block_input"] != false {
|
||||
t.Fatalf("unexpected input channel: %#v", channels[0])
|
||||
}
|
||||
frameBatch, ok := statuses[0].StatusPayload["frame_batch_contract"].(map[string]any)
|
||||
if !ok {
|
||||
t.Fatalf("frame_batch_contract = %#v", statuses[0].StatusPayload["frame_batch_contract"])
|
||||
}
|
||||
if frameBatch["schema_version"] != "rap.remote_workspace_frame_batch.v1" ||
|
||||
frameBatch["payload_forwarding"] != "not_implemented" ||
|
||||
frameBatch["service_class"] != "remote_workspace" {
|
||||
t.Fatalf("unexpected frame batch contract: %#v", frameBatch)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -385,32 +385,37 @@ func (s *FabricFlowScheduler) ConfigureAdaptivePolicy(policy FabricServiceChanne
|
||||
}
|
||||
|
||||
func (s *FabricFlowScheduler) ScheduleClientPackets(packets [][]byte) []FabricScheduledPacketBatch {
|
||||
return s.scheduleClientPackets("", "", packets)
|
||||
scheduled, _ := s.scheduleClientPackets("", "", packets)
|
||||
return scheduled
|
||||
}
|
||||
|
||||
func (s *FabricFlowScheduler) ScheduleClientPacketsForConnection(vpnConnectionID string, packets [][]byte) []FabricScheduledPacketBatch {
|
||||
return s.scheduleClientPackets(vpnConnectionID, "", packets)
|
||||
scheduled, _ := s.scheduleClientPackets(vpnConnectionID, "", packets)
|
||||
return scheduled
|
||||
}
|
||||
|
||||
func (s *FabricFlowScheduler) ScheduleClientPacketsForConnectionClass(vpnConnectionID string, trafficClass string, packets [][]byte) []FabricScheduledPacketBatch {
|
||||
return s.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
|
||||
scheduled, _ := s.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
|
||||
return scheduled
|
||||
}
|
||||
|
||||
func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, trafficClass string, packets [][]byte) []FabricScheduledPacketBatch {
|
||||
func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, trafficClass string, packets [][]byte) ([]FabricScheduledPacketBatch, uint64) {
|
||||
packets = cleanPacketBatch(packets)
|
||||
if len(packets) == 0 {
|
||||
return nil
|
||||
return nil, 0
|
||||
}
|
||||
if s == nil {
|
||||
s = NewFabricFlowScheduler(0, 0)
|
||||
}
|
||||
trafficClass = normalizeFabricTrafficClass(trafficClass)
|
||||
grouped := map[string]*FabricScheduledPacketBatch{}
|
||||
var droppedCount uint64
|
||||
for _, packet := range packets {
|
||||
flowID, shard := classifyPacketFlow(packet, s.shardCountValue())
|
||||
channelID := fabricFlowChannelIDForClass(vpnConnectionID, trafficClass, shard)
|
||||
queueDepth, dropped := s.enqueue(channelID, trafficClass)
|
||||
if dropped {
|
||||
droppedCount++
|
||||
continue
|
||||
}
|
||||
batch := grouped[channelID]
|
||||
@@ -433,7 +438,7 @@ func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, traf
|
||||
out = append(out, *batch)
|
||||
}
|
||||
s.sortScheduledBatches(out)
|
||||
return out
|
||||
return out, droppedCount
|
||||
}
|
||||
|
||||
func fabricFlowChannelID(vpnConnectionID string, shard int) string {
|
||||
@@ -1441,11 +1446,9 @@ func (i *FabricClientPacketIngress) SendClientPacketBatchWithTrafficClass(ctx co
|
||||
}
|
||||
i.recordSendBatch(len(packets))
|
||||
scheduler := i.flowScheduler()
|
||||
droppedBefore := scheduler.Dropped()
|
||||
scheduled := scheduler.ScheduleClientPacketsForConnectionClass(vpnConnectionID, trafficClass, packets)
|
||||
droppedAfter := scheduler.Dropped()
|
||||
if droppedAfter > droppedBefore {
|
||||
i.recordFlowDropped(droppedAfter - droppedBefore)
|
||||
scheduled, droppedCount := scheduler.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
|
||||
if droppedCount > 0 {
|
||||
i.recordFlowDropped(droppedCount)
|
||||
}
|
||||
if len(scheduled) == 0 {
|
||||
i.recordError(mesh.ErrSyntheticRelayQueueFull)
|
||||
@@ -1657,8 +1660,10 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
|
||||
if i == nil || routesFunc == nil {
|
||||
return nil
|
||||
}
|
||||
localClusterID := i.clusterID()
|
||||
localNodeID := i.localNodeID()
|
||||
if clusterID == "" {
|
||||
clusterID = i.ClusterID
|
||||
clusterID = localClusterID
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
var preferred []fabricClientRouteCandidate
|
||||
@@ -1676,7 +1681,7 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
|
||||
}
|
||||
}
|
||||
for _, route := range routesFunc() {
|
||||
if route.ClusterID != clusterID || route.SourceNodeID != i.LocalNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
|
||||
if route.ClusterID != clusterID || route.SourceNodeID != localNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
|
||||
continue
|
||||
}
|
||||
if manager.isWithdrawn(route.RouteID) {
|
||||
@@ -1685,8 +1690,8 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
|
||||
if !route.ExpiresAt.IsZero() && !route.ExpiresAt.After(now) {
|
||||
continue
|
||||
}
|
||||
nextHop := nextHopAfter(route.Hops, i.LocalNodeID, route.DestinationNodeID)
|
||||
if nextHop == "" || nextHop == i.LocalNodeID {
|
||||
nextHop := nextHopAfter(route.Hops, localNodeID, route.DestinationNodeID)
|
||||
if nextHop == "" || nextHop == localNodeID {
|
||||
continue
|
||||
}
|
||||
candidate := fabricClientRouteCandidate{Route: route, NextHop: nextHop}
|
||||
@@ -2024,7 +2029,7 @@ func (i *FabricClientPacketIngress) routeProvenance(clusterID string) map[string
|
||||
if i == nil || routesFunc == nil {
|
||||
return out
|
||||
}
|
||||
localNodeID := strings.TrimSpace(i.LocalNodeID)
|
||||
localNodeID := i.localNodeID()
|
||||
for _, route := range routesFunc() {
|
||||
if strings.TrimSpace(route.RouteID) == "" {
|
||||
continue
|
||||
@@ -2322,6 +2327,24 @@ func (i *FabricClientPacketIngress) routesFunc() func() []mesh.SyntheticRoute {
|
||||
return i.Routes
|
||||
}
|
||||
|
||||
func (i *FabricClientPacketIngress) clusterID() string {
|
||||
if i == nil {
|
||||
return ""
|
||||
}
|
||||
i.mu.Lock()
|
||||
defer i.mu.Unlock()
|
||||
return strings.TrimSpace(i.ClusterID)
|
||||
}
|
||||
|
||||
func (i *FabricClientPacketIngress) localNodeID() string {
|
||||
if i == nil {
|
||||
return ""
|
||||
}
|
||||
i.mu.Lock()
|
||||
defer i.mu.Unlock()
|
||||
return strings.TrimSpace(i.LocalNodeID)
|
||||
}
|
||||
|
||||
func (i *FabricClientPacketIngress) flowScheduler() *FabricFlowScheduler {
|
||||
if i == nil {
|
||||
return NewFabricFlowScheduler(0, 0)
|
||||
|
||||
@@ -324,10 +324,13 @@ func TestFabricFlowSchedulerDropsWhenChannelQueueIsFull(t *testing.T) {
|
||||
packetA := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
|
||||
packetB := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
|
||||
|
||||
batches := scheduler.ScheduleClientPackets([][]byte{packetA, packetB})
|
||||
batches, dropped := scheduler.scheduleClientPackets("", "", [][]byte{packetA, packetB})
|
||||
if len(batches) != 1 || len(batches[0].Packets) != 1 {
|
||||
t.Fatalf("batches = %#v, want one accepted packet", batches)
|
||||
}
|
||||
if dropped != 1 {
|
||||
t.Fatalf("dropped = %d, want per-call drop count 1", dropped)
|
||||
}
|
||||
snapshot := scheduler.Snapshot()
|
||||
if snapshot.Dropped != 1 || !snapshot.BackpressureActive {
|
||||
t.Fatalf("snapshot = %+v, want one dropped packet and active backpressure", snapshot)
|
||||
@@ -1069,6 +1072,60 @@ func TestFabricClientPacketIngressIsolatesRouteMemoryPerVPNConnection(t *testing
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricClientPacketIngressRouteSelectionUsesUpdatedRuntimeIdentity(t *testing.T) {
|
||||
transport := &captureManyProductionTransport{}
|
||||
ingress := &FabricClientPacketIngress{
|
||||
ForwardTransport: transport,
|
||||
Inbox: NewFabricPacketInbox(8),
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "entry-1",
|
||||
Routes: func() []mesh.SyntheticRoute {
|
||||
return []mesh.SyntheticRoute{{
|
||||
RouteID: "route-entry-1",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "entry-1",
|
||||
DestinationNodeID: "exit-1",
|
||||
Hops: []string{"entry-1", "relay-1", "exit-1"},
|
||||
AllowedChannels: []string{mesh.ProductionChannelVPNPacket},
|
||||
ExpiresAt: time.Now().UTC().Add(time.Minute),
|
||||
MaxTTL: 8,
|
||||
}}
|
||||
},
|
||||
}
|
||||
ingress.UpdateRuntime(
|
||||
transport,
|
||||
NewFabricPacketInbox(8),
|
||||
"cluster-1",
|
||||
"entry-2",
|
||||
nil,
|
||||
func() []mesh.SyntheticRoute {
|
||||
return []mesh.SyntheticRoute{{
|
||||
RouteID: "route-entry-2",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "entry-2",
|
||||
DestinationNodeID: "exit-2",
|
||||
Hops: []string{"entry-2", "relay-2", "exit-2"},
|
||||
AllowedChannels: []string{mesh.ProductionChannelVPNPacket},
|
||||
ExpiresAt: time.Now().UTC().Add(time.Minute),
|
||||
MaxTTL: 8,
|
||||
}}
|
||||
},
|
||||
"policy-updated",
|
||||
)
|
||||
|
||||
packet := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 443)
|
||||
if err := ingress.SendClientPacketBatch(context.Background(), "", "vpn-1", [][]byte{packet}); err != nil {
|
||||
t.Fatalf("send after runtime update: %v", err)
|
||||
}
|
||||
if len(transport.envelopes) != 1 {
|
||||
t.Fatalf("envelopes = %d, want one send", len(transport.envelopes))
|
||||
}
|
||||
envelope := transport.envelopes[0]
|
||||
if envelope.RouteID != "route-entry-2" || envelope.SourceNodeID != "entry-2" || transport.calls[0] != "relay-2" {
|
||||
t.Fatalf("envelope route/source/next-hop = %s/%s/%s, want updated entry-2 route", envelope.RouteID, envelope.SourceNodeID, transport.calls[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricClientPacketIngressParallelFlowWindowDoesNotBlockIndependentChannel(t *testing.T) {
|
||||
scheduler := NewFabricFlowScheduler(8, 16)
|
||||
slowPacket, fastPacket := packetsForOrderedDistinctChannels(scheduler.shardCountValue())
|
||||
|
||||
@@ -0,0 +1,170 @@
|
||||
//go:build windows && rap_vpn_windows_tun
|
||||
|
||||
package vpnruntime
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
_ "embed"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
wgtun "golang.zx2c4.com/wireguard/tun"
|
||||
)
|
||||
|
||||
const windowsGatewayMTU = 1420
|
||||
|
||||
//go:embed assets/windows/amd64/wintun.dll
|
||||
var embeddedWintunDLL []byte
|
||||
|
||||
type tunDevice struct {
|
||||
dev wgtun.Device
|
||||
name string
|
||||
}
|
||||
|
||||
func openGatewayTun(name, addressCIDR, routeCIDR string) (*tunDevice, error) {
|
||||
if _, _, err := net.ParseCIDR(addressCIDR); err != nil {
|
||||
return nil, fmt.Errorf("invalid vpn gateway address %q: %w", addressCIDR, err)
|
||||
}
|
||||
if _, _, err := net.ParseCIDR(routeCIDR); err != nil {
|
||||
return nil, fmt.Errorf("invalid vpn gateway route %q: %w", routeCIDR, err)
|
||||
}
|
||||
if err := ensureWintunDLL(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dev, err := wgtun.CreateTUN(name, windowsGatewayMTU)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create wintun interface %s: %w", name, err)
|
||||
}
|
||||
if err := configureGatewayInterface(name, addressCIDR, routeCIDR); err != nil {
|
||||
_ = dev.Close()
|
||||
return nil, err
|
||||
}
|
||||
return &tunDevice{dev: dev, name: name}, nil
|
||||
}
|
||||
|
||||
func (d *tunDevice) Read(packet []byte) (int, error) {
|
||||
bufs := [][]byte{packet}
|
||||
sizes := []int{0}
|
||||
n, err := d.dev.Read(bufs, sizes, 0)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if n <= 0 {
|
||||
return 0, nil
|
||||
}
|
||||
return sizes[0], nil
|
||||
}
|
||||
|
||||
func (d *tunDevice) Write(packet []byte) (int, error) {
|
||||
n, err := d.dev.Write([][]byte{packet}, 0)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if n <= 0 {
|
||||
return 0, nil
|
||||
}
|
||||
return len(packet), nil
|
||||
}
|
||||
|
||||
func (d *tunDevice) Close() error {
|
||||
_ = removeWindowsGatewayNat()
|
||||
return d.dev.Close()
|
||||
}
|
||||
|
||||
func configureGatewayInterface(name, addressCIDR, routeCIDR string) error {
|
||||
ip, network, err := net.ParseCIDR(addressCIDR)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid vpn gateway address %q: %w", addressCIDR, err)
|
||||
}
|
||||
ones, bits := network.Mask.Size()
|
||||
if bits != 32 || ones <= 0 {
|
||||
return fmt.Errorf("invalid vpn gateway prefix %q", addressCIDR)
|
||||
}
|
||||
_, route, err := net.ParseCIDR(routeCIDR)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid vpn gateway route %q: %w", routeCIDR, err)
|
||||
}
|
||||
|
||||
script := fmt.Sprintf(`
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$alias = %s
|
||||
$address = %s
|
||||
$prefixLength = %d
|
||||
$natPrefix = %s
|
||||
$natName = 'RAPVPN'
|
||||
$adapter = Get-NetAdapter -Name $alias -ErrorAction Stop
|
||||
$adapter | Enable-NetAdapter -Confirm:$false -ErrorAction SilentlyContinue | Out-Null
|
||||
$existing = Get-NetIPAddress -InterfaceAlias $alias -AddressFamily IPv4 -ErrorAction SilentlyContinue
|
||||
foreach ($addr in $existing) {
|
||||
if ($addr.IPAddress -ne $address -or $addr.PrefixLength -ne $prefixLength) {
|
||||
Remove-NetIPAddress -InterfaceAlias $alias -IPAddress $addr.IPAddress -Confirm:$false -ErrorAction SilentlyContinue
|
||||
}
|
||||
}
|
||||
if (-not (Get-NetIPAddress -InterfaceAlias $alias -IPAddress $address -AddressFamily IPv4 -ErrorAction SilentlyContinue)) {
|
||||
New-NetIPAddress -InterfaceAlias $alias -IPAddress $address -PrefixLength $prefixLength -Type Unicast | Out-Null
|
||||
}
|
||||
Set-NetIPInterface -InterfaceAlias $alias -AddressFamily IPv4 -Forwarding Enabled
|
||||
Get-NetIPInterface -AddressFamily IPv4 | Where-Object { $_.ConnectionState -eq 'Connected' -and $_.InterfaceAlias -ne 'Loopback Pseudo-Interface 1' } | Set-NetIPInterface -Forwarding Enabled
|
||||
$existingNat = Get-NetNat -Name $natName -ErrorAction SilentlyContinue
|
||||
if ($existingNat -and $existingNat.InternalIPInterfaceAddressPrefix -ne $natPrefix) {
|
||||
$existingNat | Remove-NetNat -Confirm:$false
|
||||
$existingNat = $null
|
||||
}
|
||||
if (-not $existingNat) {
|
||||
New-NetNat -Name $natName -InternalIPInterfaceAddressPrefix $natPrefix | Out-Null
|
||||
}
|
||||
`, psQuote(name), psQuote(ip.String()), ones, psQuote(route.String()))
|
||||
|
||||
if err := runPowerShell(script); err != nil {
|
||||
return fmt.Errorf("configure windows vpn gateway interface %s: %w", name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func removeWindowsGatewayNat() error {
|
||||
return runPowerShell(`Get-NetNat -Name 'RAPVPN' -ErrorAction SilentlyContinue | Remove-NetNat -Confirm:$false -ErrorAction SilentlyContinue`)
|
||||
}
|
||||
|
||||
func runPowerShell(script string) error {
|
||||
cmd := exec.Command("powershell.exe", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("powershell failed: %w: %s", err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func psQuote(value string) string {
|
||||
return "'" + strings.ReplaceAll(value, "'", "''") + "'"
|
||||
}
|
||||
|
||||
func ensureWintunDLL() error {
|
||||
exePath, err := os.Executable()
|
||||
if err != nil {
|
||||
return fmt.Errorf("locate node-agent executable for wintun.dll: %w", err)
|
||||
}
|
||||
target := filepath.Join(filepath.Dir(exePath), "wintun.dll")
|
||||
if payload, err := os.ReadFile(target); err == nil && sameSHA256(payload, embeddedWintunDLL) {
|
||||
return nil
|
||||
}
|
||||
tmp := target + ".tmp"
|
||||
if err := os.WriteFile(tmp, embeddedWintunDLL, 0o644); err != nil {
|
||||
return fmt.Errorf("write embedded wintun.dll: %w", err)
|
||||
}
|
||||
_ = os.Remove(target)
|
||||
if err := os.Rename(tmp, target); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return fmt.Errorf("install embedded wintun.dll: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func sameSHA256(a, b []byte) bool {
|
||||
left := sha256.Sum256(a)
|
||||
right := sha256.Sum256(b)
|
||||
return left == right
|
||||
}
|
||||
Reference in New Issue
Block a user