Record project continuation changes

This commit is contained in:
2026-05-12 21:02:29 +03:00
parent 3059d1d7a3
commit 8f69d53193
339 changed files with 101111 additions and 1769 deletions
+37 -14
View File
@@ -7,7 +7,7 @@ import (
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const Version = "0.1.0-c3"
const Version = "0.2.256-c18z82"
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
return client.EnrollRequest{
@@ -17,18 +17,26 @@ func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) cli
NodeFingerprint: identity.NodeFingerprint,
PublicKey: identity.PublicKey,
ReportedCapabilities: map[string]any{
"can_accept_client_ingress": false,
"can_accept_node_ingress": false,
"can_route_mesh": false,
"can_run_rdp_worker": true,
"can_run_vnc_worker": false,
"can_run_vpn_exit": false,
"can_run_vpn_connector": false,
"can_run_file_cache": false,
"can_run_update_cache": false,
"can_run_video_relay": false,
"native_node_agent_version": Version,
"service_supervision_enabled": false,
"can_accept_client_ingress": false,
"can_accept_node_ingress": false,
"can_route_mesh": false,
"can_run_rdp_worker": true,
"can_run_vnc_worker": false,
"can_run_vpn_exit": true,
"can_run_vpn_connector": true,
"can_run_file_cache": false,
"can_run_update_cache": false,
"can_run_video_relay": false,
"native_node_agent_version": Version,
"node_update_plan_contract": "rap.node_update_plan.v1",
"node_update_status_report": true,
"host_agent_update_required": true,
"service_supervision_enabled": false,
"vpn_assignment_status": true,
"vpn_packet_forwarding": true,
"vpn_fabric_packet_transport": true,
"vpn_local_gateway_shortcut": true,
"external_backend_entry_proxy": true,
},
ReportedFacts: map[string]any{
"os": runtime.GOOS,
@@ -45,13 +53,28 @@ func HeartbeatPayload() client.HeartbeatRequest {
HealthStatus: "healthy",
ReportedVersion: Version,
Capabilities: map[string]any{
"native_node_agent": true,
"native_node_agent": true,
"node_update_plan_contract": "rap.node_update_plan.v1",
"node_update_status_report": true,
"vpn_assignment_status": true,
"vpn_packet_forwarding": true,
"vpn_fabric_packet_transport": true,
"vpn_local_gateway_shortcut": true,
"external_backend_entry_proxy": true,
},
ServiceStates: map[string]any{
"workload_supervision": "not_implemented_c3",
},
Metadata: map[string]any{
"stage": "c3",
"update_runtime": map[string]any{
"product": "rap-node-agent",
"current_version": Version,
"host_agent_present": true,
"self_update_enabled": true,
"rollback_executor_ready": true,
"reason": "host-agent updater active",
},
},
}
}
@@ -260,6 +260,7 @@ type SyntheticMeshRouteConfig struct {
}
type SyntheticMeshConfig struct {
Raw json.RawMessage `json:"-"`
Enabled bool `json:"enabled"`
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
@@ -286,6 +287,17 @@ type SyntheticMeshConfig struct {
ProductionForwarding bool `json:"production_forwarding"`
}
func (c *SyntheticMeshConfig) UnmarshalJSON(data []byte) error {
type syntheticMeshConfigAlias SyntheticMeshConfig
var decoded syntheticMeshConfigAlias
if err := json.Unmarshal(data, &decoded); err != nil {
return err
}
*c = SyntheticMeshConfig(decoded)
c.Raw = append(c.Raw[:0], data...)
return nil
}
type FabricServiceChannelRemediationCommand struct {
SchemaVersion string `json:"schema_version"`
CommandID string `json:"command_id"`
@@ -28,6 +28,9 @@ type Config struct {
MeshProductionForwardingEnabled bool
MeshProductionObservationSinkCapacity int
MeshListenAddr string
MeshListenPortMode string
MeshListenAutoPortStart int
MeshListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
MeshAdvertiseTransport string
@@ -58,6 +61,9 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.")
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "direct_tcp_tls"), "Transport label for the advertised mesh endpoint.")
@@ -70,7 +76,7 @@ func Load(args []string, env map[string]string) (Config, error) {
heartbeatSeconds := getEnvInt(env, "RAP_HEARTBEAT_INTERVAL_SECONDS", 15)
fs.DurationVar(&cfg.HeartbeatInterval, "heartbeat-interval", time.Duration(heartbeatSeconds)*time.Second, "Heartbeat interval.")
enrollmentPollIntervalSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5)
enrollmentPollTimeoutSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 600)
enrollmentPollTimeoutSeconds := getEnvSignedInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0)
fs.DurationVar(&cfg.EnrollmentPollInterval, "enrollment-poll-interval", time.Duration(enrollmentPollIntervalSeconds)*time.Second, "Enrollment approval polling interval.")
fs.DurationVar(&cfg.EnrollmentPollTimeout, "enrollment-poll-timeout", time.Duration(enrollmentPollTimeoutSeconds)*time.Second, "Enrollment approval polling timeout.")
if err := fs.Parse(args); err != nil {
@@ -84,6 +90,7 @@ func Load(args []string, env map[string]string) (Config, error) {
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
@@ -117,6 +124,20 @@ func Load(args []string, env map[string]string) (Config, error) {
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
}
switch cfg.MeshListenPortMode {
case "", "manual", "auto", "disabled":
if cfg.MeshListenPortMode == "" {
cfg.MeshListenPortMode = "manual"
}
default:
return Config{}, errors.New("mesh listen port mode must be manual, auto, or disabled")
}
if cfg.MeshListenAutoPortStart <= 0 || cfg.MeshListenAutoPortEnd <= 0 {
return Config{}, errors.New("mesh listen auto port range must be positive")
}
if cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return Config{}, errors.New("mesh listen auto port start must be less than or equal to end")
}
return cfg, nil
}
@@ -22,6 +22,9 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5",
"RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001",
"RAP_MESH_LISTEN_PORT_MODE": "auto",
"RAP_MESH_LISTEN_AUTO_PORT_START": "19010",
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443/",
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
@@ -65,6 +68,9 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if cfg.MeshListenAddr != "127.0.0.1:19001" {
t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr)
}
if cfg.MeshListenPortMode != "auto" || cfg.MeshListenAutoPortStart != 19010 || cfg.MeshListenAutoPortEnd != 19020 {
t.Fatalf("unexpected mesh listen port config: %+v", cfg)
}
if cfg.MeshAdvertiseEndpoint != "https://node-a.example.test:443" ||
cfg.MeshAdvertiseEndpointsJSON == "" ||
cfg.MeshAdvertiseTransport != "wss" ||
@@ -81,6 +87,19 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
}
}
func TestLoadConfigDefaultsEnrollmentPollingToNoTimeout(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
})
if err != nil {
t.Fatalf("load config: %v", err)
}
if cfg.EnrollmentPollTimeout != 0 {
t.Fatalf("EnrollmentPollTimeout = %s, want no timeout", cfg.EnrollmentPollTimeout)
}
}
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
@@ -0,0 +1,135 @@
package hostagent
import (
"errors"
"fmt"
"strings"
)
const (
DefaultContainerName = "rap-node-agent"
DefaultImage = "rap-node-agent:latest"
DefaultStateDir = "/var/lib/rap-node-agent"
DefaultNetwork = "host"
)
type RuntimeConfig struct {
BackendURL string
ClusterID string
JoinToken string
NodeName string
Image string
ContainerName string
StateDir string
Network string
RestartPolicy string
PullImage bool
Replace bool
DockerVPNGatewayEnabled bool
WorkloadSupervisionEnabled bool
MeshSyntheticRuntimeEnabled bool
MeshProductionForwardingEnabled bool
MeshListenAddr string
MeshListenPortMode string
MeshListenAutoPortStart int
MeshListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
MeshRegion string
HeartbeatIntervalSeconds int
EnrollmentPollIntervalSeconds int
EnrollmentPollTimeoutSeconds int
ExtraEnv []string
AdditionalDockerRunArgs []string
ProductionObservationSinkCap int
ImageArtifactURLs []string
ImageArtifactSHA256 string
ImageArtifactSizeBytes int64
}
func (cfg RuntimeConfig) Normalize() RuntimeConfig {
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.Image = firstNonEmpty(cfg.Image, DefaultImage)
cfg.ContainerName = firstNonEmpty(cfg.ContainerName, DefaultContainerName)
cfg.StateDir = firstNonEmpty(cfg.StateDir, DefaultStateDir)
cfg.Network = firstNonEmpty(cfg.Network, DefaultNetwork)
cfg.RestartPolicy = firstNonEmpty(cfg.RestartPolicy, "unless-stopped")
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
cfg.ImageArtifactSHA256 = strings.TrimSpace(cfg.ImageArtifactSHA256)
if cfg.HeartbeatIntervalSeconds == 0 {
cfg.HeartbeatIntervalSeconds = 15
}
if cfg.EnrollmentPollIntervalSeconds == 0 {
cfg.EnrollmentPollIntervalSeconds = 5
}
return cfg
}
func (cfg RuntimeConfig) ValidateInstall() error {
cfg = cfg.Normalize()
var missing []string
if cfg.BackendURL == "" {
missing = append(missing, "backend-url")
}
if cfg.ClusterID == "" {
missing = append(missing, "cluster-id")
}
if cfg.NodeName == "" {
missing = append(missing, "node-name")
}
if len(missing) > 0 {
return fmt.Errorf("missing required install settings: %s", strings.Join(missing, ", "))
}
if cfg.JoinToken == "" && !cfg.Replace {
return errors.New("join-token is required for first install; pass -replace only when updating an already enrolled local state")
}
if cfg.HeartbeatIntervalSeconds <= 0 {
return errors.New("heartbeat interval must be positive")
}
if cfg.EnrollmentPollIntervalSeconds <= 0 {
return errors.New("enrollment poll interval must be positive")
}
if cfg.EnrollmentPollTimeoutSeconds < 0 {
return errors.New("enrollment poll timeout must not be negative")
}
switch cfg.MeshListenPortMode {
case "", "manual", "auto", "disabled":
default:
return errors.New("mesh listen port mode must be manual, auto, or disabled")
}
if cfg.MeshListenAutoPortStart < 0 || cfg.MeshListenAutoPortEnd < 0 {
return errors.New("mesh listen auto port range must not be negative")
}
if cfg.MeshListenAutoPortStart > 0 && cfg.MeshListenAutoPortEnd > 0 && cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return errors.New("mesh listen auto port start must be less than or equal to end")
}
if cfg.ProductionObservationSinkCap < 0 {
return errors.New("production observation sink capacity must not be negative")
}
for _, item := range cfg.ExtraEnv {
if !strings.Contains(item, "=") {
return fmt.Errorf("extra env %q must be KEY=VALUE", item)
}
}
return nil
}
func firstNonEmpty(value, fallback string) string {
if strings.TrimSpace(value) == "" {
return fallback
}
return strings.TrimSpace(value)
}
@@ -0,0 +1,335 @@
package hostagent
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
)
type CommandRunner interface {
Run(ctx context.Context, name string, args ...string) (string, error)
}
type ExecRunner struct{}
func (ExecRunner) Run(ctx context.Context, name string, args ...string) (string, error) {
cmd := exec.CommandContext(ctx, name, args...)
out, err := cmd.CombinedOutput()
if err != nil {
return string(out), fmt.Errorf("%s %s: %w\n%s", name, strings.Join(args, " "), err, strings.TrimSpace(string(out)))
}
return string(out), nil
}
type DockerManager struct {
Runner CommandRunner
Binary string
}
var statHostPath = os.Stat
type InstallResult struct {
ContainerName string
Image string
Replaced bool
Pulled bool
Loaded bool
ContainerID string
}
func (m DockerManager) Install(ctx context.Context, cfg RuntimeConfig) (InstallResult, error) {
if err := cfg.ValidateInstall(); err != nil {
return InstallResult{}, err
}
cfg = cfg.Normalize()
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
docker := firstNonEmpty(m.Binary, "docker")
result := InstallResult{ContainerName: cfg.ContainerName, Image: cfg.Image}
if err := PrepareStateDir(cfg.StateDir); err != nil {
return result, err
}
if cfg.DockerVPNGatewayEnabled {
if err := ensureHostTunDevice(ctx, runner); err != nil {
return result, err
}
}
if cfg.PullImage {
if _, err := runner.Run(ctx, docker, "pull", cfg.Image); err != nil {
return result, err
}
result.Pulled = true
} else if len(cfg.ImageArtifactURLs) > 0 {
loaded, err := m.ensureImageFromArtifact(ctx, runner, docker, cfg)
if err != nil {
return result, err
}
result.Loaded = loaded
}
if cfg.Replace {
if _, err := runner.Run(ctx, docker, "rm", "-f", cfg.ContainerName); err != nil && !isNoSuchContainerError(err) {
return result, err
}
result.Replaced = true
}
args := DockerRunArgs(cfg)
out, err := runner.Run(ctx, docker, args...)
if err != nil {
return result, err
}
result.ContainerID = strings.TrimSpace(out)
return result, nil
}
func ensureHostTunDevice(ctx context.Context, runner CommandRunner) error {
if _, err := statHostPath("/dev/net/tun"); err == nil {
return nil
}
if _, err := runner.Run(ctx, "modprobe", "tun"); err != nil {
return fmt.Errorf("docker vpn gateway requires host /dev/net/tun; modprobe tun failed: %w", err)
}
if _, err := statHostPath("/dev/net/tun"); err != nil {
return fmt.Errorf("docker vpn gateway requires host /dev/net/tun after modprobe tun: %w", err)
}
return nil
}
func (m DockerManager) ensureImageFromArtifact(ctx context.Context, runner CommandRunner, docker string, cfg RuntimeConfig) (bool, error) {
if _, err := runner.Run(ctx, docker, "image", "inspect", cfg.Image); err == nil && !cfg.Replace {
return false, nil
}
path, err := downloadFirstArtifact(ctx, cfg.ImageArtifactURLs, cfg.ImageArtifactSHA256, cfg.ImageArtifactSizeBytes)
if err != nil {
return false, err
}
defer os.Remove(path)
if _, err := runner.Run(ctx, docker, "load", "-i", path); err != nil {
return false, err
}
if _, err := runner.Run(ctx, docker, "image", "inspect", cfg.Image); err != nil {
return true, fmt.Errorf("loaded artifact but image %q is not available: %w", cfg.Image, err)
}
return true, nil
}
func downloadFirstArtifact(ctx context.Context, urls []string, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
var lastErr error
for _, rawURL := range urls {
rawURL = strings.TrimSpace(rawURL)
if rawURL == "" {
continue
}
for attempt := 1; attempt <= 3; attempt++ {
path, err := downloadArtifact(ctx, rawURL, expectedSHA256, expectedSizeBytes)
if err == nil {
return path, nil
}
lastErr = err
}
}
if lastErr != nil {
return "", lastErr
}
return "", fmt.Errorf("no artifact URLs configured")
}
func downloadArtifact(ctx context.Context, rawURL, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
if err != nil {
return "", err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", fmt.Errorf("download artifact %s: %w", rawURL, err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return "", fmt.Errorf("download artifact %s: %s", rawURL, resp.Status)
}
file, err := os.CreateTemp("", "rap-docker-image-*.tar")
if err != nil {
return "", err
}
path := file.Name()
hasher := sha256.New()
written, copyErr := io.Copy(io.MultiWriter(file, hasher), resp.Body)
closeErr := file.Close()
if copyErr != nil {
os.Remove(path)
return "", copyErr
}
if closeErr != nil {
os.Remove(path)
return "", closeErr
}
if resp.ContentLength >= 0 && written != resp.ContentLength {
os.Remove(path)
return "", fmt.Errorf("artifact download truncated for %s: got %d bytes want content-length %d", rawURL, written, resp.ContentLength)
}
if expectedSizeBytes > 0 && written != expectedSizeBytes {
if strings.TrimSpace(expectedSHA256) != "" {
os.Remove(path)
return "", fmt.Errorf("artifact size mismatch for %s: got %d bytes want %d", rawURL, written, expectedSizeBytes)
}
fmt.Printf("artifact size mismatch for %s: got %d bytes want %d; proceeding without checksum for backward-compatible installs\n", rawURL, written, expectedSizeBytes)
}
actual := hex.EncodeToString(hasher.Sum(nil))
if expected := strings.TrimSpace(expectedSHA256); expected != "" && !strings.EqualFold(actual, expected) {
os.Remove(path)
return "", fmt.Errorf("artifact checksum mismatch for %s: got %s want %s", rawURL, actual, expected)
}
return path, nil
}
func (m DockerManager) Status(ctx context.Context, containerName string) (string, error) {
containerName = firstNonEmpty(containerName, DefaultContainerName)
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
docker := firstNonEmpty(m.Binary, "docker")
return runner.Run(ctx, docker, "ps", "-a", "--filter", "name=^/"+containerName+"$", "--format", "{{.Names}}\t{{.Image}}\t{{.Status}}")
}
func PrepareStateDir(stateDir string) error {
stateDir = strings.TrimSpace(stateDir)
if stateDir == "" || !looksLikeHostPath(stateDir) {
return nil
}
if err := os.MkdirAll(stateDir, 0o777); err != nil {
return fmt.Errorf("prepare state dir %q: %w", stateDir, err)
}
if err := os.Chmod(stateDir, 0o777); err != nil {
if isAccessDenied(err) {
return nil
}
return fmt.Errorf("chmod state dir %q: %w", stateDir, err)
}
return nil
}
func DockerRunArgs(cfg RuntimeConfig) []string {
cfg = cfg.Normalize()
args := []string{
"run", "-d",
"--name", cfg.ContainerName,
"--restart", cfg.RestartPolicy,
"--network", cfg.Network,
"-v", cfg.StateDir + ":/var/lib/rap-node-agent",
}
if cfg.DockerVPNGatewayEnabled {
args = append(args,
"--privileged",
"--cap-add", "NET_ADMIN",
"--device", "/dev/net/tun:/dev/net/tun",
)
}
args = append(args, cfg.AdditionalDockerRunArgs...)
for _, env := range NodeAgentEnv(cfg) {
args = append(args, "-e", env)
}
args = append(args, cfg.Image)
return args
}
func NodeAgentEnv(cfg RuntimeConfig) []string {
return NodeAgentEnvWithStateDir(cfg, "/var/lib/rap-node-agent")
}
func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
cfg = cfg.Normalize()
stateDir = firstNonEmpty(stateDir, cfg.StateDir)
env := []string{
"RAP_BACKEND_URL=" + cfg.BackendURL,
"RAP_CLUSTER_ID=" + cfg.ClusterID,
"RAP_NODE_NAME=" + cfg.NodeName,
"RAP_NODE_STATE_DIR=" + stateDir,
"RAP_HEARTBEAT_INTERVAL_SECONDS=" + strconv.Itoa(cfg.HeartbeatIntervalSeconds),
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollIntervalSeconds),
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollTimeoutSeconds),
"RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled),
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled),
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled),
}
if cfg.JoinToken != "" {
env = append(env, "RAP_JOIN_TOKEN="+cfg.JoinToken)
}
if cfg.MeshListenAddr != "" {
env = append(env, "RAP_MESH_LISTEN_ADDR="+cfg.MeshListenAddr)
}
if cfg.MeshListenPortMode != "" {
env = append(env, "RAP_MESH_LISTEN_PORT_MODE="+cfg.MeshListenPortMode)
}
if cfg.MeshListenAutoPortStart > 0 {
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_START="+strconv.Itoa(cfg.MeshListenAutoPortStart))
}
if cfg.MeshListenAutoPortEnd > 0 {
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_END="+strconv.Itoa(cfg.MeshListenAutoPortEnd))
}
if cfg.MeshAdvertiseEndpoint != "" {
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINT="+cfg.MeshAdvertiseEndpoint)
}
if cfg.MeshAdvertiseEndpointsJSON != "" {
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON="+cfg.MeshAdvertiseEndpointsJSON)
}
if cfg.MeshAdvertiseTransport != "" {
env = append(env, "RAP_MESH_ADVERTISE_TRANSPORT="+cfg.MeshAdvertiseTransport)
}
if cfg.MeshConnectivityMode != "" {
env = append(env, "RAP_MESH_CONNECTIVITY_MODE="+cfg.MeshConnectivityMode)
}
if cfg.MeshNATType != "" {
env = append(env, "RAP_MESH_NAT_TYPE="+cfg.MeshNATType)
}
if cfg.MeshRegion != "" {
env = append(env, "RAP_MESH_REGION="+cfg.MeshRegion)
}
if cfg.ProductionObservationSinkCap > 0 {
env = append(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY="+strconv.Itoa(cfg.ProductionObservationSinkCap))
}
env = append(env, cfg.ExtraEnv...)
return env
}
func RedactedArgs(args []string) []string {
out := append([]string(nil), args...)
for i := 0; i < len(out)-1; i++ {
if out[i] == "-e" && strings.HasPrefix(out[i+1], "RAP_JOIN_TOKEN=") {
out[i+1] = "RAP_JOIN_TOKEN=***"
}
}
return out
}
func isNoSuchContainerError(err error) bool {
value := strings.ToLower(err.Error())
return strings.Contains(value, "no such container") || strings.Contains(value, "no such object")
}
func looksLikeHostPath(value string) bool {
if filepath.IsAbs(value) {
return true
}
return strings.HasPrefix(value, ".") || strings.HasPrefix(value, "~") || strings.Contains(value, "/") || strings.Contains(value, `\`)
}
func boolString(value bool) string {
if value {
return "true"
}
return "false"
}
@@ -0,0 +1,366 @@
package hostagent
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
)
type recordingRunner struct {
calls [][]string
}
func (r *recordingRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) > 0 && args[0] == "run" {
return "container-1\n", nil
}
return "", nil
}
type imageMissingRunner struct {
calls [][]string
inspectSeen int
}
func (r *imageMissingRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) >= 3 && args[0] == "image" && args[1] == "inspect" {
r.inspectSeen++
if r.inspectSeen == 1 {
return "", fmt.Errorf("No such image")
}
return "[]", nil
}
if len(args) > 0 && args[0] == "run" {
return "container-1\n", nil
}
return "", nil
}
type imagePresentRunner struct {
calls [][]string
}
func (r *imagePresentRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) > 0 && args[0] == "run" {
return "container-1\n", nil
}
return "[]", nil
}
func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
args := DockerRunArgs(RuntimeConfig{
BackendURL: "http://control/api/v1/",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "/srv/rap/node-a",
MeshSyntheticRuntimeEnabled: true,
MeshListenAddr: ":19131",
MeshAdvertiseEndpoint: "http://10.0.0.11:19131/",
MeshConnectivityMode: "private_lan",
})
joined := strings.Join(args, "\x00")
for _, want := range []string{
"run", "-d", "--name\x00rap-node-agent-node-a", "--network\x00host",
"-v\x00/srv/rap/node-a:/var/lib/rap-node-agent",
"RAP_BACKEND_URL=http://control/api/v1",
"RAP_CLUSTER_ID=cluster-1",
"RAP_JOIN_TOKEN=join-secret",
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
"RAP_MESH_LISTEN_ADDR=:19131",
"RAP_MESH_ADVERTISE_ENDPOINT=http://10.0.0.11:19131",
"RAP_MESH_CONNECTIVITY_MODE=private_lan",
"rap-node-agent:test",
} {
if !strings.Contains(joined, want) {
t.Fatalf("docker args missing %q in %#v", want, args)
}
}
}
func TestDockerRunArgsEnableVPNGatewayDevice(t *testing.T) {
args := DockerRunArgs(RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
StateDir: "rap-node-state",
DockerVPNGatewayEnabled: true,
})
joined := strings.Join(args, "\x00")
for _, want := range []string{
"--privileged",
"--cap-add\x00NET_ADMIN",
"--device\x00/dev/net/tun:/dev/net/tun",
} {
if !strings.Contains(joined, want) {
t.Fatalf("docker vpn gateway args missing %q in %#v", want, args)
}
}
}
func TestPrepareStateDirCreatesWritableHostPath(t *testing.T) {
dir := filepath.Join(t.TempDir(), "node-state")
if err := PrepareStateDir(dir); err != nil {
t.Fatalf("prepare state dir: %v", err)
}
info, err := os.Stat(dir)
if err != nil {
t.Fatalf("stat state dir: %v", err)
}
if !info.IsDir() {
t.Fatalf("state path is not a directory")
}
if info.Mode().Perm()&0o777 != 0o777 {
t.Fatalf("state dir mode = %v, want writable for container nonroot user", info.Mode().Perm())
}
}
func TestPrepareStateDirSkipsNamedVolume(t *testing.T) {
if err := PrepareStateDir("rap-node-state"); err != nil {
t.Fatalf("named volume should be ignored: %v", err)
}
}
func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/node-agents/docker-install-profile" {
t.Fatalf("path = %s", r.URL.Path)
}
_ = json.NewEncoder(w).Encode(map[string]any{
"docker_install_profile": map[string]any{
"cluster_id": "cluster-1",
"backend_url": "https://control.example.test/api/v1",
"join_token": "rap_join_profile",
"node_name": "node-a",
"image": "rap-node-agent:test",
"artifact_endpoints": []string{"https://cache.example.test/artifacts"},
"docker_image_artifact": map[string]any{
"kind": "docker_image_tar",
"image": "rap-node-agent:test",
"file_name": "rap-node-agent-test.tar",
"size_bytes": 21,
},
"container_name": "rap-node-agent-node-a",
"state_dir": "/var/lib/rap/nodes/node-a",
"network": "host",
"restart_policy": "unless-stopped",
"replace": true,
"mesh_synthetic_runtime_enabled": true,
"mesh_connectivity_mode": "outbound_only",
},
})
}))
defer server.Close()
profile, err := FetchDockerInstallProfile(context.Background(), ProfileRequest{
URL: server.URL + "/api/v1",
ClusterID: "cluster-1",
InstallToken: "rap_join_profile",
NodeName: "node-a",
})
if err != nil {
t.Fatalf("fetch profile: %v", err)
}
cfg := RuntimeConfigFromProfile(profile).Normalize()
if cfg.BackendURL != "https://control.example.test/api/v1" ||
cfg.ClusterID != "cluster-1" ||
cfg.JoinToken != "rap_join_profile" ||
cfg.ContainerName != "rap-node-agent-node-a" ||
len(cfg.ImageArtifactURLs) != 1 ||
cfg.ImageArtifactSizeBytes != 21 ||
!cfg.MeshSyntheticRuntimeEnabled ||
cfg.MeshConnectivityMode != "outbound_only" {
t.Fatalf("unexpected cfg: %+v", cfg)
}
}
func TestInstallLoadsImageArtifactWhenImageMissing(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Loaded || result.ContainerID != "container-1" {
t.Fatalf("result = %+v", result)
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
if !strings.Contains(joined, "load\x00-i") || !strings.Contains(joined, "run\x00-d") {
t.Fatalf("expected docker load and run calls, got %#v", runner.calls)
}
}
func TestInstallAcceptsSizeMismatchWhenChecksumMissing(t *testing.T) {
const payload = "fake docker image tar"
const wrongSize = 999
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(payload))
}))
defer server.Close()
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "", // intentionally absent -> size mismatch should not block install
ImageArtifactSizeBytes: wrongSize,
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Loaded || result.ContainerID != "container-1" {
t.Fatalf("result = %+v", result)
}
}
func TestInstallReloadsImageArtifactWhenReplacingMutableTag(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
runner := &imagePresentRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Loaded {
t.Fatalf("expected image artifact reload, got %+v", result)
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
if !strings.Contains(joined, "load\x00-i") {
t.Fatalf("expected docker load even when image exists during replace, got %#v", runner.calls)
}
}
func TestDockerInstallLoadsExplicitArtifactBeforeReplace(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/rap-node-agent-test.tar" {
t.Fatalf("unexpected path %s", r.URL.Path)
}
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Loaded || !result.Replaced {
t.Fatalf("expected explicit artifact load and replace, got %+v", result)
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
if !strings.Contains(joined, "load\x00-i") {
t.Fatalf("expected docker load call, got %#v", runner.calls)
}
}
func flattenCalls(calls [][]string) []string {
out := []string{}
for _, call := range calls {
out = append(out, call...)
}
return out
}
func TestInstallCanPullReplaceAndRedactsJoinToken(t *testing.T) {
runner := &recordingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
PullImage: true,
Replace: true,
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
})
if err != nil {
t.Fatalf("install: %v", err)
}
if !result.Pulled || !result.Replaced || result.ContainerID != "container-1" {
t.Fatalf("result = %+v", result)
}
if len(runner.calls) != 3 {
t.Fatalf("calls = %#v", runner.calls)
}
redacted := strings.Join(RedactedArgs(runner.calls[2][1:]), " ")
if strings.Contains(redacted, "join-secret") || !strings.Contains(redacted, "RAP_JOIN_TOKEN=***") {
t.Fatalf("redacted args leaked token: %s", redacted)
}
}
func TestValidateRequiresJoinTokenUnlessReplacingExistingState(t *testing.T) {
err := RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a"}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "join-token") {
t.Fatalf("expected join token validation error, got %v", err)
}
err = RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a", Replace: true}.ValidateInstall()
if err != nil {
t.Fatalf("replace update should allow missing join token: %v", err)
}
}
@@ -0,0 +1,481 @@
package hostagent
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"time"
)
const (
DefaultLinuxInstallRoot = "/opt/rap"
DefaultLinuxStateRoot = "/var/lib/rap/nodes"
DefaultLinuxConfigRoot = "/etc/rap"
)
type LinuxInstallConfig struct {
RuntimeConfig RuntimeConfig
NodeID string
InstallDir string
StateDir string
ConfigDir string
UnitDir string
StartupMode string
ArtifactURLs []string
ArtifactSHA256 string
ArtifactSizeBytes int64
Replace bool
DryRun bool
AutoUpdateEnabled bool
AutoUpdateCurrentVersion string
AutoUpdateChannel string
AutoUpdateIntervalSeconds int
AutoUpdateInitialDelaySeconds int
AutoUpdateHealthTimeoutSeconds int
HostAgentSourcePath string
}
type LinuxInstallResult struct {
NodeName string
InstallDir string
StateDir string
ConfigDir string
NodeAgentPath string
HostAgentPath string
EnvPath string
UnitName string
UnitPath string
UpdaterUnitName string
Downloaded bool
Started bool
UpdaterStarted bool
}
type LinuxManager struct {
Runner CommandRunner
}
func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConfig {
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultLinuxStateRoot, safeUnitSlug(profile.NodeName)))
installDir := firstNonEmpty(profile.InstallDir, filepath.Join(DefaultLinuxInstallRoot, safeUnitSlug(profile.NodeName)))
return LinuxInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
StateDir: stateDir,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
},
InstallDir: installDir,
StateDir: stateDir,
ConfigDir: filepath.Join(DefaultLinuxConfigRoot, safeUnitSlug(profile.NodeName)),
StartupMode: firstNonEmpty(profile.StartupMode, "systemd"),
ArtifactURLs: linuxArtifactURLs(profile),
ArtifactSHA256: linuxArtifactSHA256(profile),
ArtifactSizeBytes: linuxArtifactSizeBytes(profile),
Replace: true,
AutoUpdateEnabled: true,
}
}
func linuxArtifactURLs(profile LinuxInstallProfile) []string {
if profile.NodeAgentArtifact != nil && len(profile.NodeAgentArtifact.URLs) > 0 {
return append([]string(nil), profile.NodeAgentArtifact.URLs...)
}
if profile.NodeAgentArtifact == nil || strings.TrimSpace(profile.NodeAgentArtifact.FileName) == "" {
return nil
}
out := []string{}
fileName := strings.TrimLeft(strings.TrimSpace(profile.NodeAgentArtifact.FileName), "/")
for _, endpoint := range profile.ArtifactEndpoints {
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
out = append(out, trimmed+"/"+fileName)
}
}
return out
}
func linuxArtifactSHA256(profile LinuxInstallProfile) string {
if profile.NodeAgentArtifact == nil {
return ""
}
return strings.TrimSpace(profile.NodeAgentArtifact.SHA256)
}
func linuxArtifactSizeBytes(profile LinuxInstallProfile) int64 {
if profile.NodeAgentArtifact == nil {
return 0
}
return profile.NodeAgentArtifact.SizeBytes
}
func (m LinuxManager) Install(ctx context.Context, cfg LinuxInstallConfig) (LinuxInstallResult, error) {
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
cfg.RuntimeConfig.Replace = cfg.Replace
cfg.RuntimeConfig.StateDir = firstNonEmpty(cfg.StateDir, cfg.RuntimeConfig.StateDir)
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
if err := cfg.RuntimeConfig.ValidateInstall(); err != nil {
return LinuxInstallResult{}, err
}
slug := safeUnitSlug(cfg.RuntimeConfig.NodeName)
cfg.InstallDir = firstNonEmpty(cfg.InstallDir, filepath.Join(DefaultLinuxInstallRoot, slug))
cfg.StateDir = firstNonEmpty(cfg.RuntimeConfig.StateDir, filepath.Join(DefaultLinuxStateRoot, slug))
cfg.ConfigDir = firstNonEmpty(cfg.ConfigDir, filepath.Join(DefaultLinuxConfigRoot, slug))
cfg.UnitDir = firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir)
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "systemd"))
unitName := "rap-node-agent-" + slug + ".service"
result := LinuxInstallResult{
NodeName: cfg.RuntimeConfig.NodeName,
InstallDir: cfg.InstallDir,
StateDir: cfg.StateDir,
ConfigDir: cfg.ConfigDir,
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent"),
HostAgentPath: filepath.Join(cfg.InstallDir, "rap-host-agent"),
EnvPath: filepath.Join(cfg.ConfigDir, "rap-node-agent.env"),
UnitName: unitName,
UnitPath: filepath.Join(cfg.UnitDir, unitName),
}
if cfg.DryRun {
return result, nil
}
if runtime.GOOS != "linux" {
return result, fmt.Errorf("linux install is only supported on linux hosts")
}
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
return result, err
}
if err := os.MkdirAll(cfg.StateDir, 0o700); err != nil {
return result, err
}
if err := os.MkdirAll(cfg.ConfigDir, 0o755); err != nil {
return result, err
}
if len(cfg.ArtifactURLs) > 0 && (cfg.Replace || !fileExists(result.NodeAgentPath)) {
m.stopService(ctx, result.UnitName)
path, err := downloadFirstArtifact(ctx, cfg.ArtifactURLs, cfg.ArtifactSHA256, cfg.ArtifactSizeBytes)
if err != nil {
return result, err
}
defer os.Remove(path)
if err := copyFile(path, result.NodeAgentPath, 0o755); err != nil {
m.stopService(ctx, result.UnitName)
if retryErr := copyFile(path, result.NodeAgentPath, 0o755); retryErr != nil {
return result, err
}
}
result.Downloaded = true
}
if !fileExists(result.NodeAgentPath) {
return result, fmt.Errorf("node-agent binary is missing at %s and no artifact was available", result.NodeAgentPath)
}
if err := os.WriteFile(result.EnvPath, []byte(linuxEnvFile(cfg.RuntimeConfig, cfg.StateDir)), 0o600); err != nil {
return result, err
}
if cfg.StartupMode != "none" {
if err := os.MkdirAll(cfg.UnitDir, 0o755); err != nil {
return result, err
}
if err := os.WriteFile(result.UnitPath, []byte(linuxNodeAgentUnit(result)), 0o644); err != nil {
return result, err
}
runner := m.runner()
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
return result, err
}
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", result.UnitName); err != nil {
return result, err
}
result.Started = true
}
return installLinuxHostAgentUpdater(ctx, m, result, cfg)
}
func (m LinuxManager) stopService(ctx context.Context, unitName string) {
if strings.TrimSpace(unitName) == "" {
return
}
_, _ = m.runner().Run(ctx, "systemctl", "stop", unitName)
}
func (m LinuxManager) runner() CommandRunner {
if m.Runner != nil {
return m.Runner
}
return ExecRunner{}
}
func linuxEnvFile(cfg RuntimeConfig, stateDir string) string {
lines := []string{}
for _, env := range NodeAgentEnvWithStateDir(cfg, stateDir) {
key, value, ok := strings.Cut(env, "=")
if !ok {
continue
}
lines = append(lines, key+"="+systemdQuote(value))
}
return strings.Join(lines, "\n") + "\n"
}
func linuxNodeAgentUnit(result LinuxInstallResult) string {
return fmt.Sprintf(`[Unit]
Description=RAP node-agent %s
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
EnvironmentFile=%s
ExecStart=%s
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
`, result.NodeName, systemdQuote(result.EnvPath), systemdQuote(result.NodeAgentPath))
}
func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result LinuxInstallResult, cfg LinuxInstallConfig) (LinuxInstallResult, error) {
if !cfg.AutoUpdateEnabled || strings.EqualFold(cfg.StartupMode, "none") {
return result, nil
}
if cfg.AutoUpdateCurrentVersion == "" || (cfg.Replace && !result.Downloaded) {
cfg.AutoUpdateCurrentVersion = "0.0.0"
}
if err := installHostAgentBinary(cfg.HostAgentSourcePath, result.HostAgentPath); err != nil {
return result, err
}
interval := cfg.AutoUpdateIntervalSeconds
if interval == 0 {
interval = 21600
}
initialDelay := cfg.AutoUpdateInitialDelaySeconds
if initialDelay == 0 {
initialDelay = 15
}
healthTimeout := cfg.AutoUpdateHealthTimeoutSeconds
if healthTimeout == 0 {
healthTimeout = 30
}
args := []string{
result.HostAgentPath,
"update-loop",
"--backend-url", cfg.RuntimeConfig.BackendURL,
"--cluster-id", cfg.RuntimeConfig.ClusterID,
"--state-dir", result.StateDir,
"--current-version", cfg.AutoUpdateCurrentVersion,
"--os", "linux",
"--arch", runtime.GOARCH,
"--install-type", BinaryUpdateInstallType,
"--binary-path", result.NodeAgentPath,
"--systemd-unit", result.UnitName,
"--health-timeout-seconds", fmt.Sprintf("%d", healthTimeout),
"--interval-seconds", fmt.Sprintf("%d", interval),
"--initial-delay-seconds", fmt.Sprintf("%d", initialDelay),
"--host-agent-update-status-enabled",
"--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"),
"--host-agent-binary-path", result.HostAgentPath,
}
if strings.TrimSpace(cfg.NodeID) != "" {
args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID))
}
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
}
unitName := "rap-host-agent-updater-" + safeUnitSlug(result.NodeName) + ".service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
unit := fmt.Sprintf(`[Unit]
Description=RAP host-agent updater for %s
After=network-online.target %s
Wants=network-online.target
[Service]
Type=simple
ExecStart=%s
Restart=always
RestartSec=30
[Install]
WantedBy=multi-user.target
`, result.NodeName, result.UnitName, systemdJoin(args))
if err := os.WriteFile(unitPath, []byte(unit), 0o644); err != nil {
return result, err
}
runner := m.runner()
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
return result, err
}
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", unitName); err != nil {
return result, err
}
result.UpdaterUnitName = unitName
result.UpdaterStarted = true
return result, nil
}
func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
req.OS = firstNonEmpty(req.OS, "linux")
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
req = req.Normalize()
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return UpdateResult{}, err
}
plan, err := FetchNodeUpdatePlan(ctx, req)
if err != nil {
return UpdateResult{}, err
}
result := UpdateResult{Action: plan.Action, Reason: plan.Reason, TargetVersion: plan.TargetVersion, ContainerName: req.SystemdUnitName, NewImage: req.BinaryPath}
if plan.Action != "update" {
if !req.DryRun {
status := statusFromNoopPlan(req, plan)
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["systemd_unit"] = req.SystemdUnitName
status.Payload["binary_path"] = req.BinaryPath
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
}
return result, nil
}
if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact == nil {
err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != BinaryUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
runner := m.runner()
_, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
return result, err
}
result.Replaced = true
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
_ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()})
return result, nil
}
func (m LinuxManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
req.OS = firstNonEmpty(req.OS, "linux")
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
cfg.Request = req
return runLinuxUpdateLoop(ctx, m, cfg)
}
func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfig) error {
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
}
if cfg.InitialDelay > 0 {
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
return err
}
}
runs := 0
lastTriggerGeneration := currentUpdateTriggerGeneration(cfg.Request.StateDir)
for {
runs++
result, err := m.ApplyUpdate(ctx, cfg.Request)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("linux_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, cfg.Request.StateDir)
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepUntilUpdateIntervalOrTrigger(ctx, cfg.Request.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
continue
} else {
logf("linux_update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
}
} else {
logf("linux_update_loop run=%d action=%s reason=%s target=%s unit=%s replaced=%t", runs, result.Action, result.Reason, result.TargetVersion, result.ContainerName, result.Replaced)
if result.Action == "update" && result.TargetVersion != "" {
cfg.Request.CurrentVersion = result.TargetVersion
}
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, cfg.Request.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, cfg.Request.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, cfg.Request.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, cfg.Request.StateDir)
hostReq.Channel = firstNonEmpty(hostReq.Channel, cfg.Request.Channel)
hostReq.OS = firstNonEmpty(hostReq.OS, "linux")
hostReq.Arch = firstNonEmpty(hostReq.Arch, runtime.GOARCH)
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, BinaryUpdateInstallType)
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
if hostErr != nil {
logf("linux_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
} else {
logf("linux_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t", runs, hostResult.Action, hostResult.Reason, hostResult.TargetVersion, hostResult.NewImage, hostResult.Replaced, hostResult.RestartNeeded)
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
cfg.HostAgentUpdateRequest.CurrentVersion = hostResult.TargetVersion
}
}
}
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepUntilUpdateIntervalOrTrigger(ctx, cfg.Request.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
}
}
@@ -0,0 +1,333 @@
package hostagent
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"strings"
"time"
)
type DockerInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
Image string `json:"image"`
ContainerName string `json:"container_name"`
StateDir string `json:"state_dir"`
Network string `json:"network"`
RestartPolicy string `json:"restart_policy"`
PullImage bool `json:"pull_image"`
Replace bool `json:"replace"`
DockerVPNGatewayEnabled bool `json:"docker_vpn_gateway_enabled"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
Roles []string `json:"roles"`
}
type DockerArtifact struct {
Kind string `json:"kind"`
Image string `json:"image"`
MediaType string `json:"media_type"`
FileName string `json:"file_name"`
URLs []string `json:"urls"`
SHA256 string `json:"sha256"`
SizeBytes int64 `json:"size_bytes"`
}
type WindowsInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
StateDir string `json:"state_dir"`
InstallDir string `json:"install_dir"`
StartupMode string `json:"startup_mode"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
Roles []string `json:"roles"`
}
type LinuxInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
StateDir string `json:"state_dir"`
InstallDir string `json:"install_dir"`
StartupMode string `json:"startup_mode"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
Roles []string `json:"roles"`
}
type ProfileRequest struct {
URL string
ClusterID string
InstallToken string
NodeName string
HTTPClient *http.Client
}
func FetchDockerInstallProfile(ctx context.Context, req ProfileRequest) (DockerInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return DockerInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/docker-install-profile") {
url += "/node-agents/docker-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return DockerInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return DockerInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return DockerInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return DockerInstallProfile{}, fmt.Errorf("fetch docker install profile: %s", resp.Status)
}
var envelope struct {
Profile DockerInstallProfile `json:"docker_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return DockerInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
}
func FetchWindowsInstallProfile(ctx context.Context, req ProfileRequest) (WindowsInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return WindowsInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/windows-install-profile") {
url += "/node-agents/windows-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return WindowsInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return WindowsInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return WindowsInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return WindowsInstallProfile{}, fmt.Errorf("fetch windows install profile: %s", resp.Status)
}
var envelope struct {
Profile WindowsInstallProfile `json:"windows_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return WindowsInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
}
func FetchLinuxInstallProfile(ctx context.Context, req ProfileRequest) (LinuxInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return LinuxInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/linux-install-profile") {
url += "/node-agents/linux-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return LinuxInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return LinuxInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return LinuxInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return LinuxInstallProfile{}, fmt.Errorf("fetch linux install profile: %s", resp.Status)
}
var envelope struct {
Profile LinuxInstallProfile `json:"linux_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return LinuxInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
}
func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
return RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
Image: profile.Image,
ContainerName: profile.ContainerName,
StateDir: profile.StateDir,
Network: profile.Network,
RestartPolicy: profile.RestartPolicy,
PullImage: profile.PullImage,
Replace: profile.Replace,
DockerVPNGatewayEnabled: profile.DockerVPNGatewayEnabled,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
ImageArtifactURLs: dockerArtifactURLs(profile),
ImageArtifactSHA256: dockerArtifactSHA256(profile),
ImageArtifactSizeBytes: dockerArtifactSizeBytes(profile),
}
}
func dockerArtifactURLs(profile DockerInstallProfile) []string {
if profile.DockerImageArtifact != nil && len(profile.DockerImageArtifact.URLs) > 0 {
return append([]string(nil), profile.DockerImageArtifact.URLs...)
}
if profile.DockerImageArtifact == nil || strings.TrimSpace(profile.DockerImageArtifact.FileName) == "" {
return nil
}
out := []string{}
fileName := strings.TrimLeft(strings.TrimSpace(profile.DockerImageArtifact.FileName), "/")
for _, endpoint := range profile.ArtifactEndpoints {
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
out = append(out, trimmed+"/"+fileName)
}
}
return out
}
func dockerArtifactSHA256(profile DockerInstallProfile) string {
if profile.DockerImageArtifact == nil {
return ""
}
return strings.TrimSpace(profile.DockerImageArtifact.SHA256)
}
func dockerArtifactSizeBytes(profile DockerInstallProfile) int64 {
if profile.DockerImageArtifact == nil {
return 0
}
return profile.DockerImageArtifact.SizeBytes
}
@@ -0,0 +1,258 @@
package hostagent
import (
"context"
"errors"
"fmt"
"os"
"strings"
"time"
)
type HostAgentUpdateRequest struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
CurrentVersion string
Channel string
OS string
Arch string
InstallType string
BinaryPath string
DryRun bool
RestartService string
RestartAfterApply bool
}
type HostAgentUpdateLoopConfig struct {
Request HostAgentUpdateRequest
Interval time.Duration
InitialDelay time.Duration
Jitter float64
MaxRuns int
StopOnError bool
Logf func(format string, args ...any)
}
func (req HostAgentUpdateRequest) updateRequest() UpdateRequest {
return UpdateRequest{
BackendURL: req.BackendURL,
ClusterID: req.ClusterID,
NodeID: req.NodeID,
StateDir: req.StateDir,
Product: HostAgentUpdateProduct,
CurrentVersion: req.CurrentVersion,
OS: firstNonEmpty(req.OS, "linux"),
Arch: req.Arch,
InstallType: firstNonEmpty(req.InstallType, BinaryUpdateInstallType),
Channel: req.Channel,
ContainerName: "host-agent-service",
DryRun: req.DryRun,
}
}
func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUpdateRequest) (UpdateResult, error) {
binaryPath := firstNonEmpty(req.BinaryPath, DefaultHostAgentInstallPath)
planReq := req.updateRequest()
planReq.BinaryDefaults()
resolved, err := resolveUpdateRequest(planReq)
if err != nil {
return UpdateResult{}, err
}
plan, err := FetchNodeUpdatePlan(ctx, resolved)
if err != nil {
return UpdateResult{}, err
}
result := UpdateResult{
Action: plan.Action,
Reason: plan.Reason,
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
NewImage: binaryPath,
}
if plan.Action != "update" {
if !req.DryRun {
status := statusFromNoopPlan(resolved, plan)
status.Product = HostAgentUpdateProduct
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["binary_path"] = binaryPath
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, status)
}
return result, nil
}
if plan.Artifact == nil {
err := errors.New("host-agent update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
return result, err
}
if !isBinaryInstallType(plan.Artifact.InstallType) {
err := fmt.Errorf("unsupported host-agent artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL)
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": binaryPath},
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
if err := installHostAgentBinary(path, binaryPath); err != nil {
stageErr := stageHostAgentBinary(path, binaryPath)
if stageErr == nil {
result.RestartNeeded = true
_ = saveUpdateState(resolved.StateDir, UpdateState{
Product: HostAgentUpdateProduct,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "apply",
Status: "staged",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"binary_path": binaryPath, "staged_path": binaryPath + ".next", "restart_needed": true, "replace_error": err.Error()},
})
return result, nil
}
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr)))
return result, err
}
result.Loaded = true
result.Replaced = true
result.RestartNeeded = true
_ = saveUpdateState(resolved.StateDir, UpdateState{
Product: HostAgentUpdateProduct,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "apply",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"binary_path": binaryPath, "restart_needed": true},
})
if req.RestartAfterApply && strings.TrimSpace(req.RestartService) != "" {
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
_, err = runner.Run(ctx, "systemctl", "restart", req.RestartService)
if err != nil {
return result, err
}
result.RestartNeeded = false
}
return result, nil
}
func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgentUpdateLoopConfig) error {
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
if cfg.InitialDelay < 0 || cfg.Interval < 0 {
return errors.New("host-agent update loop durations must not be negative")
}
if cfg.Jitter < 0 || cfg.Jitter > 1 {
return errors.New("host-agent update loop jitter must be between 0 and 1")
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
}
if cfg.InitialDelay > 0 {
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
return err
}
}
runs := 0
req := cfg.Request
for {
runs++
result, err := m.ApplyHostAgentUpdate(ctx, req)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
} else {
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
}
} else {
logf("host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
runs,
result.Action,
result.Reason,
result.TargetVersion,
result.NewImage,
result.Replaced,
result.RestartNeeded,
)
if result.Action == "update" && result.TargetVersion != "" {
req.CurrentVersion = result.TargetVersion
}
}
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
return err
}
}
}
func (req *UpdateRequest) BinaryDefaults() {
req.Product = firstNonEmpty(req.Product, HostAgentUpdateProduct)
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
req.OS = firstNonEmpty(req.OS, "linux")
}
func isBinaryInstallType(value string) bool {
switch strings.TrimSpace(value) {
case "", BinaryUpdateInstallType, "windows_binary", "binary", "host_binary", "linux-amd64-binary", "windows-amd64-binary":
return true
default:
return false
}
}
func hostAgentInstallTypeFor(nodeInstallType string) string {
if strings.TrimSpace(nodeInstallType) == WindowsUpdateInstallType {
return "windows_binary"
}
return BinaryUpdateInstallType
}
func stageHostAgentBinary(sourcePath, binaryPath string) error {
return copyFile(sourcePath, binaryPath+".next", 0o755)
}
@@ -0,0 +1,321 @@
package hostagent
import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strings"
)
const (
DefaultHostAgentInstallPath = "/usr/local/bin/rap-host-agent"
DefaultSystemdUnitDir = "/etc/systemd/system"
)
type UpdateServiceConfig struct {
RuntimeConfig RuntimeConfig
Product string
CurrentVersion string
Channel string
IntervalSeconds int
InitialDelaySeconds int
Jitter float64
HealthTimeoutSec int
BinaryInstallPath string
SourceBinaryPath string
UnitDir string
ManageSystemd bool
DryRun bool
InstallSelfUpdater bool
SelfUpdateVersion string
}
type UpdateServiceResult struct {
Installed bool
Started bool
UnitName string
UnitPath string
BinaryPath string
Unit string
SelfUnitName string
SelfUnitPath string
SelfUnit string
}
func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServiceConfig) (UpdateServiceResult, error) {
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
if cfg.Product == "" {
cfg.Product = DefaultUpdateProduct
}
if cfg.IntervalSeconds == 0 {
cfg.IntervalSeconds = 21600
}
if cfg.Jitter == 0 {
cfg.Jitter = 0.15
}
if cfg.HealthTimeoutSec == 0 {
cfg.HealthTimeoutSec = 30
}
cfg.BinaryInstallPath = firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath)
cfg.UnitDir = firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir)
unitName := "rap-host-agent-updater-" + safeUnitSlug(cfg.RuntimeConfig.ContainerName) + ".service"
result := UpdateServiceResult{
UnitName: unitName,
UnitPath: filepath.Join(cfg.UnitDir, unitName),
BinaryPath: cfg.BinaryInstallPath,
}
unit, err := buildUpdateServiceUnit(cfg)
if err != nil {
return result, err
}
result.Unit = unit
if cfg.DryRun {
if cfg.InstallSelfUpdater {
selfUnit, selfUnitName, selfUnitPath, err := buildHostAgentSelfUpdateUnit(cfg)
if err != nil {
return result, err
}
result.SelfUnit = selfUnit
result.SelfUnitName = selfUnitName
result.SelfUnitPath = selfUnitPath
}
return result, nil
}
if runtime.GOOS != "linux" && cfg.UnitDir == DefaultSystemdUnitDir {
return result, fmt.Errorf("systemd update service install is only supported on linux")
}
if err := installHostAgentBinary(cfg.SourceBinaryPath, cfg.BinaryInstallPath); err != nil {
return result, err
}
if err := os.MkdirAll(cfg.UnitDir, 0o755); err != nil {
return result, err
}
if err := os.WriteFile(result.UnitPath, []byte(unit), 0o644); err != nil {
return result, err
}
if cfg.InstallSelfUpdater {
selfUnit, selfUnitName, selfUnitPath, err := buildHostAgentSelfUpdateUnit(cfg)
if err != nil {
return result, err
}
if err := os.WriteFile(selfUnitPath, []byte(selfUnit), 0o644); err != nil {
return result, err
}
result.SelfUnit = selfUnit
result.SelfUnitName = selfUnitName
result.SelfUnitPath = selfUnitPath
}
result.Installed = true
if cfg.ManageSystemd {
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
return result, err
}
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", unitName); err != nil {
return result, err
}
if cfg.InstallSelfUpdater && result.SelfUnitName != "" {
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", result.SelfUnitName); err != nil {
return result, err
}
}
result.Started = true
}
return result, nil
}
func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
var missing []string
if runtimeCfg.BackendURL == "" {
missing = append(missing, "backend-url")
}
if runtimeCfg.ClusterID == "" {
missing = append(missing, "cluster-id")
}
if runtimeCfg.ContainerName == "" {
missing = append(missing, "container-name")
}
if runtimeCfg.StateDir == "" {
missing = append(missing, "state-dir")
}
if len(missing) > 0 {
return "", fmt.Errorf("missing required update service settings: %s", strings.Join(missing, ", "))
}
args := []string{
cfg.BinaryInstallPath,
"update-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir,
"--container-name", runtimeCfg.ContainerName,
"--product", firstNonEmpty(cfg.Product, DefaultUpdateProduct),
"--current-version", strings.TrimSpace(cfg.CurrentVersion),
"--interval-seconds", fmt.Sprintf("%d", cfg.IntervalSeconds),
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds),
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
"--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec),
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
execStart := systemdJoin(args)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent updater for %s
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
ExecStart=%s
Restart=always
RestartSec=30
[Install]
WantedBy=multi-user.target
`, runtimeCfg.ContainerName, execStart), nil
}
func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host-agent self updater")
}
unitName := "rap-host-agent-self-updater.service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
currentVersion := firstNonEmpty(cfg.SelfUpdateVersion, cfg.CurrentVersion)
args := []string{
cfg.BinaryInstallPath,
"update-host-agent-loop",
"--backend-url", runtimeCfg.BackendURL,
"--cluster-id", runtimeCfg.ClusterID,
"--state-dir", runtimeCfg.StateDir,
"--binary-path", firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath),
"--current-version", currentVersion,
"--interval-seconds", fmt.Sprintf("%d", cfg.IntervalSeconds),
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30),
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
return fmt.Sprintf(`[Unit]
Description=RAP host-agent self updater
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
ExecStart=%s
Restart=always
RestartSec=60
[Install]
WantedBy=multi-user.target
`, systemdJoin(args)), unitName, unitPath, nil
}
func installHostAgentBinary(sourcePath, targetPath string) error {
sourcePath = strings.TrimSpace(sourcePath)
targetPath = strings.TrimSpace(targetPath)
if sourcePath == "" {
var err error
sourcePath, err = os.Executable()
if err != nil {
return err
}
}
if samePath(sourcePath, targetPath) {
return os.Chmod(targetPath, 0o755)
}
src, err := os.Open(sourcePath)
if err != nil {
return err
}
defer src.Close()
if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil {
return err
}
tmp := targetPath + ".tmp"
dst, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
if err != nil {
return err
}
if _, err := io.Copy(dst, src); err != nil {
_ = dst.Close()
_ = os.Remove(tmp)
return err
}
if err := dst.Close(); err != nil {
_ = os.Remove(tmp)
return err
}
if err := os.Chmod(tmp, 0o755); err != nil {
_ = os.Remove(tmp)
return err
}
return os.Rename(tmp, targetPath)
}
func samePath(a, b string) bool {
absA, errA := filepath.Abs(a)
absB, errB := filepath.Abs(b)
if errA == nil && errB == nil {
return absA == absB
}
return a == b
}
func safeUnitSlug(value string) string {
value = strings.ToLower(strings.TrimSpace(value))
if value == "" {
value = DefaultContainerName
}
var b strings.Builder
lastDash := false
for _, r := range value {
ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
if ok {
b.WriteRune(r)
lastDash = false
continue
}
if !lastDash {
b.WriteByte('-')
lastDash = true
}
}
out := strings.Trim(b.String(), "-")
if out == "" {
return DefaultContainerName
}
return out
}
func systemdJoin(args []string) string {
out := make([]string, 0, len(args))
for _, arg := range args {
out = append(out, systemdQuote(arg))
}
return strings.Join(out, " ")
}
func systemdQuote(arg string) string {
if arg == "" {
return `""`
}
if !strings.ContainsAny(arg, " \t\n\"'\\") {
return arg
}
arg = strings.ReplaceAll(arg, `\`, `\\`)
arg = strings.ReplaceAll(arg, `"`, `\"`)
return `"` + arg + `"`
}
@@ -0,0 +1,171 @@
package hostagent
import (
"context"
"os"
"path/filepath"
"strings"
"testing"
)
func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
dir := t.TempDir()
source := filepath.Join(dir, "rap-host-agent-src")
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
t.Fatalf("write source: %v", err)
}
unitDir := filepath.Join(dir, "systemd")
binaryPath := filepath.Join(dir, "bin", "rap-host-agent")
result, err := (DockerManager{}).InstallUpdateService(context.Background(), UpdateServiceConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
NodeName: "node-a",
ContainerName: "rap-node-agent-node-a",
StateDir: "/var/lib/rap/nodes/node-a",
},
CurrentVersion: "0.1.0-current",
IntervalSeconds: 60,
Jitter: 0.2,
SourceBinaryPath: source,
BinaryInstallPath: binaryPath,
UnitDir: unitDir,
ManageSystemd: false,
InstallSelfUpdater: true,
SelfUpdateVersion: "0.1.0-host",
})
if err != nil {
t.Fatalf("install update service: %v", err)
}
if !result.Installed || result.Started {
t.Fatalf("unexpected result: %+v", result)
}
unit, err := os.ReadFile(result.UnitPath)
if err != nil {
t.Fatalf("read unit: %v", err)
}
text := string(unit)
for _, want := range []string{
"ExecStart=",
" update-loop",
"--backend-url http://control/api/v1",
"--cluster-id cluster-1",
"--state-dir /var/lib/rap/nodes/node-a",
"--container-name rap-node-agent-node-a",
"--current-version 0.1.0-current",
"--interval-seconds 60",
"Restart=always",
} {
if !strings.Contains(text, want) {
t.Fatalf("unit missing %q:\n%s", want, text)
}
}
if payload, err := os.ReadFile(binaryPath); err != nil || string(payload) != "binary" {
t.Fatalf("binary copy = %q, %v", payload, err)
}
if result.SelfUnitName != "rap-host-agent-self-updater.service" || result.SelfUnitPath == "" {
t.Fatalf("self updater result = %+v", result)
}
selfUnit, err := os.ReadFile(result.SelfUnitPath)
if err != nil {
t.Fatalf("read self unit: %v", err)
}
if text := string(selfUnit); !strings.Contains(text, "update-host-agent-loop") || !strings.Contains(text, "--current-version 0.1.0-host") {
t.Fatalf("unexpected self unit:\n%s", text)
}
}
func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
cfg := WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
},
NodeID: "node-1",
AutoUpdateCurrentVersion: "0.1.2",
AutoUpdateIntervalSeconds: 120,
AutoUpdateInitialDelaySeconds: 7,
AutoUpdateHealthTimeoutSeconds: 11,
}
result := WindowsInstallResult{
NodeName: "win-a",
StateDir: `C:\ProgramData\RAP\nodes\win-a`,
NodeAgentPath: `C:\Program Files\RAP\win-a\rap-node-agent.exe`,
TaskName: "RAP Node Agent win-a",
}
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
for _, want := range []string{
":loop",
"rap-host-agent.exe.next",
"update-loop --backend-url",
"--backend-url \"http://control/api/v1\"",
"--cluster-id \"cluster-1\"",
"--node-id \"node-1\"",
"--state-dir \"C:\\ProgramData\\RAP\\nodes\\win-a\"",
"--install-type windows_service",
"--binary-path \"C:\\Program Files\\RAP\\win-a\\rap-node-agent.exe\"",
"--host-agent-binary-path \"C:\\Program Files\\RAP\\win-a\\rap-host-agent.exe\"",
"--windows-task-name \"RAP Node Agent win-a\"",
"--current-version 0.1.2",
"--host-agent-current-version 0.1.2",
"--interval-seconds 120",
"timeout /t 120",
} {
if !strings.Contains(script, want) {
t.Fatalf("script missing %q:\n%s", want, script)
}
}
}
func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) {
result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
NodeName: "win-a",
},
InstallDir: `C:\Program Files\RAP\win-a`,
Replace: true,
DryRun: true,
})
if err != nil {
t.Fatalf("replace install should not require join token: %v", err)
}
if result.NodeName != "win-a" || result.NodeAgentPath == "" {
t.Fatalf("unexpected dry-run result: %+v", result)
}
}
func TestWindowsRepairUpdaterStartsFromUnknownVersion(t *testing.T) {
dir := t.TempDir()
source := filepath.Join(dir, "rap-host-agent.exe")
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
t.Fatalf("write source: %v", err)
}
result, err := installWindowsHostAgentUpdater(context.Background(), WindowsManager{Runner: &recordingRunner{}}, WindowsInstallResult{
NodeName: "win-a",
InstallDir: dir,
StateDir: dir,
NodeAgentPath: filepath.Join(dir, "rap-node-agent.exe"),
TaskName: "RAP Node Agent win-a",
StartupMode: "user-task",
}, WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
},
Replace: true,
AutoUpdateEnabled: true,
HostAgentSourcePath: source,
})
if err != nil {
t.Fatalf("install updater: %v", err)
}
script, err := os.ReadFile(filepath.Join(result.InstallDir, "rap-host-agent-update.cmd"))
if err != nil {
t.Fatalf("read updater script: %v", err)
}
if !strings.Contains(string(script), "--current-version 0.0.0") {
t.Fatalf("repair updater should force unknown current version:\n%s", script)
}
}
@@ -0,0 +1,947 @@
package hostagent
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"math/rand"
"net/http"
"net/url"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const (
DefaultUpdateProduct = "rap-node-agent"
HostAgentUpdateProduct = "rap-host-agent"
DefaultUpdateInstallType = "docker"
BinaryUpdateInstallType = "linux_binary"
WindowsUpdateInstallType = "windows_service"
UpdateStateFileName = "host-update-state.json"
UpdateTriggerFileName = "update-trigger.json"
)
var ErrNodeIdentityNotReady = errors.New("node identity is not approved yet")
type UpdateRequest struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
Product string
CurrentVersion string
OS string
Arch string
InstallType string
Channel string
ContainerName string
BinaryPath string
WindowsTaskName string
SystemdUnitName string
HealthTimeout time.Duration
DryRun bool
AllowProductionMesh bool
}
type UpdateResult struct {
Action string
Reason string
TargetVersion string
ContainerName string
PreviousImageID string
NewImage string
ContainerID string
Loaded bool
Replaced bool
RolledBack bool
RestartNeeded bool
}
type UpdateLoopConfig struct {
Request UpdateRequest
Interval time.Duration
InitialDelay time.Duration
Jitter float64
MaxRuns int
StopOnError bool
HostAgentUpdateEnabled bool
HostAgentUpdateRequest HostAgentUpdateRequest
Logf func(format string, args ...any)
}
type UpdateState struct {
Product string `json:"product"`
CurrentVersion string `json:"current_version"`
TargetVersion string `json:"target_version,omitempty"`
ContainerName string `json:"container_name,omitempty"`
Image string `json:"image,omitempty"`
UpdatedAt time.Time `json:"updated_at"`
}
type UpdateTrigger struct {
SchemaVersion string `json:"schema_version"`
Generation string `json:"generation"`
Products []string `json:"products,omitempty"`
Reason string `json:"reason,omitempty"`
DeliveryMode string `json:"delivery_mode,omitempty"`
SubscriptionStatus string `json:"subscription_status,omitempty"`
UpdateServiceNodeID string `json:"update_service_node_id,omitempty"`
UpdateServiceStatus string `json:"update_service_status,omitempty"`
FallbackPollSeconds int `json:"fallback_poll_seconds,omitempty"`
ObservedAt time.Time `json:"observed_at"`
}
type NodeUpdatePlanResponse struct {
Plan NodeUpdatePlan `json:"node_update_plan"`
}
type NodeUpdatePlan struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
Product string `json:"product"`
CurrentVersion string `json:"current_version,omitempty"`
Action string `json:"action"`
Reason string `json:"reason"`
TargetVersion string `json:"target_version,omitempty"`
Channel string `json:"channel,omitempty"`
Strategy string `json:"strategy,omitempty"`
RollbackAllowed bool `json:"rollback_allowed"`
HealthWindowSec int `json:"health_window_seconds,omitempty"`
Artifact *ReleaseArtifact `json:"artifact,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature json.RawMessage `json:"authority_signature,omitempty"`
ProductionForwarding bool `json:"production_forwarding"`
}
type ReleaseArtifact struct {
ID string `json:"id"`
ReleaseID string `json:"release_id"`
ClusterID string `json:"cluster_id"`
Product string `json:"product"`
Version string `json:"version"`
OS string `json:"os"`
Arch string `json:"arch"`
InstallType string `json:"install_type"`
Kind string `json:"kind"`
URL string `json:"url"`
URLs []string `json:"urls,omitempty"`
SHA256 string `json:"sha256"`
SizeBytes int64 `json:"size_bytes"`
Signature *string `json:"signature,omitempty"`
Metadata json.RawMessage `json:"metadata"`
CreatedAt time.Time `json:"created_at"`
}
type NodeUpdateStatusRequest struct {
Product string `json:"product"`
CurrentVersion string `json:"current_version,omitempty"`
TargetVersion string `json:"target_version,omitempty"`
Phase string `json:"phase"`
Status string `json:"status"`
AttemptID string `json:"attempt_id,omitempty"`
ErrorMessage *string `json:"error_message,omitempty"`
RollbackVersion *string `json:"rollback_version,omitempty"`
Payload map[string]any `json:"payload,omitempty"`
ObservedAt time.Time `json:"observed_at,omitempty"`
}
type dockerInspectContainer struct {
ID string `json:"Id"`
Image string `json:"Image"`
Config struct {
Image string `json:"Image"`
Env []string `json:"Env"`
} `json:"Config"`
HostConfig struct {
Privileged bool `json:"Privileged"`
NetworkMode string `json:"NetworkMode"`
CapAdd []string `json:"CapAdd"`
Devices []struct {
PathOnHost string `json:"PathOnHost"`
PathInContainer string `json:"PathInContainer"`
CgroupPermissions string `json:"CgroupPermissions"`
} `json:"Devices"`
RestartPolicy struct {
Name string `json:"Name"`
} `json:"RestartPolicy"`
} `json:"HostConfig"`
Mounts []struct {
Source string `json:"Source"`
Destination string `json:"Destination"`
} `json:"Mounts"`
State struct {
Running bool `json:"Running"`
} `json:"State"`
}
func (req UpdateRequest) Normalize() UpdateRequest {
req.BackendURL = strings.TrimRight(strings.TrimSpace(req.BackendURL), "/")
req.ClusterID = strings.TrimSpace(req.ClusterID)
req.NodeID = strings.TrimSpace(req.NodeID)
req.StateDir = strings.TrimSpace(req.StateDir)
req.Product = firstNonEmpty(req.Product, DefaultUpdateProduct)
req.OS = firstNonEmpty(req.OS, runtime.GOOS)
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
req.InstallType = firstNonEmpty(req.InstallType, DefaultUpdateInstallType)
req.Channel = strings.TrimSpace(req.Channel)
req.ContainerName = firstNonEmpty(req.ContainerName, DefaultContainerName)
req.BinaryPath = strings.TrimSpace(req.BinaryPath)
req.WindowsTaskName = strings.TrimSpace(req.WindowsTaskName)
req.SystemdUnitName = strings.TrimSpace(req.SystemdUnitName)
if req.HealthTimeout == 0 {
req.HealthTimeout = 30 * time.Second
}
return req
}
func (req UpdateRequest) Validate() error {
req = req.Normalize()
var missing []string
if req.BackendURL == "" {
missing = append(missing, "backend-url")
}
if req.ClusterID == "" {
missing = append(missing, "cluster-id")
}
if req.NodeID == "" && req.StateDir == "" {
missing = append(missing, "node-id-or-state-dir")
}
if req.InstallType == WindowsUpdateInstallType {
if req.BinaryPath == "" {
missing = append(missing, "binary-path")
}
if req.WindowsTaskName == "" {
missing = append(missing, "windows-task-name")
}
} else if req.InstallType == BinaryUpdateInstallType && req.Product != HostAgentUpdateProduct {
if req.BinaryPath == "" {
missing = append(missing, "binary-path")
}
if req.SystemdUnitName == "" {
missing = append(missing, "systemd-unit")
}
} else if req.ContainerName == "" {
missing = append(missing, "container-name")
}
if len(missing) > 0 {
return fmt.Errorf("missing required update settings: %s", strings.Join(missing, ", "))
}
if req.HealthTimeout < 0 {
return errors.New("health timeout must not be negative")
}
return nil
}
func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
req = req.Normalize()
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return UpdateResult{}, err
}
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
docker := firstNonEmpty(m.Binary, "docker")
plan, err := FetchNodeUpdatePlan(ctx, req)
if err != nil {
return UpdateResult{}, err
}
if plan.HealthWindowSec > 0 && req.HealthTimeout == 30*time.Second {
req.HealthTimeout = time.Duration(plan.HealthWindowSec) * time.Second
}
result := UpdateResult{
Action: plan.Action,
Reason: plan.Reason,
TargetVersion: plan.TargetVersion,
ContainerName: req.ContainerName,
}
if plan.Action != "update" {
if !req.DryRun {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromNoopPlan(req, plan))
}
return result, nil
}
if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact == nil {
err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != DefaultUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
result.NewImage = artifactImage(*plan.Artifact, "")
return result, nil
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "planned",
Status: "accepted",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason},
})
current, cfg, err := m.runtimeConfigFromContainer(ctx, runner, docker, req.ContainerName)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "inspect", "failed", err))
return result, err
}
result.PreviousImageID = current.Image
cfg.BackendURL = firstNonEmpty(cfg.BackendURL, req.BackendURL)
cfg.ClusterID = firstNonEmpty(cfg.ClusterID, req.ClusterID)
cfg.ContainerName = req.ContainerName
cfg.Image = artifactImage(*plan.Artifact, cfg.Image)
cfg.ImageArtifactURLs = artifactURLsForBackend(*plan.Artifact, req.BackendURL)
cfg.ImageArtifactSHA256 = plan.Artifact.SHA256
cfg.ImageArtifactSizeBytes = plan.Artifact.SizeBytes
cfg.Replace = true
cfg.JoinToken = ""
result.NewImage = cfg.Image
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": cfg.ImageArtifactURLs, "image": cfg.Image},
})
installed, err := m.Install(ctx, cfg)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
if rollbackErr == nil && plan.RollbackAllowed {
result.RolledBack = true
}
return result, err
}
result.Loaded = installed.Loaded
result.Replaced = installed.Replaced
result.ContainerID = installed.ContainerID
if err := m.waitContainerRunning(ctx, runner, docker, req.ContainerName, req.HealthTimeout); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "health_check", "failed", err))
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
if rollbackErr == nil && plan.RollbackAllowed {
result.RolledBack = true
}
return result, err
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "health_check",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"container_id": installed.ContainerID, "image": cfg.Image},
})
_ = saveUpdateState(req.StateDir, UpdateState{
Product: req.Product,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
ContainerName: req.ContainerName,
Image: cfg.Image,
UpdatedAt: time.Now().UTC(),
})
return result, nil
}
func (m DockerManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request.Normalize()
if err := req.Validate(); err != nil {
return err
}
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
if cfg.Interval < 0 {
return errors.New("update loop interval must not be negative")
}
if cfg.InitialDelay < 0 {
return errors.New("update loop initial delay must not be negative")
}
if cfg.Jitter < 0 || cfg.Jitter > 1 {
return errors.New("update loop jitter must be between 0 and 1")
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
}
if cfg.InitialDelay > 0 {
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
return err
}
}
runs := 0
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
for {
runs++
result, err := m.ApplyUpdate(ctx, req)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
return err
}
continue
}
logf("update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
} else {
logf("update_loop run=%d action=%s reason=%s target=%s container=%s loaded=%t replaced=%t rolled_back=%t",
runs,
result.Action,
result.Reason,
result.TargetVersion,
result.ContainerName,
result.Loaded,
result.Replaced,
result.RolledBack,
)
if result.Action == "update" && result.TargetVersion != "" && !result.RolledBack {
req.CurrentVersion = result.TargetVersion
}
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
hostReq.CurrentVersion = firstNonEmpty(hostReq.CurrentVersion, req.CurrentVersion)
hostReq.OS = firstNonEmpty(hostReq.OS, req.OS)
hostReq.Arch = firstNonEmpty(hostReq.Arch, req.Arch)
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, hostAgentInstallTypeFor(req.InstallType))
result, err := m.ApplyHostAgentUpdate(ctx, hostReq)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
} else {
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
}
} else {
logf("host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
runs,
result.Action,
result.Reason,
result.TargetVersion,
result.NewImage,
result.Replaced,
result.RestartNeeded,
)
if result.Action == "update" && result.TargetVersion != "" {
cfg.HostAgentUpdateRequest.CurrentVersion = result.TargetVersion
}
if result.RestartNeeded {
return nil
}
}
}
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
}
}
func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan, error) {
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return NodeUpdatePlan{}, err
}
values := url.Values{}
values.Set("product", req.Product)
values.Set("current_version", req.CurrentVersion)
values.Set("os", req.OS)
values.Set("arch", req.Arch)
values.Set("install_type", req.InstallType)
if req.Channel != "" {
values.Set("channel", req.Channel)
}
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/plan?%s", req.BackendURL, url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode())
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return NodeUpdatePlan{}, err
}
resp, err := http.DefaultClient.Do(httpReq)
if err != nil {
return NodeUpdatePlan{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return NodeUpdatePlan{}, fmt.Errorf("fetch update plan: %s", resp.Status)
}
var out NodeUpdatePlanResponse
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
return NodeUpdatePlan{}, err
}
return out.Plan, nil
}
func resolveUpdateRequest(req UpdateRequest) (UpdateRequest, error) {
req = req.Normalize()
if err := req.Validate(); err != nil {
return UpdateRequest{}, err
}
if req.NodeID == "" {
identity, err := state.Load(filepath.Join(req.StateDir, state.FileName))
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return UpdateRequest{}, ErrNodeIdentityNotReady
}
return UpdateRequest{}, err
}
if strings.TrimSpace(identity.NodeID) == "" {
return UpdateRequest{}, ErrNodeIdentityNotReady
}
req.NodeID = strings.TrimSpace(identity.NodeID)
if req.ClusterID == "" {
req.ClusterID = strings.TrimSpace(identity.ClusterID)
}
}
if updateState, err := loadUpdateState(req.StateDir, req.Product); err == nil && updateState.Product == req.Product && updateState.CurrentVersion != "" {
req.CurrentVersion = updateState.CurrentVersion
}
return req, nil
}
func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID string, request NodeUpdateStatusRequest) error {
backendURL = strings.TrimRight(strings.TrimSpace(backendURL), "/")
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/status", backendURL, url.PathEscape(clusterID), url.PathEscape(nodeID))
body, err := json.Marshal(request)
if err != nil {
return err
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body))
if err != nil {
return err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := http.DefaultClient.Do(httpReq)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return fmt.Errorf("report update status: %s", resp.Status)
}
return nil
}
func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner CommandRunner, docker, containerName string) (dockerInspectContainer, RuntimeConfig, error) {
out, err := runner.Run(ctx, docker, "inspect", containerName)
if err != nil {
return dockerInspectContainer{}, RuntimeConfig{}, err
}
var inspected []dockerInspectContainer
if err := json.Unmarshal([]byte(out), &inspected); err != nil {
return dockerInspectContainer{}, RuntimeConfig{}, err
}
if len(inspected) == 0 {
return dockerInspectContainer{}, RuntimeConfig{}, fmt.Errorf("container %q not found", containerName)
}
env := envMap(inspected[0].Config.Env)
cfg := RuntimeConfig{
BackendURL: env["RAP_BACKEND_URL"],
ClusterID: env["RAP_CLUSTER_ID"],
NodeName: firstNonEmpty(env["RAP_NODE_NAME"], containerName),
Image: inspected[0].Config.Image,
ContainerName: containerName,
StateDir: hostStateDir(inspected[0]),
Network: firstNonEmpty(inspected[0].HostConfig.NetworkMode, DefaultNetwork),
RestartPolicy: firstNonEmpty(inspected[0].HostConfig.RestartPolicy.Name, "unless-stopped"),
WorkloadSupervisionEnabled: parseBool(env["RAP_WORKLOAD_SUPERVISION_ENABLED"]),
MeshSyntheticRuntimeEnabled: true,
MeshProductionForwardingEnabled: parseBool(env["RAP_MESH_PRODUCTION_FORWARDING_ENABLED"]),
MeshListenAddr: env["RAP_MESH_LISTEN_ADDR"],
MeshListenPortMode: env["RAP_MESH_LISTEN_PORT_MODE"],
MeshListenAutoPortStart: parseInt(env["RAP_MESH_LISTEN_AUTO_PORT_START"]),
MeshListenAutoPortEnd: parseInt(env["RAP_MESH_LISTEN_AUTO_PORT_END"]),
MeshAdvertiseEndpoint: env["RAP_MESH_ADVERTISE_ENDPOINT"],
MeshAdvertiseEndpointsJSON: env["RAP_MESH_ADVERTISE_ENDPOINTS_JSON"],
MeshAdvertiseTransport: env["RAP_MESH_ADVERTISE_TRANSPORT"],
MeshConnectivityMode: env["RAP_MESH_CONNECTIVITY_MODE"],
MeshNATType: env["RAP_MESH_NAT_TYPE"],
MeshRegion: env["RAP_MESH_REGION"],
HeartbeatIntervalSeconds: parseInt(env["RAP_HEARTBEAT_INTERVAL_SECONDS"]),
EnrollmentPollIntervalSeconds: parseInt(env["RAP_ENROLLMENT_POLL_INTERVAL_SECONDS"]),
EnrollmentPollTimeoutSeconds: parseInt(env["RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS"]),
ProductionObservationSinkCap: parseInt(env["RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY"]),
DockerVPNGatewayEnabled: dockerInspectHasVPNGatewayRuntime(inspected[0]),
}
return inspected[0], cfg.Normalize(), nil
}
func dockerInspectHasVPNGatewayRuntime(container dockerInspectContainer) bool {
hasNetAdmin := false
for _, cap := range container.HostConfig.CapAdd {
if strings.EqualFold(strings.TrimSpace(cap), "NET_ADMIN") {
hasNetAdmin = true
break
}
}
hasTun := false
for _, device := range container.HostConfig.Devices {
if device.PathOnHost == "/dev/net/tun" || device.PathInContainer == "/dev/net/tun" {
hasTun = true
break
}
}
return (container.HostConfig.Privileged || hasNetAdmin) && hasTun
}
func (m DockerManager) waitContainerRunning(ctx context.Context, runner CommandRunner, docker, containerName string, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
for {
out, err := runner.Run(ctx, docker, "inspect", "--format", "{{.State.Running}}", containerName)
if err == nil && strings.TrimSpace(out) == "true" {
return nil
}
if timeout == 0 || time.Now().After(deadline) {
if err != nil {
return err
}
return fmt.Errorf("container %q is not running", containerName)
}
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(time.Second):
}
}
}
func (m DockerManager) rollbackContainer(ctx context.Context, runner CommandRunner, docker string, cfg RuntimeConfig, previous dockerInspectContainer, allowed bool) error {
if !allowed || strings.TrimSpace(previous.Image) == "" {
return nil
}
rollbackCfg := cfg
rollbackCfg.Image = previous.Image
rollbackCfg.ImageArtifactURLs = nil
rollbackCfg.ImageArtifactSHA256 = ""
rollbackCfg.ImageArtifactSizeBytes = 0
rollbackCfg.Replace = true
_, err := m.Install(ctx, rollbackCfg)
if err == nil {
_, _ = runner.Run(ctx, docker, "inspect", "--format", "{{.State.Running}}", cfg.ContainerName)
}
return err
}
func artifactImage(artifact ReleaseArtifact, fallback string) string {
if len(artifact.Metadata) > 0 {
var metadata struct {
Image string `json:"image"`
}
if err := json.Unmarshal(artifact.Metadata, &metadata); err == nil && strings.TrimSpace(metadata.Image) != "" {
return strings.TrimSpace(metadata.Image)
}
}
if artifact.InstallType == DefaultUpdateInstallType && artifact.Product != "" && artifact.Version != "" {
return strings.TrimSpace(artifact.Product) + ":" + strings.TrimSpace(artifact.Version)
}
return firstNonEmpty(fallback, DefaultImage)
}
func artifactURLs(artifact ReleaseArtifact) []string {
out := make([]string, 0, 1+len(artifact.URLs))
for _, raw := range append([]string{artifact.URL}, artifact.URLs...) {
raw = strings.TrimSpace(raw)
if raw == "" || containsArtifactURL(out, raw) {
continue
}
out = append(out, raw)
}
return out
}
func artifactURLsForBackend(artifact ReleaseArtifact, backendURL string) []string {
urls := artifactURLs(artifact)
base, err := url.Parse(strings.TrimSpace(backendURL))
if err != nil || base.Scheme == "" || base.Host == "" {
return urls
}
origin := base.Scheme + "://" + base.Host
out := make([]string, 0, len(urls))
for _, raw := range urls {
if strings.HasPrefix(raw, "/") {
raw = origin + raw
}
if !containsArtifactURL(out, raw) {
out = append(out, raw)
}
}
return out
}
func containsArtifactURL(values []string, value string) bool {
for _, item := range values {
if item == value {
return true
}
}
return false
}
func statusFromError(req UpdateRequest, plan NodeUpdatePlan, phase, status string, err error) NodeUpdateStatusRequest {
message := err.Error()
return NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: phase,
Status: status,
AttemptID: updateAttemptID(plan),
ErrorMessage: &message,
ObservedAt: time.Now().UTC(),
}
}
func statusFromNoopPlan(req UpdateRequest, plan NodeUpdatePlan) NodeUpdateStatusRequest {
return NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "plan",
Status: "noop",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{
"action": plan.Action,
"reason": plan.Reason,
"strategy": plan.Strategy,
"channel": plan.Channel,
},
}
}
func updateAttemptID(plan NodeUpdatePlan) string {
parts := []string{plan.NodeID, plan.Product, plan.TargetVersion}
if plan.Artifact != nil {
parts = append(parts, plan.Artifact.ID)
}
return strings.Join(parts, ":")
}
func envMap(items []string) map[string]string {
out := map[string]string{}
for _, item := range items {
key, value, ok := strings.Cut(item, "=")
if ok {
out[key] = value
}
}
return out
}
func hostStateDir(container dockerInspectContainer) string {
for _, mount := range container.Mounts {
if mount.Destination == "/var/lib/rap-node-agent" && mount.Source != "" {
return mount.Source
}
}
return DefaultStateDir
}
func parseBool(value string) bool {
switch strings.ToLower(strings.TrimSpace(value)) {
case "1", "true", "yes", "y", "on":
return true
default:
return false
}
}
func parseInt(value string) int {
out, _ := strconv.Atoi(strings.TrimSpace(value))
return out
}
func loadUpdateState(stateDir string, product string) (UpdateState, error) {
stateDir = strings.TrimSpace(stateDir)
if stateDir == "" {
return UpdateState{}, os.ErrNotExist
}
product = firstNonEmpty(normalizeUpdateProductToken(product), DefaultUpdateProduct)
payload, err := os.ReadFile(updateStatePath(stateDir, product))
if err != nil && product == DefaultUpdateProduct {
payload, err = os.ReadFile(filepath.Join(stateDir, UpdateStateFileName))
}
if err != nil {
return UpdateState{}, err
}
var item UpdateState
if err := json.Unmarshal(payload, &item); err != nil {
return UpdateState{}, err
}
item.Product = firstNonEmpty(item.Product, product)
return item, nil
}
func saveUpdateState(stateDir string, item UpdateState) error {
stateDir = strings.TrimSpace(stateDir)
if stateDir == "" || item.CurrentVersion == "" {
return nil
}
item.Product = firstNonEmpty(item.Product, DefaultUpdateProduct)
if item.UpdatedAt.IsZero() {
item.UpdatedAt = time.Now().UTC()
}
if err := os.MkdirAll(stateDir, 0o700); err != nil {
return err
}
payload, err := json.MarshalIndent(item, "", " ")
if err != nil {
return err
}
return os.WriteFile(updateStatePath(stateDir, item.Product), payload, 0o600)
}
func updateStatePath(stateDir, product string) string {
product = normalizeUpdateProductToken(firstNonEmpty(product, DefaultUpdateProduct))
if product == "" || product == DefaultUpdateProduct {
return filepath.Join(stateDir, UpdateStateFileName)
}
return filepath.Join(stateDir, "host-update-state-"+product+".json")
}
func UpdateTriggerPath(stateDir string) string {
return filepath.Join(strings.TrimSpace(stateDir), UpdateTriggerFileName)
}
func SaveUpdateTrigger(stateDir string, trigger UpdateTrigger) error {
stateDir = strings.TrimSpace(stateDir)
trigger.Generation = strings.TrimSpace(trigger.Generation)
if stateDir == "" || trigger.Generation == "" {
return nil
}
if trigger.SchemaVersion == "" {
trigger.SchemaVersion = "rap.node_update_trigger.v1"
}
if trigger.ObservedAt.IsZero() {
trigger.ObservedAt = time.Now().UTC()
}
if err := os.MkdirAll(stateDir, 0o700); err != nil {
return err
}
payload, err := json.MarshalIndent(trigger, "", " ")
if err != nil {
return err
}
return os.WriteFile(UpdateTriggerPath(stateDir), payload, 0o600)
}
func currentUpdateTriggerGeneration(stateDir string) string {
payload, err := os.ReadFile(UpdateTriggerPath(stateDir))
if err != nil {
return ""
}
var trigger UpdateTrigger
if err := json.Unmarshal(payload, &trigger); err != nil {
return ""
}
return strings.TrimSpace(trigger.Generation)
}
func CurrentUpdateTriggerGenerationForNodeAgent(stateDir string) string {
return currentUpdateTriggerGeneration(stateDir)
}
func normalizeUpdateProductToken(value string) string {
value = strings.ToLower(strings.TrimSpace(value))
var b strings.Builder
for _, r := range value {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' {
b.WriteRune(r)
}
}
return b.String()
}
func sleepContext(ctx context.Context, duration time.Duration) error {
if duration <= 0 {
return nil
}
timer := time.NewTimer(duration)
defer timer.Stop()
select {
case <-ctx.Done():
return ctx.Err()
case <-timer.C:
return nil
}
}
func sleepUntilUpdateIntervalOrTrigger(ctx context.Context, stateDir string, duration time.Duration, lastGeneration *string) error {
if duration <= 0 {
return nil
}
deadline := time.NewTimer(duration)
defer deadline.Stop()
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-deadline.C:
return nil
case <-ticker.C:
generation := currentUpdateTriggerGeneration(stateDir)
if generation != "" && lastGeneration != nil && generation != *lastGeneration {
*lastGeneration = generation
return nil
}
}
}
}
func jitteredDuration(base time.Duration, jitter float64) time.Duration {
if base <= 0 || jitter <= 0 {
return base
}
spread := int64(float64(base) * jitter)
if spread <= 0 {
return base
}
offset := rand.Int63n(spread*2+1) - spread
return base + time.Duration(offset)
}
@@ -0,0 +1,672 @@
package hostagent
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
type updateRunner struct {
calls [][]string
healthOkay bool
inspectJSON string
}
func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.T) {
urls := artifactURLsForBackend(ReleaseArtifact{
URL: "/downloads/rap-node-agent-0.2.92.tar",
URLs: []string{"/downloads/mirror.tar", "https://cdn.example.test/agent.tar"},
}, "http://control.example.test:18080/api/v1")
want := []string{
"http://control.example.test:18080/downloads/rap-node-agent-0.2.92.tar",
"http://control.example.test:18080/downloads/mirror.tar",
"https://cdn.example.test/agent.tar",
}
if len(urls) != len(want) {
t.Fatalf("urls = %#v", urls)
}
for i := range want {
if urls[i] != want[i] {
t.Fatalf("urls[%d] = %q, want %q; all=%#v", i, urls[i], want[i], urls)
}
}
}
func (r *updateRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) >= 2 && args[0] == "inspect" && args[1] == "--format" {
if r.healthOkay {
return "true\n", nil
}
return "false\n", nil
}
if len(args) == 2 && args[0] == "inspect" {
return r.inspectJSON, nil
}
if len(args) >= 2 && args[0] == "image" && args[1] == "inspect" {
return "[]", nil
}
if len(args) > 0 && args[0] == "run" {
return "updated-container\n", nil
}
return "", nil
}
func TestApplyUpdateFetchesPlanLoadsImageAndRecreatesContainer(t *testing.T) {
artifactBody := []byte("fake docker image tar")
statuses := []NodeUpdateStatusRequest{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"schema_version": "rap.node_update_plan.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.0-old",
"action": "update",
"reason": "matching_release_available",
"target_version": "0.1.0-new",
"rollback_allowed": true,
"health_window_seconds": 1,
"production_forwarding": false,
"artifact": map[string]any{
"id": "artifact-1",
"product": "rap-node-agent",
"version": "0.1.0-new",
"os": "linux",
"arch": "amd64",
"install_type": "docker",
"url": serverArtifactURL(r),
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
"size_bytes": len(artifactBody),
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
},
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
var status NodeUpdateStatusRequest
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
t.Fatalf("decode status: %v", err)
}
statuses = append(statuses, status)
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
_, _ = w.Write(artifactBody)
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixture(server.URL)}
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.0-old",
ContainerName: "rap-node-agent-node-1",
HealthTimeout: time.Second,
})
if err != nil {
t.Fatalf("apply update: %v", err)
}
if result.Action != "update" || !result.Loaded || !result.Replaced || result.NewImage != "rap-node-agent:test-new" {
t.Fatalf("unexpected result: %+v", result)
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
for _, want := range []string{"inspect\x00rap-node-agent-node-1", "load\x00-i", "rm\x00-f\x00rap-node-agent-node-1", "run\x00-d", "RAP_NODE_NAME=node-a"} {
if !strings.Contains(joined, want) {
t.Fatalf("missing docker call part %q in %#v", want, runner.calls)
}
}
if len(statuses) != 3 || statuses[0].Phase != "planned" || statuses[1].Phase != "download" || statuses[2].Status != "succeeded" {
t.Fatalf("statuses = %+v", statuses)
}
}
func TestApplyUpdatePreservesDockerVPNGatewayRuntime(t *testing.T) {
previousStatHostPath := statHostPath
statHostPath = func(string) (os.FileInfo, error) { return nil, nil }
t.Cleanup(func() { statHostPath = previousStatHostPath })
artifactBody := []byte("fake docker image tar")
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"schema_version": "rap.node_update_plan.v1",
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.2.7",
"action": "update",
"reason": "matching_release_available",
"target_version": "0.2.8",
"rollback_allowed": true,
"health_window_seconds": 1,
"artifact": map[string]any{
"id": "artifact-1",
"product": "rap-node-agent",
"version": "0.2.8",
"os": "linux",
"arch": "amd64",
"install_type": "docker",
"url": serverArtifactURL(r),
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
"size_bytes": len(artifactBody),
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
},
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
_, _ = w.Write(artifactBody)
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixtureWithVPNGatewayRuntime()}
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.2.7",
ContainerName: "rap-node-agent-node-1",
HealthTimeout: time.Second,
})
if err != nil {
t.Fatalf("ApplyUpdate failed: %v", err)
}
if !result.Replaced {
t.Fatalf("expected replacement")
}
joined := strings.Join(flattenCalls(runner.calls), "\x00")
for _, want := range []string{"--privileged", "--cap-add\x00NET_ADMIN", "--device\x00/dev/net/tun:/dev/net/tun"} {
if !strings.Contains(joined, want) {
t.Fatalf("docker run did not preserve %q in %#v", want, runner.calls)
}
}
}
func TestApplyUpdateNoopsWithoutDockerWhenPlanHasNoAction(t *testing.T) {
statuses := []NodeUpdateStatusRequest{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.3",
"action": "none",
"reason": "already_current",
"target_version": "0.1.3",
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
var status NodeUpdateStatusRequest
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
t.Fatalf("decode status: %v", err)
}
statuses = append(statuses, status)
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
runner := &updateRunner{}
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.3",
ContainerName: "rap-node-agent-node-1",
})
if err != nil {
t.Fatalf("apply update: %v", err)
}
if result.Action != "none" || result.Reason != "already_current" {
t.Fatalf("result = %+v", result)
}
if len(runner.calls) != 0 {
t.Fatalf("docker should not be called, got %#v", runner.calls)
}
if len(statuses) != 1 || statuses[0].Phase != "plan" || statuses[0].Status != "noop" || statuses[0].TargetVersion != "0.1.3" {
t.Fatalf("statuses = %+v", statuses)
}
}
func TestWindowsApplyUpdateNoopReportsTaskStatus(t *testing.T) {
statuses := []NodeUpdateStatusRequest{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": "0.1.3",
"action": "none",
"reason": "already_current",
"target_version": "0.1.3",
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
var status NodeUpdateStatusRequest
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
t.Fatalf("decode status: %v", err)
}
statuses = append(statuses, status)
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
result, err := (WindowsManager{Runner: &updateRunner{}}).ApplyUpdate(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.3",
InstallType: WindowsUpdateInstallType,
BinaryPath: `C:\Program Files\RAP\node\rap-node-agent.exe`,
WindowsTaskName: "RAP Node Agent node",
})
if err != nil {
t.Fatalf("windows apply update: %v", err)
}
if result.Action != "none" || result.Reason != "already_current" {
t.Fatalf("result = %+v", result)
}
if len(statuses) != 1 || statuses[0].Phase != "plan" || statuses[0].Status != "noop" {
t.Fatalf("statuses = %+v", statuses)
}
if statuses[0].Payload["task"] != "RAP Node Agent node" {
t.Fatalf("status payload = %+v", statuses[0].Payload)
}
}
func TestRunUpdateLoopAdvancesCurrentVersionAfterSuccessfulUpdate(t *testing.T) {
artifactBody := []byte("fake docker image tar")
planRequests := []string{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
current := r.URL.Query().Get("current_version")
planRequests = append(planRequests, current)
action := "update"
reason := "matching_release_available"
if current == "0.1.0-new" {
action = "none"
reason = "already_current"
}
plan := map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": "rap-node-agent",
"current_version": current,
"action": action,
"reason": reason,
"target_version": "0.1.0-new",
"rollback_allowed": true,
"production_forwarding": false,
}
if action == "update" {
plan["artifact"] = map[string]any{
"id": "artifact-1",
"product": "rap-node-agent",
"version": "0.1.0-new",
"os": "linux",
"arch": "amd64",
"install_type": "docker",
"url": serverArtifactURL(r),
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
"size_bytes": len(artifactBody),
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
}
}
_ = json.NewEncoder(w).Encode(map[string]any{"node_update_plan": plan})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
_, _ = w.Write(artifactBody)
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixture(server.URL)}
err := (DockerManager{Runner: runner}).RunUpdateLoop(context.Background(), UpdateLoopConfig{
Request: UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.0-old",
ContainerName: "rap-node-agent-node-1",
HealthTimeout: time.Second,
},
Interval: time.Millisecond,
MaxRuns: 2,
})
if err != nil {
t.Fatalf("run update loop: %v", err)
}
if strings.Join(planRequests, ",") != "0.1.0-old,0.1.0-new" {
t.Fatalf("plan current versions = %#v", planRequests)
}
}
func TestRunUpdateLoopReportsHostAgentStatusWhenEnabled(t *testing.T) {
statuses := []NodeUpdateStatusRequest{}
planProducts := []string{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
product := r.URL.Query().Get("product")
planProducts = append(planProducts, product)
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": product,
"current_version": "0.1.3",
"action": "none",
"reason": "already_current",
"target_version": "0.1.3",
"rollback_allowed": true,
"production_forwarding": false,
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
var status NodeUpdateStatusRequest
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
t.Fatalf("decode status: %v", err)
}
statuses = append(statuses, status)
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
err := (DockerManager{}).RunUpdateLoop(context.Background(), UpdateLoopConfig{
Request: UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
NodeID: "node-1",
CurrentVersion: "0.1.3",
ContainerName: "rap-node-agent-node-1",
},
HostAgentUpdateEnabled: true,
HostAgentUpdateRequest: HostAgentUpdateRequest{
CurrentVersion: "0.1.3",
BinaryPath: filepath.Join(t.TempDir(), "rap-host-agent"),
},
MaxRuns: 1,
})
if err != nil {
t.Fatalf("run update loop: %v", err)
}
if strings.Join(planProducts, ",") != "rap-node-agent,rap-host-agent" {
t.Fatalf("plan products = %#v", planProducts)
}
if len(statuses) != 2 || statuses[0].Product != "rap-node-agent" || statuses[1].Product != "rap-host-agent" {
t.Fatalf("statuses = %+v", statuses)
}
if statuses[1].Phase != "plan" || statuses[1].Status != "noop" {
t.Fatalf("host-agent status = %+v", statuses[1])
}
}
func TestFetchNodeUpdatePlanResolvesNodeIDAndVersionFromStateDir(t *testing.T) {
dir := t.TempDir()
if err := state.Save(filepath.Join(dir, state.FileName), state.Identity{
NodeID: "node-from-state",
ClusterID: "cluster-1",
NodeName: "node-a",
}); err != nil {
t.Fatalf("save identity: %v", err)
}
if err := saveUpdateState(dir, UpdateState{
Product: "rap-node-agent",
CurrentVersion: "0.1.0-state",
}); err != nil {
t.Fatalf("save update state: %v", err)
}
var gotPath string
var gotCurrent string
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
gotPath = r.URL.Path
gotCurrent = r.URL.Query().Get("current_version")
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-from-state",
"product": "rap-node-agent",
"action": "none",
"reason": "already_current",
},
})
}))
defer server.Close()
if _, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
StateDir: dir,
CurrentVersion: "0.1.0-flag",
}); err != nil {
t.Fatalf("fetch plan: %v", err)
}
if !strings.Contains(gotPath, "/nodes/node-from-state/updates/plan") || gotCurrent != "0.1.0-state" {
t.Fatalf("path/current = %q/%q", gotPath, gotCurrent)
}
}
func TestApplyHostAgentUpdateDownloadsAndReplacesBinary(t *testing.T) {
dir := t.TempDir()
if err := state.Save(filepath.Join(dir, state.FileName), state.Identity{
NodeID: "node-1",
ClusterID: "cluster-1",
NodeName: "node-a",
}); err != nil {
t.Fatalf("save identity: %v", err)
}
binaryPath := filepath.Join(dir, "rap-host-agent")
artifactBody := []byte("new host agent binary")
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
if r.URL.Query().Get("product") != HostAgentUpdateProduct || r.URL.Query().Get("install_type") != BinaryUpdateInstallType {
t.Fatalf("unexpected query: %s", r.URL.RawQuery)
}
_ = json.NewEncoder(w).Encode(map[string]any{
"node_update_plan": map[string]any{
"cluster_id": "cluster-1",
"node_id": "node-1",
"product": HostAgentUpdateProduct,
"action": "update",
"reason": "matching_release_available",
"target_version": "0.1.0-host-new",
"rollback_allowed": false,
"production_forwarding": false,
"artifact": map[string]any{
"id": "artifact-host-1",
"product": HostAgentUpdateProduct,
"version": "0.1.0-host-new",
"os": "linux",
"arch": "amd64",
"install_type": BinaryUpdateInstallType,
"url": serverArtifactURL(r),
"sha256": "adc549d9e66ef64a507dd6880590d31309e16a3be965a92d849edd103cfb1815",
"size_bytes": len(artifactBody),
},
},
})
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
_, _ = w.Write(artifactBody)
default:
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
result, err := (DockerManager{}).ApplyHostAgentUpdate(context.Background(), HostAgentUpdateRequest{
BackendURL: server.URL,
ClusterID: "cluster-1",
StateDir: dir,
CurrentVersion: "0.1.0-host-old",
BinaryPath: binaryPath,
})
if err != nil {
t.Fatalf("apply host-agent update: %v", err)
}
if !result.Replaced || !result.RestartNeeded {
t.Fatalf("result = %+v", result)
}
payload, err := os.ReadFile(binaryPath)
if err != nil || string(payload) != string(artifactBody) {
t.Fatalf("binary payload = %q, %v", payload, err)
}
updateState, err := loadUpdateState(dir, HostAgentUpdateProduct)
if err != nil {
t.Fatalf("load update state: %v", err)
}
if updateState.Product != HostAgentUpdateProduct || updateState.CurrentVersion != "0.1.0-host-new" {
t.Fatalf("update state = %+v", updateState)
}
}
func TestUpdateStateIsProductScoped(t *testing.T) {
dir := t.TempDir()
if err := saveUpdateState(dir, UpdateState{Product: DefaultUpdateProduct, CurrentVersion: "node-v"}); err != nil {
t.Fatalf("save node state: %v", err)
}
if err := saveUpdateState(dir, UpdateState{Product: HostAgentUpdateProduct, CurrentVersion: "host-v"}); err != nil {
t.Fatalf("save host state: %v", err)
}
nodeState, err := loadUpdateState(dir, DefaultUpdateProduct)
if err != nil {
t.Fatalf("load node state: %v", err)
}
hostState, err := loadUpdateState(dir, HostAgentUpdateProduct)
if err != nil {
t.Fatalf("load host state: %v", err)
}
if nodeState.CurrentVersion != "node-v" || hostState.CurrentVersion != "host-v" {
t.Fatalf("states overlapped: node=%+v host=%+v", nodeState, hostState)
}
}
func TestArtifactImageDerivesDockerTagFromProductAndVersion(t *testing.T) {
got := artifactImage(ReleaseArtifact{
Product: "rap-node-agent",
Version: "0.2.77",
InstallType: DefaultUpdateInstallType,
}, "rap-node-agent:old")
if got != "rap-node-agent:0.2.77" {
t.Fatalf("expected versioned docker image, got %q", got)
}
}
func serverArtifactURL(r *http.Request) string {
scheme := "http"
if r.TLS != nil {
scheme = "https"
}
return fmt.Sprintf("%s://%s/artifact.tar", scheme, r.Host)
}
func dockerInspectFixture(_ string) string {
return `[
{
"Id": "old-container",
"Image": "sha256:oldimage",
"Config": {
"Image": "rap-node-agent:test-old",
"Env": [
"RAP_BACKEND_URL=http://control/api/v1",
"RAP_CLUSTER_ID=cluster-1",
"RAP_NODE_NAME=node-a",
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
"RAP_HEARTBEAT_INTERVAL_SECONDS=15",
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=5",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
"RAP_MESH_LISTEN_ADDR=:19131"
]
},
"HostConfig": {
"NetworkMode": "host",
"RestartPolicy": {"Name": "unless-stopped"}
},
"Mounts": [
{"Source": "/var/lib/rap/nodes/node-a", "Destination": "/var/lib/rap-node-agent"}
],
"State": {"Running": true}
}
]`
}
func dockerInspectFixtureWithVPNGatewayRuntime() string {
return `[
{
"Id": "old-container",
"Image": "sha256:oldimage",
"Config": {
"Image": "rap-node-agent:test-old",
"Env": [
"RAP_BACKEND_URL=http://control/api/v1",
"RAP_CLUSTER_ID=cluster-1",
"RAP_NODE_NAME=node-a",
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
"RAP_HEARTBEAT_INTERVAL_SECONDS=15",
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=5",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
"RAP_MESH_LISTEN_ADDR=:19131"
]
},
"HostConfig": {
"NetworkMode": "host",
"Privileged": true,
"CapAdd": ["NET_ADMIN"],
"Devices": [
{"PathOnHost": "/dev/net/tun", "PathInContainer": "/dev/net/tun", "CgroupPermissions": "rwm"}
],
"RestartPolicy": {"Name": "unless-stopped"}
},
"Mounts": [
{"Source": "/var/lib/rap/nodes/node-a", "Destination": "/var/lib/rap-node-agent"}
],
"State": {"Running": true}
}
]`
}
@@ -0,0 +1,368 @@
package hostagent
import (
"context"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strings"
)
const (
DefaultWindowsInstallDir = `C:\Program Files\RAP`
DefaultWindowsStateRoot = `C:\ProgramData\RAP\nodes`
)
type WindowsInstallConfig struct {
RuntimeConfig RuntimeConfig
NodeID string
InstallDir string
StartupMode string
ArtifactURLs []string
ArtifactSHA256 string
ArtifactSizeBytes int64
Replace bool
DryRun bool
AutoUpdateEnabled bool
AutoUpdateCurrentVersion string
AutoUpdateChannel string
AutoUpdateIntervalSeconds int
AutoUpdateInitialDelaySeconds int
AutoUpdateHealthTimeoutSeconds int
HostAgentSourcePath string
}
type WindowsInstallResult struct {
NodeName string
InstallDir string
StateDir string
NodeAgentPath string
WrapperPath string
StartupMode string
TaskName string
HostAgentPath string
UpdaterTaskName string
Downloaded bool
Started bool
UpdaterStarted bool
AdminFallback bool
}
type WindowsManager struct {
Runner CommandRunner
}
func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInstallConfig {
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(profile.NodeName)))
return WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
StateDir: stateDir,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
},
InstallDir: firstNonEmpty(profile.InstallDir, filepath.Join(DefaultWindowsInstallDir, safeUnitSlug(profile.NodeName))),
StartupMode: firstNonEmpty(profile.StartupMode, "auto"),
ArtifactURLs: binaryArtifactURLs(profile),
ArtifactSHA256: binaryArtifactSHA256(profile),
ArtifactSizeBytes: binaryArtifactSizeBytes(profile),
Replace: true,
AutoUpdateEnabled: true,
}
}
func (m WindowsManager) Install(ctx context.Context, cfg WindowsInstallConfig) (WindowsInstallResult, error) {
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
if strings.TrimSpace(cfg.RuntimeConfig.StateDir) == "" {
cfg.RuntimeConfig.StateDir = filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(cfg.RuntimeConfig.NodeName))
}
cfg.RuntimeConfig.Replace = cfg.Replace
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
if err := cfg.RuntimeConfig.ValidateInstall(); err != nil {
return WindowsInstallResult{}, err
}
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "auto"))
noAdminPreferred := cfg.StartupMode == "user-task"
cfg.InstallDir = firstNonEmpty(cfg.InstallDir, defaultWindowsInstallDir(cfg.RuntimeConfig.NodeName, noAdminPreferred))
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "auto"))
if noAdminPreferred && strings.HasPrefix(strings.ToLower(cfg.RuntimeConfig.StateDir), strings.ToLower(DefaultWindowsStateRoot)) {
cfg.RuntimeConfig.StateDir = defaultWindowsStateDir(cfg.RuntimeConfig.NodeName, true)
}
result := WindowsInstallResult{
NodeName: cfg.RuntimeConfig.NodeName,
InstallDir: cfg.InstallDir,
StateDir: cfg.RuntimeConfig.StateDir,
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent.exe"),
WrapperPath: filepath.Join(cfg.InstallDir, "rap-node-agent-run.cmd"),
StartupMode: cfg.StartupMode,
TaskName: "RAP Node Agent " + safeUnitSlug(cfg.RuntimeConfig.NodeName),
}
if cfg.DryRun {
return result, nil
}
if runtime.GOOS != "windows" {
return result, fmt.Errorf("windows install is only supported on windows hosts")
}
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
if cfg.StartupMode != "auto" || !isAccessDenied(err) {
return result, err
}
cfg.InstallDir = defaultWindowsInstallDir(cfg.RuntimeConfig.NodeName, true)
cfg.RuntimeConfig.StateDir = defaultWindowsStateDir(cfg.RuntimeConfig.NodeName, true)
result.InstallDir = cfg.InstallDir
result.StateDir = cfg.RuntimeConfig.StateDir
result.NodeAgentPath = filepath.Join(cfg.InstallDir, "rap-node-agent.exe")
result.WrapperPath = filepath.Join(cfg.InstallDir, "rap-node-agent-run.cmd")
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
return result, err
}
result.AdminFallback = true
}
if err := os.MkdirAll(cfg.RuntimeConfig.StateDir, 0o700); err != nil {
return result, err
}
if len(cfg.ArtifactURLs) > 0 && (cfg.Replace || !fileExists(result.NodeAgentPath)) {
m.stopExistingNodeAgent(ctx, result.TaskName, result.NodeAgentPath)
path, err := downloadFirstArtifact(ctx, cfg.ArtifactURLs, cfg.ArtifactSHA256, cfg.ArtifactSizeBytes)
if err != nil {
return result, err
}
defer os.Remove(path)
if err := copyFile(path, result.NodeAgentPath, 0o755); err != nil {
m.stopExistingNodeAgent(ctx, result.TaskName, result.NodeAgentPath)
if retryErr := copyFile(path, result.NodeAgentPath, 0o755); retryErr == nil {
result.Downloaded = true
goto binaryReady
}
return result, err
}
result.Downloaded = true
}
binaryReady:
if !fileExists(result.NodeAgentPath) {
return result, fmt.Errorf("node-agent binary is missing at %s and no artifact was available", result.NodeAgentPath)
}
if err := os.WriteFile(filepath.Join(cfg.InstallDir, "rap-node-agent.env.cmd"), []byte(windowsEnvScript(cfg.RuntimeConfig)), 0o600); err != nil {
return result, err
}
if err := os.WriteFile(result.WrapperPath, []byte(windowsWrapperScript(result.NodeAgentPath, filepath.Join(cfg.InstallDir, "rap-node-agent.env.cmd"))), 0o755); err != nil {
return result, err
}
logPath := filepath.Join(cfg.RuntimeConfig.StateDir, "rap-node-agent.log")
started, fallback, mode, err := m.installStartupTask(ctx, result.TaskName, result.WrapperPath, logPath, cfg.StartupMode)
if err != nil {
return result, err
}
result.Started = started
result.AdminFallback = fallback
result.StartupMode = mode
result, err = installWindowsHostAgentUpdater(ctx, m, result, cfg)
if err != nil {
return result, err
}
return result, nil
}
func (m WindowsManager) stopExistingNodeAgent(ctx context.Context, taskName, nodeAgentPath string) {
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
_, _ = runner.Run(ctx, "schtasks", "/End", "/TN", taskName)
escapedPath := strings.ReplaceAll(nodeAgentPath, `'`, `''`)
_, _ = runner.Run(ctx, "powershell", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command",
`Get-Process rap-node-agent -ErrorAction SilentlyContinue | Where-Object { $_.Path -eq '`+escapedPath+`' } | Stop-Process -Force -ErrorAction SilentlyContinue`)
}
func (m WindowsManager) installStartupTask(ctx context.Context, taskName, wrapperPath, logPath, mode string) (bool, bool, string, error) {
if mode == "none" {
return false, false, mode, nil
}
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
if mode == "auto" || mode == "system-task" {
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "ONSTART", "/RU", "SYSTEM", "/RL", "HIGHEST", "/TR", windowsTaskAction(wrapperPath, logPath), "/F")
if err == nil {
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
return true, false, "system-task", nil
}
if mode == "system-task" {
return false, false, mode, err
}
}
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "ONLOGON", "/TR", windowsTaskAction(wrapperPath, logPath), "/F")
if err != nil {
return false, mode == "auto", "user-task", err
}
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
return true, mode == "auto", "user-task", nil
}
func windowsTaskAction(wrapperPath, logPath string) string {
return `cmd.exe /c ""` + wrapperPath + `" >> "` + logPath + `" 2>&1"`
}
func windowsEnvScript(cfg RuntimeConfig) string {
lines := []string{"@echo off"}
for _, env := range NodeAgentEnv(cfg) {
key, value, ok := strings.Cut(env, "=")
if !ok {
continue
}
lines = append(lines, "set "+key+"="+value)
}
return strings.Join(lines, "\r\n") + "\r\n"
}
func windowsWrapperScript(nodeAgentPath, envPath string) string {
return strings.Join([]string{
"@echo off",
`call "` + envPath + `"`,
`"` + nodeAgentPath + `"`,
}, "\r\n") + "\r\n"
}
func binaryArtifactURLs(profile WindowsInstallProfile) []string {
if profile.NodeAgentArtifact != nil && len(profile.NodeAgentArtifact.URLs) > 0 {
return append([]string(nil), profile.NodeAgentArtifact.URLs...)
}
if profile.NodeAgentArtifact == nil || strings.TrimSpace(profile.NodeAgentArtifact.FileName) == "" {
return nil
}
out := []string{}
fileName := strings.TrimLeft(strings.TrimSpace(profile.NodeAgentArtifact.FileName), "/")
for _, endpoint := range profile.ArtifactEndpoints {
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
out = append(out, trimmed+"/"+fileName)
}
}
return out
}
func binaryArtifactSHA256(profile WindowsInstallProfile) string {
if profile.NodeAgentArtifact == nil {
return ""
}
return strings.TrimSpace(profile.NodeAgentArtifact.SHA256)
}
func binaryArtifactSizeBytes(profile WindowsInstallProfile) int64 {
if profile.NodeAgentArtifact == nil {
return 0
}
return profile.NodeAgentArtifact.SizeBytes
}
func fileExists(path string) bool {
_, err := os.Stat(path)
return err == nil
}
func copyFile(source, target string, mode os.FileMode) error {
src, err := os.Open(source)
if err != nil {
return err
}
defer src.Close()
if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil {
return err
}
tmp := target + ".tmp"
dst, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
if err != nil {
return err
}
if _, err := io.Copy(dst, src); err != nil {
_ = dst.Close()
_ = os.Remove(tmp)
return err
}
if err := dst.Close(); err != nil {
_ = os.Remove(tmp)
return err
}
if err := replaceFile(tmp, target); err != nil {
_ = os.Remove(tmp)
return err
}
return nil
}
func replaceFile(tmp, target string) error {
if runtime.GOOS != "windows" {
return os.Rename(tmp, target)
}
backup := target + ".bak"
_ = os.Remove(backup)
if fileExists(target) {
if err := os.Rename(target, backup); err != nil {
return err
}
}
if err := os.Rename(tmp, target); err != nil {
if fileExists(backup) {
_ = os.Rename(backup, target)
}
return err
}
_ = os.Remove(backup)
return nil
}
func defaultWindowsInstallDir(nodeName string, userMode bool) string {
slug := safeUnitSlug(nodeName)
if userMode {
if base := strings.TrimSpace(os.Getenv("LOCALAPPDATA")); base != "" {
return filepath.Join(base, "RAP", slug)
}
if base := strings.TrimSpace(os.Getenv("USERPROFILE")); base != "" {
return filepath.Join(base, "AppData", "Local", "RAP", slug)
}
}
return filepath.Join(DefaultWindowsInstallDir, slug)
}
func defaultWindowsStateDir(nodeName string, userMode bool) string {
slug := safeUnitSlug(nodeName)
if userMode {
if base := strings.TrimSpace(os.Getenv("LOCALAPPDATA")); base != "" {
return filepath.Join(base, "RAP", "nodes", slug)
}
if base := strings.TrimSpace(os.Getenv("USERPROFILE")); base != "" {
return filepath.Join(base, "AppData", "Local", "RAP", "nodes", slug)
}
}
return filepath.Join(DefaultWindowsStateRoot, slug)
}
func isAccessDenied(err error) bool {
if err == nil {
return false
}
value := strings.ToLower(err.Error())
return strings.Contains(value, "access is denied") ||
strings.Contains(value, "permission denied") ||
strings.Contains(value, "operation not permitted")
}
@@ -0,0 +1,337 @@
package hostagent
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"time"
)
func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
req.InstallType = WindowsUpdateInstallType
}
req.OS = firstNonEmpty(req.OS, "windows")
req.Arch = firstNonEmpty(req.Arch, "amd64")
req = req.Normalize()
var err error
req, err = resolveUpdateRequest(req)
if err != nil {
return UpdateResult{}, err
}
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
plan, err := FetchNodeUpdatePlan(ctx, req)
if err != nil {
return UpdateResult{}, err
}
if plan.HealthWindowSec > 0 && req.HealthTimeout == 30*time.Second {
req.HealthTimeout = time.Duration(plan.HealthWindowSec) * time.Second
}
result := UpdateResult{
Action: plan.Action,
Reason: plan.Reason,
TargetVersion: plan.TargetVersion,
ContainerName: req.WindowsTaskName,
NewImage: req.BinaryPath,
}
if plan.Action != "update" {
if !req.DryRun {
status := statusFromNoopPlan(req, plan)
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["task"] = req.WindowsTaskName
status.Payload["binary_path"] = req.BinaryPath
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
}
return result, nil
}
if plan.ProductionForwarding && !req.AllowProductionMesh {
err := errors.New("refusing update plan with production forwarding enabled")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact == nil {
err := errors.New("update plan has no artifact")
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != WindowsUpdateInstallType {
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
return result, err
}
if req.DryRun {
return result, nil
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "planned",
Status: "accepted",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName},
})
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath},
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
if err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if retryErr := copyFile(path, req.BinaryPath, 0o755); retryErr != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
return result, err
}
}
result.Replaced = true
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "health_check",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"task": req.WindowsTaskName, "binary_path": req.BinaryPath},
})
_ = saveUpdateState(req.StateDir, UpdateState{
Product: req.Product,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
Image: req.BinaryPath,
UpdatedAt: time.Now().UTC(),
})
return result, nil
}
func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
req.InstallType = WindowsUpdateInstallType
}
req.OS = firstNonEmpty(req.OS, "windows")
req.Arch = firstNonEmpty(req.Arch, "amd64")
req = req.Normalize()
if err := req.Validate(); err != nil {
return err
}
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
if cfg.Interval < 0 {
return errors.New("update loop interval must not be negative")
}
if cfg.InitialDelay < 0 {
return errors.New("update loop initial delay must not be negative")
}
if cfg.Jitter < 0 || cfg.Jitter > 1 {
return errors.New("update loop jitter must be between 0 and 1")
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
}
if cfg.InitialDelay > 0 {
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
return err
}
}
runs := 0
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
for {
runs++
result, err := m.ApplyUpdate(ctx, req)
if err != nil {
if errors.Is(err, ErrNodeIdentityNotReady) {
logf("windows_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
return err
}
continue
}
logf("windows_update_loop run=%d status=failed error=%v", runs, err)
if cfg.StopOnError {
return err
}
} else {
logf("windows_update_loop run=%d action=%s reason=%s target=%s task=%s replaced=%t",
runs,
result.Action,
result.Reason,
result.TargetVersion,
result.ContainerName,
result.Replaced,
)
if result.Action == "update" && result.TargetVersion != "" && !result.RolledBack {
req.CurrentVersion = result.TargetVersion
}
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
hostReq.OS = firstNonEmpty(hostReq.OS, "windows")
hostReq.Arch = firstNonEmpty(hostReq.Arch, "amd64")
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, "windows_binary")
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
if hostErr != nil {
if errors.Is(hostErr, ErrNodeIdentityNotReady) {
logf("windows_host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
} else {
logf("windows_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
if cfg.StopOnError {
return hostErr
}
}
} else {
logf("windows_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
runs,
hostResult.Action,
hostResult.Reason,
hostResult.TargetVersion,
hostResult.NewImage,
hostResult.Replaced,
hostResult.RestartNeeded,
)
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
cfg.HostAgentUpdateRequest.CurrentVersion = hostResult.TargetVersion
}
}
}
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
}
}
func installWindowsHostAgentUpdater(ctx context.Context, m WindowsManager, result WindowsInstallResult, cfg WindowsInstallConfig) (WindowsInstallResult, error) {
if !cfg.AutoUpdateEnabled || strings.EqualFold(result.StartupMode, "none") {
return result, nil
}
if cfg.AutoUpdateCurrentVersion == "" || (cfg.Replace && !result.Downloaded) {
cfg.AutoUpdateCurrentVersion = "0.0.0"
}
hostAgentPath := filepath.Join(result.InstallDir, "rap-host-agent.exe")
if err := installHostAgentBinary(cfg.HostAgentSourcePath, hostAgentPath); err != nil {
return result, err
}
wrapperPath := filepath.Join(result.InstallDir, "rap-host-agent-update.cmd")
logPath := filepath.Join(result.StateDir, "rap-host-agent-update.log")
taskName := "RAP Host Agent Updater " + safeUnitSlug(result.NodeName)
script := windowsHostAgentUpdateScript(hostAgentPath, cfg, result)
if err := os.WriteFile(wrapperPath, []byte(script), 0o755); err != nil {
return result, err
}
started, fallback, mode, err := m.installStartupTask(ctx, taskName, wrapperPath, logPath, cfg.StartupMode)
if err != nil {
return result, err
}
result.HostAgentPath = hostAgentPath
result.UpdaterTaskName = taskName
result.UpdaterStarted = started
if fallback {
result.AdminFallback = true
}
if mode != "" && mode != result.StartupMode {
result.StartupMode = mode
}
return result, nil
}
func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig, result WindowsInstallResult) string {
currentVersion := firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0")
interval := cfg.AutoUpdateIntervalSeconds
if interval == 0 {
interval = 21600
}
initialDelay := cfg.AutoUpdateInitialDelaySeconds
if initialDelay == 0 {
initialDelay = 15
}
healthTimeout := cfg.AutoUpdateHealthTimeoutSeconds
if healthTimeout == 0 {
healthTimeout = 30
}
updateLoopArgs := []string{
`"` + hostAgentPath + `"`,
"update-loop",
"--backend-url", `"` + cfg.RuntimeConfig.BackendURL + `"`,
"--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`,
"--state-dir", `"` + result.StateDir + `"`,
"--current-version", currentVersion,
"--os", "windows",
"--arch", "amd64",
"--install-type", WindowsUpdateInstallType,
"--binary-path", `"` + result.NodeAgentPath + `"`,
"--windows-task-name", `"` + result.TaskName + `"`,
"--health-timeout-seconds", fmt.Sprintf("%d", healthTimeout),
"--interval-seconds", fmt.Sprintf("%d", interval),
"--initial-delay-seconds", "0",
"--host-agent-update-status-enabled",
"--host-agent-current-version", currentVersion,
"--host-agent-binary-path", `"` + hostAgentPath + `"`,
}
if strings.TrimSpace(cfg.NodeID) != "" {
updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`)
}
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
updateLoopArgs = append(updateLoopArgs, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
}
lines := []string{
"@echo off",
"setlocal",
"set RAP_HOST_AGENT=" + `"` + hostAgentPath + `"`,
"set RAP_HOST_AGENT_NEXT=" + `"` + hostAgentPath + `.next"`,
}
if initialDelay > 0 {
lines = append(lines, "timeout /t "+fmt.Sprintf("%d", initialDelay)+" /nobreak >NUL")
}
lines = append(lines, []string{
":loop",
"if exist %RAP_HOST_AGENT_NEXT% (",
" copy /Y %RAP_HOST_AGENT_NEXT% %RAP_HOST_AGENT% >NUL",
" if %ERRORLEVEL% EQU 0 del /F /Q %RAP_HOST_AGENT_NEXT%",
")",
strings.Join(updateLoopArgs, " "),
"timeout /t " + fmt.Sprintf("%d", interval) + " /nobreak >NUL",
"goto loop",
"endlocal",
"rem initial-delay-seconds " + fmt.Sprintf("%d", initialDelay),
}...)
return strings.Join(lines, "\r\n") + "\r\n"
}
@@ -63,10 +63,12 @@ const (
ProductionChannelVPNPacket = "vpn_packet"
ProductionMessageVPNPacketBatch = "vpn.packet_batch"
FabricServiceClassVPNPackets = "vpn_packets"
FabricServiceClassRemoteWorkspace = "remote_workspace"
FabricServiceChannelBulk = "bulk"
FabricServiceChannelControl = "control"
FabricServiceChannelInteractive = "interactive"
FabricServiceChannelReliable = "reliable"
FabricServiceChannelDroppable = "droppable"
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionVPNPacketPayloadBytes = 256 * 1024
MaxProductionEnvelopeFutureSkew = time.Minute
@@ -59,9 +59,9 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
reasons := []string{"base"}
switch candidate.Transport {
case "direct_tcp_tls":
case "direct_tcp_tls", "direct_http", "direct_https":
score += 35
reasons = append(reasons, "transport:direct_tcp_tls")
reasons = append(reasons, "transport:direct")
case "wss":
score += 25
reasons = append(reasons, "transport:wss")
@@ -37,27 +37,28 @@ type PeerCacheSnapshot struct {
}
type PeerCacheEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
}
type peerCacheBuildEntry struct {
@@ -117,6 +118,10 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
MaxVerificationAge: time.Hour,
})
if len(scored) > 0 {
entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored))
for _, scoredCandidate := range scored {
entry.EndpointCandidates = append(entry.EndpointCandidates, scoredCandidate.Candidate)
}
entry.BestCandidateID = scored[0].Candidate.EndpointID
entry.BestCandidateAddr = scored[0].Candidate.Address
entry.BestTransport = scored[0].Candidate.Transport
@@ -66,24 +66,44 @@ type PeerConnectionManagerSnapshot struct {
}
type PeerConnectionProbeResult struct {
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
SelectedCandidateID string `json:"selected_candidate_id,omitempty"`
SelectedEndpoint string `json:"selected_endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
CandidateResults []PeerConnectionCandidateProbeResult `json:"candidate_results,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
type PeerConnectionCandidateProbeResult struct {
CandidateID string `json:"candidate_id,omitempty"`
Endpoint string `json:"endpoint"`
Transport string `json:"transport,omitempty"`
LinkStatus string `json:"link_status"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
type peerConnectionProbeTarget struct {
CandidateID string
Endpoint string
Transport string
}
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
@@ -137,6 +157,10 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
RendezvousLeases: rendezvousLeases,
Now: startedAt,
})
entriesByNode := map[string]PeerCacheEntry{}
for _, entry := range peerSnapshot.Entries {
entriesByNode[entry.NodeID] = entry
}
cycle := PeerConnectionManagerCycle{
Mode: recoveryPlan.Mode,
StartedAt: startedAt,
@@ -150,7 +174,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
}
for _, intent := range intentPlan.Intents {
result := m.probeIntent(ctx, intent)
result := m.probeIntent(ctx, intent, entriesByNode[intent.NodeID])
cycle.Results = append(cycle.Results, result)
switch result.LinkStatus {
case PeerConnectionProbeReachable:
@@ -200,7 +224,7 @@ func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvo
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
}
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult {
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent, cacheEntry PeerCacheEntry) PeerConnectionProbeResult {
startedAt := normalizedNow(m.now())
result := PeerConnectionProbeResult{
NodeID: intent.NodeID,
@@ -254,9 +278,6 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
result.CompletedAt = normalizedNow(m.now())
return result
}
m.tracker.BeginProbe(peer, startedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
defer cancel()
target := PeerIdentity{
ClusterID: m.local.ClusterID,
NodeID: intent.NodeID,
@@ -264,30 +285,118 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
if intent.RelayCandidate && intent.RelayNodeID != "" {
target.NodeID = intent.RelayNodeID
}
_, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
completedAt := normalizedNow(m.now())
if err != nil {
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = err.Error()
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt)
targets := []peerConnectionProbeTarget{{
CandidateID: intent.BestCandidateID,
Endpoint: intent.Endpoint,
Transport: intent.Transport,
}}
if intent.DirectCandidate {
targets = peerConnectionProbeTargets(intent, cacheEntry)
}
var lastFailure string
for _, probeTarget := range targets {
probePeer := peer
probePeer.Endpoint = strings.TrimRight(strings.TrimSpace(probeTarget.Endpoint), "/")
probePeer.BestCandidateID = strings.TrimSpace(probeTarget.CandidateID)
probePeer.BestCandidateAddr = probePeer.Endpoint
probePeer.BestTransport = strings.TrimSpace(probeTarget.Transport)
if probePeer.Endpoint == "" {
continue
}
candidateStartedAt := normalizedNow(m.now())
m.tracker.BeginProbe(probePeer, candidateStartedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
_, err := NewClient(probePeer.Endpoint).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
cancel()
completedAt := normalizedNow(m.now())
candidateResult := PeerConnectionCandidateProbeResult{
CandidateID: probePeer.BestCandidateID,
Endpoint: probePeer.Endpoint,
Transport: probePeer.BestTransport,
StartedAt: candidateStartedAt,
CompletedAt: completedAt,
}
if err != nil {
lastFailure = err.Error()
candidateResult.LinkStatus = PeerConnectionProbeUnreachable
candidateResult.FailureReason = lastFailure
result.CandidateResults = append(result.CandidateResults, candidateResult)
continue
}
latency := int(completedAt.Sub(candidateStartedAt).Milliseconds())
if latency < 0 {
latency = 0
}
candidateResult.LinkStatus = PeerConnectionProbeReachable
candidateResult.LatencyMs = latency
result.CandidateResults = append(result.CandidateResults, candidateResult)
result.LinkStatus = PeerConnectionProbeReachable
result.Endpoint = probePeer.Endpoint
result.SelectedCandidateID = probePeer.BestCandidateID
result.SelectedEndpoint = probePeer.Endpoint
result.LatencyMs = latency
if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
}
result.CompletedAt = completedAt
return result
}
latency := int(completedAt.Sub(startedAt).Milliseconds())
if latency < 0 {
latency = 0
}
result.LinkStatus = PeerConnectionProbeReachable
result.LatencyMs = latency
if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt)
completedAt := normalizedNow(m.now())
if lastFailure == "" {
lastFailure = "no_probe_endpoint_available"
}
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = lastFailure
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, lastFailure, completedAt)
result.CompletedAt = completedAt
return result
}
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
seen := map[string]struct{}{}
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
add := func(candidateID, endpoint, transport string) {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
if endpoint == "" {
return
}
key := candidateID + "|" + endpoint
if _, ok := seen[key]; ok {
return
}
seen[key] = struct{}{}
out = append(out, peerConnectionProbeTarget{
CandidateID: strings.TrimSpace(candidateID),
Endpoint: endpoint,
Transport: strings.TrimSpace(transport),
})
}
for _, candidate := range cacheEntry.EndpointCandidates {
if !candidateUsableForDirectProbe(candidate) {
continue
}
add(candidate.EndpointID, candidate.Address, candidate.Transport)
}
add(intent.BestCandidateID, intent.Endpoint, intent.Transport)
return out
}
func candidateUsableForDirectProbe(candidate PeerEndpointCandidate) bool {
endpoint := strings.TrimSpace(candidate.Address)
if endpoint == "" || strings.HasPrefix(endpoint, "relay://") || strings.HasPrefix(endpoint, "outbound://") {
return false
}
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
if connectivity == "outbound_only" || connectivity == "relay_required" || reachability == "outbound_only" || reachability == "relay" {
return false
}
return transport == "" || strings.Contains(transport, "direct") || transport == "wss" || strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
}
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
snapshot := m.tracker.Snapshot()
for _, entry := range snapshot.Entries {
@@ -188,3 +188,71 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
}
}
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-dead",
NodeID: "node-b",
Transport: "direct_http",
Address: "http://127.0.0.1:1",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
},
{
EndpointID: "node-b-live",
NodeID: "node-b",
Transport: "direct_http",
Address: server.URL,
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 2,
},
},
},
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 100 * time.Millisecond},
ProbeTimeout: 100 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Failed != 0 || len(cycle.Results) != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
result := cycle.Results[0]
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != server.URL {
t.Fatalf("fallback did not select live candidate: %+v", result)
}
if len(result.CandidateResults) != 2 ||
result.CandidateResults[0].LinkStatus != PeerConnectionProbeUnreachable ||
result.CandidateResults[1].LinkStatus != PeerConnectionProbeReachable {
t.Fatalf("candidate probe trail mismatch: %+v", result.CandidateResults)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != server.URL {
t.Fatalf("tracker did not retain selected candidate: %+v", snapshot)
}
}
@@ -138,6 +138,32 @@ func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now
return entry
}
func (t *PeerConnectionTracker) RecordSuccessForPeer(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
entry.ConsecutiveSuccesses++
entry.ConsecutiveFailures = 0
entry.LastLatencyMs = latencyMs
entry.LastFailureReason = ""
entry.LastProbeAt = now
entry.BackoffUntil = time.Time{}
nextState := PeerConnectionReady
if latencyMs >= 500 {
nextState = PeerConnectionDegraded
}
if entry.State != nextState {
entry.State = nextState
entry.LastTransitionAt = now
}
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
@@ -34,12 +34,20 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
return err
}
}
if envelope.ChannelClass != ProductionChannelFabricControl {
maxPayloadBytes := MaxProductionEnvelopePayloadBytes
switch envelope.ChannelClass {
case ProductionChannelFabricControl:
if envelope.MessageType != ProductionMessageFabricControl {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
case ProductionChannelVPNPacket:
if envelope.MessageType != ProductionMessageVPNPacketBatch {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
maxPayloadBytes = MaxProductionVPNPacketPayloadBytes
default:
return ErrUnauthorizedChannel
}
if envelope.MessageType != ProductionMessageFabricControl {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
if envelope.TTL <= 0 {
return ErrTTLExhausted
}
@@ -58,8 +66,8 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
if envelope.PayloadLength != len(envelope.Payload) {
return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes {
return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid)
if envelope.PayloadLength > maxPayloadBytes {
return fmt.Errorf("%w: payload exceeds channel limit", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadHash == "" {
return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid)
@@ -22,7 +22,7 @@ func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope Producti
if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) {
return ErrRouteExpired
}
if !contains(route.AllowedChannels, ProductionChannelFabricControl) {
if !contains(route.AllowedChannels, envelope.ChannelClass) {
return ErrUnauthorizedChannel
}
path := routePath(route)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -2,6 +2,8 @@ package supervisor
import (
"context"
"strings"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
)
@@ -17,24 +19,146 @@ type StubSupervisor struct {
func (s StubSupervisor) Apply(_ context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error) {
statuses := make([]client.WorkloadStatusRequest, 0, len(desired))
for _, workload := range desired {
state := "degraded"
if workload.DesiredState == "disabled" {
state = "stopped"
}
version := workload.Version
if version == "" {
version = s.Version
}
statuses = append(statuses, client.WorkloadStatusRequest{
ReportedState: state,
RuntimeMode: workload.RuntimeMode,
Version: version,
StatusPayload: map[string]any{
"supervisor": "stub",
"desired_state": workload.DesiredState,
"service_type": workload.ServiceType,
},
})
statuses = append(statuses, s.applyOne(workload))
}
return statuses, nil
}
func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.WorkloadStatusRequest {
serviceType := strings.TrimSpace(workload.ServiceType)
desiredState := strings.TrimSpace(strings.ToLower(workload.DesiredState))
if desiredState == "" {
desiredState = "disabled"
}
runtimeMode := strings.TrimSpace(strings.ToLower(workload.RuntimeMode))
if runtimeMode == "" {
runtimeMode = "native"
}
version := strings.TrimSpace(workload.Version)
if version == "" {
version = s.Version
}
payload := map[string]any{
"schema_version": "rap.node_agent.workload_supervision.v1",
"supervisor": "node-agent-local",
"desired_state": desiredState,
"service_type": serviceType,
"runtime_mode": runtimeMode,
"observed_at": time.Now().UTC().Format(time.RFC3339Nano),
}
if desiredState != "enabled" {
payload["reason"] = "desired_state_not_enabled"
return client.WorkloadStatusRequest{
ReportedState: "stopped",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
if serviceType == "core-mesh" || serviceType == "mesh-listener" {
payload["reason"] = "builtin_node_agent_service_ready"
payload["execution_mode"] = "builtin"
payload["traffic"] = serviceTrafficMode(serviceType)
return client.WorkloadStatusRequest{
ReportedState: "running",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
if serviceType == "synthetic.echo" && runtimeMode == "native" {
payload["reason"] = "internal_synthetic_echo_ready"
payload["execution_mode"] = "builtin"
payload["traffic"] = "test_service_only"
return client.WorkloadStatusRequest{
ReportedState: "running",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
if serviceType == "rdp-worker" && runtimeMode == "native" && boolConfig(workload.Config, "adapter_contract_probe") {
payload["reason"] = "remote_workspace_adapter_contract_probe_ready"
payload["execution_mode"] = "contract_probe"
payload["service_class"] = "remote_workspace"
payload["fabric_service_channel_required"] = true
payload["backend_relay_steady_state"] = false
payload["channels"] = remoteWorkspaceAdapterChannels()
payload["frame_batch_contract"] = remoteWorkspaceFrameBatchContract()
payload["traffic"] = "none"
return client.WorkloadStatusRequest{
ReportedState: "running",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
payload["reason"] = "service_runtime_not_implemented"
payload["traffic"] = "blocked"
return client.WorkloadStatusRequest{
ReportedState: "degraded",
RuntimeMode: runtimeMode,
Version: version,
StatusPayload: payload,
}
}
func boolConfig(values map[string]any, key string) bool {
if values == nil {
return false
}
value, ok := values[key]
if !ok {
return false
}
switch typed := value.(type) {
case bool:
return typed
case string:
return strings.EqualFold(strings.TrimSpace(typed), "true")
default:
return false
}
}
func remoteWorkspaceAdapterChannels() []map[string]any {
return []map[string]any{
{"name": "input", "direction": "client_to_adapter", "reliability": "reliable_ordered", "priority": "critical", "droppable": true, "may_block_input": false},
{"name": "control", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "high", "droppable": false, "may_block_input": false},
{"name": "display", "direction": "adapter_to_client", "reliability": "droppable_latest", "priority": "high", "droppable": true, "may_block_input": false},
{"name": "cursor", "direction": "adapter_to_client", "reliability": "droppable_latest", "priority": "high", "droppable": true, "may_block_input": false},
{"name": "clipboard", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "medium", "droppable": false, "may_block_input": false},
{"name": "file_transfer", "direction": "bidirectional", "reliability": "reliable_chunked", "priority": "medium", "droppable": false, "may_block_input": false},
{"name": "audio", "direction": "adapter_to_client", "reliability": "adaptive_droppable", "priority": "medium", "droppable": true, "may_block_input": false},
{"name": "device", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "medium", "droppable": false, "may_block_input": false},
{"name": "telemetry", "direction": "adapter_to_client", "reliability": "sampled_droppable", "priority": "low", "droppable": true, "may_block_input": false},
}
}
func remoteWorkspaceFrameBatchContract() map[string]any {
return map[string]any{
"schema_version": "rap.remote_workspace_frame_batch.v1",
"adapter_contract_id": "rap.rdp_worker.remote_workspace_adapter_contract_probe.v1",
"probe_only": true,
"payload_forwarding": "not_implemented",
"service_class": "remote_workspace",
"allowed_flow_classes": []string{"control", "interactive", "reliable", "bulk", "droppable"},
"allowed_payload_encodings": []string{
"none",
"base64",
},
"max_probe_frames": 32,
"channels": remoteWorkspaceAdapterChannels(),
}
}
func serviceTrafficMode(serviceType string) string {
switch serviceType {
case "core-mesh":
return "fabric_control"
case "mesh-listener":
return "entry_listener"
default:
return "unknown"
}
}
@@ -33,3 +33,101 @@ func TestStubSupervisorReportsStoppedForDisabledWorkload(t *testing.T) {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
}
func TestStubSupervisorRunsInternalSyntheticEchoWorkload(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "synthetic.echo", DesiredState: "enabled", RuntimeMode: "native"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "running" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
if statuses[0].StatusPayload["reason"] != "internal_synthetic_echo_ready" {
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
}
if statuses[0].StatusPayload["execution_mode"] != "builtin" {
t.Fatalf("execution_mode = %v", statuses[0].StatusPayload["execution_mode"])
}
}
func TestStubSupervisorReportsBuiltinFabricServicesRunning(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "core-mesh", DesiredState: "enabled", RuntimeMode: "container"},
{ServiceType: "mesh-listener", DesiredState: "enabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if len(statuses) != 2 {
t.Fatalf("statuses length = %d", len(statuses))
}
for _, status := range statuses {
if status.ReportedState != "running" {
t.Fatalf("ReportedState = %q", status.ReportedState)
}
if status.StatusPayload["reason"] != "builtin_node_agent_service_ready" {
t.Fatalf("reason = %v", status.StatusPayload["reason"])
}
}
}
func TestStubSupervisorKeepsUnsupportedEnabledWorkloadDegraded(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "rdp-worker", DesiredState: "enabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "degraded" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
if statuses[0].StatusPayload["reason"] != "service_runtime_not_implemented" {
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
}
}
func TestStubSupervisorRunsRDPWorkerAdapterContractProbeOnly(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{
ServiceType: "rdp-worker",
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"adapter_contract_probe": true,
},
},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "running" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
if statuses[0].StatusPayload["reason"] != "remote_workspace_adapter_contract_probe_ready" {
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
}
if statuses[0].StatusPayload["service_class"] != "remote_workspace" {
t.Fatalf("service_class = %v", statuses[0].StatusPayload["service_class"])
}
if statuses[0].StatusPayload["backend_relay_steady_state"] != false {
t.Fatalf("backend_relay_steady_state = %v", statuses[0].StatusPayload["backend_relay_steady_state"])
}
channels, ok := statuses[0].StatusPayload["channels"].([]map[string]any)
if !ok || len(channels) != 9 {
t.Fatalf("channels = %#v", statuses[0].StatusPayload["channels"])
}
if channels[0]["name"] != "input" || channels[0]["priority"] != "critical" || channels[0]["droppable"] != true || channels[0]["may_block_input"] != false {
t.Fatalf("unexpected input channel: %#v", channels[0])
}
frameBatch, ok := statuses[0].StatusPayload["frame_batch_contract"].(map[string]any)
if !ok {
t.Fatalf("frame_batch_contract = %#v", statuses[0].StatusPayload["frame_batch_contract"])
}
if frameBatch["schema_version"] != "rap.remote_workspace_frame_batch.v1" ||
frameBatch["payload_forwarding"] != "not_implemented" ||
frameBatch["service_class"] != "remote_workspace" {
t.Fatalf("unexpected frame batch contract: %#v", frameBatch)
}
}
@@ -385,32 +385,37 @@ func (s *FabricFlowScheduler) ConfigureAdaptivePolicy(policy FabricServiceChanne
}
func (s *FabricFlowScheduler) ScheduleClientPackets(packets [][]byte) []FabricScheduledPacketBatch {
return s.scheduleClientPackets("", "", packets)
scheduled, _ := s.scheduleClientPackets("", "", packets)
return scheduled
}
func (s *FabricFlowScheduler) ScheduleClientPacketsForConnection(vpnConnectionID string, packets [][]byte) []FabricScheduledPacketBatch {
return s.scheduleClientPackets(vpnConnectionID, "", packets)
scheduled, _ := s.scheduleClientPackets(vpnConnectionID, "", packets)
return scheduled
}
func (s *FabricFlowScheduler) ScheduleClientPacketsForConnectionClass(vpnConnectionID string, trafficClass string, packets [][]byte) []FabricScheduledPacketBatch {
return s.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
scheduled, _ := s.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
return scheduled
}
func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, trafficClass string, packets [][]byte) []FabricScheduledPacketBatch {
func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, trafficClass string, packets [][]byte) ([]FabricScheduledPacketBatch, uint64) {
packets = cleanPacketBatch(packets)
if len(packets) == 0 {
return nil
return nil, 0
}
if s == nil {
s = NewFabricFlowScheduler(0, 0)
}
trafficClass = normalizeFabricTrafficClass(trafficClass)
grouped := map[string]*FabricScheduledPacketBatch{}
var droppedCount uint64
for _, packet := range packets {
flowID, shard := classifyPacketFlow(packet, s.shardCountValue())
channelID := fabricFlowChannelIDForClass(vpnConnectionID, trafficClass, shard)
queueDepth, dropped := s.enqueue(channelID, trafficClass)
if dropped {
droppedCount++
continue
}
batch := grouped[channelID]
@@ -433,7 +438,7 @@ func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, traf
out = append(out, *batch)
}
s.sortScheduledBatches(out)
return out
return out, droppedCount
}
func fabricFlowChannelID(vpnConnectionID string, shard int) string {
@@ -1441,11 +1446,9 @@ func (i *FabricClientPacketIngress) SendClientPacketBatchWithTrafficClass(ctx co
}
i.recordSendBatch(len(packets))
scheduler := i.flowScheduler()
droppedBefore := scheduler.Dropped()
scheduled := scheduler.ScheduleClientPacketsForConnectionClass(vpnConnectionID, trafficClass, packets)
droppedAfter := scheduler.Dropped()
if droppedAfter > droppedBefore {
i.recordFlowDropped(droppedAfter - droppedBefore)
scheduled, droppedCount := scheduler.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
if droppedCount > 0 {
i.recordFlowDropped(droppedCount)
}
if len(scheduled) == 0 {
i.recordError(mesh.ErrSyntheticRelayQueueFull)
@@ -1657,8 +1660,10 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
if i == nil || routesFunc == nil {
return nil
}
localClusterID := i.clusterID()
localNodeID := i.localNodeID()
if clusterID == "" {
clusterID = i.ClusterID
clusterID = localClusterID
}
now := time.Now().UTC()
var preferred []fabricClientRouteCandidate
@@ -1676,7 +1681,7 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
}
}
for _, route := range routesFunc() {
if route.ClusterID != clusterID || route.SourceNodeID != i.LocalNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
if route.ClusterID != clusterID || route.SourceNodeID != localNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
continue
}
if manager.isWithdrawn(route.RouteID) {
@@ -1685,8 +1690,8 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
if !route.ExpiresAt.IsZero() && !route.ExpiresAt.After(now) {
continue
}
nextHop := nextHopAfter(route.Hops, i.LocalNodeID, route.DestinationNodeID)
if nextHop == "" || nextHop == i.LocalNodeID {
nextHop := nextHopAfter(route.Hops, localNodeID, route.DestinationNodeID)
if nextHop == "" || nextHop == localNodeID {
continue
}
candidate := fabricClientRouteCandidate{Route: route, NextHop: nextHop}
@@ -2024,7 +2029,7 @@ func (i *FabricClientPacketIngress) routeProvenance(clusterID string) map[string
if i == nil || routesFunc == nil {
return out
}
localNodeID := strings.TrimSpace(i.LocalNodeID)
localNodeID := i.localNodeID()
for _, route := range routesFunc() {
if strings.TrimSpace(route.RouteID) == "" {
continue
@@ -2322,6 +2327,24 @@ func (i *FabricClientPacketIngress) routesFunc() func() []mesh.SyntheticRoute {
return i.Routes
}
func (i *FabricClientPacketIngress) clusterID() string {
if i == nil {
return ""
}
i.mu.Lock()
defer i.mu.Unlock()
return strings.TrimSpace(i.ClusterID)
}
func (i *FabricClientPacketIngress) localNodeID() string {
if i == nil {
return ""
}
i.mu.Lock()
defer i.mu.Unlock()
return strings.TrimSpace(i.LocalNodeID)
}
func (i *FabricClientPacketIngress) flowScheduler() *FabricFlowScheduler {
if i == nil {
return NewFabricFlowScheduler(0, 0)
@@ -324,10 +324,13 @@ func TestFabricFlowSchedulerDropsWhenChannelQueueIsFull(t *testing.T) {
packetA := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
packetB := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
batches := scheduler.ScheduleClientPackets([][]byte{packetA, packetB})
batches, dropped := scheduler.scheduleClientPackets("", "", [][]byte{packetA, packetB})
if len(batches) != 1 || len(batches[0].Packets) != 1 {
t.Fatalf("batches = %#v, want one accepted packet", batches)
}
if dropped != 1 {
t.Fatalf("dropped = %d, want per-call drop count 1", dropped)
}
snapshot := scheduler.Snapshot()
if snapshot.Dropped != 1 || !snapshot.BackpressureActive {
t.Fatalf("snapshot = %+v, want one dropped packet and active backpressure", snapshot)
@@ -1069,6 +1072,60 @@ func TestFabricClientPacketIngressIsolatesRouteMemoryPerVPNConnection(t *testing
}
}
func TestFabricClientPacketIngressRouteSelectionUsesUpdatedRuntimeIdentity(t *testing.T) {
transport := &captureManyProductionTransport{}
ingress := &FabricClientPacketIngress{
ForwardTransport: transport,
Inbox: NewFabricPacketInbox(8),
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
Routes: func() []mesh.SyntheticRoute {
return []mesh.SyntheticRoute{{
RouteID: "route-entry-1",
ClusterID: "cluster-1",
SourceNodeID: "entry-1",
DestinationNodeID: "exit-1",
Hops: []string{"entry-1", "relay-1", "exit-1"},
AllowedChannels: []string{mesh.ProductionChannelVPNPacket},
ExpiresAt: time.Now().UTC().Add(time.Minute),
MaxTTL: 8,
}}
},
}
ingress.UpdateRuntime(
transport,
NewFabricPacketInbox(8),
"cluster-1",
"entry-2",
nil,
func() []mesh.SyntheticRoute {
return []mesh.SyntheticRoute{{
RouteID: "route-entry-2",
ClusterID: "cluster-1",
SourceNodeID: "entry-2",
DestinationNodeID: "exit-2",
Hops: []string{"entry-2", "relay-2", "exit-2"},
AllowedChannels: []string{mesh.ProductionChannelVPNPacket},
ExpiresAt: time.Now().UTC().Add(time.Minute),
MaxTTL: 8,
}}
},
"policy-updated",
)
packet := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 443)
if err := ingress.SendClientPacketBatch(context.Background(), "", "vpn-1", [][]byte{packet}); err != nil {
t.Fatalf("send after runtime update: %v", err)
}
if len(transport.envelopes) != 1 {
t.Fatalf("envelopes = %d, want one send", len(transport.envelopes))
}
envelope := transport.envelopes[0]
if envelope.RouteID != "route-entry-2" || envelope.SourceNodeID != "entry-2" || transport.calls[0] != "relay-2" {
t.Fatalf("envelope route/source/next-hop = %s/%s/%s, want updated entry-2 route", envelope.RouteID, envelope.SourceNodeID, transport.calls[0])
}
}
func TestFabricClientPacketIngressParallelFlowWindowDoesNotBlockIndependentChannel(t *testing.T) {
scheduler := NewFabricFlowScheduler(8, 16)
slowPacket, fastPacket := packetsForOrderedDistinctChannels(scheduler.shardCountValue())
@@ -0,0 +1,170 @@
//go:build windows && rap_vpn_windows_tun
package vpnruntime
import (
"crypto/sha256"
_ "embed"
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"strings"
wgtun "golang.zx2c4.com/wireguard/tun"
)
const windowsGatewayMTU = 1420
//go:embed assets/windows/amd64/wintun.dll
var embeddedWintunDLL []byte
type tunDevice struct {
dev wgtun.Device
name string
}
func openGatewayTun(name, addressCIDR, routeCIDR string) (*tunDevice, error) {
if _, _, err := net.ParseCIDR(addressCIDR); err != nil {
return nil, fmt.Errorf("invalid vpn gateway address %q: %w", addressCIDR, err)
}
if _, _, err := net.ParseCIDR(routeCIDR); err != nil {
return nil, fmt.Errorf("invalid vpn gateway route %q: %w", routeCIDR, err)
}
if err := ensureWintunDLL(); err != nil {
return nil, err
}
dev, err := wgtun.CreateTUN(name, windowsGatewayMTU)
if err != nil {
return nil, fmt.Errorf("create wintun interface %s: %w", name, err)
}
if err := configureGatewayInterface(name, addressCIDR, routeCIDR); err != nil {
_ = dev.Close()
return nil, err
}
return &tunDevice{dev: dev, name: name}, nil
}
func (d *tunDevice) Read(packet []byte) (int, error) {
bufs := [][]byte{packet}
sizes := []int{0}
n, err := d.dev.Read(bufs, sizes, 0)
if err != nil {
return 0, err
}
if n <= 0 {
return 0, nil
}
return sizes[0], nil
}
func (d *tunDevice) Write(packet []byte) (int, error) {
n, err := d.dev.Write([][]byte{packet}, 0)
if err != nil {
return 0, err
}
if n <= 0 {
return 0, nil
}
return len(packet), nil
}
func (d *tunDevice) Close() error {
_ = removeWindowsGatewayNat()
return d.dev.Close()
}
func configureGatewayInterface(name, addressCIDR, routeCIDR string) error {
ip, network, err := net.ParseCIDR(addressCIDR)
if err != nil {
return fmt.Errorf("invalid vpn gateway address %q: %w", addressCIDR, err)
}
ones, bits := network.Mask.Size()
if bits != 32 || ones <= 0 {
return fmt.Errorf("invalid vpn gateway prefix %q", addressCIDR)
}
_, route, err := net.ParseCIDR(routeCIDR)
if err != nil {
return fmt.Errorf("invalid vpn gateway route %q: %w", routeCIDR, err)
}
script := fmt.Sprintf(`
$ErrorActionPreference = 'Stop'
$alias = %s
$address = %s
$prefixLength = %d
$natPrefix = %s
$natName = 'RAPVPN'
$adapter = Get-NetAdapter -Name $alias -ErrorAction Stop
$adapter | Enable-NetAdapter -Confirm:$false -ErrorAction SilentlyContinue | Out-Null
$existing = Get-NetIPAddress -InterfaceAlias $alias -AddressFamily IPv4 -ErrorAction SilentlyContinue
foreach ($addr in $existing) {
if ($addr.IPAddress -ne $address -or $addr.PrefixLength -ne $prefixLength) {
Remove-NetIPAddress -InterfaceAlias $alias -IPAddress $addr.IPAddress -Confirm:$false -ErrorAction SilentlyContinue
}
}
if (-not (Get-NetIPAddress -InterfaceAlias $alias -IPAddress $address -AddressFamily IPv4 -ErrorAction SilentlyContinue)) {
New-NetIPAddress -InterfaceAlias $alias -IPAddress $address -PrefixLength $prefixLength -Type Unicast | Out-Null
}
Set-NetIPInterface -InterfaceAlias $alias -AddressFamily IPv4 -Forwarding Enabled
Get-NetIPInterface -AddressFamily IPv4 | Where-Object { $_.ConnectionState -eq 'Connected' -and $_.InterfaceAlias -ne 'Loopback Pseudo-Interface 1' } | Set-NetIPInterface -Forwarding Enabled
$existingNat = Get-NetNat -Name $natName -ErrorAction SilentlyContinue
if ($existingNat -and $existingNat.InternalIPInterfaceAddressPrefix -ne $natPrefix) {
$existingNat | Remove-NetNat -Confirm:$false
$existingNat = $null
}
if (-not $existingNat) {
New-NetNat -Name $natName -InternalIPInterfaceAddressPrefix $natPrefix | Out-Null
}
`, psQuote(name), psQuote(ip.String()), ones, psQuote(route.String()))
if err := runPowerShell(script); err != nil {
return fmt.Errorf("configure windows vpn gateway interface %s: %w", name, err)
}
return nil
}
func removeWindowsGatewayNat() error {
return runPowerShell(`Get-NetNat -Name 'RAPVPN' -ErrorAction SilentlyContinue | Remove-NetNat -Confirm:$false -ErrorAction SilentlyContinue`)
}
func runPowerShell(script string) error {
cmd := exec.Command("powershell.exe", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("powershell failed: %w: %s", err, strings.TrimSpace(string(out)))
}
return nil
}
func psQuote(value string) string {
return "'" + strings.ReplaceAll(value, "'", "''") + "'"
}
func ensureWintunDLL() error {
exePath, err := os.Executable()
if err != nil {
return fmt.Errorf("locate node-agent executable for wintun.dll: %w", err)
}
target := filepath.Join(filepath.Dir(exePath), "wintun.dll")
if payload, err := os.ReadFile(target); err == nil && sameSHA256(payload, embeddedWintunDLL) {
return nil
}
tmp := target + ".tmp"
if err := os.WriteFile(tmp, embeddedWintunDLL, 0o644); err != nil {
return fmt.Errorf("write embedded wintun.dll: %w", err)
}
_ = os.Remove(target)
if err := os.Rename(tmp, target); err != nil {
_ = os.Remove(tmp)
return fmt.Errorf("install embedded wintun.dll: %w", err)
}
return nil
}
func sameSHA256(a, b []byte) bool {
left := sha256.Sum256(a)
right := sha256.Sum256(b)
return left == right
}