commit 8ba0561f4f0f21d11550b6b2db2f37ef4087bda9 Author: Mikhail Date: Tue Apr 28 22:29:50 2026 +0300 Initial project snapshot diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..c9381e2 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,22 @@ +FROM debian:12.10-slim + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bash \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + gdb \ + git \ + freerdp2-dev \ + libboost-dev \ + libssl-dev \ + libwinpr2-dev \ + ninja-build \ + pkg-config \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /workspaces/rdp-proxy diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..0d82632 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,22 @@ +{ + "name": "rdp-proxy-worker", + "build": { + "dockerfile": "Dockerfile", + "context": ".." + }, + "workspaceFolder": "/workspaces/rdp-proxy", + "customizations": { + "vscode": { + "settings": { + "cmake.sourceDirectory": "${workspaceFolder}/workers/rdp-worker", + "cmake.buildDirectory": "${workspaceFolder}/workers/rdp-worker/build" + }, + "extensions": [ + "ms-vscode.cmake-tools", + "ms-vscode.cpptools" + ] + } + }, + "postCreateCommand": "cmake --preset dev --directory workers/rdp-worker", + "remoteUser": "root" +} diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..bfa33bb --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,28 @@ +name: build + +on: + push: + pull_request: + +jobs: + backend: + runs-on: ubuntu-24.04 + defaults: + run: + working-directory: backend + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: '1.23.8' + - run: go mod download + - run: go build ./... + + worker: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - name: Build worker image + run: docker build --tag rap-rdp-worker:ci --file workers/rdp-worker/Dockerfile workers/rdp-worker + - name: Verify worker binary path + run: docker run --rm --entrypoint /bin/sh rap-rdp-worker:ci -lc "test -x /usr/local/bin/rdp-worker && test -x /usr/local/bin/rdp-worker-dataplane-token-probe && test -x /usr/local/bin/rdp-worker-dataplane-bind-probe" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ebdbfcb --- /dev/null +++ b/.gitignore @@ -0,0 +1,64 @@ +# OS +.DS_Store +Thumbs.db + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Logs +*.log +logs/ +tmp/ +temp/ + +# Env +.env +.env.* +!.env.example + +# Go +backend/bin/ +backend/.cache/ +backend/vendor/ +*.test +coverage.out + +# C/C++ +build/ +cmake-build-*/ +CMakeFiles/ +CMakeCache.txt +compile_commands.json +*.o +*.obj +*.so +*.a +*.dll +*.exe + +# .NET / WPF +bin/ +obj/ +*.user +*.suo + +# Node / React +node_modules/ +dist/ +build/ +.next/ +coverage/ + +# Python (if scripts appear later) +__pycache__/ +*.pyc + +# Docker +*.local.yml + +# Generated artifacts +artifacts/ +out/ \ No newline at end of file diff --git a/CODEX_CONTEXT.md b/CODEX_CONTEXT.md new file mode 100644 index 0000000..aa88f3c --- /dev/null +++ b/CODEX_CONTEXT.md @@ -0,0 +1,618 @@ +# CODEX CONTEXT + +## Project identity + +This project is a production-grade distributed secure access platform. + +It started as a custom RDP proxy with persistent server-side sessions, but the final target architecture is broader: + +- distributed secure access fabric +- multi-tenant platform +- session broker for GUI and future non-GUI protocols +- cluster mesh of nodes +- connector/VPN layer +- customer-managed and platform-managed nodes +- node-agent based self-update / rollback / health supervision + +## Current proven foundation + +The current codebase already proved the most risky low-level lifecycle assumptions for RDP: + +- real FreeRDP connect works +- session state transitions to active work +- terminate works +- detach works without killing the remote session +- reattach works without recreating the remote session +- takeover works without recreating the remote session +- per-resource certificate verification policy exists +- `certificate_verification_mode = strict | ignore` +- `strict` is default +- `ignore` works on a per-resource basis +- worker build is reproducible +- backend build is reproducible + +This proven lifecycle must NOT be broken by future architecture work. + +## Current architecture baseline + +Current audit and baseline snapshot: + +- `docs/audits/PROJECT_AUDIT_2026-04-26.md` +- `docs/audits/CURRENT_BASELINE_MATRIX.md` + +### Test environment +- Canonical test Docker host: `192.168.200.61` +- Canonical Docker context: `test-ubuntu` +- Canonical SSH alias: `docker-test` +- Backend API for local/client smoke runs: `http://192.168.200.61:8080/api/v1` +- WebSocket gateway for local/client smoke runs: `ws://192.168.200.61:8080/api/v1/gateway/ws` +- Stage C17 planning is completed. +- C17A synthetic mesh runtime skeleton is implemented and test-proven in + `rap-node-agent` only. It is disabled by default and carries synthetic + `fabric.probe` / `fabric.probe_ack` messages only. +- C17B route health and failover probes are implemented and test-proven in + `rap-node-agent` only. They are disabled by default and carry synthetic + `fabric.route_health` / `fabric.route_health_ack` messages only. +- C17C relay semantic hardening is implemented and test-proven in + `rap-node-agent` only. It is disabled by default and models synthetic + per-channel queues/QoS/backpressure only. +- C17D non-production test-service path is implemented and test-proven in + `rap-node-agent` only. It is disabled by default and carries only bounded + `synthetic.echo` test payloads. +- C17E/C17F/C17G are implemented and proven for live synthetic HTTP transport, + scoped synthetic route config, and Control Plane scoped synthetic config + consumption. +- C17H deployed multi-agent synthetic config smoke is runtime-proven on + `docker-test`: five running `rap-node-agent` containers consume + backend-issued node-scoped synthetic config, direct and single-relay + synthetic route-health observations return to the Control Plane, and + production forwarding remains disabled. +- C17I production forwarding gate foundation is implemented and test-proven: + `rap-node-agent` has an explicit production-forwarding gate, while + `/mesh/v1/forward` still refuses production payload forwarding until a later + approved runtime stage. +- C17J production envelope contract is implemented and test-proven: + `/mesh/v1/forward` validates route-bound production envelopes for + `fabric_control` / `fabric.control` only when the gate is enabled, rejects + service channels, and still refuses production forwarding. +- C17K production envelope observation is implemented and test-proven: + valid accepted envelopes can be observed locally as metadata-only records + after validation; rejected envelopes are not observed, observation failure + fails closed, and production forwarding remains unavailable. +- C17L bounded production observation sink is implemented and test-proven: + accepted metadata-only observations can be retained locally with fixed + capacity, oldest-entry drop behavior, and no payload body storage. +- C17M production observation sink wiring is implemented and test-proven: + node-agent can wire the bounded local metadata-only sink when + `RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` is explicitly greater than + zero; the wiring is disabled by default and exposes no read API. +- C17N production observation sink metrics are implemented and test-proven: + local sink metrics expose only capacity, current depth, accepted total, and + dropped-oldest total; they expose no observation records or payload metadata. +- C17O production observation sink local metrics logging is implemented and + test-proven: node-agent logs aggregate sink metrics locally when the sink is + explicitly enabled; no read API or Control Plane reporting is added. +- C17P production observation sink change-driven metrics logging is implemented + and test-proven: node-agent suppresses repeated identical local sink metrics + logs; no read API or Control Plane reporting is added. +- C17Q production forwarding gate/runtime log boundary is implemented and + test-proven: node-agent logs production forwarding gate state separately from + production forwarding runtime state. Runtime state remained false until + C17Z introduced gate-controlled `fabric.control` direct forwarding. +- C17R production observation sink capacity guard is implemented and + test-proven: `RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` is rejected + above `10000`. +- C17S production observation panic fail-closed hardening is implemented and + test-proven: observer errors and observer panics both fail closed as + observation failure. +- C17T production envelope payload boundary is implemented and test-proven: + validated production `fabric.control` envelope payloads are bounded to + `4096` bytes and oversized envelopes are rejected before observation. +- C17U production envelope created-at skew boundary is implemented and + test-proven: validated production `fabric.control` envelopes whose + `created_at` is more than one minute in the future are rejected before + observation. +- C17V peer endpoint candidate model is implemented and test-proven: + node-scoped synthetic mesh config now carries route-scoped endpoint + candidates with transport, address, reachability, NAT type, connectivity + mode, priority, policy tags, verification time, and metadata. This is a + model/config boundary only; no production route scoring, NAT traversal, + shortcut routing, or forwarding runtime is implemented. +- C17W peer endpoint candidate scoring model is implemented and test-proven: + `rap-node-agent` can rank already-scoped endpoint candidates using soft + inputs such as transport, reachability, connectivity mode, NAT type, + priority, region, policy tags, channel class, and verification age. This is + a scoring helper only; it does not open connections, choose production + routes, or forward payloads. +- C17X health-aware endpoint candidate scoring overlay is implemented and + test-proven: endpoint candidate scoring can optionally use local health + observations keyed by `endpoint_id`, including latency, success/failure + history, recent failure reason, reliability score, and observation freshness. + This remains advisory scoring only and is not wired into production route + execution. +- C17Y Platform Owner synthetic mesh visibility is implemented and + build/test-proven: `web-admin` reads node-scoped synthetic mesh config and + shows config enabled state, route counts, peer endpoints, endpoint + candidates, C17X advisory scoring boundary, and `production_forwarding`. + This remains platform-owner visibility only and does not enable production + forwarding. +- C17Z production fabric-control direct forwarding boundary is implemented and + test-proven: when `RAP_MESH_PRODUCTION_FORWARDING_ENABLED=true`, + `/mesh/v1/forward` can deliver valid route-bound `fabric.control` envelopes + at the local destination or forward them to a direct next hop from explicit + peer endpoint config. Service channels, arbitrary relay forwarding, + multi-hop production route execution, and RDP/VPN/file/video/service payloads + remain unavailable. +- C17Z1 production fabric-control multi-hop route-path boundary is implemented + and test-proven: production `fabric.control` envelopes can carry + `route_path` and `visited_node_ids`; relay nodes validate path position, + forward only to the next path node, update TTL/hop/visited metadata, and + reject loops. Service payloads remain unavailable. +- C17Z2 production fabric-control forwarding observability boundary is + implemented and test-proven: node-agent emits local + `mesh_production_forward_event` logs for accepted, forwarded, delivered, and + rejected production `fabric.control` envelopes. Logs are metadata-only and + include no payload bodies or read API. +- C17Z3 production fabric-control route-config boundary is implemented and + test-proven: when scoped/control-plane mesh routes are available locally, + production `fabric.control` envelopes must match configured route_id/path/ + next-hop/channel/expiry/TTL/hop limits before forwarding. +- C17Z4 scoped peer directory and recovery seeds boundary is implemented and + test/build-proven: node-scoped mesh config carries scoped `peer_directory` + and explicit bounded `recovery_seeds`; node-agent parses/validates them and + web-admin shows counts. +- C17Z5 node-agent peer cache runtime boundary is implemented and test-proven: + node-agent builds a local `PeerCache`, selects bounded warm peers, probes warm + peers with `/mesh/v1/health`, and reports metadata-only mesh-link + observations when synthetic mesh testing is enabled. +- C17Z6 dynamic endpoint reporting boundary is implemented and test-proven: + node-agent reports explicit advertised mesh endpoint metadata in heartbeat, + and Control Plane projects latest reported endpoints/candidates into + node-scoped synthetic mesh config. +- C17Z7 private/corporate endpoint candidate boundary is implemented and + test-proven: node-agent reports multiple advertised endpoint candidates, + scoring rewards private/corporate same-site candidates, and peer cache can + use the best candidate address for warm health. +- C17Z8 peer connection state machine boundary is implemented and test-proven: + node-agent tracks warm-peer states `disconnected`, `connecting`, `ready`, + `degraded`, and `backoff`, with bounded backoff after repeated health probe + failures. +- C17Z9 peer recovery planner boundary is implemented and test-proven: + node-agent targets a bounded stable ready-peer set, enters recovery when + ready peers fall below target, and selects bounded recovery probes from warm + peers, recovery seeds, and other connectable scoped peers. +- C17Z10 peer connection intent planner boundary is implemented and + test-proven: node-agent classifies bounded peer work as maintain/probe/ + recover and classifies transport readiness as direct/private_lan/ + corporate_lan/outbound_only/relay_required, with rendezvous-required + metadata only. +- C17Z11 peer connection manager runtime boundary is implemented and + test-proven: node-agent uses a reusable HTTP keep-alive client for real + control-plane health probes of direct/private/corporate peers and records + `waiting_rendezvous` for outbound-only/relay-required peers. +- C17Z12 rendezvous/relay control-plane contract is implemented and + docker-test-runtime-proven: backend issues node-scoped `rendezvous_leases`, + node-agent resolves matching `waiting_rendezvous` intents into + `relay_control`, probes relay `/mesh/v1/health`, records and maintains + `relay_ready`, and keeps service payload forwarding disabled. +- C17Z13 rendezvous lease telemetry is implemented and + docker-test-runtime-proven: node-agent reports + `mesh_rendezvous_lease_report` with relay admission, peer admission, + TTL/renewal posture, `relay_ready`, and explicit no-payload boundary flags; + web-admin shows `rv leases` in recent heartbeat tables. +- C17Z14 rendezvous lease refresh contract is implemented and + docker-test-runtime-proven: node-agent refreshes renewal-needed/stale + rendezvous leases through node-scoped synthetic config reload, updates the + running peer cache/route/lease state, and reports refresh plus stale relay + withdrawal/reselection telemetry. Service payload forwarding remains + unavailable. +- C17Z15 backend relay replacement policy is implemented and + docker-test-runtime-proven: backend consumes recent stale-relay heartbeat + feedback, withdraws stale explicit rendezvous leases, scores alternate relay + candidates from route adjacency, endpoint priority, policy tags, and recent + mesh-link health, and returns replacement leases plus + `rendezvous_relay_policy` decisions in node-scoped synthetic config. + Node-agent reports `c17z15.mesh_rendezvous_lease_report.v1` and keeps stale + state scoped to the exact lease/relay, so replacement leases for the same + peer are not marked stale by association. Service payload forwarding remains + unavailable. +- C17Z16 route/path decision artifact is implemented and + docker-test-runtime-proven: backend `c17z16.synthetic.v1` config includes + `route_path_decisions` with original hops, effective hops, local previous/ + next hop, selected replacement relay, generation, score reasons, and + no-payload boundary flags. Node-agent stores the control-plane route + generation and reports `c17z16.mesh_route_path_decision_report.v1` plus + `c17z16.mesh_rendezvous_lease_report.v1`. Service payload forwarding remains + unavailable. +- C17Z17 node-side route generation tracker is implemented and + docker-test-runtime-proven: backend `c17z17.synthetic.v1` config and + node-agent `mesh_route_generation_report` track active/applied/unchanged/ + withdrawn route decisions, generation changes, total counters, and + `withdrawn_by_replacement` records for stale relay paths when replacement is + first observed. Service payload forwarding remains unavailable. +- C17Z18 synthetic route-health effective path runtime is implemented and + docker-test-runtime-proven: backend `c17z18.synthetic.v1` config and + node-agent `mesh_route_health_config_report` apply Control Plane + `route_path_decisions` to synthetic route-health route config only. The + synthetic runtime probes selected effective paths through replacement relays, + reports expected/observed hops and drift state, and backend latest mesh links + preserve route-health observations separately from connection-manager + observations. Service payload forwarding remains unavailable. +- C17Z19 synthetic route-health feedback scoring is implemented and + docker-test-runtime-proven: backend consumes recent `synthetic_route_health` + observations in relay scoring, uses drift/unreachable/failure metadata to + mark the exact selected relay stale, boosts healthy low-latency relay + candidates, and returns replacement leases/route decisions through the + existing synthetic config contract. Migration `000022` adds the `synthetic` + mesh service class. Service payload forwarding remains unavailable. +- C17Z20 node-side route-health feedback refresh is implemented and + docker-test-runtime-proven: after reporting synthetic route-health + drift/unreachable/failure, node-agent performs a bounded node-scoped + synthetic-config refresh, applies returned replacement route decisions to + route-health config immediately, and reports + `c17z20.mesh_route_health_feedback_refresh_report.v1`. Service payload + forwarding remains unavailable. +- Installation Authority foundation is implemented: production requires strict + Product Root public key config, first-owner bootstrap uses signed Ed25519 + activation manifests, `installation_authority` and signed + `platform_role_grants` are persisted, and strict platform-admin checks ignore + direct `users.platform_role` database edits without a valid signed grant. + Web-admin exposes installation status/first-owner bootstrap, and + `scripts/installation/product-root-tool.go` generates keys/manifests for + offline product-root operations. +- Cluster Authority and node enrollment bootstrap are docker-test lifecycle + smoke-proven in run `dev-bootstrap-20260428-201430`: a fresh dev install + bootstrapped the first owner, created a cluster, issued a signed join token, + accepted real `rap-node-agent` enrollment, owner-approved the join request, + agent-polled signed bootstrap, persisted cluster authority pin, heartbeated, + and verified signed `c17z18.synthetic.v1` Control Plane config. Production + service payload forwarding remains unavailable. +- Migration `000021_cluster_authority_keys` drops/recreates + `cluster_admin_summaries` because fresh replay proved PostgreSQL cannot + change that view layout via `CREATE OR REPLACE VIEW`. +- `rap-node-agent` desired-workload polling/status reporting is gated by + `RAP_WORKLOAD_SUPERVISION_ENABLED=false` by default while service runtime + supervision remains a stub. +- C18 VPN/IP tunnel service target design is completed as documentation only. +- C18A VPN/IP tunnel control-plane data model foundation is implemented and + backend-test-proven. +- C18B VPN/IP tunnel lease/fencing hardening is implemented and + backend-test-proven. +- C18C VPN/IP tunnel node-agent desired-state consumption/reporting is + implemented and backend-test-proven. +- No next platform-core implementation step is automatically authorized after + C17Z20. The next mesh layer should stay limited to route-health feedback + refresh dampening/no-change cooldown unless the user explicitly chooses + another staged task. +- Latest RDP performance reference image: + `rap-rdp-worker:rdp-perf6-dirty-region` +- Stage 5.2 file-download runtime artifacts remain preserved for when RDP work + resumes, but they are not the active next task. +- Do not use `docker.cin.su` for this project unless explicitly requested for a separate one-off check. + +### Backend +- Go +- PostgreSQL = source of truth +- Redis = live coordination / routing only +- REST for control plane +- WebSocket for live session channel + +### Worker +- C++ worker +- FreeRDP integration +- worker runtime hides FreeRDP details from backend +- The C++ worker remains the primary RDP runtime. +- Target RDP performance direction: `docs/architecture/RDP_SERVICE_CPP_PERFORMANCE_TARGET.md`. +- The RDP performance rewrite scope is limited to C++ RDP service adapter + internals. It must not redesign backend control plane, cluster transport, + organizations, leases, or session lifecycle. +- The C# RDP service skeleton is inactive research scaffolding and is not the + current runtime direction. +- Current RDP Adapter baseline: RDP-Perf-6 dirty-region direct binary rendering + is completed and smoke-proven on `docker-test`. RDP work is paused by product + decision; next active work is Fabric Core / cluster foundation. +- P3/P3.1 security-readiness foundation exists: production mode rejects + plaintext credential-like resource metadata, requires `secret_ref` for + RDP/VNC/SSH resources, and has an encrypted PostgreSQL-backed resource secret + storage/resolver MVP. P3.2 direct-worker TLS/PKI guard exists. +- P3.3 production-like test-stand smoke is complete on `docker-test`: backend + runs in `APP_ENV=production` with a test-only secret key file, a secret-backed + RDP resource starts real sessions through the resolver path, metadata/audit do + not contain plaintext credentials, and backend gateway fallback remains + available when direct worker WSS trust is `smoke_insecure`. +- P3.4 production direct-worker WSS trust model is documented in + `docs/architecture/PRODUCTION_DIRECT_WORKER_WSS_TRUST.md`; it defines + platform CA/public CA behavior, worker certificate SAN/identity requirements, + app-local Windows trust direction, rotation/revocation, and the future + `platform_ca` smoke plan. No RDP runtime behavior changed in P3.4. +- P3.5 app-local platform CA trust is implemented and runtime-proven on + `docker-test`: Windows client validates direct worker WSS with an app-local + platform CA bundle, keeps hostname/SAN validation enabled, selects + `direct_worker_wss` without insecure TLS bypass, and falls back to backend + gateway for unknown CA / smoke-only production cases. +- P3.6 stale Redis worker/live event idempotency is implemented and + runtime-proven: stale worker events for terminal PostgreSQL sessions are + ignored, backend restart survives stale Redis events, and terminal sessions + are not reopened. +- Stage 5.2 server-to-client file download core data path is runtime-proven: + direct worker WSS and backend gateway fallback both download text/binary + files from `RAP_Transfers\ToClient` with matching size/hash, and direct + policy blocking is proven for `disabled` and `client_to_server`. Lifecycle + blocking is also runtime-proven for detach, old-client takeover, and worker + failure. Runtime report: + `artifacts/stage5-2-file-download-runtime-report.md`. +- Stage 5.2 is not fully accepted yet. Remaining proof: Windows desktop UI + download path and regression matrix for rendering/input/clipboard/upload/ + reconnect/takeover. + +### Clients +- future native clients: + - Windows: native desktop client first + - Linux: native desktop client later +- web UI is admin/control plane, not the primary power-user client + +## Final architecture direction + +The long-term target architecture is documented in: + +- `docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md` +- `docs/architecture/CLUSTER_NODE_ADMIN_FOUNDATION.md` +- `docs/architecture/WEB_INGRESS_AND_ADMIN_UI_MODEL.md` + +This document defines the target Secure Access Fabric architecture only. It is not the current implementation scope and must not be used as permission to start mesh, VPN, multi-cluster, updater, or realtime data-plane migration work without an explicit staged prompt. + +`CLUSTER_NODE_ADMIN_FOUNDATION.md` defines the next platform-core planning +baseline for clusters, node enrollment, native node-agent identity, platform +admin console, multi-cluster administration, and future organization admin +visibility. It is a staged foundation document, not permission to implement +mesh packet routing or VPN runtime. + +`WEB_INGRESS_AND_ADMIN_UI_MODEL.md` defines WEB as HTTP/HTTPS ingress and +Admin UI presentation only. Cluster configuration remains Control Plane +ownership through scoped APIs, PostgreSQL source-of-truth mutations, and audit. +Dynamic pages must be safe schema-driven projections and must not embed +internal topology, peer caches, route caches, secrets, raw credentials, or +arbitrary executable code. + +Admin endpoint placement is explicit. Fabric Storage / Config Storage nodes do +not automatically host or move the cluster panel. Platform Owner Console +remains global platform-owner scope. Cluster Admin Endpoint requires explicit +admin/web ingress role assignment, cluster health/trust readiness, and Control +Plane authorization. Organization Admin Panel remains a tenant-safe projection. + +The final platform must support: + +1. Multi-tenancy / Organizations +- platform has many organizations +- each organization has isolated users, groups, resources, policies, audit, connectors +- users may belong to multiple organizations +- organization admins only see their organization +- platform admins see platform scope + +2. Identity federation +- local users +- LDAP / Active Directory +- OIDC +- future extensibility for more identity sources +- access mappings based on external groups / claims + +3. Cluster of nodes +- no mandatory single central node +- many nodes across many sites +- nodes can be platform-managed or customer-managed +- customer-managed nodes are sandboxed cluster participants, not full cluster owners + +4. Node agent +- small stable always-running agent on every node +- supervises services +- downloads updates +- verifies signed artifacts +- can rollback to previous version +- can restart crashed services +- can work on thin or thick nodes + +5. Service-based node model +Each node is not monolithic. +A node has: +- capabilities: what it can do physically/technically +- enabled services: what it is allowed/assigned to do + +Possible services include: +- ingress-gateway +- mesh-router +- relay +- connector-host +- vpn-adapter +- session-worker +- media-relay +- file-relay +- update-cache +- config-replica +- audit-sink +- metrics-exporter + +6. Cluster mesh and routing +- encrypted inter-node communication +- dynamic topology +- no need for full mesh +- multi-hop routing allowed +- route failover +- client failover between ingress nodes +- connector failover between nodes + +7. Split-brain prevention +- quorum-based cluster behavior +- minority partition must not become a second authoritative cluster +- degraded / recovery / isolated modes +- manual recovery / promote decision by platform recovery admin + +8. Connector / VPN layer +- connectors are reusable network access methods +- one connector may be used by multiple resources +- connector placement and failover are controlled by policy +- nodes may be allowed or disallowed to host connectors +- direct access, VPN, relay and future egress modes must fit this model + +9. Future exit mode +- split tunnel +- full tunnel +- internet access through cluster +- not first implementation priority + +## Non-negotiable design rules + +- Do not rewrite proven session lifecycle carelessly. +- Do not turn Redis into a source of truth. +- Do not make certificate-ignore a global worker setting. +- Do not make customer-managed nodes platform-wide trusted by default. +- Do not create a separate cluster per organization. +- Do not assume a single permanently reachable central node. +- Do not rely on “secret protocol with no docs” as security. +- Security must come from crypto, auth, isolation, policy and observability. +- Prefer incremental evolution from current proven system. +- Do not collapse platform control plane and data plane into one vague layer. + +## Implementation strategy + +The codebase must evolve in phases. + +Current implementation focus remains: +- RDP work is paused by product decision +- preserve the accepted RDP Adapter baseline and Stage 5.x file-transfer work +- do not delete or rewrite the current RDP MVP while platform-core work starts +- C1-C9 platform-core foundations are implemented and verified: clusters, + node enrollment, node-agent scaffold, platform admin console, workload + supervision contract, mesh control-plane prep, mesh skeleton, multi-cluster + hardening, and organization admin foundation +- C10 Fabric Core configuration distribution design is completed +- C11 signed scoped cluster snapshot model is completed +- C12 node local state store is completed +- C13 Fabric Storage / Config Storage service foundation is completed +- C14 peer directory and cache model is completed +- C15 Fabric Routing Engine skeleton is completed +- C16 secure node-to-node channel lifecycle is completed +- C17 mesh routing runtime implementation plan is completed +- C17A synthetic mesh runtime skeleton is implemented and test-proven with + synthetic fabric messages only, no RDP/VPN/production service traffic +- C17B route health and failover probes are implemented and test-proven with + synthetic traffic only, no RDP/VPN/production service traffic +- C17C relay semantic hardening is implemented and test-proven with synthetic + channel classes only, no RDP/VPN/production service traffic +- C17D non-production test-service path is implemented and test-proven with + bounded `synthetic.echo` traffic only, no RDP/VPN/production service traffic +- C17E live node-to-node synthetic HTTP transport is implemented and + smoke-proven with synthetic traffic only +- C17F scoped synthetic route config loading and route-health reporting is + implemented and smoke-proven with synthetic traffic only +- C17G Control Plane scoped synthetic config read/consume is implemented and + test-proven with synthetic traffic only +- C17H deployed multi-agent synthetic config smoke is implemented and + runtime-proven on `docker-test` with synthetic traffic only +- C17I production forwarding gate foundation is implemented and test-proven; + production forwarding remains unavailable +- C17J production envelope contract validation is implemented and test-proven; + production forwarding remains unavailable +- C17K production envelope observation is implemented and test-proven; + production forwarding remains unavailable +- C17L bounded production observation sink is implemented and test-proven; + production forwarding remains unavailable +- C17M production observation sink wiring is implemented and test-proven; + production forwarding remains unavailable +- C17N production observation sink metrics are implemented and test-proven; + production forwarding remains unavailable +- C17O production observation sink local metrics logging is implemented and + test-proven; production forwarding remains unavailable +- C17P production observation sink change-driven metrics logging is implemented + and test-proven; production forwarding remains unavailable +- C17Q production forwarding gate/runtime log boundary is implemented and + test-proven; production forwarding remains unavailable +- C17R production observation sink capacity guard is implemented and + test-proven; production forwarding remains unavailable +- C17S production observation panic fail-closed hardening is implemented and + test-proven; production forwarding remains unavailable +- C17T production envelope payload boundary is implemented and test-proven; + production forwarding remains unavailable +- C17U production envelope created-at skew boundary is implemented and + test-proven; production forwarding remains unavailable +- C17V peer endpoint candidate model and NAT/connectivity hints are + implemented and test-proven; production forwarding remains unavailable +- C17W peer endpoint candidate scoring model is implemented and test-proven; + production forwarding remains unavailable +- C17X health-aware endpoint candidate scoring overlay is implemented and + test-proven; production forwarding remains unavailable +- C17Y Platform Owner synthetic mesh visibility is implemented and + build/test-proven; production forwarding remains unavailable +- C17Z production fabric-control direct forwarding is implemented and + test-proven; production service traffic remains unavailable +- C17Z1 production fabric-control multi-hop route-path forwarding is + implemented and test-proven; production service traffic remains unavailable +- C17Z2 production fabric-control forwarding observability is implemented and + test-proven; production service traffic remains unavailable +- C17Z3 production fabric-control route-config boundary is implemented and + test-proven; production service traffic remains unavailable +- C17Z4 scoped peer directory/recovery seed boundary is implemented and + test/build-proven; production service traffic remains unavailable +- C17Z5 node-agent peer cache runtime boundary is implemented and test-proven; + production service traffic remains unavailable +- C17Z6 dynamic endpoint reporting boundary is implemented and test-proven; + production service traffic remains unavailable +- C17Z7 private/corporate endpoint candidate boundary is implemented and + test-proven; production service traffic remains unavailable +- C17Z8 peer connection state machine boundary is implemented and test-proven; + production service traffic remains unavailable +- C17Z9 peer recovery planner boundary is implemented and test-proven; + production service traffic remains unavailable +- C17Z10 peer connection intent planner boundary is implemented and + test-proven; production service traffic remains unavailable +- C17Z11 peer connection manager runtime boundary is implemented and + test-proven; production service traffic remains unavailable +- C17Z12 rendezvous/relay control-plane contract is implemented and + docker-test-runtime-proven; production service traffic remains unavailable +- C17Z13 rendezvous lease telemetry is implemented and + docker-test-runtime-proven; production service traffic remains unavailable +- C17Z14 rendezvous lease refresh contract is implemented and + docker-test-runtime-proven; production service traffic remains unavailable +- C17Z15 backend relay replacement policy is implemented and + docker-test-runtime-proven; production service traffic remains unavailable +- C17Z16 route/path decision artifact is implemented and + docker-test-runtime-proven; production service traffic remains unavailable +- C17Z17 node-side route generation tracker is implemented and + docker-test-runtime-proven; production service traffic remains unavailable +- C17Z18 synthetic route-health effective path runtime is implemented and + docker-test-runtime-proven; production service traffic remains unavailable +- C17Z19 synthetic route-health feedback scoring is implemented and + docker-test-runtime-proven; production service traffic remains unavailable +- C17Z20 node-side route-health feedback refresh is implemented and + docker-test-runtime-proven; production service traffic remains unavailable +- Cluster Authority plus node enrollment bootstrap polling are docker-test + lifecycle-smoke-proven; fresh install migration replay is fixed for + `cluster_admin_summaries` +- C18 VPN/IP tunnel service target design is completed as documentation only +- C18A VPN/IP tunnel control-plane data model foundation is implemented and + backend-test-proven +- C18B VPN/IP tunnel lease/fencing hardening is implemented and + backend-test-proven +- C18C VPN/IP tunnel node-agent desired-state consumption/reporting is + implemented and backend-test-proven +- Version Storage / Update Repository is documented as a future Fabric Core + service for signed release manifests, OS/arch artifacts, + stable/current/candidate channels, update-cache mirroring, node-agent + update supervision, rollback, and explicit data-structure migration bundles. + Runtime updater behavior is not implemented. +- no next platform-core implementation step is automatically authorized after + C17Z20; choose the next narrow staged prompt explicitly before continuing +- preserve the proven RDP lifecycle behavior +- keep the current backend gateway available as the active/fallback implementation path + +The current phase is NOT: +- full mesh routing implementation +- full VPN orchestration +- multi-cluster runtime traffic handling +- production data-plane migration +- updater runtime +- video meetings +- final native client UI redesign + +Future mesh, VPN, multi-cluster, node-agent updater, and production realtime data-plane work must be introduced only through explicit, narrow, staged implementation prompts. + +Always keep the project production-oriented. Do not simplify it into a toy app. diff --git a/README.md b/README.md new file mode 100644 index 0000000..3c04cff --- /dev/null +++ b/README.md @@ -0,0 +1,119 @@ +# Remote Access Platform + +Production-oriented Secure Access Fabric platform. + +The project started as an RDP proxy, but the target architecture is broader: + +- multi-tenant control plane +- direct realtime data plane +- service adapters for RDP now and VNC/SSH/VPN/file/video later +- native Access Clients +- future secure mesh / node-agent / updater / connector model + +RDP is the first proven service baseline. RDP work is currently paused by +product decision while the project moves to the Secure Access Fabric +platform-core foundation: clusters, node enrollment, node-agent identity, role +assignments, and platform administration. + +## Current Baseline + +- Backend: Go control plane with PostgreSQL as source of truth and Redis for + live coordination/routing only. +- Worker: active C++ RDP Adapter worker based on FreeRDP. +- Windows client: C# / WPF native Access Client. +- Data plane: direct worker WSS for realtime RDP when available, backend + WebSocket gateway retained as fallback/debug. +- Current test Docker host: `docker-test` / `192.168.200.61`. +- Current test Docker deployment for Stage 5.2 proof: + `rap-backend-smoke:stage5-2-download` and + `rap-rdp-worker:stage5-2-download`. + +See the current audit and baseline matrix before starting new work: + +- `docs/audits/PROJECT_AUDIT_2026-04-26.md` +- `docs/audits/CURRENT_BASELINE_MATRIX.md` + +## Proven RDP Capabilities + +- real RDP connect through worker +- active/detach/reattach/takeover/terminate lifecycle +- takeover without remote session recreation +- worker-death/orphan-active-session recovery +- direct worker WSS data plane +- binary direct render frames +- backend gateway JSON/base64 fallback +- Windows client rendering and input +- text clipboard with policy enforcement +- client-to-server file upload into controlled worker storage +- restricted transfer-drive visibility through `RAP_Transfers`, runtime-proven +- server-to-client file download implementation through + `RAP_Transfers\ToClient`; core direct/fallback data path is runtime-proven, + lifecycle blocking is runtime-proven, and manual desktop UI acceptance remains + pending + +## Repository Structure + +- `backend/` - Go control plane +- `workers/rdp-worker/` - active C++ RDP Adapter worker +- `workers/rdp-service-csharp/` - inactive research scaffold, not current runtime +- `clients/windows/` - Windows native Access Client +- `docs/architecture/` - target and staged architecture documents +- `docs/codex/` - current Codex status and next-step prompts +- `docs/audits/` - current audits and baseline matrices +- `scripts/` - smoke and helper scripts +- `deploy/` - deployment assets +- `web-admin/` - future/admin UI area + +## Read Order + +1. `CODEX_CONTEXT.md` +2. `docs/audits/PROJECT_AUDIT_2026-04-26.md` +3. `docs/audits/CURRENT_BASELINE_MATRIX.md` +4. `docs/codex/CURRENT_STATUS.md` +5. `docs/codex/ARCHITECTURE_GUARDRAILS.md` +6. `docs/architecture/RDP_ADAPTER_RUNTIME.md` +7. `docs/architecture/DATA_PLANE_V1.md` +8. `docs/architecture/CLUSTER_NODE_ADMIN_FOUNDATION.md` +9. `docs/codex/NEXT_STEP_PROMPT.md` + +Do not use `docs/_legacy_v1` for implementation decisions. Legacy files are +historical reference only. + +## Current Next Step + +RDP work is paused. Platform-core stages C1-C9 are implemented and verified: +cluster/node model foundation, node enrollment hardening, native +`rap-node-agent` MVP, Platform Admin Console MVP, service workload supervision +contract, mesh control-plane preparation, Mesh MVP skeleton, multi-cluster +hardening, and organization admin foundation. + +Planning baseline: + +- `docs/architecture/CLUSTER_NODE_ADMIN_FOUNDATION.md` +- `artifacts/c1-cluster-node-foundation-report.md` +- `artifacts/c2-node-enrollment-hardening-report.md` +- `artifacts/c3-rap-node-agent-mvp-report.md` +- `artifacts/c4-platform-admin-console-report.md` +- `artifacts/c5-service-workload-supervision-contract-report.md` +- `artifacts/c6-mesh-control-plane-preparation-report.md` +- `artifacts/c7-mesh-mvp-skeleton-report.md` +- `artifacts/c8-multi-cluster-hardening-report.md` +- `artifacts/c9-organization-admin-foundation-report.md` + +Stage C1 implemented the backend foundation for: + +- explicit clusters +- cluster memberships +- node join tokens with hashed tokens only +- node join requests +- node identity/certificate metadata +- node role assignments +- node heartbeat/latest health +- cluster audit events +- safe migration/backfill of existing node data into a default cluster + +Recommended next step is a decision between platform auth hardening for +web-admin, node PKI hardening, node-agent runtime hardening, mesh runtime DP-0, +or returning to paused RDP work. + +Do not start all continuation paths at once. diff --git a/agents/rap-node-agent/Dockerfile b/agents/rap-node-agent/Dockerfile new file mode 100644 index 0000000..cc31f74 --- /dev/null +++ b/agents/rap-node-agent/Dockerfile @@ -0,0 +1,13 @@ +FROM golang:1.23-bookworm AS build + +WORKDIR /src +COPY agents/rap-node-agent/go.mod ./ +RUN go mod download +COPY agents/rap-node-agent/ ./ +RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/rap-node-agent ./cmd/rap-node-agent + +FROM gcr.io/distroless/static-debian12:nonroot + +COPY --from=build /out/rap-node-agent /usr/local/bin/rap-node-agent +USER nonroot:nonroot +ENTRYPOINT ["/usr/local/bin/rap-node-agent"] diff --git a/agents/rap-node-agent/README.md b/agents/rap-node-agent/README.md new file mode 100644 index 0000000..b355b5e --- /dev/null +++ b/agents/rap-node-agent/README.md @@ -0,0 +1,487 @@ +# rap-node-agent + +Native node agent MVP for the Secure Access Fabric. + +Status: Stage C17Z18 synthetic route-health effective path boundary. + +This agent is intentionally native. Containers may package service workloads, +but the host-level node identity belongs to `rap-node-agent`. + +## Current Scope + +Implemented: + +- config loading from flags/environment +- local identity state file +- enrollment request client +- heartbeat client +- capability/facts payload +- status-only service reporting payload +- mesh control-channel skeleton +- route-health message skeleton +- relay skeleton that refuses production payload forwarding +- disabled-by-default synthetic mesh runtime for `fabric.probe` / + `fabric.probe_ack` +- direct and single-relay synthetic route tests +- synthetic `fabric.route_health` / `fabric.route_health_ack` +- local route success/failure observations +- fallback route selection for test topology +- route cache invalidation on version changes +- synthetic relay envelope validation +- per-channel bounded queues for synthetic traffic +- QoS dequeue order: `fabric_control`, then `route_control`, then `telemetry` +- telemetry-only stale message drop under backpressure +- reliable fabric/control queue rejection when full +- bounded non-production `synthetic.echo` test-service path +- direct, single-relay, and forced-fallback test-service proofs +- live HTTP peer transport for synthetic mesh envelopes +- disabled-by-default synthetic mesh HTTP endpoint in `rap-node-agent` +- `mesh-live-smoke` harness proving direct and single-relay synthetic traffic + over real local HTTP endpoints +- scoped synthetic mesh config file loading for peer endpoints and routes +- Control Plane synthetic mesh config read fallback when no local scoped config + file is set +- synthetic route-health observations reported to the Control Plane when test + flags allow synthetic links +- explicit production mesh forwarding gate config; production forwarding still + has no runtime implementation and remains unavailable +- route-bound production mesh envelope contract and fail-closed validation on + `/mesh/v1/forward` +- metadata-only production envelope observation hook for valid envelopes, still + without forwarding payloads +- bounded metadata-only production envelope observation sink for accepted + observations +- disabled-by-default node-agent wiring for the bounded observation sink +- local metrics for the bounded observation sink without exposing observation + records +- local node-agent logging for bounded observation sink metrics +- change-driven suppression for unchanged bounded observation sink metrics logs +- explicit local log distinction between production forwarding gate state and + production forwarding runtime state +- node-scoped rendezvous lease refresh through Control Plane synthetic config +- stale relay withdrawal/reselection telemetry +- relay replacement contract reporting for stale rendezvous relays +- route/path decision contract reporting for control-plane route generations +- route generation apply/withdraw tracking for control-plane path decisions +- synthetic route-health route config refresh from Control Plane path + decisions +- route-health expected/observed effective path drift reporting +- maximum capacity guard for the local production observation sink +- panic-safe fail-closed production envelope observation wrapper +- explicit `4096` byte payload boundary for validated production + fabric-control envelopes +- explicit future-skew boundary for validated production envelope `created_at` +- scoped synthetic peer endpoint candidate config with reachability, + NAT/connectivity hints, priority, policy tags, and metadata +- deterministic local peer endpoint candidate scoring model for synthetic + config candidates +- optional local health observation overlay for endpoint candidate scoring +- gate-controlled production `fabric.control` direct next-hop delivery +- route-path-bound production `fabric.control` multi-hop forwarding +- local metadata-only production `fabric.control` forwarding event logs +- route-config-bound production `fabric.control` forwarding validation +- scoped peer directory and bounded recovery seed config parsing/validation +- node-local peer cache with bounded warm peer health probes +- advertised mesh endpoint reporting through heartbeat metadata +- multiple advertised endpoint candidates, including private/corporate LAN +- peer connection state machine for warm-peer health +- bounded peer recovery planner over peer cache and connection states +- peer connection intent planner with transport readiness classification +- peer connection manager for real control-plane health over reusable + HTTP keep-alive transport +- route-health effective-path runtime through replacement relay control paths + +Not implemented yet: + +- mesh packet routing +- production mesh service traffic +- VPN runtime +- production workload supervision +- certificate issuance/rotation +- updater runtime +- privileged host route/firewall control + +## Build + +```powershell +cd agents\rap-node-agent +go test ./... +go build -o bin\rap-node-agent.exe .\cmd\rap-node-agent +go build -o bin\mesh-live-smoke.exe .\cmd\mesh-live-smoke +``` + +## First Enrollment + +Create a join token from the platform control plane, then run: + +```powershell +.\bin\rap-node-agent.exe ` + -backend-url http://192.168.200.61:8080/api/v1 ` + -cluster-id ` + -join-token ` + -node-name test-node-1 ` + -state-dir C:\ProgramData\RapNodeAgent +``` + +The agent submits a pending join request and exits. It does not self-activate. +A platform admin must approve the join request. + +## Enrollment Approval + +When the agent enrolls, it stores the returned `pending_join_request_id` and +polls the Control Plane bootstrap endpoint until the platform owner approves +the request or the enrollment timeout expires. After approval, the agent +verifies the signed bootstrap contract and writes the approved `node_id`, +`cluster_id`, `identity_status=active`, `cluster_authority_public_key`, and +`cluster_authority_fingerprint` into `identity.json`. + +Future C3 hardening can add signed node certificates and automatic secure +certificate material exchange. + +Then run the agent again: + +```powershell +.\bin\rap-node-agent.exe ` + -backend-url http://192.168.200.61:8080/api/v1 ` + -state-dir C:\ProgramData\RapNodeAgent +``` + +It sends periodic heartbeats to: + +```text +/api/v1/clusters/{clusterID}/nodes/{nodeID}/heartbeats +``` + +## Environment Variables + +- `RAP_BACKEND_URL` +- `RAP_CLUSTER_ID` +- `RAP_CLUSTER_AUTHORITY_PUBLIC_KEY` +- `RAP_CLUSTER_AUTHORITY_FINGERPRINT` +- `RAP_JOIN_TOKEN` +- `RAP_NODE_NAME` +- `RAP_NODE_STATE_DIR` +- `RAP_WORKLOAD_SUPERVISION_ENABLED` +- `RAP_HEARTBEAT_INTERVAL_SECONDS` +- `RAP_ENROLLMENT_POLL_INTERVAL_SECONDS` +- `RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS` +- `RAP_MESH_SYNTHETIC_RUNTIME_ENABLED` +- `RAP_MESH_LISTEN_ADDR` +- `RAP_MESH_ADVERTISE_ENDPOINT` +- `RAP_MESH_ADVERTISE_ENDPOINTS_JSON` +- `RAP_MESH_ADVERTISE_TRANSPORT` +- `RAP_MESH_CONNECTIVITY_MODE` +- `RAP_MESH_NAT_TYPE` +- `RAP_MESH_REGION` +- `RAP_MESH_SYNTHETIC_CONFIG` +- `RAP_MESH_PEER_ENDPOINTS_JSON` +- `RAP_MESH_SYNTHETIC_ROUTES_JSON` +- `RAP_MESH_PRODUCTION_FORWARDING_ENABLED` +- `RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` + +`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED` defaults to `false`. It gates only the +C17A/C17B/C17C/C17D/C17E synthetic probe, route-health, relay scheduling, +bounded `synthetic.echo` test-service runtime, and live synthetic HTTP endpoint. +It must not be used for RDP, VPN, file, video, or other production service +traffic. + +`RAP_WORKLOAD_SUPERVISION_ENABLED` defaults to `false`. While service runtime +supervision is still a stub, the agent does not poll desired workloads or report +workload status unless this flag is explicitly enabled. + +`RAP_MESH_LISTEN_ADDR` starts the C17E/C17F/C17G synthetic HTTP endpoint only when +`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true`. `RAP_MESH_SYNTHETIC_CONFIG` points to +a scoped synthetic mesh config snapshot and is preferred over debug JSON. +`RAP_MESH_PEER_ENDPOINTS_JSON` is a JSON object mapping peer node IDs to +endpoint URLs. `RAP_MESH_SYNTHETIC_ROUTES_JSON` is a JSON array of synthetic +route objects. If no local scoped config file is set, the agent asks the +Control Plane for: + +```text +/clusters/{clusterID}/nodes/{nodeID}/mesh/synthetic-config +``` + +The JSON variables are debug fallback only. + +Control Plane synthetic config with `authority_required=true` must include a +signed `authority_payload` / `authority_signature` envelope and a +`cluster_authority` descriptor. The agent verifies the signature, validates the +config hash, and rejects mismatched pinned authority values when +`RAP_CLUSTER_AUTHORITY_PUBLIC_KEY`, `RAP_CLUSTER_AUTHORITY_FINGERPRINT`, or the +same fields in `identity.json` are set. + +`RAP_MESH_PRODUCTION_FORWARDING_ENABLED` defaults to `false`. It is a future +production-forwarding gate only. Turning it on does not enable production mesh +payload forwarding; `/mesh/v1/forward` still returns an unavailable runtime +response after validating the route-bound production envelope contract, until +a later approved production mesh stage implements route-bound, policy-bound +forwarding. + +The production envelope contract requires route, hop, TTL, expiry, payload +length, and SHA-256 payload hash fields. C17J accepts only the +`fabric_control` channel class and `fabric.control` message type for +validation. RDP, VPN, render, file, video, and service workload channels are +rejected. + +C17K adds a local metadata-only observation hook after successful production +envelope validation. Observations include route/message/hop/channel metadata and +payload length/hash, not the payload body. Observation failure fails closed, and +the endpoint still does not forward payloads. + +C17L adds a bounded in-memory observation sink for accepted metadata-only +observations. The sink drops the oldest observation when full and still stores +no payload bodies. + +`RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` defaults to `0`. When set above +zero, C17M wires the bounded metadata-only sink into the node-agent mesh server. +This remains local-only, exposes no read API, stores no payload bodies, and +does not enable production forwarding. C17R rejects values above `10000`. + +C17N adds local sink metrics: configured capacity, current depth, accepted +total, and dropped-oldest total. Metrics do not expose observation records, +route IDs, message IDs, hashes, payload metadata, or payload bodies. + +C17O logs those aggregate metrics locally from the node-agent loop when the +sink is explicitly enabled. This does not add a read API or Control Plane +reporting. + +C17P logs aggregate sink metrics only when they change, so steady heartbeat +loops do not repeat identical local metrics lines. + +C17Q logs `production_forwarding_gate_enabled` separately from +`production_forwarding_runtime_enabled`. The runtime field remains `false`; +turning on the gate still does not enable production forwarding. + +C17S makes production envelope observation panic-safe. Observer errors and +observer panics both fail closed as observation failure; forwarding remains +unavailable. + +C17T limits validated production `fabric.control` envelope payloads to 4096 +bytes. Oversized envelopes are rejected before observation. + +C17U rejects production `fabric.control` envelopes whose `created_at` is more +than one minute in the future. + +C17V adds scoped peer endpoint candidates to synthetic mesh config. Candidate +entries describe possible per-node endpoints with transport, address, +reachability, NAT type, connectivity mode, priority, policy tags, verification +time, and metadata. They are model/config hints only; no production route +scoring, NAT traversal, shortcut routing, or forwarding runtime is implemented. + +C17W adds deterministic local scoring for scoped endpoint candidates. Scoring +uses transport, reachability, connectivity mode, NAT type, priority, preferred +region, policy tags, channel class, and verification age. It returns ranked +candidates and reason labels only; it does not select production routes, open +connections, perform NAT traversal, or forward payloads. + +C17X extends candidate scoring with optional local health observations keyed by +`endpoint_id`. Observations can contribute latency, success/failure history, +recent failure reason, reliability score, and freshness/staleness signals. +The score remains advisory only and is not wired into production forwarding. + +C17Z adds the first narrow production forwarding runtime. When +`RAP_MESH_PRODUCTION_FORWARDING_ENABLED=true`, `/mesh/v1/forward` can deliver +route-bound `fabric.control` envelopes at the local destination or forward them +to a direct next hop from explicit peer endpoint config. Service channels, +RDP/VPN/file/video payloads, arbitrary relay forwarding, and multi-hop +production route execution remain unavailable. + +C17Z1 adds route-path-bound multi-hop forwarding for production +`fabric.control` only. Envelopes may carry `route_path` and +`visited_node_ids`; each relay validates its path position, forwards only to +the next route-path node, updates TTL/hop/visited metadata, and rejects loops. +Service payloads remain unavailable. + +C17Z2 emits local `mesh_production_forward_event` logs for production +`fabric.control` forwarding outcomes: accepted, forwarded, delivered, and +rejected. Logs include route/message/hop/channel/status/reason/TTL/hop count/ +route path length/visited count/payload length metadata only. Payload bodies +are not logged, no observation read API is added, and service payloads remain +unavailable. + +C17Z3 binds production `fabric.control` forwarding to loaded scoped or +Control Plane route config when routes are available locally. Configured +envelopes must match `route_id`, cluster, source, destination, route path, +next hop, allowed channel, expiry, max TTL, and max hop count before +forwarding. If no route config is present, existing C17Z1 behavior is +preserved. Service payloads remain unavailable. + +C17Z4 adds scoped peer directory and recovery seed config. `peer_directory` +describes only peers needed by the node-scoped mesh config. `recovery_seeds` +is an explicit, bounded bootstrap/recovery list and is not a full cluster node +list. The node-agent parses and validates these fields, but does not yet +implement a persistent connection manager, NAT traversal, or +relay/rendezvous runtime. + +C17Z5 turns scoped peer directory and recovery seed config into node-local +runtime `PeerCache` state. The cache builds a bounded warm peer set from +route-adjacent peers, recovery seeds, peer endpoints, and endpoint candidates. +When synthetic mesh testing is enabled, the node-agent probes warm peers with +`/mesh/v1/health` and reports metadata-only mesh-link observations. This is not +a persistent connection manager and does not forward service payloads. + +C17Z6 adds advertised mesh endpoint reporting. When +`RAP_MESH_ADVERTISE_ENDPOINT` is set, node-agent includes a +`mesh_endpoint_report` in heartbeat metadata with transport, connectivity mode, +NAT hint, region, observed time, and endpoint candidate metadata. Control Plane +can project the latest reported endpoint into node-scoped synthetic mesh config +for route-path peers. This does not perform automatic public IP discovery, +STUN/TURN/ICE NAT classification, or service payload forwarding. + +C17Z7 adds `RAP_MESH_ADVERTISE_ENDPOINTS_JSON` for multiple advertised +endpoints per node. Candidates can describe public, private, corporate/LAN, +outbound, or relay-style addresses. Endpoint scoring rewards `private-lan`, +`corp-lan`, and `same-site` policy tags, and peer cache can use the best +candidate address for warm-peer health probes. This supports corporate-network +cluster segments without enabling service payload forwarding. + +C17Z8 adds a node-local peer connection state machine on top of warm-peer +health probes. Warm peers move through `disconnected`, `connecting`, `ready`, +`degraded`, and `backoff`; repeated probe failures enter bounded backoff, and +successful probes recover to `ready`. Mesh-link observations include +metadata-only connection state. This is not a persistent socket/session manager +and does not forward service payloads. + +C17Z9 adds a node-local peer recovery planner. The node targets a bounded +stable ready-peer set, defaulting to three connectable peers when available, +instead of probing every known cluster node. When ready peers fall below target, +the planner selects bounded recovery probes from warm peers, recovery seeds, +and other connectable scoped peers, skipping active backoff entries. Heartbeats +include metadata-only `mesh_peer_recovery_report` state. This is not persistent +connection transport, NAT traversal, relay/rendezvous runtime, or service +payload forwarding. + +C17Z10 adds a node-local peer connection intent planner over the C17Z9 recovery +plan. It classifies bounded peer work as `maintain`, `probe`, or `recover`, +and classifies transport readiness as `direct`, `private_lan`, +`corporate_lan`, `outbound_only`, or `relay_required`. Heartbeats include +metadata-only `mesh_peer_connection_intent_report` counts. This is not +persistent connection transport, STUN/TURN/ICE, NAT traversal, relay runtime, +or service payload forwarding. + +C17Z11 adds the first real node-local peer connection manager for mesh +control-plane health. It uses a reusable HTTP keep-alive client to probe +direct/private/corporate peer endpoints selected by C17Z10 intents, updates +the shared peer connection tracker, and records `waiting_rendezvous` for +outbound-only or relay-required peers. Heartbeats include metadata-only +`mesh_peer_connection_manager_report` state. This is not STUN/TURN/ICE, +relay/rendezvous runtime, route lease generation, VPN runtime, or service +payload forwarding. + +C17Z12 adds a node-scoped rendezvous/relay control-plane lease contract for +peers that would otherwise remain `waiting_rendezvous`. The agent consumes +`rendezvous_leases`, resolves matching intents into `relay_control`, probes the +relay node `/mesh/v1/health`, and records `relay_ready` for the peer control +path. This remains control-plane health only and does not enable RDP/VPN/file/ +video/service payload forwarding, arbitrary relay packet forwarding, +STUN/TURN/ICE, or host networking changes. + +C17Z13 adds heartbeat telemetry for rendezvous lease admission and renewal +posture. The agent emits `mesh_rendezvous_lease_report` with local role, +relay/peer admission counts, TTL, renewal-after time, renewal-needed status, +`relay_ready`, and explicit no-payload boundary flags. This remains +metadata-only control-plane telemetry and does not enable service payload +forwarding. + +C17Z14 adds a control-plane refresh contract for rendezvous leases. When a +lease is renewal-needed, expired, invalid, or tied to a stale relay state, the +agent reloads node-scoped synthetic config from Control Plane, updates the +running peer cache/route/lease state, and reports refresh counters plus stale +relay withdrawal/reselection fields. This remains control-plane health only +and does not enable service payload forwarding. + +C17Z15 adds the node side of backend relay replacement policy. The agent +advertises the relay replacement contract capability and emits +`c17z15.mesh_rendezvous_lease_report.v1`; stale relay state is matched to the +exact rendezvous lease/relay when that metadata is present, so an alternate +replacement lease for the same peer is not treated as stale by association. +This remains control-plane health only and does not enable service payload +forwarding. + +C17Z16 adds route/path decision reporting. The agent consumes +`route_path_decisions` from Control Plane synthetic config, keeps the latest +control-plane generation in local state, and emits +`c17z18.mesh_route_path_decision_report.v1` with effective hops, previous/next +hop, selected replacement relay, generation, and no-payload boundary flags. +This remains metadata-only route planning and does not enable service payload +forwarding. + +C17Z17 adds node-side route generation tracking for Control Plane +`route_path_decisions`. The agent emits +`c17z18.mesh_route_generation_report.v1` with active, applied, unchanged, and +withdrawn decision counts, total counters, generation change state, active +decision details, and withdrawn decision details. When the first observed +config already contains a stale relay replacement, the tracker emits a +`withdrawn_by_replacement` record for the old relay path. This remains +metadata-only route planning and does not enable service payload forwarding. + +C17Z18 applies Control Plane `route_path_decisions` to synthetic route-health +route config only. The agent keeps base routes separate from route-health +routes, periodically refreshes scoped config, emits +`c17z18.mesh_route_health_config_report.v1`, and reports route-health +observations with expected/observed hops and drift status. This probes +replacement relay effective paths for control-plane health only and does not +enable service payload forwarding. + +Scoped synthetic config shape: + +```json +{ + "schema_version": "c17z18.synthetic.v1", + "cluster_id": "cluster-1", + "local_node_id": "node-a", + "config_version": "config-v1", + "peer_directory_version": "peers-v1", + "policy_version": "policy-v1", + "peer_endpoints": { + "node-b": "http://127.0.0.1:19002" + }, + "peer_endpoint_candidates": { + "node-b": [ + { + "endpoint_id": "node-b-public", + "node_id": "node-b", + "transport": "direct_tcp_tls", + "address": "203.0.113.20:443", + "reachability": "public", + "nat_type": "restricted", + "connectivity_mode": "direct", + "priority": 10 + } + ] + }, + "routes": [], + "route_path_decisions": { + "schema_version": "c17z18.route_path_decisions.v1", + "decisions": [] + } +} +``` + +## C17E Live Synthetic Smoke + +Run: + +```powershell +cd agents\rap-node-agent +go run .\cmd\mesh-live-smoke +``` + +Expected: + +- scoped synthetic config loads +- direct `node-a -> node-b` synthetic probe succeeds +- relay `node-a -> node-r -> node-b` synthetic probe succeeds +- bounded `synthetic.echo` test-service succeeds +- `production_forwarding=false` + +## Safety Rules + +- The agent never assigns roles to itself. +- The agent reports capabilities only. +- Platform policy assigns roles. +- No RDP/VPN/production service traffic is carried by the C17A-C17Z18 staged + mesh runtime. +- Production forwarding remains disabled by default and limited to + `fabric.control` when explicitly enabled. +- No privileged operations are performed by the current agent. diff --git a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go new file mode 100644 index 0000000..b7f5e07 --- /dev/null +++ b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go @@ -0,0 +1,186 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "time" + + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh" +) + +type smokeNode struct { + Local mesh.PeerIdentity + Runtime *mesh.SyntheticRuntime + URL string + server *httptest.Server +} + +type smokeReport struct { + Stage string `json:"stage"` + ProductionForwarding bool `json:"production_forwarding"` + ScopedConfigLoaded bool `json:"scoped_config_loaded"` + DirectProbeAccepted bool `json:"direct_probe_accepted"` + DirectPath []string `json:"direct_path"` + RelayProbeAccepted bool `json:"relay_probe_accepted"` + RelayPath []string `json:"relay_path"` + TestServiceAccepted bool `json:"test_service_accepted"` + TestServiceEchoPayload string `json:"test_service_echo_payload"` + PeerEndpoints map[string]any `json:"peer_endpoints"` +} + +func main() { + report, err := run(context.Background()) + if err != nil { + fmt.Fprintf(os.Stderr, "mesh live smoke failed: %v\n", err) + os.Exit(1) + } + payload, err := json.MarshalIndent(report, "", " ") + if err != nil { + fmt.Fprintf(os.Stderr, "marshal report: %v\n", err) + os.Exit(1) + } + fmt.Println(string(payload)) +} + +func run(ctx context.Context) (smokeReport, error) { + nodeA := newSmokeNode(mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + defer nodeA.Close() + nodeR := newSmokeNode(mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}) + defer nodeR.Close() + nodeB := newSmokeNode(mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}) + defer nodeB.Close() + + directRoute := smokeRoute("route-direct", []string{"node-a", "node-b"}) + relayRoute := smokeRoute("route-relay", []string{"node-a", "node-r", "node-b"}) + routes := []mesh.SyntheticRoute{directRoute, relayRoute} + nodeAConfigPath, err := writeSmokeScopedConfig(nodeA.Local, map[string]string{ + "node-r": nodeR.URL, + "node-b": nodeB.URL, + }, routes) + if err != nil { + return smokeReport{}, err + } + nodeAConfig, err := mesh.LoadScopedSyntheticConfig(nodeAConfigPath, nodeA.Local) + if err != nil { + return smokeReport{}, fmt.Errorf("load node-a scoped config: %w", err) + } + + nodeA.Runtime = smokeRuntime(nodeA.Local, nodeAConfig.Routes, nodeAConfig.PeerEndpoints) + nodeR.Runtime = smokeRuntime(nodeR.Local, routes, map[string]string{ + "node-b": nodeB.URL, + }) + nodeB.Runtime = smokeRuntime(nodeB.Local, routes, map[string]string{}) + + directAck, err := nodeA.Runtime.SendProbe(ctx, directRoute.RouteID, mesh.SyntheticChannelFabricControl, "smoke-direct") + if err != nil { + return smokeReport{}, fmt.Errorf("direct probe: %w", err) + } + relayAck, err := nodeA.Runtime.SendProbe(ctx, relayRoute.RouteID, mesh.SyntheticChannelFabricControl, "smoke-relay") + if err != nil { + return smokeReport{}, fmt.Errorf("relay probe: %w", err) + } + testService, err := nodeA.Runtime.SendTestService(ctx, relayRoute.RouteID, mesh.SyntheticChannelRouteControl, mesh.SyntheticTestServiceRequest{ + RequestID: "smoke-test-service", + OrganizationID: mesh.SyntheticDefaultTestOrganizationID, + ServiceType: mesh.SyntheticTestServiceType, + Payload: "hello-c17e", + SentAt: time.Now().UTC(), + }) + if err != nil { + return smokeReport{}, fmt.Errorf("test service: %w", err) + } + + return smokeReport{ + Stage: "C17F scoped synthetic config plus live HTTP transport", + ProductionForwarding: false, + ScopedConfigLoaded: nodeAConfig.ConfigVersion == "smoke-config-v1", + DirectProbeAccepted: directAck.MessageType == mesh.SyntheticMessageProbeAck, + DirectPath: decodeProbePath(directAck), + RelayProbeAccepted: relayAck.MessageType == mesh.SyntheticMessageProbeAck, + RelayPath: decodeProbePath(relayAck), + TestServiceAccepted: testService.Ack.MessageType == mesh.SyntheticMessageTestServiceAck, + TestServiceEchoPayload: testService.Response.EchoPayload, + PeerEndpoints: map[string]any{ + "node-a": nodeA.URL, + "node-r": nodeR.URL, + "node-b": nodeB.URL, + }, + }, nil +} + +func writeSmokeScopedConfig(local mesh.PeerIdentity, peers map[string]string, routes []mesh.SyntheticRoute) (string, error) { + path := filepath.Join(os.TempDir(), "rap-c17e-node-a-scoped-mesh.json") + payload, err := json.Marshal(mesh.ScopedSyntheticConfig{ + SchemaVersion: "c17f.synthetic.v1", + ClusterID: local.ClusterID, + LocalNodeID: local.NodeID, + ConfigVersion: "smoke-config-v1", + PeerDirectoryVersion: "smoke-peers-v1", + PolicyVersion: "smoke-policy-v1", + PeerEndpoints: peers, + Routes: routes, + }) + if err != nil { + return "", err + } + if err := os.WriteFile(path, payload, 0o600); err != nil { + return "", err + } + return path, nil +} + +func newSmokeNode(local mesh.PeerIdentity) *smokeNode { + node := &smokeNode{Local: local} + node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r) + })) + node.URL = node.server.URL + return node +} + +func (n *smokeNode) Close() { + if n.server != nil { + n.server.Close() + } +} + +func smokeRuntime(local mesh.PeerIdentity, routes []mesh.SyntheticRoute, peers map[string]string) *mesh.SyntheticRuntime { + return mesh.NewSyntheticRuntime(mesh.SyntheticRuntimeConfig{ + Enabled: true, + Local: local, + Routes: routes, + AllowedChannels: []string{ + mesh.SyntheticChannelFabricControl, + mesh.SyntheticChannelRouteControl, + }, + Transport: mesh.NewHTTPPeerTransport(peers), + }) +} + +func smokeRoute(routeID string, hops []string) mesh.SyntheticRoute { + return mesh.SyntheticRoute{ + RouteID: routeID, + ClusterID: "cluster-1", + SourceNodeID: hops[0], + DestinationNodeID: hops[len(hops)-1], + Hops: hops, + AllowedChannels: []string{mesh.SyntheticChannelFabricControl, mesh.SyntheticChannelRouteControl}, + MaxTTL: 8, + MaxHops: 8, + ExpiresAt: time.Now().UTC().Add(time.Hour), + RouteVersion: "route-v1", + PolicyVersion: "policy-v1", + PeerDirectoryVersion: "peers-v1", + } +} + +func decodeProbePath(envelope mesh.SyntheticEnvelope) []string { + var payload mesh.SyntheticProbeAckPayload + _ = json.Unmarshal(envelope.Payload, &payload) + return payload.Path +} diff --git a/agents/rap-node-agent/cmd/rap-node-agent/main.go b/agents/rap-node-agent/cmd/rap-node-agent/main.go new file mode 100644 index 0000000..c78daf5 --- /dev/null +++ b/agents/rap-node-agent/cmd/rap-node-agent/main.go @@ -0,0 +1,2862 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "log" + "net/http" + "os" + "os/signal" + "path/filepath" + "sort" + "strings" + "syscall" + "time" + + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/client" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/config" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/state" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/supervisor" +) + +const ( + meshRendezvousLeaseRenewalWindow = time.Minute + meshRendezvousLeaseRefreshBackoff = 30 * time.Second + meshSyntheticConfigRefreshInterval = 20 * time.Second + meshRouteHealthFeedbackRefreshBackoff = 5 * time.Second + maxMeshRendezvousLeaseReportEntries = 20 + meshRendezvousLeaseReportSchema = "c17z18.mesh_rendezvous_lease_report.v1" + meshRendezvousLeaseTelemetryCapability = "mesh_rendezvous_lease_telemetry" + meshRendezvousLeaseRefreshCapability = "mesh_rendezvous_lease_refresh_contract" + meshRendezvousRelayReplacementCapability = "mesh_rendezvous_relay_replacement_contract" + meshRoutePathDecisionReportSchema = "c17z18.mesh_route_path_decision_report.v1" + meshRoutePathDecisionCapability = "mesh_route_path_decision_contract" + meshRouteGenerationReportSchema = "c17z18.mesh_route_generation_report.v1" + meshRouteGenerationTrackerCapability = "mesh_route_generation_tracker" + meshRouteHealthConfigReportSchema = "c17z20.mesh_route_health_config_report.v1" + meshRouteHealthConfigCapability = "mesh_route_health_config_from_path_decisions" + meshRouteHealthFeedbackRefreshSchema = "c17z20.mesh_route_health_feedback_refresh_report.v1" + meshRouteHealthFeedbackRefreshCapability = "mesh_route_health_feedback_refresh_contract" +) + +func main() { + cfg, err := config.Load(os.Args[1:], nil) + if err != nil { + log.Fatalf("load config: %v", err) + } + + signalCtx, stopSignals := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stopSignals() + ctx, cancel := context.WithCancel(signalCtx) + defer cancel() + + identity, err := state.LoadOrCreate(cfg.StateDir, cfg.ClusterID, cfg.NodeName) + if err != nil { + log.Fatalf("load identity state: %v", err) + } + + api := client.New(cfg.BackendURL) + if identity.NodeID == "" { + identity, err = ensureApprovedIdentity(ctx, cfg, identity, api) + if err != nil { + log.Fatalf("bootstrap node identity: %v", err) + } + if identity.NodeID == "" { + log.Printf("enrollment still pending: join_request_id=%s identity_file=%s", identity.PendingJoinRequestID, filepath.Join(cfg.StateDir, state.FileName)) + return + } + } + + log.Printf("node-agent started: node_id=%s cluster_id=%s backend=%s", identity.NodeID, identity.ClusterID, cfg.BackendURL) + meshState, stopMeshEndpoint, err := startSyntheticMeshEndpoint(ctx, cancel, cfg, identity, api) + if err != nil { + log.Fatalf("start synthetic mesh endpoint: %v", err) + } + defer stopMeshEndpoint() + + supervisor := supervisor.StubSupervisor{Version: agent.Version} + startedAt := time.Now().UTC() + ticker := time.NewTicker(cfg.HeartbeatInterval) + defer ticker.Stop() + for { + flags, err := sendHeartbeat(ctx, api, cfg, identity, meshState) + if err != nil { + log.Printf("heartbeat failed: %v", err) + } + if flags.Enabled && flags.TelemetryEnabled { + if err := api.ReportTelemetry(ctx, identity.ClusterID, identity.NodeID, agent.TelemetryPayload(identity, startedAt)); err != nil { + log.Printf("telemetry failed: %v", err) + } else { + log.Printf("telemetry sent: node_id=%s cluster_id=%s scopes=%v", identity.NodeID, identity.ClusterID, flags.AppliedScopes) + } + } + if cfg.WorkloadSupervisionEnabled { + if err := reportWorkloadStatus(ctx, api, supervisor, identity); err != nil { + log.Printf("workload status failed: %v", err) + } + } + logProductionObservationSinkMetrics(meshState) + if flags.Enabled && flags.SyntheticLinksEnabled { + if err := api.ReportMeshLink(ctx, identity.ClusterID, agent.MeshSelfObservationPayload(identity)); err != nil { + log.Printf("mesh self-observation failed: %v", err) + } else { + log.Printf("mesh self-observation sent: node_id=%s cluster_id=%s scopes=%v", identity.NodeID, identity.ClusterID, flags.AppliedScopes) + } + if err := refreshRendezvousLeasesIfNeeded(ctx, cfg, identity, api, meshState, time.Now().UTC()); err != nil { + log.Printf("mesh rendezvous lease refresh failed: %v", err) + } + if err := refreshSyntheticMeshConfigIfDue(ctx, cfg, identity, api, meshState, time.Now().UTC()); err != nil { + log.Printf("mesh synthetic config refresh failed: %v", err) + } + if err := reportSyntheticRouteHealth(ctx, cfg, api, identity, meshState); err != nil { + log.Printf("mesh synthetic route health failed: %v", err) + } + if err := probeWarmPeerHealth(ctx, api, identity, meshState); err != nil { + log.Printf("mesh warm peer health failed: %v", err) + } + } + select { + case <-ctx.Done(): + log.Print("node-agent stopped") + return + case <-ticker.C: + } + } +} + +type joinRequestEnvelope struct { + ID string `json:"id"` +} + +type nodeApprovalAuthorityPayload struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + JoinRequestID string `json:"join_request_id"` + NodeID string `json:"node_id"` + NodeFingerprint string `json:"node_fingerprint"` + IdentityStatus string `json:"identity_status"` + HeartbeatEndpoint string `json:"heartbeat_endpoint"` + ApprovedByUserID string `json:"approved_by_user_id"` + IssuedAt time.Time `json:"issued_at"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` +} + +func ensureApprovedIdentity(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client) (state.Identity, error) { + clusterID := firstNonEmpty(identity.ClusterID, cfg.ClusterID) + if clusterID == "" { + return state.Identity{}, fmt.Errorf("cluster ID is required for enrollment") + } + identity.ClusterID = clusterID + if identity.PendingJoinRequestID == "" { + if cfg.JoinToken == "" { + return state.Identity{}, fmt.Errorf("join token is required for first enrollment") + } + response, err := api.Enroll(ctx, agent.EnrollmentPayload(clusterID, cfg.JoinToken, identity)) + if err != nil { + return state.Identity{}, fmt.Errorf("enroll node: %w", err) + } + joinRequestID, err := parseJoinRequestID(response.JoinRequest) + if err != nil { + return state.Identity{}, err + } + identity, err = state.MarkEnrollmentSubmitted(cfg.StateDir, clusterID, joinRequestID) + if err != nil { + return state.Identity{}, fmt.Errorf("persist pending enrollment: %w", err) + } + log.Printf("enrollment submitted: status=%s join_request_id=%s identity_file=%s", response.Status, joinRequestID, filepath.Join(cfg.StateDir, state.FileName)) + } + + deadline := time.Time{} + if cfg.EnrollmentPollTimeout > 0 { + deadline = time.Now().UTC().Add(cfg.EnrollmentPollTimeout) + } + for { + response, err := api.BootstrapEnrollment(ctx, identity.PendingJoinRequestID, client.EnrollmentBootstrapRequest{ + ClusterID: clusterID, + NodeFingerprint: identity.NodeFingerprint, + PublicKey: identity.PublicKey, + }) + if err == nil { + switch response.Status { + case "approved": + if response.Bootstrap == nil { + return state.Identity{}, fmt.Errorf("approved enrollment missing bootstrap contract") + } + if err := verifyEnrollmentBootstrap(*response.Bootstrap, identity, cfg); err != nil { + return state.Identity{}, err + } + approved, err := state.MarkApprovedWithAuthority( + cfg.StateDir, + response.Bootstrap.NodeID, + response.Bootstrap.ClusterID, + response.Bootstrap.IdentityStatus, + response.Bootstrap.ClusterAuthority.PublicKey, + response.Bootstrap.ClusterAuthority.PublicKeyFingerprint, + ) + if err != nil { + return state.Identity{}, fmt.Errorf("persist approved identity: %w", err) + } + log.Printf("enrollment approved: node_id=%s cluster_id=%s authority=%s", approved.NodeID, approved.ClusterID, approved.ClusterAuthorityFingerprint) + return approved, nil + case "rejected", "cancelled": + return state.Identity{}, fmt.Errorf("enrollment %s", response.Status) + default: + log.Printf("enrollment waiting for approval: status=%s join_request_id=%s", response.Status, identity.PendingJoinRequestID) + } + } else { + log.Printf("enrollment bootstrap poll failed: %v", err) + } + if cfg.EnrollmentPollTimeout == 0 || (!deadline.IsZero() && !time.Now().UTC().Before(deadline)) { + return identity, nil + } + select { + case <-ctx.Done(): + return state.Identity{}, ctx.Err() + case <-time.After(cfg.EnrollmentPollInterval): + } + } +} + +func parseJoinRequestID(raw json.RawMessage) (string, error) { + var envelope joinRequestEnvelope + if err := json.Unmarshal(raw, &envelope); err != nil { + return "", fmt.Errorf("decode join request: %w", err) + } + if strings.TrimSpace(envelope.ID) == "" { + return "", fmt.Errorf("join request id missing from enrollment response") + } + return strings.TrimSpace(envelope.ID), nil +} + +func verifyEnrollmentBootstrap(bootstrap client.NodeBootstrap, identity state.Identity, cfg config.Config) error { + if bootstrap.ClusterAuthority == nil { + return fmt.Errorf("node bootstrap missing cluster authority") + } + if bootstrap.AuthoritySignature == nil || rawMessageEmpty(bootstrap.AuthorityPayload) { + return fmt.Errorf("node bootstrap missing authority payload or signature") + } + if bootstrap.ClusterID != identity.ClusterID || bootstrap.NodeID == "" || bootstrap.IdentityStatus == "" { + return fmt.Errorf("node bootstrap identity mismatch") + } + if bootstrap.ClusterAuthority.SchemaVersion != authority.AuthoritySchemaVersion || + bootstrap.ClusterAuthority.ClusterID != bootstrap.ClusterID || + bootstrap.ClusterAuthority.KeyAlgorithm != authority.AlgorithmEd25519 { + return fmt.Errorf("node bootstrap cluster authority descriptor mismatch") + } + if bootstrap.AuthoritySignature.KeyFingerprint != bootstrap.ClusterAuthority.PublicKeyFingerprint { + return fmt.Errorf("node bootstrap authority fingerprint mismatch") + } + if pinned := firstNonEmpty(identity.ClusterAuthorityFingerprint, cfg.ClusterAuthorityFingerprint); pinned != "" && pinned != bootstrap.ClusterAuthority.PublicKeyFingerprint { + return fmt.Errorf("node bootstrap pinned authority fingerprint mismatch") + } + if pinned := firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey); pinned != "" && pinned != bootstrap.ClusterAuthority.PublicKey { + return fmt.Errorf("node bootstrap pinned authority public key mismatch") + } + signature := authority.Signature{ + SchemaVersion: bootstrap.AuthoritySignature.SchemaVersion, + Algorithm: bootstrap.AuthoritySignature.Algorithm, + KeyFingerprint: bootstrap.AuthoritySignature.KeyFingerprint, + Signature: bootstrap.AuthoritySignature.Signature, + } + if err := authority.VerifyRaw(bootstrap.ClusterAuthority.PublicKey, bootstrap.AuthorityPayload, signature); err != nil { + return fmt.Errorf("verify node bootstrap authority signature: %w", err) + } + var payload nodeApprovalAuthorityPayload + if err := json.Unmarshal(bootstrap.AuthorityPayload, &payload); err != nil { + return fmt.Errorf("decode node bootstrap authority payload: %w", err) + } + if payload.SchemaVersion != "rap.cluster.node_approval.v1" || + payload.ClusterID != bootstrap.ClusterID || + payload.NodeID != bootstrap.NodeID || + payload.NodeFingerprint != identity.NodeFingerprint || + payload.IdentityStatus != bootstrap.IdentityStatus || + payload.HeartbeatEndpoint != bootstrap.HeartbeatEndpoint || + !payload.ControlPlaneOnly || + payload.ProductionForwarding { + return fmt.Errorf("node bootstrap authority payload mismatch") + } + if identity.PendingJoinRequestID != "" && payload.JoinRequestID != identity.PendingJoinRequestID { + return fmt.Errorf("node bootstrap authority payload join request mismatch") + } + return nil +} + +type syntheticMeshState struct { + Runtime *mesh.SyntheticRuntime + Routes []mesh.SyntheticRoute + RouteHealthRoutes []mesh.SyntheticRoute + Source string + PeerCache *mesh.PeerCache + RendezvousLeases []mesh.PeerRendezvousLease + RoutePathDecisions *client.RoutePathDecisionReport + RouteGenerationTracker *meshRouteGenerationTracker + ConfigVersion string + PeerDirectoryVersion string + PolicyVersion string + PeerConnections *mesh.PeerConnectionTracker + PeerConnectionManager *mesh.PeerConnectionManager + LastPeerRecoveryPlan *mesh.PeerRecoveryPlan + LastPeerConnectionIntent *mesh.PeerConnectionIntentPlan + LastConfigRefreshAt time.Time + LastLeaseRefresh *meshRendezvousLeaseRefreshState + LeaseRefreshAttempts int + LeaseRefreshSuccesses int + LeaseRefreshFailures int + LastRouteHealthRefresh *meshRouteHealthFeedbackRefreshState + RouteHealthRefreshAttempts int + RouteHealthRefreshSuccesses int + RouteHealthRefreshFailures int + RouteHealthRefreshSuppressed int + ProductionObservationSink *mesh.ProductionEnvelopeObservationSink + LastProductionSinkMetrics *mesh.ProductionEnvelopeObservationSinkMetrics +} + +type meshRendezvousLeaseRefreshState struct { + Status string + Reason string + Error string + AttemptedAt time.Time + CompletedAt time.Time + PreviousLeaseCount int + RefreshedLeaseCount int + RefreshNeededCount int + RenewalNeededCount int + ExpiredCount int + StaleRelayCount int + ConfigVersion string +} + +type meshRouteHealthFeedbackTrigger struct { + Reason string + RouteID string + PeerNodeID string + SelectedRelayID string + LinkStatus string + FailureReason string + DriftDetected bool + ObservedAt time.Time +} + +type meshRouteHealthFeedbackRefreshState struct { + Status string + Reason string + Error string + AttemptedAt time.Time + CompletedAt time.Time + RouteID string + PeerNodeID string + SelectedRelayID string + LinkStatus string + FailureReason string + DriftDetected bool + PreviousConfigVersion string + RefreshedConfigVersion string + PreviousRouteHealthRouteCount int + RefreshedRouteHealthRouteCount int +} + +type loadedSyntheticMeshConfig struct { + PeerEndpoints map[string]string + PeerEndpointCandidates map[string][]mesh.PeerEndpointCandidate + PeerDirectory []mesh.PeerDirectoryEntry + RecoverySeeds []mesh.PeerRecoverySeed + RendezvousLeases []mesh.PeerRendezvousLease + RoutePathDecisions *client.RoutePathDecisionReport + Routes []mesh.SyntheticRoute + Source string + ConfigVersion string + PeerDirectoryVersion string + PolicyVersion string +} + +func startSyntheticMeshEndpoint(ctx context.Context, cancel context.CancelFunc, cfg config.Config, identity state.Identity, api *client.Client) (*syntheticMeshState, func(), error) { + noop := func() {} + if !cfg.MeshSyntheticRuntimeEnabled { + return nil, noop, nil + } + if cfg.MeshListenAddr == "" { + log.Print("synthetic mesh runtime enabled, but RAP_MESH_LISTEN_ADDR is empty; endpoint not started") + return nil, noop, nil + } + local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID} + loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api) + if err != nil { + return nil, noop, err + } + peerEndpoints := loadedConfig.PeerEndpoints + routes := loadedConfig.Routes + routeHealthRoutes := routeHealthRoutesFromPathDecisions(routes, loadedConfig.RoutePathDecisions) + peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{ + Local: local, + PeerEndpoints: loadedConfig.PeerEndpoints, + PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates, + PeerDirectory: loadedConfig.PeerDirectory, + RecoverySeeds: loadedConfig.RecoverySeeds, + RendezvousLeases: loadedConfig.RendezvousLeases, + Routes: loadedConfig.Routes, + WarmPeerLimit: mesh.DefaultWarmPeerLimit, + PreferredRegion: cfg.MeshRegion, + Now: time.Now().UTC(), + }) + peerCacheSnapshot := peerCache.Snapshot() + peerConnections := mesh.NewPeerConnectionTracker(peerCacheSnapshot, time.Now().UTC()) + peerConnectionSnapshot := peerConnections.Snapshot() + peerRecoveryPlan := mesh.PlanPeerRecovery(mesh.PeerRecoveryPlanConfig{ + PeerCache: peerCacheSnapshot, + Connections: peerConnectionSnapshot, + TargetReadyPeers: mesh.DefaultStablePeerTarget, + MaxProbeCandidates: mesh.DefaultRecoveryProbeLimit, + Now: time.Now().UTC(), + }) + peerConnectionIntentPlan := mesh.PlanPeerConnectionIntents(mesh.PeerConnectionIntentPlanConfig{ + PeerCache: peerCacheSnapshot, + RecoveryPlan: peerRecoveryPlan, + RendezvousLeases: loadedConfig.RendezvousLeases, + Now: time.Now().UTC(), + }) + peerConnectionManager := mesh.NewPeerConnectionManager(mesh.PeerConnectionManagerConfig{ + Local: local, + PeerCache: peerCache, + Tracker: peerConnections, + RendezvousLeases: loadedConfig.RendezvousLeases, + }) + routeGenerationTracker := newMeshRouteGenerationTracker(loadedConfig.RoutePathDecisions, time.Now().UTC()) + gateEnabled, runtimeEnabled := productionForwardingLogState(cfg) + log.Printf( + "synthetic mesh config loaded: source=%s node_id=%s cluster_id=%s peers=%d routes=%d peer_cache_peers=%d warm_peers=%d recovery_seeds=%d rendezvous_leases=%d peer_connection_states=%d peer_recovery_mode=%s peer_recovery_target_ready_peers=%d peer_connection_intents=%d rendezvous_required=%d rendezvous_resolved=%d production_forwarding_gate_enabled=%t production_forwarding_runtime_enabled=%t", + loadedConfig.Source, + identity.NodeID, + identity.ClusterID, + len(peerEndpoints), + len(routes), + peerCacheSnapshot.PeerCount, + peerCacheSnapshot.WarmPeerCount, + peerCacheSnapshot.RecoverySeedCount, + peerCacheSnapshot.RendezvousLeaseCount, + peerConnectionSnapshot.Total, + peerRecoveryPlan.Mode, + peerRecoveryPlan.TargetReadyPeers, + peerConnectionIntentPlan.IntentCount, + peerConnectionIntentPlan.RendezvousRequiredCount, + peerConnectionIntentPlan.RendezvousResolvedCount, + gateEnabled, + runtimeEnabled, + ) + runtime := mesh.NewSyntheticRuntime(mesh.SyntheticRuntimeConfig{ + Enabled: true, + Local: local, + Routes: routes, + RouteHealthRoutes: routeHealthRoutes, + Transport: mesh.NewHTTPPeerTransport(peerEndpoints), + Logger: func(entry mesh.SyntheticLogEntry) { + payload, err := json.Marshal(entry) + if err != nil { + log.Printf("mesh synthetic event marshal failed: %v", err) + return + } + log.Printf("mesh_synthetic_event=%s", string(payload)) + }, + }) + productionObservationSink := productionEnvelopeObservationSinkFromConfig(cfg) + var productionEnvelopeObserver mesh.ProductionEnvelopeObserver + if productionObservationSink != nil { + productionEnvelopeObserver = productionObservationSink.Observe + } + var productionForwardTransport mesh.ProductionForwardTransport + if cfg.MeshProductionForwardingEnabled { + productionForwardTransport = mesh.NewHTTPProductionForwardTransport(peerEndpoints) + } + server := &http.Server{ + Addr: cfg.MeshListenAddr, + Handler: mesh.Server{ + Local: local, + SyntheticRuntime: runtime, + ProductionForwardingEnabled: cfg.MeshProductionForwardingEnabled, + ProductionEnvelopeObserver: productionEnvelopeObserver, + ProductionForwardTransport: productionForwardTransport, + ProductionForwardLogger: func(entry mesh.ProductionForwardLogEntry) { + payload, err := json.Marshal(entry) + if err != nil { + log.Printf("mesh production forward event marshal failed: %v", err) + return + } + log.Printf("mesh_production_forward_event=%s", string(payload)) + }, + ProductionRoutes: routes, + }.Handler(), + ReadHeaderTimeout: 5 * time.Second, + } + go func() { + log.Printf( + "synthetic mesh endpoint starting: listen_addr=%s node_id=%s cluster_id=%s peers=%d routes=%d production_forwarding_gate_enabled=%t production_forwarding_runtime_enabled=%t", + cfg.MeshListenAddr, + identity.NodeID, + identity.ClusterID, + len(peerEndpoints), + len(routes), + gateEnabled, + runtimeEnabled, + ) + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Printf("synthetic mesh endpoint failed: %v", err) + cancel() + } + }() + go func() { + <-ctx.Done() + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + if err := server.Shutdown(shutdownCtx); err != nil { + log.Printf("synthetic mesh endpoint shutdown failed: %v", err) + } + }() + return &syntheticMeshState{ + Runtime: runtime, + Routes: routes, + RouteHealthRoutes: routeHealthRoutes, + Source: loadedConfig.Source, + PeerCache: peerCache, + RendezvousLeases: loadedConfig.RendezvousLeases, + RoutePathDecisions: loadedConfig.RoutePathDecisions, + RouteGenerationTracker: routeGenerationTracker, + ConfigVersion: loadedConfig.ConfigVersion, + PeerDirectoryVersion: loadedConfig.PeerDirectoryVersion, + PolicyVersion: loadedConfig.PolicyVersion, + LastConfigRefreshAt: time.Now().UTC(), + PeerConnections: peerConnections, + PeerConnectionManager: peerConnectionManager, + LastPeerRecoveryPlan: &peerRecoveryPlan, + LastPeerConnectionIntent: &peerConnectionIntentPlan, + ProductionObservationSink: productionObservationSink, + }, func() { + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + _ = server.Shutdown(shutdownCtx) + }, nil +} + +func productionForwardingLogState(cfg config.Config) (gateEnabled bool, runtimeEnabled bool) { + return cfg.MeshProductionForwardingEnabled, cfg.MeshProductionForwardingEnabled +} + +func productionEnvelopeObservationSinkFromConfig(cfg config.Config) *mesh.ProductionEnvelopeObservationSink { + if cfg.MeshProductionObservationSinkCapacity <= 0 { + return nil + } + sink := mesh.NewProductionEnvelopeObservationSink(cfg.MeshProductionObservationSinkCapacity) + log.Printf("production envelope observation sink enabled: capacity=%d payload_storage=false", sink.Capacity()) + return sink +} + +func logProductionObservationSinkMetrics(meshState *syntheticMeshState) { + if meshState == nil || meshState.ProductionObservationSink == nil { + return + } + metrics := meshState.ProductionObservationSink.Metrics() + if meshState.LastProductionSinkMetrics != nil && productionObservationSinkMetricsEqual(*meshState.LastProductionSinkMetrics, metrics) { + return + } + meshState.LastProductionSinkMetrics = &metrics + log.Printf( + "production envelope observation sink metrics: capacity=%d current_depth=%d accepted_total=%d dropped_oldest=%d payload_storage=false", + metrics.Capacity, + metrics.CurrentDepth, + metrics.AcceptedTotal, + metrics.DroppedOldest, + ) +} + +func productionObservationSinkMetricsEqual(a, b mesh.ProductionEnvelopeObservationSinkMetrics) bool { + return a.Capacity == b.Capacity && + a.CurrentDepth == b.CurrentDepth && + a.AcceptedTotal == b.AcceptedTotal && + a.DroppedOldest == b.DroppedOldest +} + +func loadSyntheticMeshConfig(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client) (loadedSyntheticMeshConfig, error) { + local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID} + if cfg.MeshSyntheticConfigPath != "" { + scoped, err := mesh.LoadScopedSyntheticConfig(cfg.MeshSyntheticConfigPath, local) + if err != nil { + return loadedSyntheticMeshConfig{}, err + } + return loadedSyntheticMeshConfig{ + PeerEndpoints: scoped.PeerEndpoints, + PeerEndpointCandidates: scoped.PeerEndpointCandidates, + PeerDirectory: scoped.PeerDirectory, + RecoverySeeds: scoped.RecoverySeeds, + RendezvousLeases: scoped.RendezvousLeases, + RoutePathDecisions: nil, + Routes: scoped.Routes, + Source: "scoped_config", + ConfigVersion: scoped.ConfigVersion, + PeerDirectoryVersion: scoped.PeerDirectoryVersion, + PolicyVersion: scoped.PolicyVersion, + }, nil + } + if api != nil { + remote, err := api.SyntheticMeshConfig(ctx, local.ClusterID, local.NodeID) + if err == nil { + if verifyErr := verifyControlPlaneSyntheticMeshConfig(remote, identity, cfg); verifyErr != nil { + return loadedSyntheticMeshConfig{}, verifyErr + } + } + if err == nil && remote.Enabled { + return loadedSyntheticMeshConfig{ + PeerEndpoints: remote.PeerEndpoints, + PeerEndpointCandidates: peerEndpointCandidatesFromControlPlane(remote.PeerEndpointCandidates), + PeerDirectory: peerDirectoryFromControlPlane(remote.PeerDirectory), + RecoverySeeds: recoverySeedsFromControlPlane(remote.RecoverySeeds), + RendezvousLeases: rendezvousLeasesFromControlPlane(remote.RendezvousLeases), + RoutePathDecisions: remote.RoutePathDecisions, + Routes: syntheticRoutesFromControlPlane(remote.Routes), + Source: "control_plane", + ConfigVersion: remote.ConfigVersion, + PeerDirectoryVersion: remote.PeerDirectoryVersion, + PolicyVersion: remote.PolicyVersion, + }, nil + } + if err != nil { + log.Printf("control-plane synthetic mesh config unavailable, falling back to debug JSON: %v", err) + } + } + peerEndpoints, err := parseMeshPeerEndpoints(cfg.MeshPeerEndpointsJSON) + if err != nil { + return loadedSyntheticMeshConfig{}, err + } + routes, err := parseMeshSyntheticRoutes(cfg.MeshSyntheticRoutesJSON) + if err != nil { + return loadedSyntheticMeshConfig{}, err + } + return loadedSyntheticMeshConfig{ + PeerEndpoints: peerEndpoints, + Routes: routes, + Source: "debug_json", + }, nil +} + +type controlPlaneMeshConfigAuthorityPayload struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + LocalNodeID string `json:"local_node_id"` + ConfigVersion string `json:"config_version"` + ConfigSHA256 string `json:"config_sha256"` + IssuedAt time.Time `json:"issued_at"` + ExpiresAt time.Time `json:"expires_at"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` +} + +func verifyControlPlaneSyntheticMeshConfig(remote client.SyntheticMeshConfig, identity state.Identity, cfg config.Config) error { + signaturePresent := remote.ClusterAuthority != nil || len(remote.AuthorityPayload) > 0 || remote.AuthoritySignature != nil + if !remote.AuthorityRequired && !signaturePresent { + return nil + } + if remote.ClusterAuthority == nil { + return fmt.Errorf("control-plane synthetic mesh config requires cluster authority") + } + if remote.AuthoritySignature == nil || rawMessageEmpty(remote.AuthorityPayload) { + return fmt.Errorf("control-plane synthetic mesh config requires authority payload and signature") + } + if remote.ClusterAuthority.SchemaVersion != authority.AuthoritySchemaVersion { + return fmt.Errorf("control-plane synthetic mesh config authority schema mismatch") + } + if remote.ClusterAuthority.ClusterID != identity.ClusterID || remote.ClusterAuthority.ClusterID != remote.ClusterID { + return fmt.Errorf("control-plane synthetic mesh config authority cluster mismatch") + } + if remote.ClusterAuthority.KeyAlgorithm != authority.AlgorithmEd25519 { + return fmt.Errorf("control-plane synthetic mesh config authority algorithm mismatch") + } + if remote.AuthoritySignature.KeyFingerprint != remote.ClusterAuthority.PublicKeyFingerprint { + return fmt.Errorf("control-plane synthetic mesh config signature fingerprint mismatch") + } + if pinned := firstNonEmpty(identity.ClusterAuthorityFingerprint, cfg.ClusterAuthorityFingerprint); pinned != "" && pinned != remote.ClusterAuthority.PublicKeyFingerprint { + return fmt.Errorf("control-plane synthetic mesh config authority fingerprint mismatch") + } + if pinned := firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey); pinned != "" && pinned != remote.ClusterAuthority.PublicKey { + return fmt.Errorf("control-plane synthetic mesh config authority public key mismatch") + } + signature := authority.Signature{ + SchemaVersion: remote.AuthoritySignature.SchemaVersion, + Algorithm: remote.AuthoritySignature.Algorithm, + KeyFingerprint: remote.AuthoritySignature.KeyFingerprint, + Signature: remote.AuthoritySignature.Signature, + } + if err := authority.VerifyRaw(remote.ClusterAuthority.PublicKey, remote.AuthorityPayload, signature); err != nil { + return fmt.Errorf("verify control-plane synthetic mesh config authority signature: %w", err) + } + var payload controlPlaneMeshConfigAuthorityPayload + if err := json.Unmarshal(remote.AuthorityPayload, &payload); err != nil { + return fmt.Errorf("decode control-plane synthetic mesh config authority payload: %w", err) + } + if payload.SchemaVersion != "rap.cluster.mesh_config_snapshot.v1" { + return fmt.Errorf("control-plane synthetic mesh config authority payload schema mismatch") + } + if payload.ClusterID != identity.ClusterID || payload.ClusterID != remote.ClusterID { + return fmt.Errorf("control-plane synthetic mesh config authority payload cluster mismatch") + } + if payload.LocalNodeID != identity.NodeID || payload.LocalNodeID != remote.LocalNodeID { + return fmt.Errorf("control-plane synthetic mesh config authority payload node mismatch") + } + if payload.ConfigVersion != remote.ConfigVersion { + return fmt.Errorf("control-plane synthetic mesh config authority payload version mismatch") + } + if !payload.ControlPlaneOnly || payload.ProductionForwarding || remote.ProductionForwarding { + return fmt.Errorf("control-plane synthetic mesh config authority payload forbids production forwarding") + } + if !payload.ExpiresAt.IsZero() && !payload.ExpiresAt.After(time.Now().UTC()) { + return fmt.Errorf("control-plane synthetic mesh config authority payload expired") + } + configHash, err := syntheticMeshConfigAuthorityHash(remote) + if err != nil { + return err + } + if payload.ConfigSHA256 != configHash { + return fmt.Errorf("control-plane synthetic mesh config authority payload hash mismatch") + } + return nil +} + +func syntheticMeshConfigAuthorityHash(remote client.SyntheticMeshConfig) (string, error) { + unsigned := remote + unsigned.AuthorityPayload = nil + unsigned.AuthoritySignature = nil + raw, err := json.Marshal(unsigned) + if err != nil { + return "", fmt.Errorf("marshal control-plane synthetic mesh config for authority hash: %w", err) + } + hash, err := authority.HashRaw(raw) + if err != nil { + return "", fmt.Errorf("hash control-plane synthetic mesh config authority payload: %w", err) + } + return hash, nil +} + +func rawMessageEmpty(raw json.RawMessage) bool { + value := strings.TrimSpace(string(raw)) + return value == "" || value == "{}" || value == "null" +} + +func firstNonEmpty(values ...string) string { + for _, value := range values { + if strings.TrimSpace(value) != "" { + return strings.TrimSpace(value) + } + } + return "" +} + +type meshRendezvousLeasePosture struct { + RefreshNeeded bool + Reason string + RefreshNeededCount int + RenewalNeededCount int + ExpiredCount int + InvalidCount int + StaleRelayCount int +} + +func refreshRendezvousLeasesIfNeeded(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client, meshState *syntheticMeshState, observedAt time.Time) error { + if meshState == nil || meshState.PeerCache == nil { + return nil + } + observedAt = observedAt.UTC() + posture := meshRendezvousLeasePostureForState(meshState, identity, observedAt) + if !posture.RefreshNeeded { + return nil + } + if meshState.LastLeaseRefresh != nil && meshState.LastLeaseRefresh.AttemptedAt.Add(meshRendezvousLeaseRefreshBackoff).After(observedAt) { + return nil + } + + refresh := &meshRendezvousLeaseRefreshState{ + Status: "attempted", + Reason: posture.Reason, + AttemptedAt: observedAt, + PreviousLeaseCount: len(meshState.RendezvousLeases), + RefreshNeededCount: posture.RefreshNeededCount, + RenewalNeededCount: posture.RenewalNeededCount, + ExpiredCount: posture.ExpiredCount, + StaleRelayCount: posture.StaleRelayCount, + ConfigVersion: meshState.ConfigVersion, + } + meshState.LastLeaseRefresh = refresh + meshState.LeaseRefreshAttempts++ + + if api == nil || meshState.Source != "control_plane" || cfg.MeshSyntheticConfigPath != "" { + refresh.Status = "unsupported" + refresh.Error = "control_plane_synthetic_config_required" + refresh.CompletedAt = observedAt + meshState.LeaseRefreshFailures++ + return nil + } + + local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID} + loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api) + completedAt := time.Now().UTC() + refresh.CompletedAt = completedAt + if err != nil { + refresh.Status = "failed" + refresh.Error = err.Error() + meshState.LeaseRefreshFailures++ + return err + } + applyRefreshedSyntheticMeshConfig(meshState, loadedConfig, local, cfg.MeshRegion, completedAt) + refresh.Status = "succeeded" + refresh.RefreshedLeaseCount = len(loadedConfig.RendezvousLeases) + refresh.ConfigVersion = loadedConfig.ConfigVersion + meshState.LeaseRefreshSuccesses++ + log.Printf( + "mesh rendezvous lease refresh succeeded: reason=%s previous_leases=%d refreshed_leases=%d config_version=%s", + refresh.Reason, + refresh.PreviousLeaseCount, + refresh.RefreshedLeaseCount, + refresh.ConfigVersion, + ) + return nil +} + +func refreshSyntheticMeshConfigIfDue(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client, meshState *syntheticMeshState, observedAt time.Time) error { + if meshState == nil || meshState.PeerCache == nil { + return nil + } + observedAt = observedAt.UTC() + if !meshState.LastConfigRefreshAt.IsZero() && meshState.LastConfigRefreshAt.Add(meshSyntheticConfigRefreshInterval).After(observedAt) { + return nil + } + if api == nil || meshState.Source != "control_plane" || cfg.MeshSyntheticConfigPath != "" { + meshState.LastConfigRefreshAt = observedAt + return nil + } + if identity.NodeID == "" || identity.ClusterID == "" { + return nil + } + local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID} + loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api) + completedAt := time.Now().UTC() + if err != nil { + meshState.LastConfigRefreshAt = observedAt + return err + } + previousVersion := meshState.ConfigVersion + applyRefreshedSyntheticMeshConfig(meshState, loadedConfig, local, cfg.MeshRegion, completedAt) + log.Printf( + "mesh synthetic config refreshed: previous_config_version=%s refreshed_config_version=%s route_health_routes=%d", + previousVersion, + loadedConfig.ConfigVersion, + len(meshState.RouteHealthRoutes), + ) + return nil +} + +func refreshSyntheticMeshConfigForRouteHealthFeedback(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client, meshState *syntheticMeshState, trigger meshRouteHealthFeedbackTrigger, observedAt time.Time) error { + if meshState == nil || meshState.PeerCache == nil { + return nil + } + observedAt = observedAt.UTC() + if trigger.RouteID == "" { + return nil + } + if meshState.LastRouteHealthRefresh != nil && meshState.LastRouteHealthRefresh.AttemptedAt.Add(meshRouteHealthFeedbackRefreshBackoff).After(observedAt) { + meshState.RouteHealthRefreshSuppressed++ + return nil + } + + refresh := &meshRouteHealthFeedbackRefreshState{ + Status: "attempted", + Reason: trigger.Reason, + AttemptedAt: observedAt, + RouteID: trigger.RouteID, + PeerNodeID: trigger.PeerNodeID, + SelectedRelayID: trigger.SelectedRelayID, + LinkStatus: trigger.LinkStatus, + FailureReason: trigger.FailureReason, + DriftDetected: trigger.DriftDetected, + PreviousConfigVersion: meshState.ConfigVersion, + PreviousRouteHealthRouteCount: len(meshState.RouteHealthRoutes), + } + meshState.LastRouteHealthRefresh = refresh + meshState.RouteHealthRefreshAttempts++ + + if api == nil || meshState.Source != "control_plane" || cfg.MeshSyntheticConfigPath != "" { + refresh.Status = "unsupported" + refresh.Error = "control_plane_synthetic_config_required" + refresh.CompletedAt = observedAt + meshState.RouteHealthRefreshFailures++ + return nil + } + if identity.NodeID == "" || identity.ClusterID == "" { + refresh.Status = "unsupported" + refresh.Error = "approved_identity_required" + refresh.CompletedAt = observedAt + meshState.RouteHealthRefreshFailures++ + return nil + } + + local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID} + loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api) + completedAt := time.Now().UTC() + refresh.CompletedAt = completedAt + if err != nil { + refresh.Status = "failed" + refresh.Error = err.Error() + meshState.RouteHealthRefreshFailures++ + return err + } + applyRefreshedSyntheticMeshConfig(meshState, loadedConfig, local, cfg.MeshRegion, completedAt) + refresh.Status = "succeeded" + refresh.RefreshedConfigVersion = loadedConfig.ConfigVersion + refresh.RefreshedRouteHealthRouteCount = len(meshState.RouteHealthRoutes) + meshState.RouteHealthRefreshSuccesses++ + log.Printf( + "mesh route-health feedback refresh succeeded: reason=%s route_id=%s previous_config_version=%s refreshed_config_version=%s route_health_routes=%d", + refresh.Reason, + refresh.RouteID, + refresh.PreviousConfigVersion, + refresh.RefreshedConfigVersion, + refresh.RefreshedRouteHealthRouteCount, + ) + return nil +} + +func applyRefreshedSyntheticMeshConfig(meshState *syntheticMeshState, loadedConfig loadedSyntheticMeshConfig, local mesh.PeerIdentity, preferredRegion string, observedAt time.Time) { + routeHealthRoutes := routeHealthRoutesFromPathDecisions(loadedConfig.Routes, loadedConfig.RoutePathDecisions) + peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{ + Local: local, + PeerEndpoints: loadedConfig.PeerEndpoints, + PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates, + PeerDirectory: loadedConfig.PeerDirectory, + RecoverySeeds: loadedConfig.RecoverySeeds, + RendezvousLeases: loadedConfig.RendezvousLeases, + Routes: loadedConfig.Routes, + WarmPeerLimit: mesh.DefaultWarmPeerLimit, + PreferredRegion: preferredRegion, + Now: observedAt, + }) + if meshState.PeerConnections == nil { + meshState.PeerConnections = mesh.NewPeerConnectionTracker(peerCache.Snapshot(), observedAt) + } + peerConnectionSnapshot := meshState.PeerConnections.Snapshot() + peerRecoveryPlan := mesh.PlanPeerRecovery(mesh.PeerRecoveryPlanConfig{ + PeerCache: peerCache.Snapshot(), + Connections: peerConnectionSnapshot, + TargetReadyPeers: mesh.DefaultStablePeerTarget, + MaxProbeCandidates: mesh.DefaultRecoveryProbeLimit, + Now: observedAt, + }) + peerConnectionIntentPlan := mesh.PlanPeerConnectionIntents(mesh.PeerConnectionIntentPlanConfig{ + PeerCache: peerCache.Snapshot(), + RecoveryPlan: peerRecoveryPlan, + RendezvousLeases: loadedConfig.RendezvousLeases, + Now: observedAt, + }) + if meshState.PeerConnectionManager == nil { + meshState.PeerConnectionManager = mesh.NewPeerConnectionManager(mesh.PeerConnectionManagerConfig{ + Local: local, + PeerCache: peerCache, + Tracker: meshState.PeerConnections, + RendezvousLeases: loadedConfig.RendezvousLeases, + }) + } else { + meshState.PeerConnectionManager.UpdatePeerConfig(peerCache, loadedConfig.RendezvousLeases) + } + if meshState.Runtime != nil { + meshState.Runtime.UpdateConfig(loadedConfig.Routes, mesh.NewHTTPPeerTransport(loadedConfig.PeerEndpoints)) + meshState.Runtime.UpdateRouteHealthConfig(routeHealthRoutes) + } + if meshState.RouteGenerationTracker == nil { + meshState.RouteGenerationTracker = newMeshRouteGenerationTracker(loadedConfig.RoutePathDecisions, observedAt) + } else { + meshState.RouteGenerationTracker.Apply(loadedConfig.RoutePathDecisions, observedAt) + } + meshState.Routes = loadedConfig.Routes + meshState.RouteHealthRoutes = routeHealthRoutes + meshState.Source = loadedConfig.Source + meshState.PeerCache = peerCache + meshState.RendezvousLeases = loadedConfig.RendezvousLeases + meshState.RoutePathDecisions = loadedConfig.RoutePathDecisions + meshState.ConfigVersion = loadedConfig.ConfigVersion + meshState.PeerDirectoryVersion = loadedConfig.PeerDirectoryVersion + meshState.PolicyVersion = loadedConfig.PolicyVersion + meshState.LastConfigRefreshAt = observedAt + meshState.LastPeerRecoveryPlan = &peerRecoveryPlan + meshState.LastPeerConnectionIntent = &peerConnectionIntentPlan +} + +func meshRendezvousLeasePostureForState(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) meshRendezvousLeasePosture { + posture := meshRendezvousLeasePosture{} + if meshState == nil { + return posture + } + connectionByPeer := meshRendezvousConnectionsByPeer(meshState) + for _, lease := range meshState.RendezvousLeases { + valid := meshRendezvousLeaseBaseValid(lease) + expired := valid && !lease.ExpiresAt.After(observedAt) + usable := valid && !expired + renewalNeeded := meshRendezvousLeaseRenewalNeeded(lease, observedAt, usable) + staleRelay := usable && meshRendezvousLeaseStaleRelay(lease, connectionByPeer[lease.PeerNodeID]) + switch { + case !valid: + posture.InvalidCount++ + case expired: + posture.ExpiredCount++ + case renewalNeeded: + posture.RenewalNeededCount++ + case staleRelay: + posture.StaleRelayCount++ + } + if !valid || expired || renewalNeeded || staleRelay { + posture.RefreshNeededCount++ + if posture.Reason == "" { + posture.Reason = meshRendezvousLeaseRefreshReason(valid, expired, renewalNeeded, staleRelay) + } + } + } + posture.RefreshNeeded = posture.RefreshNeededCount > 0 && identity.NodeID != "" + if posture.Reason == "" { + posture.Reason = "none" + } + return posture +} + +func meshRendezvousLeaseRefreshReason(valid bool, expired bool, renewalNeeded bool, staleRelay bool) string { + switch { + case !valid: + return "invalid_lease" + case expired: + return "expired_lease" + case staleRelay: + return "stale_relay" + case renewalNeeded: + return "renewal_needed" + default: + return "none" + } +} + +func syntheticRoutesFromControlPlane(routes []client.SyntheticMeshRouteConfig) []mesh.SyntheticRoute { + out := make([]mesh.SyntheticRoute, 0, len(routes)) + for _, route := range routes { + out = append(out, mesh.SyntheticRoute{ + RouteID: route.RouteID, + ClusterID: route.ClusterID, + SourceNodeID: route.SourceNodeID, + DestinationNodeID: route.DestinationNodeID, + Hops: route.Hops, + AllowedChannels: route.AllowedChannels, + ExpiresAt: route.ExpiresAt, + MaxTTL: route.MaxTTL, + MaxHops: route.MaxHops, + RouteVersion: route.RouteVersion, + PolicyVersion: route.PolicyVersion, + PeerDirectoryVersion: route.PeerDirectoryVersion, + }) + } + return out +} + +func routeHealthRoutesFromPathDecisions(routes []mesh.SyntheticRoute, report *client.RoutePathDecisionReport) []mesh.SyntheticRoute { + out := make([]mesh.SyntheticRoute, 0, len(routes)) + routeIndex := map[string]int{} + for _, route := range routes { + if route.RouteID == "" { + continue + } + routeIndex[route.RouteID] = len(out) + out = append(out, cloneSyntheticRoute(route)) + } + if report == nil { + return out + } + for _, decision := range report.Decisions { + if strings.TrimSpace(decision.RouteID) == "" || decision.ProductionForwarding || !decision.ControlPlaneOnly { + continue + } + hops := cleanNodePath(decision.EffectiveHops) + if len(hops) < 2 { + continue + } + route, ok := mesh.SyntheticRoute{}, false + if index, exists := routeIndex[decision.RouteID]; exists { + route = out[index] + ok = true + } + if !ok { + route = mesh.SyntheticRoute{ + RouteID: decision.RouteID, + ClusterID: decision.ClusterID, + AllowedChannels: []string{mesh.SyntheticChannelFabricControl, mesh.SyntheticChannelRouteControl}, + } + routeIndex[decision.RouteID] = len(out) + out = append(out, route) + } + route.Hops = hops + route.SourceNodeID = defaultString(decision.SourceNodeID, hops[0]) + route.DestinationNodeID = defaultString(decision.DestinationNodeID, hops[len(hops)-1]) + route.ClusterID = defaultString(decision.ClusterID, route.ClusterID) + if !decision.ExpiresAt.IsZero() { + route.ExpiresAt = decision.ExpiresAt + } + if strings.TrimSpace(decision.Generation) != "" { + route.RouteVersion = strings.TrimSpace(decision.Generation) + } + if route.MaxTTL < len(hops) { + route.MaxTTL = len(hops) + } + if route.MaxHops < len(hops)-1 { + route.MaxHops = len(hops) - 1 + } + out[routeIndex[decision.RouteID]] = route + } + return out +} + +func cloneSyntheticRoute(route mesh.SyntheticRoute) mesh.SyntheticRoute { + route.Hops = append([]string{}, route.Hops...) + route.AllowedChannels = append([]string{}, route.AllowedChannels...) + return route +} + +func cleanNodePath(items []string) []string { + out := make([]string, 0, len(items)) + for _, item := range items { + item = strings.TrimSpace(item) + if item != "" { + out = append(out, item) + } + } + return out +} + +func peerEndpointCandidatesFromControlPlane(candidates map[string][]client.PeerEndpointCandidate) map[string][]mesh.PeerEndpointCandidate { + out := make(map[string][]mesh.PeerEndpointCandidate, len(candidates)) + for nodeID, items := range candidates { + for _, item := range items { + out[nodeID] = append(out[nodeID], mesh.PeerEndpointCandidate{ + EndpointID: item.EndpointID, + NodeID: item.NodeID, + Transport: item.Transport, + Address: item.Address, + AddressFamily: item.AddressFamily, + Reachability: item.Reachability, + NATType: item.NATType, + ConnectivityMode: item.ConnectivityMode, + Region: item.Region, + Priority: item.Priority, + PolicyTags: item.PolicyTags, + LastVerifiedAt: item.LastVerifiedAt, + Metadata: item.Metadata, + }) + } + } + return out +} + +func peerDirectoryFromControlPlane(entries []client.PeerDirectoryEntry) []mesh.PeerDirectoryEntry { + out := make([]mesh.PeerDirectoryEntry, 0, len(entries)) + for _, item := range entries { + out = append(out, mesh.PeerDirectoryEntry{ + NodeID: item.NodeID, + RouteIDs: item.RouteIDs, + EndpointCount: item.EndpointCount, + CandidateCount: item.CandidateCount, + ConnectivityModes: item.ConnectivityModes, + RecoverySeed: item.RecoverySeed, + }) + } + return out +} + +func recoverySeedsFromControlPlane(seeds []client.PeerRecoverySeed) []mesh.PeerRecoverySeed { + out := make([]mesh.PeerRecoverySeed, 0, len(seeds)) + for _, item := range seeds { + out = append(out, mesh.PeerRecoverySeed{ + NodeID: item.NodeID, + Endpoint: item.Endpoint, + Transport: item.Transport, + ConnectivityMode: item.ConnectivityMode, + Region: item.Region, + Priority: item.Priority, + LastVerifiedAt: item.LastVerifiedAt, + Metadata: item.Metadata, + }) + } + return out +} + +func rendezvousLeasesFromControlPlane(leases []client.PeerRendezvousLease) []mesh.PeerRendezvousLease { + out := make([]mesh.PeerRendezvousLease, 0, len(leases)) + for _, item := range leases { + out = append(out, mesh.PeerRendezvousLease{ + LeaseID: item.LeaseID, + PeerNodeID: item.PeerNodeID, + RelayNodeID: item.RelayNodeID, + RelayEndpoint: item.RelayEndpoint, + Transport: item.Transport, + ConnectivityMode: item.ConnectivityMode, + RouteIDs: item.RouteIDs, + AllowedChannels: item.AllowedChannels, + Priority: item.Priority, + ControlPlaneOnly: item.ControlPlaneOnly, + IssuedAt: item.IssuedAt, + ExpiresAt: item.ExpiresAt, + Reason: item.Reason, + Metadata: item.Metadata, + }) + } + return out +} + +func parseMeshPeerEndpoints(raw string) (map[string]string, error) { + if raw == "" { + return map[string]string{}, nil + } + var peerEndpoints map[string]string + if err := json.Unmarshal([]byte(raw), &peerEndpoints); err != nil { + return nil, fmt.Errorf("parse synthetic mesh peer endpoints: %w", err) + } + return peerEndpoints, nil +} + +func parseMeshSyntheticRoutes(raw string) ([]mesh.SyntheticRoute, error) { + if raw == "" { + return nil, nil + } + var routes []mesh.SyntheticRoute + if err := json.Unmarshal([]byte(raw), &routes); err != nil { + return nil, fmt.Errorf("parse synthetic mesh routes: %w", err) + } + return routes, nil +} + +func reportSyntheticRouteHealth(ctx context.Context, cfg config.Config, api *client.Client, identity state.Identity, meshState *syntheticMeshState) error { + if meshState == nil || meshState.Runtime == nil || api == nil { + return nil + } + routes := meshState.RouteHealthRoutes + if len(routes) == 0 { + routes = meshState.Routes + } + decisionsByRoute := routePathDecisionsByRoute(meshState.RoutePathDecisions) + var refreshTrigger *meshRouteHealthFeedbackTrigger + for _, route := range routes { + if route.SourceNodeID != identity.NodeID { + continue + } + decision, decisionApplied := decisionsByRoute[route.RouteID] + probeCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + result, err := meshState.Runtime.SendRouteHealthProbe(probeCtx, route.RouteID, mesh.SyntheticChannelFabricControl, "route-health-"+route.RouteID) + cancel() + if err != nil { + metadata := routeHealthObservationMetadata(meshState, route, decision, decisionApplied, nil) + metadata["failure_reason"] = err.Error() + if reportErr := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{ + SourceNodeID: identity.NodeID, + TargetNodeID: route.DestinationNodeID, + LinkStatus: "unreachable", + Metadata: metadata, + }); reportErr != nil { + return reportErr + } + if trigger, ok := routeHealthFeedbackTriggerFromObservation(route, decision, decisionApplied, "unreachable", metadata, time.Now().UTC()); ok && refreshTrigger == nil { + refreshTrigger = &trigger + } + continue + } + latency := int(result.Observation.LastLatencyMs) + qualityScore := syntheticQualityScore(latency) + ackPath := routeHealthAckPath(result.Ack) + metadata := routeHealthObservationMetadata(meshState, route, decision, decisionApplied, ackPath) + metadata["selected_route_id"] = result.SelectedRouteID + metadata["fallback_used"] = result.FallbackUsed + metadata["route_version"] = result.Observation.RouteVersion + metadata["policy_version"] = result.Observation.PolicyVersion + metadata["peer_directory_version"] = result.Observation.PeerDirectoryVersion + metadata["synthetic_message_type"] = result.Ack.MessageType + if err := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{ + SourceNodeID: identity.NodeID, + TargetNodeID: route.DestinationNodeID, + LinkStatus: "reachable", + LatencyMs: &latency, + QualityScore: &qualityScore, + Metadata: metadata, + }); err != nil { + return err + } + if trigger, ok := routeHealthFeedbackTriggerFromObservation(route, decision, decisionApplied, "reachable", metadata, time.Now().UTC()); ok && refreshTrigger == nil { + refreshTrigger = &trigger + } + } + if refreshTrigger != nil { + return refreshSyntheticMeshConfigForRouteHealthFeedback(ctx, cfg, identity, api, meshState, *refreshTrigger, time.Now().UTC()) + } + return nil +} + +func routePathDecisionsByRoute(report *client.RoutePathDecisionReport) map[string]client.RoutePathDecision { + out := map[string]client.RoutePathDecision{} + if report == nil { + return out + } + for _, decision := range report.Decisions { + if strings.TrimSpace(decision.RouteID) == "" { + continue + } + previous, exists := out[decision.RouteID] + if !exists || (previous.DecisionSource != "stale_relay_replacement" && decision.DecisionSource == "stale_relay_replacement") { + out[decision.RouteID] = decision + } + } + return out +} + +func routeHealthObservationMetadata(meshState *syntheticMeshState, route mesh.SyntheticRoute, decision client.RoutePathDecision, decisionApplied bool, ackPath []string) map[string]any { + driftDetected := false + if len(ackPath) > 0 { + driftDetected = !sameStringSlice(ackPath, route.Hops) + } + metadata := map[string]any{ + "stage": "c17z20", + "traffic_forwarding": false, + "production_forwarding": false, + "production_payload_forwarding": false, + "service_workload_traffic": false, + "observation_type": "synthetic_route_health", + "route_id": route.RouteID, + "config_source": meshState.Source, + "route_health_route_config_contract": "control_plane_route_path_decisions_to_synthetic_route_health", + "route_health_only": true, + "synthetic_route_health_route_path_runtime": true, + "production_route_path_forwarding_runtime": false, + "route_path_decision_applied": decisionApplied, + "expected_effective_hops": append([]string{}, route.Hops...), + "observed_ack_path": append([]string{}, ackPath...), + "route_path_drift_detected": driftDetected, + "control_plane_only": true, + "route_health_service_payload_forwarding": false, + "route_health_production_payload_forwarding": false, + } + if decisionApplied { + metadata["route_path_decision_id"] = decision.DecisionID + metadata["route_path_decision_generation"] = decision.Generation + metadata["route_path_decision_source"] = decision.DecisionSource + metadata["route_path_decision_next_hop_id"] = decision.NextHopID + metadata["route_path_decision_selected_relay_id"] = decision.SelectedRelayID + metadata["route_path_decision_stale_relay_node_id"] = decision.StaleRelayNodeID + metadata["route_path_decision_rendezvous_peer_node_id"] = decision.RendezvousPeerNodeID + metadata["route_path_decision_rendezvous_lease_id"] = decision.RendezvousLeaseID + metadata["route_path_decision_rendezvous_lease_reason"] = decision.RendezvousLeaseReason + metadata["route_path_decision_effective_hops"] = append([]string{}, decision.EffectiveHops...) + metadata["route_path_decision_original_hops"] = append([]string{}, decision.OriginalHops...) + } + return metadata +} + +func routeHealthFeedbackTriggerFromObservation(route mesh.SyntheticRoute, decision client.RoutePathDecision, decisionApplied bool, linkStatus string, metadata map[string]any, observedAt time.Time) (meshRouteHealthFeedbackTrigger, bool) { + if strings.TrimSpace(route.RouteID) == "" { + return meshRouteHealthFeedbackTrigger{}, false + } + linkStatus = strings.TrimSpace(linkStatus) + failureReason, _ := metadata["failure_reason"].(string) + driftDetected, _ := metadata["route_path_drift_detected"].(bool) + reason := "" + switch { + case strings.TrimSpace(failureReason) != "": + reason = "synthetic_route_health_failure" + case linkStatus != "" && linkStatus != "reachable": + reason = "synthetic_route_health_unreachable" + case driftDetected: + reason = "synthetic_route_health_drift" + default: + return meshRouteHealthFeedbackTrigger{}, false + } + trigger := meshRouteHealthFeedbackTrigger{ + Reason: reason, + RouteID: route.RouteID, + PeerNodeID: route.DestinationNodeID, + LinkStatus: linkStatus, + FailureReason: failureReason, + DriftDetected: driftDetected, + ObservedAt: observedAt.UTC(), + } + if decisionApplied { + if decision.RendezvousPeerNodeID != "" { + trigger.PeerNodeID = decision.RendezvousPeerNodeID + } + trigger.SelectedRelayID = decision.SelectedRelayID + } + return trigger, true +} + +func routeHealthAckPath(ack mesh.SyntheticEnvelope) []string { + if len(ack.Payload) == 0 { + return nil + } + var payload mesh.SyntheticProbeAckPayload + if err := json.Unmarshal(ack.Payload, &payload); err != nil { + return nil + } + return append([]string{}, payload.Path...) +} + +func probeWarmPeerHealth(ctx context.Context, api *client.Client, identity state.Identity, meshState *syntheticMeshState) error { + if meshState == nil || meshState.PeerCache == nil { + return nil + } + if meshState.PeerConnectionManager != nil { + cycle := meshState.PeerConnectionManager.ProbeOnce(ctx) + meshState.LastPeerRecoveryPlan = &cycle.RecoveryPlan + meshState.LastPeerConnectionIntent = &cycle.IntentPlan + for _, result := range cycle.Results { + metadata := map[string]any{ + "stage": "c17z20", + "traffic_forwarding": false, + "observation_type": "peer_connection_manager", + "config_source": meshState.Source, + "manager_mode": cycle.Mode, + "manager_attempted": cycle.Attempted, + "manager_succeeded": cycle.Succeeded, + "manager_failed": cycle.Failed, + "manager_deferred": cycle.Deferred, + "manager_rendezvous_required": cycle.RendezvousRequiredCount, + "manager_rendezvous_resolved": cycle.RendezvousResolvedCount, + "manager_relay_control": cycle.RelayControlCount, + "connection_intent_action": result.Action, + "connection_intent_reason": result.Reason, + "transport_mode": result.TransportMode, + "requires_rendezvous": result.RequiresRendezvous, + "rendezvous_resolved": result.RendezvousResolved, + "direct_candidate": result.DirectCandidate, + "relay_candidate": result.RelayCandidate, + "rendezvous_lease_id": result.RendezvousLeaseID, + "relay_node_id": result.RelayNodeID, + "relay_endpoint": result.RelayEndpoint, + "connection_state": result.ConnectionState.State, + "consecutive_successes": result.ConnectionState.ConsecutiveSuccesses, + "consecutive_failures": result.ConnectionState.ConsecutiveFailures, + "backoff_until": result.ConnectionState.BackoffUntil, + "service_workload_traffic": false, + "persistent_connection_manager": true, + "persistent_connection_kind": "http_keepalive_control_health_or_relay_control_health", + } + if result.FailureReason != "" { + metadata["failure_reason"] = result.FailureReason + } + var latency *int + var qualityScore *int + if result.LinkStatus == mesh.PeerConnectionProbeReachable { + latency = &result.LatencyMs + score := syntheticQualityScore(result.LatencyMs) + qualityScore = &score + } + if err := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{ + SourceNodeID: identity.NodeID, + TargetNodeID: result.NodeID, + LinkStatus: result.LinkStatus, + LatencyMs: latency, + QualityScore: qualityScore, + Metadata: metadata, + }); err != nil { + return err + } + } + return nil + } + local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID} + plan := peerRecoveryPlan(meshState, time.Now().UTC()) + meshState.LastPeerRecoveryPlan = &plan + intentPlan := peerConnectionIntentPlan(meshState, plan, time.Now().UTC()) + meshState.LastPeerConnectionIntent = &intentPlan + intentsByNode := peerConnectionIntentsByNode(intentPlan) + for _, candidate := range plan.Candidates { + if strings.TrimSpace(candidate.Endpoint) == "" { + continue + } + intent := intentsByNode[candidate.NodeID] + now := time.Now().UTC() + if meshState.PeerConnections != nil && !meshState.PeerConnections.ShouldProbe(candidate.NodeID, now) { + continue + } + entry := mesh.PeerCacheEntry{ + NodeID: candidate.NodeID, + Endpoint: candidate.Endpoint, + Warm: candidate.Warm, + WarmReason: candidate.WarmReason, + RecoverySeed: candidate.RecoverySeed, + BestCandidateID: candidate.BestCandidateID, + BestTransport: candidate.BestTransport, + } + if meshState.PeerConnections != nil { + meshState.PeerConnections.BeginProbe(entry, now) + } + startedAt := time.Now() + probeCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + _, err := mesh.NewClient(strings.TrimRight(entry.Endpoint, "/")).SendHealth(probeCtx, mesh.NewHealthMessage(local, mesh.PeerIdentity{ + ClusterID: identity.ClusterID, + NodeID: candidate.NodeID, + })) + cancel() + if err != nil { + connectionState := mesh.PeerConnectionState{} + if meshState.PeerConnections != nil { + connectionState = meshState.PeerConnections.RecordFailure(candidate.NodeID, err.Error(), time.Now().UTC()) + } + if reportErr := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{ + SourceNodeID: identity.NodeID, + TargetNodeID: candidate.NodeID, + LinkStatus: "unreachable", + Metadata: map[string]any{ + "stage": "c17z10", + "traffic_forwarding": false, + "observation_type": "warm_peer_health", + "config_source": meshState.Source, + "warm_reason": entry.WarmReason, + "best_candidate_id": entry.BestCandidateID, + "best_transport": entry.BestTransport, + "recovery_seed": entry.RecoverySeed, + "recovery_plan_mode": plan.Mode, + "recovery_probe_reason": candidate.Reason, + "recovery_target_ready": plan.TargetReadyPeers, + "recovery_ready_peers": plan.ReadyPeerCount, + "recovery_deficit": plan.Deficit, + "connection_intent_action": intent.Action, + "transport_mode": intent.TransportMode, + "requires_rendezvous": intent.RequiresRendezvous, + "direct_candidate": intent.DirectCandidate, + "connection_state": connectionState.State, + "consecutive_failures": connectionState.ConsecutiveFailures, + "backoff_until": connectionState.BackoffUntil, + "failure_reason": err.Error(), + "service_workload_traffic": false, + }, + }); reportErr != nil { + return reportErr + } + continue + } + latency := int(time.Since(startedAt).Milliseconds()) + qualityScore := syntheticQualityScore(latency) + connectionState := mesh.PeerConnectionState{} + if meshState.PeerConnections != nil { + connectionState = meshState.PeerConnections.RecordSuccess(candidate.NodeID, latency, time.Now().UTC()) + } + if err := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{ + SourceNodeID: identity.NodeID, + TargetNodeID: candidate.NodeID, + LinkStatus: "reachable", + LatencyMs: &latency, + QualityScore: &qualityScore, + Metadata: map[string]any{ + "stage": "c17z10", + "traffic_forwarding": false, + "observation_type": "warm_peer_health", + "config_source": meshState.Source, + "warm_reason": entry.WarmReason, + "best_candidate_id": entry.BestCandidateID, + "best_transport": entry.BestTransport, + "recovery_seed": entry.RecoverySeed, + "recovery_plan_mode": plan.Mode, + "recovery_probe_reason": candidate.Reason, + "recovery_target_ready": plan.TargetReadyPeers, + "recovery_ready_peers": plan.ReadyPeerCount, + "recovery_deficit": plan.Deficit, + "connection_intent_action": intent.Action, + "transport_mode": intent.TransportMode, + "requires_rendezvous": intent.RequiresRendezvous, + "direct_candidate": intent.DirectCandidate, + "connection_state": connectionState.State, + "consecutive_successes": connectionState.ConsecutiveSuccesses, + "service_workload_traffic": false, + }, + }); err != nil { + return err + } + } + return nil +} + +func peerRecoveryPlan(meshState *syntheticMeshState, now time.Time) mesh.PeerRecoveryPlan { + if meshState == nil || meshState.PeerCache == nil { + return mesh.PeerRecoveryPlan{} + } + var connections mesh.PeerConnectionSnapshot + if meshState.PeerConnections != nil { + connections = meshState.PeerConnections.Snapshot() + } + return mesh.PlanPeerRecovery(mesh.PeerRecoveryPlanConfig{ + PeerCache: meshState.PeerCache.Snapshot(), + Connections: connections, + TargetReadyPeers: mesh.DefaultStablePeerTarget, + MaxProbeCandidates: mesh.DefaultRecoveryProbeLimit, + Now: now, + }) +} + +func peerConnectionIntentPlan(meshState *syntheticMeshState, recoveryPlan mesh.PeerRecoveryPlan, now time.Time) mesh.PeerConnectionIntentPlan { + if meshState == nil || meshState.PeerCache == nil { + return mesh.PeerConnectionIntentPlan{} + } + return mesh.PlanPeerConnectionIntents(mesh.PeerConnectionIntentPlanConfig{ + PeerCache: meshState.PeerCache.Snapshot(), + RecoveryPlan: recoveryPlan, + RendezvousLeases: meshState.RendezvousLeases, + Now: now, + }) +} + +func peerConnectionIntentsByNode(plan mesh.PeerConnectionIntentPlan) map[string]mesh.PeerConnectionIntent { + out := map[string]mesh.PeerConnectionIntent{} + for _, intent := range plan.Intents { + if strings.TrimSpace(intent.NodeID) != "" { + out[intent.NodeID] = intent + } + } + return out +} + +func syntheticQualityScore(latencyMs int) int { + switch { + case latencyMs <= 10: + return 100 + case latencyMs >= 1000: + return 1 + default: + score := 100 - latencyMs/10 + if score < 1 { + return 1 + } + return score + } +} + +func sendHeartbeat(ctx context.Context, api *client.Client, cfg config.Config, identity state.Identity, meshState *syntheticMeshState) (client.EffectiveTestingFlags, error) { + if identity.NodeID == "" || identity.ClusterID == "" { + return client.EffectiveTestingFlags{}, fmt.Errorf("node identity is not approved") + } + response, err := api.Heartbeat(ctx, identity.ClusterID, identity.NodeID, heartbeatPayload(cfg, identity, meshState, time.Now().UTC())) + if err == nil { + log.Printf("heartbeat sent: node_id=%s cluster_id=%s", identity.NodeID, identity.ClusterID) + } + return response.TestingFlags, err +} + +func heartbeatPayload(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time) client.HeartbeatRequest { + payload := agent.HeartbeatPayload() + candidates, err := advertisedEndpointCandidates(cfg, identity, observedAt) + if err != nil { + log.Printf("mesh endpoint report skipped: %v", err) + return payload + } + if len(candidates) == 0 && (meshState == nil || meshState.PeerCache == nil) { + return payload + } + if payload.Metadata == nil { + payload.Metadata = map[string]any{} + } + if payload.Capabilities == nil { + payload.Capabilities = map[string]any{} + } + payload.Metadata["stage"] = "c17z20" + if len(candidates) > 0 { + payload.Metadata["mesh_endpoint_report"] = meshEndpointReport(cfg, identity, meshState, observedAt, candidates) + payload.Capabilities["mesh_dynamic_endpoint_reporting"] = true + } + if meshState != nil && meshState.PeerCache != nil { + payload.Metadata["mesh_peer_recovery_report"] = meshPeerRecoveryReport(meshState, observedAt) + payload.Metadata["mesh_peer_connection_intent_report"] = meshPeerConnectionIntentReport(meshState, observedAt) + payload.Metadata["mesh_peer_connection_manager_report"] = meshPeerConnectionManagerReport(meshState, observedAt) + payload.Metadata["mesh_rendezvous_lease_report"] = meshRendezvousLeaseReport(meshState, identity, observedAt) + payload.Metadata["mesh_route_path_decision_report"] = meshRoutePathDecisionReport(meshState, identity, observedAt) + payload.Metadata["mesh_route_generation_report"] = meshRouteGenerationReport(meshState, identity, observedAt) + payload.Metadata["mesh_route_health_config_report"] = meshRouteHealthConfigReport(meshState, identity, observedAt) + payload.Metadata["mesh_route_health_feedback_refresh_report"] = meshRouteHealthFeedbackRefreshReport(meshState, identity, observedAt) + payload.Capabilities["mesh_peer_recovery_planning"] = true + payload.Capabilities["mesh_peer_connection_intent_planning"] = true + payload.Capabilities["mesh_peer_connection_manager"] = true + payload.Capabilities["mesh_rendezvous_relay_control_contract"] = true + payload.Capabilities[meshRendezvousLeaseTelemetryCapability] = true + payload.Capabilities[meshRendezvousLeaseRefreshCapability] = true + payload.Capabilities[meshRendezvousRelayReplacementCapability] = true + payload.Capabilities[meshRoutePathDecisionCapability] = true + payload.Capabilities[meshRouteGenerationTrackerCapability] = true + payload.Capabilities[meshRouteHealthConfigCapability] = true + payload.Capabilities[meshRouteHealthFeedbackRefreshCapability] = true + } + return payload +} + +func meshEndpointReport(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time, candidates []mesh.PeerEndpointCandidate) map[string]any { + transport := cfg.MeshAdvertiseTransport + if transport == "" { + transport = "direct_tcp_tls" + } + connectivityMode := cfg.MeshConnectivityMode + if connectivityMode == "" { + connectivityMode = "direct" + } + natType := cfg.MeshNATType + if natType == "" { + natType = "unknown" + } + report := map[string]any{ + "schema_version": "c17z6.mesh_endpoint_report.v1", + "cluster_id": identity.ClusterID, + "node_id": identity.NodeID, + "peer_endpoint": candidates[0].Address, + "transport": transport, + "connectivity_mode": connectivityMode, + "nat_type": natType, + "observed_at": observedAt.UTC().Format(time.RFC3339Nano), + "endpoint_candidates": candidates, + } + if meshState != nil && meshState.PeerCache != nil { + snapshot := meshState.PeerCache.Snapshot() + report["peer_cache_peers"] = snapshot.PeerCount + report["warm_peers"] = snapshot.WarmPeerCount + report["recovery_seeds"] = snapshot.RecoverySeedCount + report["rendezvous_leases"] = snapshot.RendezvousLeaseCount + } + if meshState != nil && meshState.PeerConnections != nil { + snapshot := meshState.PeerConnections.Snapshot() + report["peer_connection_total"] = snapshot.Total + report["peer_connection_ready"] = snapshot.Ready + report["peer_connection_relay_ready"] = snapshot.RelayReady + report["peer_connection_degraded"] = snapshot.Degraded + report["peer_connection_backoff"] = snapshot.Backoff + report["peer_connection_waiting_rendezvous"] = snapshot.Waiting + report["peer_connection_connecting"] = snapshot.Connecting + report["peer_connection_disconnected"] = snapshot.Disconnected + } + if meshState != nil && meshState.PeerCache != nil { + plan := peerRecoveryPlan(meshState, observedAt) + meshState.LastPeerRecoveryPlan = &plan + intentPlan := peerConnectionIntentPlan(meshState, plan, observedAt) + meshState.LastPeerConnectionIntent = &intentPlan + report["peer_recovery_mode"] = plan.Mode + report["peer_recovery_healthy"] = plan.Healthy + report["peer_recovery_target_ready"] = plan.TargetReadyPeers + report["peer_recovery_ready"] = plan.ReadyPeerCount + report["peer_recovery_deficit"] = plan.Deficit + report["peer_recovery_probe_candidates"] = plan.ProbeCandidateCount + report["peer_recovery_seed_candidates"] = plan.RecoverySeedCandidateCount + report["peer_connection_intents"] = intentPlan.IntentCount + report["peer_connection_intent_direct"] = intentPlan.DirectCount + report["peer_connection_intent_private_lan"] = intentPlan.PrivateLANCount + report["peer_connection_intent_corp_lan"] = intentPlan.CorporateLANCount + report["peer_connection_intent_outbound_only"] = intentPlan.OutboundOnlyCount + report["peer_connection_intent_relay_required"] = intentPlan.RelayRequiredCount + report["peer_connection_intent_relay_control"] = intentPlan.RelayControlCount + report["peer_connection_intent_rendezvous_required"] = intentPlan.RendezvousRequiredCount + report["peer_connection_intent_rendezvous_resolved"] = intentPlan.RendezvousResolvedCount + report["rendezvous_lease_count"] = intentPlan.RendezvousLeaseCount + } + if cfg.MeshRegion != "" { + report["region"] = cfg.MeshRegion + } + return report +} + +func meshPeerRecoveryReport(meshState *syntheticMeshState, observedAt time.Time) map[string]any { + plan := peerRecoveryPlan(meshState, observedAt) + meshState.LastPeerRecoveryPlan = &plan + intentPlan := peerConnectionIntentPlan(meshState, plan, observedAt) + meshState.LastPeerConnectionIntent = &intentPlan + report := map[string]any{ + "schema_version": "c17z9.mesh_peer_recovery_report.v1", + "mode": plan.Mode, + "healthy": plan.Healthy, + "target_ready_peers": plan.TargetReadyPeers, + "ready_peer_count": plan.ReadyPeerCount, + "degraded_peer_count": plan.DegradedPeerCount, + "backoff_peer_count": plan.BackoffPeerCount, + "connectable_peer_count": plan.ConnectablePeerCount, + "deficit": plan.Deficit, + "probe_candidate_count": plan.ProbeCandidateCount, + "recovery_seed_candidate_count": plan.RecoverySeedCandidateCount, + "service_workload_traffic": false, + "production_payload_forwarding": false, + "persistent_connection_transport": false, + "observed_at": observedAt.UTC().Format(time.RFC3339Nano), + "connection_intent_count": intentPlan.IntentCount, + "rendezvous_required_count": intentPlan.RendezvousRequiredCount, + "rendezvous_resolved_count": intentPlan.RendezvousResolvedCount, + "rendezvous_lease_count": intentPlan.RendezvousLeaseCount, + "relay_control_count": intentPlan.RelayControlCount, + } + if meshState != nil && meshState.PeerConnections != nil { + snapshot := meshState.PeerConnections.Snapshot() + report["peer_connection_total"] = snapshot.Total + report["peer_connection_ready"] = snapshot.Ready + report["peer_connection_relay_ready"] = snapshot.RelayReady + report["peer_connection_degraded"] = snapshot.Degraded + report["peer_connection_backoff"] = snapshot.Backoff + report["peer_connection_waiting_rendezvous"] = snapshot.Waiting + report["peer_connection_connecting"] = snapshot.Connecting + report["peer_connection_disconnected"] = snapshot.Disconnected + } + return report +} + +func meshPeerConnectionIntentReport(meshState *syntheticMeshState, observedAt time.Time) map[string]any { + recoveryPlan := peerRecoveryPlan(meshState, observedAt) + meshState.LastPeerRecoveryPlan = &recoveryPlan + intentPlan := peerConnectionIntentPlan(meshState, recoveryPlan, observedAt) + meshState.LastPeerConnectionIntent = &intentPlan + return map[string]any{ + "schema_version": "c17z12.mesh_peer_connection_intent_report.v1", + "mode": intentPlan.Mode, + "intent_count": intentPlan.IntentCount, + "maintain_count": intentPlan.MaintainCount, + "probe_count": intentPlan.ProbeCount, + "recover_count": intentPlan.RecoverCount, + "direct_count": intentPlan.DirectCount, + "private_lan_count": intentPlan.PrivateLANCount, + "corporate_lan_count": intentPlan.CorporateLANCount, + "outbound_only_count": intentPlan.OutboundOnlyCount, + "relay_required_count": intentPlan.RelayRequiredCount, + "relay_control_count": intentPlan.RelayControlCount, + "rendezvous_required_count": intentPlan.RendezvousRequiredCount, + "rendezvous_resolved_count": intentPlan.RendezvousResolvedCount, + "rendezvous_lease_count": intentPlan.RendezvousLeaseCount, + "service_workload_traffic": false, + "production_payload_forwarding": false, + "persistent_connection_transport": false, + "observed_at": observedAt.UTC().Format(time.RFC3339Nano), + } +} + +func meshPeerConnectionManagerReport(meshState *syntheticMeshState, observedAt time.Time) map[string]any { + report := map[string]any{ + "schema_version": "c17z12.mesh_peer_connection_manager_report.v1", + "service_workload_traffic": false, + "production_payload_forwarding": false, + "persistent_connection_transport": true, + "persistent_connection_kind": "http_keepalive_control_health_or_relay_control_health", + "observed_at": observedAt.UTC().Format(time.RFC3339Nano), + } + if meshState == nil || meshState.PeerConnectionManager == nil { + report["enabled"] = false + return report + } + report["enabled"] = true + snapshot := meshState.PeerConnectionManager.Snapshot() + cycle := snapshot.LastCycle + report["mode"] = cycle.Mode + report["intent_count"] = cycle.IntentCount + report["attempted"] = cycle.Attempted + report["succeeded"] = cycle.Succeeded + report["failed"] = cycle.Failed + report["deferred"] = cycle.Deferred + report["skipped"] = cycle.Skipped + report["rendezvous_required_count"] = cycle.RendezvousRequiredCount + report["rendezvous_resolved_count"] = cycle.RendezvousResolvedCount + report["relay_control_count"] = cycle.RelayControlCount + report["last_started_at"] = cycle.StartedAt + report["last_completed_at"] = cycle.CompletedAt + if meshState.PeerConnections != nil { + connectionSnapshot := meshState.PeerConnections.Snapshot() + report["peer_connection_ready"] = connectionSnapshot.Ready + report["peer_connection_relay_ready"] = connectionSnapshot.RelayReady + report["peer_connection_degraded"] = connectionSnapshot.Degraded + report["peer_connection_backoff"] = connectionSnapshot.Backoff + report["peer_connection_waiting_rendezvous"] = connectionSnapshot.Waiting + } + return report +} + +func meshRendezvousLeaseReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any { + observedAt = observedAt.UTC() + posture := meshRendezvousLeasePostureForState(meshState, identity, observedAt) + report := map[string]any{ + "schema_version": meshRendezvousLeaseReportSchema, + "cluster_id": identity.ClusterID, + "node_id": identity.NodeID, + "lease_count": len(meshState.RendezvousLeases), + "config_source": meshState.Source, + "config_version": meshState.ConfigVersion, + "peer_directory_version": meshState.PeerDirectoryVersion, + "policy_version": meshState.PolicyVersion, + "renewal_window_ms": int64(meshRendezvousLeaseRenewalWindow / time.Millisecond), + "refresh_backoff_ms": int64(meshRendezvousLeaseRefreshBackoff / time.Millisecond), + "refresh_contract": "node_scoped_synthetic_config_get", + "refresh_supported": meshState.Source == "control_plane", + "control_plane_only": true, + "relay_payload_forwarding": false, + "service_workload_traffic": false, + "production_payload_forwarding": false, + "persistent_connection_transport": true, + "observed_at": observedAt.Format(time.RFC3339Nano), + } + connectionByPeer := map[string]mesh.PeerConnectionState{} + if meshState != nil && meshState.PeerConnections != nil { + snapshot := meshState.PeerConnections.Snapshot() + report["peer_connection_total"] = snapshot.Total + report["peer_connection_ready"] = snapshot.Ready + report["peer_connection_relay_ready"] = snapshot.RelayReady + report["peer_connection_degraded"] = snapshot.Degraded + report["peer_connection_backoff"] = snapshot.Backoff + report["peer_connection_waiting_rendezvous"] = snapshot.Waiting + report["peer_connection_connecting"] = snapshot.Connecting + report["peer_connection_disconnected"] = snapshot.Disconnected + for _, entry := range snapshot.Entries { + connectionByPeer[entry.NodeID] = entry + } + } + if meshState == nil { + report["leases"] = []map[string]any{} + return report + } + + leaseDetails := make([]map[string]any, 0, minInt(len(meshState.RendezvousLeases), maxMeshRendezvousLeaseReportEntries)) + activeCount := 0 + usableCount := 0 + controlPlaneOnlyCount := 0 + invalidCount := 0 + expiredCount := 0 + expiringSoonCount := 0 + renewalNeededCount := 0 + admittedAsRelayCount := 0 + admittedAsPeerCount := 0 + entryObserverCount := 0 + relayControlReadyCount := 0 + staleRelayCount := 0 + withdrawalNeededCount := 0 + reselectionNeededCount := 0 + for index, lease := range meshState.RendezvousLeases { + role := meshRendezvousLeaseRole(lease, identity.NodeID) + valid := meshRendezvousLeaseBaseValid(lease) + expired := valid && !lease.ExpiresAt.After(observedAt) + usable := valid && !expired + if lease.ControlPlaneOnly { + controlPlaneOnlyCount++ + } + if !valid { + invalidCount++ + } + if expired { + expiredCount++ + } + if usable { + activeCount++ + usableCount++ + switch role { + case "relay": + admittedAsRelayCount++ + case "peer": + admittedAsPeerCount++ + default: + entryObserverCount++ + } + } + ttlRemaining := lease.ExpiresAt.Sub(observedAt) + renewalAfter := meshRendezvousLeaseRenewalAfter(lease) + expiringSoon := usable && ttlRemaining <= meshRendezvousLeaseRenewalWindow + renewalNeeded := meshRendezvousLeaseRenewalNeeded(lease, observedAt, usable) + if expiringSoon { + expiringSoonCount++ + } + if renewalNeeded { + renewalNeededCount++ + } + connectionState := connectionByPeer[lease.PeerNodeID] + staleRelay := usable && meshRendezvousLeaseStaleRelay(lease, connectionState) + withdrawalNeeded := staleRelay && role == "relay" + reselectionNeeded := staleRelay && role != "relay" + if staleRelay { + staleRelayCount++ + } + if withdrawalNeeded { + withdrawalNeededCount++ + } + if reselectionNeeded { + reselectionNeededCount++ + } + relayReady := usable && connectionState.State == mesh.PeerConnectionRelayReady + if relayReady { + relayControlReadyCount++ + } + if index < maxMeshRendezvousLeaseReportEntries { + leaseDetails = append(leaseDetails, map[string]any{ + "lease_id": lease.LeaseID, + "peer_node_id": lease.PeerNodeID, + "relay_node_id": lease.RelayNodeID, + "relay_endpoint": strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/"), + "transport": defaultString(lease.Transport, "relay_control"), + "connectivity_mode": defaultString(lease.ConnectivityMode, "relay_required"), + "route_ids": append([]string{}, lease.RouteIDs...), + "allowed_channels": append([]string{}, lease.AllowedChannels...), + "priority": lease.Priority, + "role": role, + "status": meshRendezvousLeaseStatus(valid, expired, renewalNeeded, role), + "usable": usable, + "admitted": usable && role == "relay", + "renewal_needed": renewalNeeded, + "expiring_soon": expiringSoon, + "stale_relay": staleRelay, + "withdrawal_needed": withdrawalNeeded, + "reselection_needed": reselectionNeeded, + "relay_ready": relayReady, + "connection_state": connectionState.State, + "ttl_remaining_ms": int64(ttlRemaining / time.Millisecond), + "issued_at": formatOptionalTime(lease.IssuedAt), + "expires_at": formatOptionalTime(lease.ExpiresAt), + "renewal_after": formatOptionalTime(renewalAfter), + "reason": lease.Reason, + }) + } + } + report["active_count"] = activeCount + report["usable_count"] = usableCount + report["control_plane_only_count"] = controlPlaneOnlyCount + report["invalid_count"] = invalidCount + report["expired_count"] = expiredCount + report["expiring_soon_count"] = expiringSoonCount + report["renewal_needed_count"] = renewalNeededCount + report["admitted_as_relay_count"] = admittedAsRelayCount + report["admitted_as_peer_count"] = admittedAsPeerCount + report["entry_observer_count"] = entryObserverCount + report["relay_control_ready_count"] = relayControlReadyCount + report["stale_relay_count"] = staleRelayCount + report["withdrawal_needed_count"] = withdrawalNeededCount + report["reselection_needed_count"] = reselectionNeededCount + report["refresh_needed"] = posture.RefreshNeeded + report["refresh_reason"] = posture.Reason + report["refresh_needed_count"] = posture.RefreshNeededCount + report["refresh_attempt_count"] = meshState.LeaseRefreshAttempts + report["refresh_success_count"] = meshState.LeaseRefreshSuccesses + report["refresh_failure_count"] = meshState.LeaseRefreshFailures + if meshState.LastLeaseRefresh != nil { + report["last_refresh_status"] = meshState.LastLeaseRefresh.Status + report["last_refresh_reason"] = meshState.LastLeaseRefresh.Reason + report["last_refresh_error"] = meshState.LastLeaseRefresh.Error + report["last_refresh_attempted_at"] = formatOptionalTime(meshState.LastLeaseRefresh.AttemptedAt) + report["last_refresh_completed_at"] = formatOptionalTime(meshState.LastLeaseRefresh.CompletedAt) + report["last_refresh_previous_lease_count"] = meshState.LastLeaseRefresh.PreviousLeaseCount + report["last_refresh_refreshed_lease_count"] = meshState.LastLeaseRefresh.RefreshedLeaseCount + report["last_refresh_config_version"] = meshState.LastLeaseRefresh.ConfigVersion + } + report["truncated"] = len(meshState.RendezvousLeases) > maxMeshRendezvousLeaseReportEntries + report["leases"] = leaseDetails + return report +} + +type meshRouteGenerationDecisionState struct { + DecisionID string + RouteID string + Generation string + DecisionSource string + LocalRole string + PreviousHopID string + NextHopID string + SelectedRelayID string + StaleRelayNodeID string + RendezvousLeaseID string + EffectiveHops []string + OriginalHops []string + PathScore int + Status string + ApplyStatus string + WithdrawStatus string + AppliedAt time.Time + WithdrawnAt time.Time + ControlPlaneOnly bool + ProductionForwarding bool +} + +type meshRouteGenerationTracker struct { + Generation string + PreviousGeneration string + LastAppliedAt time.Time + LastChangedAt time.Time + LastAppliedCount int + LastWithdrawnCount int + LastUnchangedCount int + TotalAppliedCount int + TotalWithdrawnCount int + Active map[string]meshRouteGenerationDecisionState + Withdrawn []meshRouteGenerationDecisionState +} + +func newMeshRouteGenerationTracker(report *client.RoutePathDecisionReport, observedAt time.Time) *meshRouteGenerationTracker { + tracker := &meshRouteGenerationTracker{ + Active: map[string]meshRouteGenerationDecisionState{}, + } + tracker.Apply(report, observedAt) + return tracker +} + +func (t *meshRouteGenerationTracker) Apply(report *client.RoutePathDecisionReport, observedAt time.Time) { + if t == nil { + return + } + if observedAt.IsZero() { + observedAt = time.Now().UTC() + } else { + observedAt = observedAt.UTC() + } + nextGeneration := "" + decisions := []client.RoutePathDecision{} + if report != nil { + nextGeneration = strings.TrimSpace(report.Generation) + decisions = append(decisions, report.Decisions...) + } + t.PreviousGeneration = t.Generation + t.Generation = nextGeneration + t.LastAppliedAt = observedAt + t.LastAppliedCount = 0 + t.LastWithdrawnCount = 0 + t.LastUnchangedCount = 0 + nextActive := map[string]meshRouteGenerationDecisionState{} + seen := map[string]struct{}{} + withdrawnRelayKeys := map[string]struct{}{} + for _, previous := range t.Withdrawn { + if key := routeGenerationWithdrawnRelayKey(previous); key != "" { + withdrawnRelayKeys[key] = struct{}{} + } + } + appliedReplacementDecisions := []client.RoutePathDecision{} + for _, decision := range decisions { + state := routeGenerationDecisionState(decision, observedAt) + key := routeGenerationDecisionKey(decision) + if key == "" { + continue + } + seen[key] = struct{}{} + if previous, ok := t.Active[key]; ok && routeGenerationDecisionSame(previous, decision) { + state.Status = "active" + state.ApplyStatus = "unchanged" + state.AppliedAt = previous.AppliedAt + t.LastUnchangedCount++ + } else { + state.Status = "active" + state.ApplyStatus = "applied" + state.AppliedAt = observedAt + t.LastAppliedCount++ + t.TotalAppliedCount++ + if decision.DecisionSource == "stale_relay_replacement" && strings.TrimSpace(decision.StaleRelayNodeID) != "" { + appliedReplacementDecisions = append(appliedReplacementDecisions, decision) + } + } + nextActive[key] = state + } + for key, previous := range t.Active { + if _, ok := seen[key]; ok { + continue + } + previous.Status = "withdrawn" + previous.ApplyStatus = "not_active" + previous.WithdrawStatus = "withdrawn" + previous.WithdrawnAt = observedAt + t.Withdrawn = append([]meshRouteGenerationDecisionState{previous}, t.Withdrawn...) + if relayKey := routeGenerationWithdrawnRelayKey(previous); relayKey != "" { + withdrawnRelayKeys[relayKey] = struct{}{} + } + t.LastWithdrawnCount++ + t.TotalWithdrawnCount++ + } + for _, decision := range appliedReplacementDecisions { + relayKey := routeGenerationRouteRelayKey(decision.RouteID, decision.StaleRelayNodeID) + if relayKey == "" { + continue + } + if _, alreadyWithdrawn := withdrawnRelayKeys[relayKey]; alreadyWithdrawn { + continue + } + withdrawn := routeGenerationReplacementWithdrawnDecisionState(decision, observedAt) + t.Withdrawn = append([]meshRouteGenerationDecisionState{withdrawn}, t.Withdrawn...) + withdrawnRelayKeys[relayKey] = struct{}{} + t.LastWithdrawnCount++ + t.TotalWithdrawnCount++ + } + if len(t.Withdrawn) > maxMeshRendezvousLeaseReportEntries { + t.Withdrawn = t.Withdrawn[:maxMeshRendezvousLeaseReportEntries] + } + if t.LastAppliedCount > 0 || t.LastWithdrawnCount > 0 || t.PreviousGeneration != t.Generation { + t.LastChangedAt = observedAt + } + t.Active = nextActive +} + +func routeGenerationDecisionState(decision client.RoutePathDecision, observedAt time.Time) meshRouteGenerationDecisionState { + return meshRouteGenerationDecisionState{ + DecisionID: decision.DecisionID, + RouteID: decision.RouteID, + Generation: decision.Generation, + DecisionSource: decision.DecisionSource, + LocalRole: decision.LocalRole, + PreviousHopID: decision.PreviousHopID, + NextHopID: decision.NextHopID, + SelectedRelayID: decision.SelectedRelayID, + StaleRelayNodeID: decision.StaleRelayNodeID, + RendezvousLeaseID: decision.RendezvousLeaseID, + EffectiveHops: append([]string{}, decision.EffectiveHops...), + OriginalHops: append([]string{}, decision.OriginalHops...), + PathScore: decision.PathScore, + Status: "active", + ApplyStatus: "applied", + WithdrawStatus: "not_withdrawn", + AppliedAt: observedAt, + ControlPlaneOnly: decision.ControlPlaneOnly, + ProductionForwarding: decision.ProductionForwarding, + } +} + +func routeGenerationReplacementWithdrawnDecisionState(decision client.RoutePathDecision, observedAt time.Time) meshRouteGenerationDecisionState { + withdrawnDecisionID := strings.TrimSpace(decision.RouteID) + "-path-withdrawn-stale-relay-" + strings.TrimSpace(decision.StaleRelayNodeID) + effectiveHops := append([]string{}, decision.OriginalHops...) + if len(effectiveHops) == 0 { + effectiveHops = append([]string{}, decision.EffectiveHops...) + } + return meshRouteGenerationDecisionState{ + DecisionID: withdrawnDecisionID, + RouteID: decision.RouteID, + Generation: decision.Generation, + DecisionSource: "stale_relay_withdrawn", + LocalRole: decision.LocalRole, + PreviousHopID: decision.PreviousHopID, + NextHopID: decision.StaleRelayNodeID, + SelectedRelayID: decision.SelectedRelayID, + StaleRelayNodeID: decision.StaleRelayNodeID, + RendezvousLeaseID: decision.RendezvousLeaseID, + EffectiveHops: effectiveHops, + OriginalHops: append([]string{}, decision.OriginalHops...), + PathScore: decision.PathScore, + Status: "withdrawn", + ApplyStatus: "not_active", + WithdrawStatus: "withdrawn_by_replacement", + WithdrawnAt: observedAt, + ControlPlaneOnly: decision.ControlPlaneOnly, + ProductionForwarding: decision.ProductionForwarding, + } +} + +func routeGenerationDecisionKey(decision client.RoutePathDecision) string { + if strings.TrimSpace(decision.DecisionID) != "" { + return strings.TrimSpace(decision.DecisionID) + } + if strings.TrimSpace(decision.RouteID) == "" { + return "" + } + return strings.TrimSpace(decision.RouteID) + "\x00" + strings.TrimSpace(decision.LocalNodeID) +} + +func routeGenerationRouteRelayKey(routeID string, relayNodeID string) string { + routeID = strings.TrimSpace(routeID) + relayNodeID = strings.TrimSpace(relayNodeID) + if routeID == "" || relayNodeID == "" { + return "" + } + return routeID + "\x00" + relayNodeID +} + +func routeGenerationWithdrawnRelayKey(state meshRouteGenerationDecisionState) string { + relayNodeID := strings.TrimSpace(state.StaleRelayNodeID) + if relayNodeID == "" { + relayNodeID = strings.TrimSpace(state.NextHopID) + } + return routeGenerationRouteRelayKey(state.RouteID, relayNodeID) +} + +func routeGenerationDecisionSame(previous meshRouteGenerationDecisionState, decision client.RoutePathDecision) bool { + return previous.Generation == decision.Generation && + previous.RouteID == decision.RouteID && + previous.DecisionSource == decision.DecisionSource && + previous.NextHopID == decision.NextHopID && + previous.SelectedRelayID == decision.SelectedRelayID && + strings.Join(previous.EffectiveHops, "\x00") == strings.Join(decision.EffectiveHops, "\x00") +} + +func meshRouteGenerationReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any { + observedAt = observedAt.UTC() + report := map[string]any{ + "schema_version": meshRouteGenerationReportSchema, + "cluster_id": identity.ClusterID, + "node_id": identity.NodeID, + "config_source": "", + "config_version": "", + "generation": "", + "previous_generation": "", + "tracker_contract": "node_side_route_generation_apply_withdraw", + "control_plane_only": true, + "production_payload_forwarding": false, + "service_workload_traffic": false, + "route_path_forwarding_runtime": false, + "observed_at": observedAt.Format(time.RFC3339Nano), + "active_decision_count": 0, + "applied_decision_count": 0, + "unchanged_decision_count": 0, + "withdrawn_decision_count": 0, + "total_applied_decision_count": 0, + "total_withdrawn_decision_count": 0, + "generation_changed": false, + "active_decisions": []map[string]any{}, + "withdrawn_decisions": []map[string]any{}, + } + if meshState == nil || meshState.RouteGenerationTracker == nil { + return report + } + tracker := meshState.RouteGenerationTracker + report["config_source"] = meshState.Source + report["config_version"] = meshState.ConfigVersion + report["generation"] = tracker.Generation + report["previous_generation"] = tracker.PreviousGeneration + report["last_applied_at"] = formatOptionalTime(tracker.LastAppliedAt) + report["last_changed_at"] = formatOptionalTime(tracker.LastChangedAt) + report["active_decision_count"] = len(tracker.Active) + report["applied_decision_count"] = tracker.LastAppliedCount + report["unchanged_decision_count"] = tracker.LastUnchangedCount + report["withdrawn_decision_count"] = tracker.LastWithdrawnCount + report["total_applied_decision_count"] = tracker.TotalAppliedCount + report["total_withdrawn_decision_count"] = tracker.TotalWithdrawnCount + report["generation_changed"] = tracker.PreviousGeneration != tracker.Generation + report["active_decisions"] = routeGenerationDecisionDetails(tracker.activeList(), maxMeshRendezvousLeaseReportEntries) + report["withdrawn_decisions"] = routeGenerationDecisionDetails(tracker.Withdrawn, maxMeshRendezvousLeaseReportEntries) + report["truncated"] = len(tracker.Active) > maxMeshRendezvousLeaseReportEntries || len(tracker.Withdrawn) > maxMeshRendezvousLeaseReportEntries + return report +} + +func (t *meshRouteGenerationTracker) activeList() []meshRouteGenerationDecisionState { + if t == nil { + return nil + } + out := make([]meshRouteGenerationDecisionState, 0, len(t.Active)) + for _, state := range t.Active { + out = append(out, state) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].RouteID != out[j].RouteID { + return out[i].RouteID < out[j].RouteID + } + return out[i].DecisionID < out[j].DecisionID + }) + return out +} + +func routeGenerationDecisionDetails(states []meshRouteGenerationDecisionState, limit int) []map[string]any { + out := make([]map[string]any, 0, minInt(len(states), limit)) + for index, state := range states { + if index >= limit { + break + } + out = append(out, map[string]any{ + "decision_id": state.DecisionID, + "route_id": state.RouteID, + "generation": state.Generation, + "decision_source": state.DecisionSource, + "local_role": state.LocalRole, + "previous_hop_id": state.PreviousHopID, + "next_hop_id": state.NextHopID, + "selected_relay_id": state.SelectedRelayID, + "stale_relay_node_id": state.StaleRelayNodeID, + "rendezvous_lease_id": state.RendezvousLeaseID, + "effective_hops": append([]string{}, state.EffectiveHops...), + "original_hops": append([]string{}, state.OriginalHops...), + "path_score": state.PathScore, + "status": state.Status, + "apply_status": state.ApplyStatus, + "withdraw_status": state.WithdrawStatus, + "applied_at": formatOptionalTime(state.AppliedAt), + "withdrawn_at": formatOptionalTime(state.WithdrawnAt), + "control_plane_only": state.ControlPlaneOnly, + "production_forwarding": state.ProductionForwarding, + }) + } + return out +} + +func meshRouteHealthConfigReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any { + observedAt = observedAt.UTC() + report := map[string]any{ + "schema_version": meshRouteHealthConfigReportSchema, + "cluster_id": identity.ClusterID, + "node_id": identity.NodeID, + "config_source": "", + "config_version": "", + "route_health_config_contract": "control_plane_route_path_decisions_to_synthetic_route_health", + "control_plane_only": true, + "route_health_only": true, + "synthetic_route_health_route_path_runtime": true, + "production_route_path_forwarding_runtime": false, + "production_payload_forwarding": false, + "service_workload_traffic": false, + "test_service_route_config_changed": false, + "observed_at": observedAt.Format(time.RFC3339Nano), + "config_refresh_interval_ms": int64(meshSyntheticConfigRefreshInterval / time.Millisecond), + "feedback_refresh_backoff_ms": int64(meshRouteHealthFeedbackRefreshBackoff / time.Millisecond), + "base_route_count": 0, + "route_health_route_count": 0, + "route_path_decision_applied_count": 0, + "replacement_route_health_route_count": 0, + "route_health_decision_drift_candidate_count": 0, + "routes": []map[string]any{}, + } + if meshState == nil { + return report + } + report["config_source"] = meshState.Source + report["config_version"] = meshState.ConfigVersion + if !meshState.LastConfigRefreshAt.IsZero() { + report["last_config_refresh_at"] = meshState.LastConfigRefreshAt.UTC().Format(time.RFC3339Nano) + } + report["base_route_count"] = len(meshState.Routes) + routes := meshState.RouteHealthRoutes + if len(routes) == 0 { + routes = meshState.Routes + } + report["route_health_route_count"] = len(routes) + decisionsByRoute := routePathDecisionsByRoute(meshState.RoutePathDecisions) + applied := 0 + replacements := 0 + driftCandidates := 0 + details := make([]map[string]any, 0, minInt(len(routes), maxMeshRendezvousLeaseReportEntries)) + for index, route := range routes { + decision, ok := decisionsByRoute[route.RouteID] + if ok { + applied++ + if decision.DecisionSource == "stale_relay_replacement" { + replacements++ + } + if !sameStringSlice(route.Hops, decision.EffectiveHops) { + driftCandidates++ + } + } + if index >= maxMeshRendezvousLeaseReportEntries { + continue + } + item := map[string]any{ + "route_id": route.RouteID, + "source_node_id": route.SourceNodeID, + "destination_node_id": route.DestinationNodeID, + "effective_hops": append([]string{}, route.Hops...), + "route_version": route.RouteVersion, + "policy_version": route.PolicyVersion, + "peer_directory_version": route.PeerDirectoryVersion, + "route_path_decision_applied": ok, + } + if ok { + item["route_path_decision_id"] = decision.DecisionID + item["route_path_decision_generation"] = decision.Generation + item["route_path_decision_source"] = decision.DecisionSource + item["selected_relay_id"] = decision.SelectedRelayID + item["stale_relay_node_id"] = decision.StaleRelayNodeID + item["next_hop_id"] = decision.NextHopID + item["original_hops"] = append([]string{}, decision.OriginalHops...) + } + details = append(details, item) + } + report["route_path_decision_applied_count"] = applied + report["replacement_route_health_route_count"] = replacements + report["route_health_decision_drift_candidate_count"] = driftCandidates + report["feedback_refresh_attempt_count"] = meshState.RouteHealthRefreshAttempts + report["feedback_refresh_success_count"] = meshState.RouteHealthRefreshSuccesses + report["feedback_refresh_failure_count"] = meshState.RouteHealthRefreshFailures + report["feedback_refresh_suppressed_count"] = meshState.RouteHealthRefreshSuppressed + report["routes"] = details + report["truncated"] = len(routes) > maxMeshRendezvousLeaseReportEntries + return report +} + +func meshRouteHealthFeedbackRefreshReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any { + observedAt = observedAt.UTC() + report := map[string]any{ + "schema_version": meshRouteHealthFeedbackRefreshSchema, + "cluster_id": identity.ClusterID, + "node_id": identity.NodeID, + "refresh_contract": "route_health_feedback_to_node_scoped_synthetic_config_get", + "control_plane_only": true, + "route_health_only": true, + "production_payload_forwarding": false, + "service_workload_traffic": false, + "feedback_refresh_backoff_ms": int64(meshRouteHealthFeedbackRefreshBackoff / time.Millisecond), + "feedback_refresh_supported": false, + "feedback_refresh_attempt_count": 0, + "feedback_refresh_success_count": 0, + "feedback_refresh_failure_count": 0, + "feedback_refresh_suppressed_count": 0, + "last_feedback_refresh_status": "", + "last_feedback_refresh_reason": "", + "last_feedback_refresh_error": "", + "last_feedback_refresh_route_id": "", + "last_feedback_refresh_peer_node_id": "", + "last_feedback_refresh_selected_relay": "", + "observed_at": observedAt.Format(time.RFC3339Nano), + } + if meshState == nil { + return report + } + report["config_source"] = meshState.Source + report["config_version"] = meshState.ConfigVersion + report["feedback_refresh_supported"] = meshState.Source == "control_plane" + report["feedback_refresh_attempt_count"] = meshState.RouteHealthRefreshAttempts + report["feedback_refresh_success_count"] = meshState.RouteHealthRefreshSuccesses + report["feedback_refresh_failure_count"] = meshState.RouteHealthRefreshFailures + report["feedback_refresh_suppressed_count"] = meshState.RouteHealthRefreshSuppressed + if meshState.LastRouteHealthRefresh == nil { + return report + } + last := meshState.LastRouteHealthRefresh + report["last_feedback_refresh_status"] = last.Status + report["last_feedback_refresh_reason"] = last.Reason + report["last_feedback_refresh_error"] = last.Error + report["last_feedback_refresh_route_id"] = last.RouteID + report["last_feedback_refresh_peer_node_id"] = last.PeerNodeID + report["last_feedback_refresh_selected_relay"] = last.SelectedRelayID + report["last_feedback_refresh_link_status"] = last.LinkStatus + report["last_feedback_refresh_failure_reason"] = last.FailureReason + report["last_feedback_refresh_drift_detected"] = last.DriftDetected + report["last_feedback_refresh_attempted_at"] = formatOptionalTime(last.AttemptedAt) + report["last_feedback_refresh_completed_at"] = formatOptionalTime(last.CompletedAt) + report["last_feedback_refresh_previous_config_version"] = last.PreviousConfigVersion + report["last_feedback_refresh_refreshed_config_version"] = last.RefreshedConfigVersion + report["last_feedback_refresh_previous_route_health_route_count"] = last.PreviousRouteHealthRouteCount + report["last_feedback_refresh_refreshed_route_health_route_count"] = last.RefreshedRouteHealthRouteCount + return report +} + +func meshRoutePathDecisionReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any { + observedAt = observedAt.UTC() + report := map[string]any{ + "schema_version": meshRoutePathDecisionReportSchema, + "cluster_id": identity.ClusterID, + "node_id": identity.NodeID, + "config_source": "", + "config_version": "", + "decision_contract": "control_plane_route_path_decisions", + "control_plane_only": true, + "production_payload_forwarding": false, + "service_workload_traffic": false, + "route_path_forwarding_runtime": false, + "observed_at": observedAt.Format(time.RFC3339Nano), + "decision_count": 0, + "replacement_decision_count": 0, + "local_effective_path_count": 0, + "withdrawn_local_relay_count": 0, + "selected_local_relay_count": 0, + "next_hop_available_count": 0, + "decisions": []map[string]any{}, + } + if meshState == nil { + return report + } + report["config_source"] = meshState.Source + report["config_version"] = meshState.ConfigVersion + decisionReport := meshState.RoutePathDecisions + if decisionReport == nil { + return report + } + report["control_plane_schema_version"] = decisionReport.SchemaVersion + report["decision_mode"] = decisionReport.DecisionMode + report["generation"] = decisionReport.Generation + report["decision_count"] = decisionReport.DecisionCount + report["replacement_decision_count"] = decisionReport.ReplacementDecisionCount + report["control_plane_report_only"] = decisionReport.ControlPlaneOnly + report["control_plane_report_production_forwarding"] = decisionReport.ProductionForwarding + decisions := make([]map[string]any, 0, minInt(len(decisionReport.Decisions), maxMeshRendezvousLeaseReportEntries)) + localEffective := 0 + withdrawnLocalRelay := 0 + selectedLocalRelay := 0 + nextHopAvailable := 0 + for index, decision := range decisionReport.Decisions { + if containsString(decision.EffectiveHops, identity.NodeID) { + localEffective++ + } + if decision.LocalRole == "withdrawn_relay" { + withdrawnLocalRelay++ + } + if decision.LocalRole == "selected_relay" { + selectedLocalRelay++ + } + if strings.TrimSpace(decision.NextHopID) != "" { + nextHopAvailable++ + } + if index >= maxMeshRendezvousLeaseReportEntries { + continue + } + decisions = append(decisions, map[string]any{ + "decision_id": decision.DecisionID, + "route_id": decision.RouteID, + "source_node_id": decision.SourceNodeID, + "destination_node_id": decision.DestinationNodeID, + "original_hops": append([]string{}, decision.OriginalHops...), + "effective_hops": append([]string{}, decision.EffectiveHops...), + "previous_hop_id": decision.PreviousHopID, + "next_hop_id": decision.NextHopID, + "local_role": decision.LocalRole, + "selected_relay_id": decision.SelectedRelayID, + "selected_relay_endpoint": decision.SelectedRelayEndpoint, + "stale_relay_node_id": decision.StaleRelayNodeID, + "rendezvous_peer_node_id": decision.RendezvousPeerNodeID, + "rendezvous_lease_id": decision.RendezvousLeaseID, + "rendezvous_lease_reason": decision.RendezvousLeaseReason, + "decision_source": decision.DecisionSource, + "generation": decision.Generation, + "path_score": decision.PathScore, + "score_reasons": append([]string{}, decision.ScoreReasons...), + "control_plane_only": decision.ControlPlaneOnly, + "production_forwarding": decision.ProductionForwarding, + "expires_at": formatOptionalTime(decision.ExpiresAt), + }) + } + report["local_effective_path_count"] = localEffective + report["withdrawn_local_relay_count"] = withdrawnLocalRelay + report["selected_local_relay_count"] = selectedLocalRelay + report["next_hop_available_count"] = nextHopAvailable + report["truncated"] = len(decisionReport.Decisions) > maxMeshRendezvousLeaseReportEntries + report["decisions"] = decisions + return report +} + +func meshRendezvousLeaseBaseValid(lease mesh.PeerRendezvousLease) bool { + return strings.TrimSpace(lease.LeaseID) != "" && + strings.TrimSpace(lease.PeerNodeID) != "" && + strings.TrimSpace(lease.RelayNodeID) != "" && + strings.TrimSpace(lease.RelayEndpoint) != "" && + !lease.ExpiresAt.IsZero() && + lease.ControlPlaneOnly +} + +func meshRendezvousConnectionsByPeer(meshState *syntheticMeshState) map[string]mesh.PeerConnectionState { + out := map[string]mesh.PeerConnectionState{} + if meshState == nil || meshState.PeerConnections == nil { + return out + } + for _, entry := range meshState.PeerConnections.Snapshot().Entries { + if strings.TrimSpace(entry.NodeID) != "" { + out[entry.NodeID] = entry + } + } + return out +} + +func meshRendezvousLeaseRenewalNeeded(lease mesh.PeerRendezvousLease, observedAt time.Time, usable bool) bool { + if !usable { + return false + } + ttlRemaining := lease.ExpiresAt.Sub(observedAt) + if ttlRemaining <= meshRendezvousLeaseRenewalWindow { + return true + } + renewalAfter := meshRendezvousLeaseRenewalAfter(lease) + return !renewalAfter.IsZero() && !renewalAfter.After(observedAt) +} + +func meshRendezvousLeaseStaleRelay(lease mesh.PeerRendezvousLease, connection mesh.PeerConnectionState) bool { + if strings.TrimSpace(lease.LeaseID) == "" || strings.TrimSpace(connection.NodeID) == "" { + return false + } + if !meshRendezvousLeaseMatchesConnection(lease, connection) { + return false + } + switch connection.State { + case mesh.PeerConnectionBackoff: + return true + case mesh.PeerConnectionDegraded: + return connection.ConsecutiveFailures > 0 + case mesh.PeerConnectionWaiting: + return connection.RendezvousLeaseID == lease.LeaseID && connection.LastFailureReason != "" + default: + return false + } +} + +func meshRendezvousLeaseMatchesConnection(lease mesh.PeerRendezvousLease, connection mesh.PeerConnectionState) bool { + if connection.RendezvousLeaseID != "" && connection.RendezvousLeaseID != lease.LeaseID { + return false + } + if connection.RelayNodeID != "" && connection.RelayNodeID != lease.RelayNodeID { + return false + } + return true +} + +func meshRendezvousLeaseRole(lease mesh.PeerRendezvousLease, localNodeID string) string { + localNodeID = strings.TrimSpace(localNodeID) + switch { + case localNodeID == "" || (lease.PeerNodeID != localNodeID && lease.RelayNodeID != localNodeID): + return "entry_or_observer" + case lease.PeerNodeID == localNodeID && lease.RelayNodeID == localNodeID: + return "self" + case lease.RelayNodeID == localNodeID: + return "relay" + case lease.PeerNodeID == localNodeID: + return "peer" + default: + return "entry_or_observer" + } +} + +func meshRendezvousLeaseStatus(valid bool, expired bool, renewalNeeded bool, role string) string { + switch { + case !valid: + return "invalid" + case expired: + return "expired" + case renewalNeeded: + return "renewal_needed" + case role == "relay": + return "admitted" + default: + return "active" + } +} + +func meshRendezvousLeaseRenewalAfter(lease mesh.PeerRendezvousLease) time.Time { + if lease.ExpiresAt.IsZero() { + return time.Time{} + } + if lease.IssuedAt.IsZero() || !lease.ExpiresAt.After(lease.IssuedAt) { + return lease.ExpiresAt.Add(-meshRendezvousLeaseRenewalWindow).UTC() + } + ttl := lease.ExpiresAt.Sub(lease.IssuedAt) + return lease.IssuedAt.Add(ttl * 2 / 3).UTC() +} + +func formatOptionalTime(value time.Time) string { + if value.IsZero() { + return "" + } + return value.UTC().Format(time.RFC3339Nano) +} + +func advertisedEndpointCandidates(cfg config.Config, identity state.Identity, observedAt time.Time) ([]mesh.PeerEndpointCandidate, error) { + var candidates []mesh.PeerEndpointCandidate + if cfg.MeshAdvertiseEndpointsJSON != "" { + if err := json.Unmarshal([]byte(cfg.MeshAdvertiseEndpointsJSON), &candidates); err != nil { + return nil, fmt.Errorf("parse RAP_MESH_ADVERTISE_ENDPOINTS_JSON: %w", err) + } + } + if cfg.MeshAdvertiseEndpoint != "" { + candidates = append(candidates, mesh.PeerEndpointCandidate{ + EndpointID: identity.NodeID + "-advertised", + NodeID: identity.NodeID, + Transport: cfg.MeshAdvertiseTransport, + Address: cfg.MeshAdvertiseEndpoint, + Reachability: reachabilityFromConnectivityMode(cfg.MeshConnectivityMode), + NATType: cfg.MeshNATType, + ConnectivityMode: cfg.MeshConnectivityMode, + Region: cfg.MeshRegion, + Priority: 10, + }) + } + for i := range candidates { + if candidates[i].EndpointID == "" { + candidates[i].EndpointID = fmt.Sprintf("%s-advertised-%d", identity.NodeID, i+1) + } + if candidates[i].NodeID == "" { + candidates[i].NodeID = identity.NodeID + } + if candidates[i].NodeID != identity.NodeID || strings.TrimSpace(candidates[i].Address) == "" { + return nil, fmt.Errorf("invalid advertised mesh endpoint candidate") + } + candidates[i].Address = strings.TrimRight(strings.TrimSpace(candidates[i].Address), "/") + if candidates[i].Transport == "" { + candidates[i].Transport = defaultString(cfg.MeshAdvertiseTransport, "direct_tcp_tls") + } + if candidates[i].ConnectivityMode == "" { + candidates[i].ConnectivityMode = defaultString(cfg.MeshConnectivityMode, "direct") + } + if candidates[i].Reachability == "" { + candidates[i].Reachability = reachabilityFromConnectivityMode(candidates[i].ConnectivityMode) + } + if candidates[i].NATType == "" { + candidates[i].NATType = defaultString(cfg.MeshNATType, "unknown") + } + if candidates[i].Region == "" { + candidates[i].Region = cfg.MeshRegion + } + if candidates[i].Priority <= 0 { + candidates[i].Priority = 10 + i + } + candidates[i].LastVerifiedAt = &observedAt + if candidates[i].Metadata == nil { + metadata, err := json.Marshal(map[string]any{ + "source": "node-agent-heartbeat", + "runtime": "c17z7", + "synthetic_runtime": cfg.MeshSyntheticRuntimeEnabled, + "production_forwarding": cfg.MeshProductionForwardingEnabled, + }) + if err != nil { + return nil, err + } + candidates[i].Metadata = metadata + } + } + return candidates, nil +} + +func defaultString(value string, fallback string) string { + if strings.TrimSpace(value) == "" { + return fallback + } + return value +} + +func containsString(items []string, value string) bool { + value = strings.TrimSpace(value) + for _, item := range items { + if strings.TrimSpace(item) == value { + return true + } + } + return false +} + +func sameStringSlice(left []string, right []string) bool { + if len(left) != len(right) { + return false + } + for index := range left { + if strings.TrimSpace(left[index]) != strings.TrimSpace(right[index]) { + return false + } + } + return true +} + +func minInt(left, right int) int { + if left < right { + return left + } + return right +} + +func reachabilityFromConnectivityMode(connectivityMode string) string { + switch connectivityMode { + case "outbound_only": + return "outbound_only" + case "relay_required": + return "relay" + case "direct": + return "public" + default: + return "unknown" + } +} + +func reportWorkloadStatus(ctx context.Context, api *client.Client, supervisor supervisor.Supervisor, identity state.Identity) error { + desired, err := api.DesiredWorkloads(ctx, identity.ClusterID, identity.NodeID) + if err != nil { + return err + } + statuses, err := supervisor.Apply(ctx, desired) + if err != nil { + return err + } + for i, status := range statuses { + if i >= len(desired) { + break + } + if err := api.ReportWorkloadStatus(ctx, identity.ClusterID, identity.NodeID, desired[i].ServiceType, status); err != nil { + return err + } + } + if len(statuses) > 0 { + log.Printf("workload status reported: count=%d", len(statuses)) + } + return nil +} diff --git a/agents/rap-node-agent/cmd/rap-node-agent/main_test.go b/agents/rap-node-agent/cmd/rap-node-agent/main_test.go new file mode 100644 index 0000000..5d65845 --- /dev/null +++ b/agents/rap-node-agent/cmd/rap-node-agent/main_test.go @@ -0,0 +1,1113 @@ +package main + +import ( + "context" + "crypto/ed25519" + "encoding/base64" + "encoding/json" + "io" + "log" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + "time" + + agentauthority "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/client" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/config" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/state" +) + +func TestLoadSyntheticMeshConfigPrefersScopedFile(t *testing.T) { + route := mesh.SyntheticRoute{ + RouteID: "route-file", + ClusterID: "cluster-1", + SourceNodeID: "node-a", + DestinationNodeID: "node-b", + Hops: []string{"node-a", "node-b"}, + AllowedChannels: []string{mesh.SyntheticChannelFabricControl}, + ExpiresAt: time.Now().UTC().Add(time.Hour), + RouteVersion: "route-v1", + PolicyVersion: "policy-v1", + PeerDirectoryVersion: "peers-v1", + } + payload, err := json.Marshal(mesh.ScopedSyntheticConfig{ + SchemaVersion: "c17f.synthetic.v1", + ClusterID: "cluster-1", + LocalNodeID: "node-a", + PeerEndpoints: map[string]string{"node-b": "http://127.0.0.1:19002"}, + PeerDirectory: []mesh.PeerDirectoryEntry{ + {NodeID: "node-b", RouteIDs: []string{"route-file"}, EndpointCount: 1}, + }, + RecoverySeeds: []mesh.PeerRecoverySeed{ + {NodeID: "node-b", Endpoint: "http://127.0.0.1:19002", Transport: "direct_tcp_tls", Priority: 10}, + }, + Routes: []mesh.SyntheticRoute{route}, + }) + if err != nil { + t.Fatalf("marshal scoped config: %v", err) + } + path := filepath.Join(t.TempDir(), "mesh-scoped.json") + if err := os.WriteFile(path, payload, 0o600); err != nil { + t.Fatalf("write scoped config: %v", err) + } + + loaded, err := loadSyntheticMeshConfig(context.Background(), config.Config{ + MeshSyntheticConfigPath: path, + MeshPeerEndpointsJSON: `{"node-b":"http://debug.invalid"}`, + MeshSyntheticRoutesJSON: `[]`, + }, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, nil) + if err != nil { + t.Fatalf("load synthetic config: %v", err) + } + if loaded.Source != "scoped_config" { + t.Fatalf("source = %q, want scoped_config", loaded.Source) + } + if loaded.PeerEndpoints["node-b"] != "http://127.0.0.1:19002" { + t.Fatalf("peer endpoint = %q", loaded.PeerEndpoints["node-b"]) + } + if len(loaded.Routes) != 1 || loaded.Routes[0].RouteID != "route-file" { + t.Fatalf("routes = %+v", loaded.Routes) + } + if len(loaded.PeerDirectory) != 1 || len(loaded.RecoverySeeds) != 1 { + t.Fatalf("peer runtime config missing: directory=%+v seeds=%+v", loaded.PeerDirectory, loaded.RecoverySeeds) + } +} + +func TestVerifyEnrollmentBootstrapAcceptsSignedApproval(t *testing.T) { + publicKey, privateKey, err := ed25519.GenerateKey(nil) + if err != nil { + t.Fatalf("generate key: %v", err) + } + publicKeyB64 := base64.StdEncoding.EncodeToString(publicKey) + fingerprint := agentauthority.Fingerprint(publicKey) + payload := json.RawMessage(`{ + "schema_version":"rap.cluster.node_approval.v1", + "cluster_id":"cluster-1", + "join_request_id":"join-request-1", + "node_id":"node-1", + "node_fingerprint":"fp-1", + "identity_status":"active", + "heartbeat_endpoint":"/api/v1/clusters/cluster-1/nodes/node-1/heartbeats", + "approved_by_user_id":"admin-1", + "issued_at":"2026-04-28T12:00:00Z", + "control_plane_only":true, + "production_forwarding":false + }`) + canonical, err := agentauthority.CanonicalJSON(payload) + if err != nil { + t.Fatalf("canonical json: %v", err) + } + bootstrap := client.NodeBootstrap{ + NodeID: "node-1", + ClusterID: "cluster-1", + IdentityStatus: "active", + HeartbeatEndpoint: "/api/v1/clusters/cluster-1/nodes/node-1/heartbeats", + ClusterAuthority: &client.ClusterAuthorityDescriptor{ + SchemaVersion: agentauthority.AuthoritySchemaVersion, + ClusterID: "cluster-1", + AuthorityState: "active", + KeyAlgorithm: agentauthority.AlgorithmEd25519, + PublicKey: publicKeyB64, + PublicKeyFingerprint: fingerprint, + }, + AuthorityPayload: payload, + AuthoritySignature: &client.ClusterSignature{ + SchemaVersion: agentauthority.SignatureSchemaVersion, + Algorithm: agentauthority.AlgorithmEd25519, + KeyFingerprint: fingerprint, + Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)), + SignedAt: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC), + }, + } + + err = verifyEnrollmentBootstrap(bootstrap, state.Identity{ + ClusterID: "cluster-1", + NodeFingerprint: "fp-1", + }, config.Config{ClusterAuthorityFingerprint: fingerprint}) + if err != nil { + t.Fatalf("verify enrollment bootstrap: %v", err) + } +} + +func TestVerifyEnrollmentBootstrapRejectsPinnedAuthorityMismatch(t *testing.T) { + bootstrap := client.NodeBootstrap{ + NodeID: "node-1", + ClusterID: "cluster-1", + IdentityStatus: "active", + ClusterAuthority: &client.ClusterAuthorityDescriptor{ + SchemaVersion: agentauthority.AuthoritySchemaVersion, + ClusterID: "cluster-1", + KeyAlgorithm: agentauthority.AlgorithmEd25519, + PublicKeyFingerprint: "rap-ca-ed25519-other", + }, + } + err := verifyEnrollmentBootstrap(bootstrap, state.Identity{ + ClusterID: "cluster-1", + NodeFingerprint: "fp-1", + }, config.Config{ClusterAuthorityFingerprint: "rap-ca-ed25519-expected"}) + if err == nil { + t.Fatal("expected pinned authority mismatch") + } +} + +func TestSyntheticQualityScoreIsBounded(t *testing.T) { + cases := []struct { + latency int + min int + max int + }{ + {latency: 0, min: 100, max: 100}, + {latency: 50, min: 90, max: 100}, + {latency: 10000, min: 1, max: 1}, + } + for _, tc := range cases { + score := syntheticQualityScore(tc.latency) + if score < tc.min || score > tc.max { + t.Fatalf("syntheticQualityScore(%d) = %d, want [%d,%d]", tc.latency, score, tc.min, tc.max) + } + } +} + +func TestProductionEnvelopeObservationSinkFromConfigIsDisabledByDefault(t *testing.T) { + sink := productionEnvelopeObservationSinkFromConfig(config.Config{}) + if sink != nil { + t.Fatal("sink is enabled by default") + } +} + +func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) { + payload := heartbeatPayload(config.Config{ + MeshAdvertiseEndpoint: "https://node-a.example.test:443", + MeshAdvertiseTransport: "wss", + MeshConnectivityMode: "outbound_only", + MeshNATType: "symmetric", + MeshRegion: "eu", + MeshSyntheticRuntimeEnabled: true, + MeshProductionForwardingEnabled: true, + }, state.Identity{ + ClusterID: "cluster-1", + NodeID: "node-a", + }, nil, time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)) + + report, ok := payload.Metadata["mesh_endpoint_report"].(map[string]any) + if !ok { + t.Fatalf("mesh endpoint report missing: %+v", payload.Metadata) + } + if report["peer_endpoint"] != "https://node-a.example.test:443" || + report["connectivity_mode"] != "outbound_only" || + report["nat_type"] != "symmetric" || + report["region"] != "eu" { + t.Fatalf("unexpected endpoint report: %+v", report) + } + if payload.Capabilities["mesh_dynamic_endpoint_reporting"] != true { + t.Fatalf("dynamic endpoint capability missing: %+v", payload.Capabilities) + } +} + +func TestHeartbeatPayloadReportsMultipleMeshEndpoints(t *testing.T) { + payload := heartbeatPayload(config.Config{ + MeshAdvertiseEndpointsJSON: `[{ + "endpoint_id": "node-a-lan", + "address": "http://10.24.10.10:19001", + "transport": "direct_tcp_tls", + "reachability": "private", + "connectivity_mode": "direct", + "nat_type": "none", + "region": "corp-eu", + "priority": 1, + "policy_tags": ["corp-lan", "same-site"] + },{ + "endpoint_id": "node-a-public", + "address": "https://node-a.example.test:443", + "transport": "direct_tcp_tls", + "reachability": "public", + "connectivity_mode": "direct", + "nat_type": "none", + "priority": 10 + }]`, + MeshRegion: "corp-eu", + }, state.Identity{ + ClusterID: "cluster-1", + NodeID: "node-a", + }, nil, time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)) + + report, ok := payload.Metadata["mesh_endpoint_report"].(map[string]any) + if !ok { + t.Fatalf("mesh endpoint report missing: %+v", payload.Metadata) + } + candidates, ok := report["endpoint_candidates"].([]mesh.PeerEndpointCandidate) + if !ok || len(candidates) != 2 { + t.Fatalf("unexpected endpoint candidates: %#v", report["endpoint_candidates"]) + } + if candidates[0].EndpointID != "node-a-lan" || candidates[0].Reachability != "private" { + t.Fatalf("internal endpoint candidate not preserved: %+v", candidates[0]) + } + if report["peer_endpoint"] != "http://10.24.10.10:19001" { + t.Fatalf("default peer endpoint = %v", report["peer_endpoint"]) + } +} + +func TestHeartbeatPayloadIncludesPeerRecoveryReportWithoutAdvertisedEndpoint(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + local := mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"} + peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{ + Local: local, + PeerEndpoints: map[string]string{ + "node-b": "http://node-b:19001", + "node-c": "http://node-c:19001", + "node-d": "http://node-d:19001", + }, + WarmPeerLimit: 3, + Now: now, + }) + peerConnections := mesh.NewPeerConnectionTracker(peerCache.Snapshot(), now) + peerConnections.RecordSuccess("node-b", 20, now) + meshState := &syntheticMeshState{ + PeerCache: peerCache, + PeerConnections: peerConnections, + } + + payload := heartbeatPayload(config.Config{}, state.Identity{ + ClusterID: "cluster-1", + NodeID: "node-a", + }, meshState, now) + + report, ok := payload.Metadata["mesh_peer_recovery_report"].(map[string]any) + if !ok { + t.Fatalf("mesh peer recovery report missing: %+v", payload.Metadata) + } + if report["schema_version"] != "c17z9.mesh_peer_recovery_report.v1" || + report["mode"] != mesh.PeerRecoveryModeRecovery || + report["ready_peer_count"] != 1 || + report["target_ready_peers"] != mesh.DefaultStablePeerTarget || + report["deficit"] != 2 { + t.Fatalf("unexpected recovery report: %+v", report) + } + if payload.Capabilities["mesh_peer_recovery_planning"] != true { + t.Fatalf("peer recovery capability missing: %+v", payload.Capabilities) + } + intentReport, ok := payload.Metadata["mesh_peer_connection_intent_report"].(map[string]any) + if !ok { + t.Fatalf("mesh peer connection intent report missing: %+v", payload.Metadata) + } + if intentReport["schema_version"] != "c17z12.mesh_peer_connection_intent_report.v1" || + intentReport["intent_count"] != 3 || + intentReport["recover_count"] != 2 { + t.Fatalf("unexpected connection intent report: %+v", intentReport) + } + if payload.Capabilities["mesh_peer_connection_intent_planning"] != true { + t.Fatalf("connection intent capability missing: %+v", payload.Capabilities) + } + if _, ok := payload.Metadata["mesh_endpoint_report"]; ok { + t.Fatalf("endpoint report should not be emitted without advertised endpoint: %+v", payload.Metadata) + } +} + +func TestHeartbeatPayloadIncludesRendezvousLeaseAdmissionReport(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + identity := state.Identity{ClusterID: "cluster-1", NodeID: "node-a"} + leases := []mesh.PeerRendezvousLease{ + { + LeaseID: "lease-node-b-via-node-a", + PeerNodeID: "node-b", + RelayNodeID: "node-a", + RelayEndpoint: "http://node-a:19001", + Transport: "relay_control", + ConnectivityMode: "relay_required", + RouteIDs: []string{"route-ab"}, + AllowedChannels: []string{mesh.SyntheticChannelFabricControl}, + Priority: 10, + ControlPlaneOnly: true, + IssuedAt: now.Add(-time.Minute), + ExpiresAt: now.Add(5 * time.Minute), + }, + { + LeaseID: "lease-node-a-via-node-r", + PeerNodeID: "node-a", + RelayNodeID: "node-r", + RelayEndpoint: "http://node-r:19001", + Transport: "relay_control", + ConnectivityMode: "relay_required", + RouteIDs: []string{"route-ra"}, + Priority: 20, + ControlPlaneOnly: true, + IssuedAt: now.Add(-2 * time.Minute), + ExpiresAt: now.Add(30 * time.Second), + }, + { + LeaseID: "lease-node-c-via-node-r-expired", + PeerNodeID: "node-c", + RelayNodeID: "node-r", + RelayEndpoint: "http://node-r:19001", + Transport: "relay_control", + ConnectivityMode: "relay_required", + RouteIDs: []string{"route-cr"}, + Priority: 30, + ControlPlaneOnly: true, + IssuedAt: now.Add(-10 * time.Minute), + ExpiresAt: now.Add(-time.Second), + }, + } + cache := mesh.NewPeerCache(mesh.PeerCacheConfig{ + Local: mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}, + RendezvousLeases: leases, + WarmPeerLimit: 3, + Now: now, + }) + tracker := mesh.NewPeerConnectionTracker(cache.Snapshot(), now) + tracker.RecordRelayReady(mesh.PeerCacheEntry{ + NodeID: "node-b", + Endpoint: "http://node-a:19001", + Warm: true, + RendezvousLeaseID: "lease-node-b-via-node-a", + RelayNodeID: "node-a", + RelayEndpoint: "http://node-a:19001", + RelayControl: true, + BestTransport: "relay_control", + BestReachability: "relay", + BestConnectivity: "relay_required", + BestCandidateScore: 500, + }, 12, now.Add(time.Second)) + meshState := &syntheticMeshState{ + PeerCache: cache, + RendezvousLeases: leases, + PeerConnections: tracker, + } + + payload := heartbeatPayload(config.Config{}, identity, meshState, now) + + report, ok := payload.Metadata["mesh_rendezvous_lease_report"].(map[string]any) + if !ok { + t.Fatalf("rendezvous lease report missing: %+v", payload.Metadata) + } + if report["schema_version"] != meshRendezvousLeaseReportSchema || + report["lease_count"] != 3 || + report["active_count"] != 2 || + report["expired_count"] != 1 || + report["admitted_as_relay_count"] != 1 || + report["admitted_as_peer_count"] != 1 || + report["renewal_needed_count"] != 1 || + report["relay_control_ready_count"] != 1 { + t.Fatalf("unexpected lease report: %+v", report) + } + if report["control_plane_only"] != true || + report["relay_payload_forwarding"] != false || + report["production_payload_forwarding"] != false { + t.Fatalf("payload boundary flags not preserved: %+v", report) + } + leaseDetails, ok := report["leases"].([]map[string]any) + if !ok || len(leaseDetails) != 3 { + t.Fatalf("unexpected lease details: %#v", report["leases"]) + } + if leaseDetails[0]["role"] != "relay" || + leaseDetails[0]["status"] != "admitted" || + leaseDetails[0]["admitted"] != true || + leaseDetails[0]["relay_ready"] != true { + t.Fatalf("relay admission detail missing: %+v", leaseDetails[0]) + } + if leaseDetails[1]["role"] != "peer" || + leaseDetails[1]["status"] != "renewal_needed" || + leaseDetails[1]["renewal_needed"] != true { + t.Fatalf("peer renewal detail missing: %+v", leaseDetails[1]) + } + if payload.Capabilities[meshRendezvousLeaseTelemetryCapability] != true { + t.Fatalf("lease telemetry capability missing: %+v", payload.Capabilities) + } +} + +func TestHeartbeatPayloadReportsStaleRelayWithdrawalTelemetry(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + identity := state.Identity{ClusterID: "cluster-1", NodeID: "node-r"} + lease := mesh.PeerRendezvousLease{ + LeaseID: "lease-node-b-via-node-r", + PeerNodeID: "node-b", + RelayNodeID: "node-r", + RelayEndpoint: "http://node-r:19001", + Transport: "relay_control", + ConnectivityMode: "relay_required", + RouteIDs: []string{"route-rb"}, + Priority: 10, + ControlPlaneOnly: true, + IssuedAt: now.Add(-time.Minute), + ExpiresAt: now.Add(10 * time.Minute), + } + altLease := lease + altLease.LeaseID = "lease-node-b-via-node-r2" + altLease.RelayNodeID = "node-r2" + altLease.RelayEndpoint = "http://node-r2:19001" + altLease.Priority = 20 + cache := mesh.NewPeerCache(mesh.PeerCacheConfig{ + Local: mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}, + RendezvousLeases: []mesh.PeerRendezvousLease{lease, altLease}, + WarmPeerLimit: 1, + Now: now, + }) + tracker := mesh.NewPeerConnectionTracker(cache.Snapshot(), now) + peer := mesh.PeerCacheEntry{ + NodeID: "node-b", + Endpoint: "http://node-r:19001", + Warm: true, + RendezvousLeaseID: "lease-node-b-via-node-r", + RelayNodeID: "node-r", + RelayEndpoint: "http://node-r:19001", + RelayControl: true, + } + tracker.RecordRelayReady(peer, 10, now.Add(time.Second)) + tracker.RecordFailure("node-b", "relay health failed", now.Add(2*time.Second)) + tracker.RecordFailure("node-b", "relay health failed", now.Add(3*time.Second)) + tracker.RecordFailure("node-b", "relay health failed", now.Add(4*time.Second)) + meshState := &syntheticMeshState{ + PeerCache: cache, + RendezvousLeases: []mesh.PeerRendezvousLease{lease, altLease}, + PeerConnections: tracker, + Source: "control_plane", + } + + payload := heartbeatPayload(config.Config{}, identity, meshState, now.Add(5*time.Second)) + + report := payload.Metadata["mesh_rendezvous_lease_report"].(map[string]any) + if report["stale_relay_count"] != 1 || + report["withdrawal_needed_count"] != 1 || + report["reselection_needed_count"] != 0 || + report["refresh_needed_count"] != 1 { + t.Fatalf("unexpected stale relay report: %+v", report) + } + leaseDetails := report["leases"].([]map[string]any) + if leaseDetails[0]["stale_relay"] != true || + leaseDetails[0]["withdrawal_needed"] != true || + leaseDetails[0]["connection_state"] != mesh.PeerConnectionBackoff { + t.Fatalf("stale relay detail missing: %+v", leaseDetails[0]) + } + if leaseDetails[1]["stale_relay"] != false || + leaseDetails[1]["withdrawal_needed"] != false { + t.Fatalf("alternate relay lease should not inherit stale state: %+v", leaseDetails[1]) + } +} + +func TestRefreshRendezvousLeasesIfNeededReloadsControlPlaneConfig(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + identity := state.Identity{ClusterID: "cluster-1", NodeID: "node-a"} + oldLease := mesh.PeerRendezvousLease{ + LeaseID: "lease-node-b-via-node-r-old", + PeerNodeID: "node-b", + RelayNodeID: "node-r-old", + RelayEndpoint: "http://node-r-old:19001", + Transport: "relay_control", + ConnectivityMode: "relay_required", + RouteIDs: []string{"route-ab"}, + Priority: 10, + ControlPlaneOnly: true, + IssuedAt: now.Add(-2 * time.Minute), + ExpiresAt: now.Add(30 * time.Second), + } + local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID} + oldCache := mesh.NewPeerCache(mesh.PeerCacheConfig{ + Local: local, + RendezvousLeases: []mesh.PeerRendezvousLease{oldLease}, + WarmPeerLimit: 1, + Now: now, + }) + tracker := mesh.NewPeerConnectionTracker(oldCache.Snapshot(), now) + oldPathDecisions := &client.RoutePathDecisionReport{ + SchemaVersion: "c17z18.route_path_decisions.v1", + DecisionMode: "control_plane_effective_path_from_relay_policy", + Generation: "old-config", + DecisionCount: 1, + ReplacementDecisionCount: 0, + ControlPlaneOnly: true, + ProductionForwarding: false, + Decisions: []client.RoutePathDecision{ + { + DecisionID: "route-ab-path-node-a-via-node-r-old", + RouteID: "route-ab", + ClusterID: "cluster-1", + LocalNodeID: "node-a", + SourceNodeID: "node-a", + DestinationNodeID: "node-b", + OriginalHops: []string{"node-a", "node-r-old", "node-b"}, + EffectiveHops: []string{"node-a", "node-r-old", "node-b"}, + NextHopID: "node-r-old", + LocalRole: "entry", + DecisionSource: "route_intent", + Generation: "old-config", + PathScore: 1000, + ControlPlaneOnly: true, + ProductionForwarding: false, + ExpiresAt: now.Add(10 * time.Minute), + }, + }, + } + meshState := &syntheticMeshState{ + PeerCache: oldCache, + RendezvousLeases: []mesh.PeerRendezvousLease{oldLease}, + RoutePathDecisions: oldPathDecisions, + RouteGenerationTracker: newMeshRouteGenerationTracker( + oldPathDecisions, + now.Add(-time.Minute), + ), + PeerConnections: tracker, + PeerConnectionManager: mesh.NewPeerConnectionManager(mesh.PeerConnectionManagerConfig{Local: local, PeerCache: oldCache, Tracker: tracker, RendezvousLeases: []mesh.PeerRendezvousLease{oldLease}}), + Source: "control_plane", + ConfigVersion: "old-config", + } + + requests := 0 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/clusters/cluster-1/nodes/node-a/mesh/synthetic-config" { + t.Fatalf("unexpected path: %s", r.URL.Path) + } + requests++ + response := map[string]any{ + "synthetic_mesh_config": map[string]any{ + "enabled": true, + "schema_version": "c17z18.synthetic.v1", + "cluster_id": "cluster-1", + "local_node_id": "node-a", + "config_version": "new-config", + "peer_directory_version": "new-config", + "policy_version": "new-config", + "peer_endpoints": map[string]string{"node-r-new": "http://node-r-new:19001"}, + "peer_endpoint_candidates": map[string]any{ + "node-b": []map[string]any{ + { + "endpoint_id": "node-b-outbound-only", + "node_id": "node-b", + "transport": "outbound_reverse", + "address": "http://node-b:19002", + "address_family": "ipv4", + "reachability": "outbound_only", + "connectivity_mode": "outbound_only", + "nat_type": "symmetric", + "region": "test", + "priority": 5, + }, + }, + }, + "peer_directory": []map[string]any{ + {"node_id": "node-b", "route_ids": []string{"route-ab"}, "endpoint_count": 0, "candidate_count": 1, "connectivity_modes": []string{"relay_required"}, "recovery_seed": false}, + }, + "rendezvous_leases": []map[string]any{ + { + "lease_id": "lease-node-b-via-node-r-new", + "peer_node_id": "node-b", + "relay_node_id": "node-r-new", + "relay_endpoint": "http://node-r-new:19001", + "transport": "relay_control", + "connectivity_mode": "relay_required", + "route_ids": []string{"route-ab"}, + "allowed_channels": []string{mesh.SyntheticChannelFabricControl}, + "priority": 5, + "control_plane_only": true, + "issued_at": now, + "expires_at": now.Add(10 * time.Minute), + "reason": "refresh_test", + }, + }, + "route_path_decisions": map[string]any{ + "schema_version": "c17z18.route_path_decisions.v1", + "decision_mode": "control_plane_effective_path_from_relay_policy", + "generation": "new-config", + "decision_count": 1, + "replacement_decision_count": 1, + "control_plane_only": true, + "production_forwarding": false, + "decisions": []map[string]any{ + { + "decision_id": "route-ab-path-node-a-via-node-r-new", + "route_id": "route-ab", + "cluster_id": "cluster-1", + "local_node_id": "node-a", + "source_node_id": "node-a", + "destination_node_id": "node-b", + "original_hops": []string{"node-a", "node-r-old", "node-r-new", "node-b"}, + "effective_hops": []string{"node-a", "node-r-new", "node-b"}, + "next_hop_id": "node-r-new", + "local_role": "entry", + "selected_relay_id": "node-r-new", + "selected_relay_endpoint": "http://node-r-new:19001", + "stale_relay_node_id": "node-r-old", + "rendezvous_lease_id": "lease-node-b-via-node-r-new", + "rendezvous_lease_reason": "stale_relay_replacement", + "decision_source": "stale_relay_replacement", + "generation": "new-config", + "path_score": 900, + "score_reasons": []string{"relay_replacement_policy"}, + "control_plane_only": true, + "production_forwarding": false, + "expires_at": now.Add(10 * time.Minute), + }, + }, + }, + "routes": []map[string]any{ + { + "route_id": "route-ab", + "cluster_id": "cluster-1", + "source_node_id": "node-a", + "destination_node_id": "node-b", + "hops": []string{"node-a", "node-r-old", "node-r-new", "node-b"}, + "allowed_channels": []string{mesh.SyntheticChannelFabricControl}, + "expires_at": now.Add(10 * time.Minute), + "max_ttl": 6, + "max_hops": 6, + }, + }, + "production_forwarding": false, + }, + } + if err := json.NewEncoder(w).Encode(response); err != nil { + t.Fatalf("write response: %v", err) + } + })) + defer server.Close() + + err := refreshRendezvousLeasesIfNeeded(context.Background(), config.Config{}, identity, client.New(server.URL), meshState, now) + if err != nil { + t.Fatalf("refresh leases: %v", err) + } + if requests != 1 { + t.Fatalf("requests = %d, want 1", requests) + } + if meshState.ConfigVersion != "new-config" || + len(meshState.RendezvousLeases) != 1 || + meshState.RendezvousLeases[0].RelayNodeID != "node-r-new" { + t.Fatalf("mesh state was not refreshed: version=%s leases=%+v", meshState.ConfigVersion, meshState.RendezvousLeases) + } + if meshState.RoutePathDecisions == nil || + meshState.RoutePathDecisions.SchemaVersion != "c17z18.route_path_decisions.v1" || + meshState.RoutePathDecisions.ReplacementDecisionCount != 1 || + len(meshState.RoutePathDecisions.Decisions) != 1 || + meshState.RoutePathDecisions.Decisions[0].NextHopID != "node-r-new" { + t.Fatalf("route path decisions were not refreshed: %+v", meshState.RoutePathDecisions) + } + if len(meshState.Routes) != 1 || + !sameStringSlice(meshState.Routes[0].Hops, []string{"node-a", "node-r-old", "node-r-new", "node-b"}) { + t.Fatalf("base routes should remain original for non-route-health runtime: %+v", meshState.Routes) + } + if len(meshState.RouteHealthRoutes) != 1 || + !sameStringSlice(meshState.RouteHealthRoutes[0].Hops, []string{"node-a", "node-r-new", "node-b"}) || + meshState.RouteHealthRoutes[0].RouteVersion != "new-config" { + t.Fatalf("route health routes were not generated from path decisions: %+v", meshState.RouteHealthRoutes) + } + if meshState.LeaseRefreshAttempts != 1 || meshState.LeaseRefreshSuccesses != 1 || meshState.LeaseRefreshFailures != 0 { + t.Fatalf("unexpected refresh counters: attempts=%d successes=%d failures=%d", meshState.LeaseRefreshAttempts, meshState.LeaseRefreshSuccesses, meshState.LeaseRefreshFailures) + } + if meshState.LastLeaseRefresh == nil || + meshState.LastLeaseRefresh.Status != "succeeded" || + meshState.LastLeaseRefresh.Reason != "renewal_needed" { + t.Fatalf("unexpected refresh state: %+v", meshState.LastLeaseRefresh) + } + if meshState.LastConfigRefreshAt.IsZero() { + t.Fatalf("last config refresh time was not updated") + } + recoveryPlan := peerRecoveryPlan(meshState, now.Add(time.Second)) + intentPlan := peerConnectionIntentPlan(meshState, recoveryPlan, now.Add(time.Second)) + var peerIntent mesh.PeerConnectionIntent + for _, intent := range intentPlan.Intents { + if intent.NodeID == "node-b" { + peerIntent = intent + break + } + } + if intentPlan.RendezvousResolvedCount != 1 || + peerIntent.RendezvousLeaseID != "lease-node-b-via-node-r-new" || + peerIntent.RelayNodeID != "node-r-new" { + t.Fatalf("refreshed lease was not selected: %+v", intentPlan) + } + pathReport := meshRoutePathDecisionReport(meshState, identity, now.Add(time.Second)) + if pathReport["schema_version"] != meshRoutePathDecisionReportSchema || + pathReport["replacement_decision_count"] != 1 || + pathReport["next_hop_available_count"] != 1 || + pathReport["production_payload_forwarding"] != false { + t.Fatalf("unexpected route path decision report: %+v", pathReport) + } + generationReport := meshRouteGenerationReport(meshState, identity, now.Add(time.Second)) + if generationReport["schema_version"] != meshRouteGenerationReportSchema || + generationReport["active_decision_count"] != 1 || + generationReport["applied_decision_count"] != 1 || + generationReport["withdrawn_decision_count"] != 1 || + generationReport["generation_changed"] != true || + generationReport["production_payload_forwarding"] != false { + t.Fatalf("unexpected route generation report: %+v", generationReport) + } + routeHealthConfigReport := meshRouteHealthConfigReport(meshState, identity, now.Add(time.Second)) + if routeHealthConfigReport["schema_version"] != meshRouteHealthConfigReportSchema || + routeHealthConfigReport["route_path_decision_applied_count"] != 1 || + routeHealthConfigReport["replacement_route_health_route_count"] != 1 || + routeHealthConfigReport["synthetic_route_health_route_path_runtime"] != true || + routeHealthConfigReport["test_service_route_config_changed"] != false || + routeHealthConfigReport["config_refresh_interval_ms"] != int64(meshSyntheticConfigRefreshInterval/time.Millisecond) || + routeHealthConfigReport["production_payload_forwarding"] != false { + t.Fatalf("unexpected route health config report: %+v", routeHealthConfigReport) + } +} + +func TestRouteHealthFeedbackRefreshAppliesReplacementConfig(t *testing.T) { + now := time.Date(2026, 4, 28, 22, 0, 0, 0, time.UTC) + identity := state.Identity{ClusterID: "cluster-1", NodeID: "node-a"} + local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID} + oldRoute := mesh.SyntheticRoute{ + RouteID: "route-ab", + ClusterID: "cluster-1", + SourceNodeID: "node-a", + DestinationNodeID: "node-b", + Hops: []string{"node-a", "node-r-old", "node-b"}, + AllowedChannels: []string{mesh.SyntheticChannelFabricControl}, + ExpiresAt: now.Add(10 * time.Minute), + RouteVersion: "old-config", + } + oldCache := mesh.NewPeerCache(mesh.PeerCacheConfig{ + Local: local, + PeerEndpoints: map[string]string{"node-r-old": "http://node-r-old:19001"}, + Routes: []mesh.SyntheticRoute{oldRoute}, + WarmPeerLimit: 1, + Now: now, + }) + meshState := &syntheticMeshState{ + PeerCache: oldCache, + Routes: []mesh.SyntheticRoute{oldRoute}, + RouteHealthRoutes: []mesh.SyntheticRoute{oldRoute}, + Source: "control_plane", + ConfigVersion: "old-config", + PeerConnections: mesh.NewPeerConnectionTracker(oldCache.Snapshot(), now), + } + + requests := 0 + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/clusters/cluster-1/nodes/node-a/mesh/synthetic-config" { + t.Fatalf("unexpected path: %s", r.URL.Path) + } + requests++ + response := map[string]any{ + "synthetic_mesh_config": map[string]any{ + "enabled": true, + "schema_version": "c17z20.synthetic.v1", + "cluster_id": "cluster-1", + "local_node_id": "node-a", + "config_version": "new-config", + "peer_directory_version": "new-config", + "policy_version": "new-config", + "peer_endpoints": map[string]string{"node-r-new": "http://node-r-new:19001"}, + "rendezvous_leases": []map[string]any{ + { + "lease_id": "lease-node-b-via-node-r-new", + "peer_node_id": "node-b", + "relay_node_id": "node-r-new", + "relay_endpoint": "http://node-r-new:19001", + "transport": "relay_control", + "connectivity_mode": "relay_required", + "route_ids": []string{"route-ab"}, + "allowed_channels": []string{mesh.SyntheticChannelFabricControl}, + "priority": 5, + "control_plane_only": true, + "issued_at": now, + "expires_at": now.Add(10 * time.Minute), + "reason": "stale_relay_replacement", + }, + }, + "route_path_decisions": map[string]any{ + "schema_version": "c17z20.route_path_decisions.v1", + "decision_mode": "control_plane_effective_path_from_relay_policy", + "generation": "new-config", + "decision_count": 1, + "replacement_decision_count": 1, + "control_plane_only": true, + "production_forwarding": false, + "decisions": []map[string]any{ + { + "decision_id": "route-ab-path-node-a-via-node-r-new", + "route_id": "route-ab", + "cluster_id": "cluster-1", + "local_node_id": "node-a", + "source_node_id": "node-a", + "destination_node_id": "node-b", + "original_hops": []string{"node-a", "node-r-old", "node-r-new", "node-b"}, + "effective_hops": []string{"node-a", "node-r-new", "node-b"}, + "next_hop_id": "node-r-new", + "local_role": "entry", + "selected_relay_id": "node-r-new", + "selected_relay_endpoint": "http://node-r-new:19001", + "stale_relay_node_id": "node-r-old", + "rendezvous_peer_node_id": "node-b", + "rendezvous_lease_id": "lease-node-b-via-node-r-new", + "rendezvous_lease_reason": "stale_relay_replacement", + "decision_source": "stale_relay_replacement", + "generation": "new-config", + "path_score": 900, + "score_reasons": []string{"route_health_feedback"}, + "control_plane_only": true, + "production_forwarding": false, + "expires_at": now.Add(10 * time.Minute), + }, + }, + }, + "routes": []map[string]any{ + { + "route_id": "route-ab", + "cluster_id": "cluster-1", + "source_node_id": "node-a", + "destination_node_id": "node-b", + "hops": []string{"node-a", "node-r-old", "node-r-new", "node-b"}, + "allowed_channels": []string{mesh.SyntheticChannelFabricControl}, + "expires_at": now.Add(10 * time.Minute), + "max_ttl": 6, + "max_hops": 6, + }, + }, + "production_forwarding": false, + }, + } + if err := json.NewEncoder(w).Encode(response); err != nil { + t.Fatalf("write response: %v", err) + } + })) + defer server.Close() + + trigger := meshRouteHealthFeedbackTrigger{ + Reason: "synthetic_route_health_drift", + RouteID: "route-ab", + PeerNodeID: "node-b", + SelectedRelayID: "node-r-old", + LinkStatus: "reachable", + DriftDetected: true, + ObservedAt: now, + } + err := refreshSyntheticMeshConfigForRouteHealthFeedback(context.Background(), config.Config{}, identity, client.New(server.URL), meshState, trigger, now) + if err != nil { + t.Fatalf("refresh route health feedback: %v", err) + } + if requests != 1 { + t.Fatalf("requests = %d, want 1", requests) + } + if meshState.ConfigVersion != "new-config" || + len(meshState.RouteHealthRoutes) != 1 || + !sameStringSlice(meshState.RouteHealthRoutes[0].Hops, []string{"node-a", "node-r-new", "node-b"}) { + t.Fatalf("replacement config was not applied: version=%s route_health_routes=%+v", meshState.ConfigVersion, meshState.RouteHealthRoutes) + } + if meshState.RouteHealthRefreshAttempts != 1 || meshState.RouteHealthRefreshSuccesses != 1 || meshState.RouteHealthRefreshFailures != 0 { + t.Fatalf("unexpected refresh counters: attempts=%d successes=%d failures=%d", meshState.RouteHealthRefreshAttempts, meshState.RouteHealthRefreshSuccesses, meshState.RouteHealthRefreshFailures) + } + if meshState.LastRouteHealthRefresh == nil || + meshState.LastRouteHealthRefresh.Status != "succeeded" || + meshState.LastRouteHealthRefresh.Reason != "synthetic_route_health_drift" || + meshState.LastRouteHealthRefresh.RefreshedConfigVersion != "new-config" { + t.Fatalf("unexpected refresh state: %+v", meshState.LastRouteHealthRefresh) + } + report := meshRouteHealthFeedbackRefreshReport(meshState, identity, now.Add(time.Second)) + if report["schema_version"] != meshRouteHealthFeedbackRefreshSchema || + report["feedback_refresh_attempt_count"] != 1 || + report["feedback_refresh_success_count"] != 1 || + report["last_feedback_refresh_status"] != "succeeded" || + report["last_feedback_refresh_reason"] != "synthetic_route_health_drift" || + report["production_payload_forwarding"] != false { + t.Fatalf("unexpected feedback refresh report: %+v", report) + } + routeHealthConfigReport := meshRouteHealthConfigReport(meshState, identity, now.Add(time.Second)) + if routeHealthConfigReport["schema_version"] != meshRouteHealthConfigReportSchema || + routeHealthConfigReport["feedback_refresh_backoff_ms"] != int64(meshRouteHealthFeedbackRefreshBackoff/time.Millisecond) || + routeHealthConfigReport["feedback_refresh_attempt_count"] != 1 || + routeHealthConfigReport["feedback_refresh_success_count"] != 1 || + routeHealthConfigReport["production_payload_forwarding"] != false { + t.Fatalf("unexpected route health config report: %+v", routeHealthConfigReport) + } +} + +func TestRouteHealthFeedbackRefreshBackoffSuppressesRepeatedTrigger(t *testing.T) { + now := time.Date(2026, 4, 28, 22, 5, 0, 0, time.UTC) + local := mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"} + cache := mesh.NewPeerCache(mesh.PeerCacheConfig{ + Local: local, + PeerEndpoints: map[string]string{"node-b": "http://node-b:19002"}, + WarmPeerLimit: 1, + Now: now, + }) + meshState := &syntheticMeshState{ + PeerCache: cache, + Source: "control_plane", + LastRouteHealthRefresh: &meshRouteHealthFeedbackRefreshState{ + Status: "succeeded", + Reason: "synthetic_route_health_drift", + AttemptedAt: now.Add(-time.Second), + RouteID: "route-ab", + }, + } + trigger := meshRouteHealthFeedbackTrigger{ + Reason: "synthetic_route_health_failure", + RouteID: "route-ab", + PeerNodeID: "node-b", + LinkStatus: "unreachable", + FailureReason: "probe failed", + ObservedAt: now, + } + err := refreshSyntheticMeshConfigForRouteHealthFeedback(context.Background(), config.Config{}, state.Identity{ClusterID: "cluster-1", NodeID: "node-a"}, client.New("http://127.0.0.1:1"), meshState, trigger, now) + if err != nil { + t.Fatalf("refresh should have been suppressed without backend call: %v", err) + } + if meshState.RouteHealthRefreshAttempts != 0 || meshState.RouteHealthRefreshSuppressed != 1 { + t.Fatalf("unexpected counters: attempts=%d suppressed=%d", meshState.RouteHealthRefreshAttempts, meshState.RouteHealthRefreshSuppressed) + } + if meshState.LastRouteHealthRefresh.Reason != "synthetic_route_health_drift" { + t.Fatalf("suppressed refresh should not replace last state: %+v", meshState.LastRouteHealthRefresh) + } +} + +func TestRouteHealthFeedbackTriggerFromObservation(t *testing.T) { + now := time.Date(2026, 4, 28, 22, 10, 0, 0, time.UTC) + route := mesh.SyntheticRoute{ + RouteID: "route-ab", + SourceNodeID: "node-a", + DestinationNodeID: "node-b", + Hops: []string{"node-a", "node-r-old", "node-b"}, + } + decision := client.RoutePathDecision{ + RouteID: "route-ab", + RendezvousPeerNodeID: "node-b", + SelectedRelayID: "node-r-old", + RendezvousLeaseID: "lease-old", + RendezvousLeaseReason: "auto_outbound_only", + } + trigger, ok := routeHealthFeedbackTriggerFromObservation(route, decision, true, "reachable", map[string]any{ + "route_path_drift_detected": true, + }, now) + if !ok || + trigger.Reason != "synthetic_route_health_drift" || + trigger.RouteID != "route-ab" || + trigger.PeerNodeID != "node-b" || + trigger.SelectedRelayID != "node-r-old" || + !trigger.DriftDetected { + t.Fatalf("unexpected drift trigger: %+v ok=%t", trigger, ok) + } + if _, ok := routeHealthFeedbackTriggerFromObservation(route, decision, true, "reachable", map[string]any{ + "route_path_drift_detected": false, + }, now); ok { + t.Fatal("healthy route-health observation should not trigger refresh") + } +} + +func TestMeshRouteGenerationTrackerReportsReplacementWithdrawOnFirstApply(t *testing.T) { + now := time.Date(2026, 4, 28, 13, 50, 0, 0, time.UTC) + report := &client.RoutePathDecisionReport{ + SchemaVersion: "c17z18.route_path_decisions.v1", + Generation: "config-replacement", + ReplacementDecisionCount: 1, + Decisions: []client.RoutePathDecision{ + { + DecisionID: "route-1-path-node-a-via-node-r-new", + RouteID: "route-1", + LocalNodeID: "node-a", + OriginalHops: []string{"node-a", "node-r-old", "node-r-new", "node-c"}, + EffectiveHops: []string{"node-a", "node-r-new", "node-c"}, + DecisionSource: "stale_relay_replacement", + Generation: "config-replacement", + LocalRole: "entry", + NextHopID: "node-r-new", + SelectedRelayID: "node-r-new", + StaleRelayNodeID: "node-r-old", + RendezvousLeaseID: "lease-node-c-via-node-r-new", + PathScore: 760, + ControlPlaneOnly: true, + ProductionForwarding: false, + RendezvousLeaseReason: "stale_relay_replacement", + SelectedRelayEndpoint: "http://node-r-new:19124", + }, + }, + } + meshState := &syntheticMeshState{ + Source: "control_plane", + ConfigVersion: "config-replacement", + RoutePathDecisions: report, + RouteGenerationTracker: newMeshRouteGenerationTracker(report, now), + } + generationReport := meshRouteGenerationReport(meshState, state.Identity{ + ClusterID: "cluster-1", + NodeID: "node-a", + }, now.Add(time.Second)) + if generationReport["active_decision_count"] != 1 || + generationReport["applied_decision_count"] != 1 || + generationReport["withdrawn_decision_count"] != 1 || + generationReport["total_withdrawn_decision_count"] != 1 || + generationReport["generation_changed"] != true { + t.Fatalf("unexpected first-apply route generation report: %+v", generationReport) + } +} + +func TestProductionEnvelopeObservationSinkFromConfigCreatesBoundedSink(t *testing.T) { + sink := productionEnvelopeObservationSinkFromConfig(config.Config{ + MeshProductionObservationSinkCapacity: 2, + }) + if sink == nil { + t.Fatal("sink is nil") + } + if sink.Capacity() != 2 { + t.Fatalf("sink capacity = %d, want 2", sink.Capacity()) + } +} + +func TestProductionForwardingLogStateDistinguishesGateFromRuntime(t *testing.T) { + gateEnabled, runtimeEnabled := productionForwardingLogState(config.Config{ + MeshProductionForwardingEnabled: true, + }) + if !gateEnabled { + t.Fatal("gateEnabled = false, want true") + } + if !runtimeEnabled { + t.Fatal("runtimeEnabled = false, want true") + } + gateEnabled, runtimeEnabled = productionForwardingLogState(config.Config{}) + if gateEnabled || runtimeEnabled { + t.Fatalf("default log state = gate:%t runtime:%t, want false/false", gateEnabled, runtimeEnabled) + } +} + +func TestLogProductionObservationSinkMetricsToleratesNilState(t *testing.T) { + logProductionObservationSinkMetrics(nil) + logProductionObservationSinkMetrics(&syntheticMeshState{}) +} + +func TestLogProductionObservationSinkMetricsOnlyWhenChanged(t *testing.T) { + sink := mesh.NewProductionEnvelopeObservationSink(2) + meshState := &syntheticMeshState{ProductionObservationSink: sink} + var logs strings.Builder + previousOutput := log.Writer() + log.SetOutput(&logs) + defer log.SetOutput(previousOutput) + defer log.SetOutput(io.Discard) + + logProductionObservationSinkMetrics(meshState) + firstLen := logs.Len() + if firstLen == 0 { + t.Fatal("first metrics log was not written") + } + logProductionObservationSinkMetrics(meshState) + if logs.Len() != firstLen { + t.Fatal("metrics log was written again without metric changes") + } + if err := sink.Observe(context.Background(), mesh.ProductionEnvelopeObservation{MessageID: "message-1"}); err != nil { + t.Fatalf("observe: %v", err) + } + logProductionObservationSinkMetrics(meshState) + if logs.Len() == firstLen { + t.Fatal("metrics log was not written after metric changes") + } +} + +func TestProductionObservationSinkMetricsEqual(t *testing.T) { + a := mesh.ProductionEnvelopeObservationSinkMetrics{ + Capacity: 2, + CurrentDepth: 1, + AcceptedTotal: 1, + DroppedOldest: 0, + } + if !productionObservationSinkMetricsEqual(a, a) { + t.Fatal("identical metrics were not equal") + } + b := a + b.DroppedOldest = 1 + if productionObservationSinkMetricsEqual(a, b) { + t.Fatal("different metrics were equal") + } +} diff --git a/agents/rap-node-agent/go.mod b/agents/rap-node-agent/go.mod new file mode 100644 index 0000000..31bd64b --- /dev/null +++ b/agents/rap-node-agent/go.mod @@ -0,0 +1,3 @@ +module github.com/example/remote-access-platform/agents/rap-node-agent + +go 1.23.2 diff --git a/agents/rap-node-agent/internal/agent/payload.go b/agents/rap-node-agent/internal/agent/payload.go new file mode 100644 index 0000000..9f4430f --- /dev/null +++ b/agents/rap-node-agent/internal/agent/payload.go @@ -0,0 +1,70 @@ +package agent + +import ( + "runtime" + + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/client" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/state" +) + +const Version = "0.1.0-c3" + +func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest { + return client.EnrollRequest{ + ClusterID: clusterID, + JoinToken: joinToken, + NodeName: identity.NodeName, + NodeFingerprint: identity.NodeFingerprint, + PublicKey: identity.PublicKey, + ReportedCapabilities: map[string]any{ + "can_accept_client_ingress": false, + "can_accept_node_ingress": false, + "can_route_mesh": false, + "can_run_rdp_worker": true, + "can_run_vnc_worker": false, + "can_run_vpn_exit": false, + "can_run_vpn_connector": false, + "can_run_file_cache": false, + "can_run_update_cache": false, + "can_run_video_relay": false, + "native_node_agent_version": Version, + "service_supervision_enabled": false, + }, + ReportedFacts: map[string]any{ + "os": runtime.GOOS, + "arch": runtime.GOARCH, + "agent": "rap-node-agent", + "agent_ver": Version, + }, + RequestedRoles: []string{}, + } +} + +func HeartbeatPayload() client.HeartbeatRequest { + return client.HeartbeatRequest{ + HealthStatus: "healthy", + ReportedVersion: Version, + Capabilities: map[string]any{ + "native_node_agent": true, + }, + ServiceStates: map[string]any{ + "workload_supervision": "not_implemented_c3", + }, + Metadata: map[string]any{ + "stage": "c3", + }, + } +} + +func MeshSelfObservationPayload(identity state.Identity) client.MeshLinkObservationRequest { + return client.MeshLinkObservationRequest{ + SourceNodeID: identity.NodeID, + TargetNodeID: identity.NodeID, + LinkStatus: "reachable", + Metadata: map[string]any{ + "stage": "c6", + "traffic_forwarding": false, + "observation_type": "self", + }, + } +} diff --git a/agents/rap-node-agent/internal/agent/payload_test.go b/agents/rap-node-agent/internal/agent/payload_test.go new file mode 100644 index 0000000..80fe7f4 --- /dev/null +++ b/agents/rap-node-agent/internal/agent/payload_test.go @@ -0,0 +1,44 @@ +package agent + +import ( + "testing" + + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/state" +) + +func TestEnrollmentPayloadDoesNotRequestRolesByDefault(t *testing.T) { + payload := EnrollmentPayload("cluster-1", "join-token", state.Identity{ + NodeName: "node-a", + NodeFingerprint: "fp", + PublicKey: "pub", + }) + if payload.ClusterID != "cluster-1" || payload.JoinToken != "join-token" { + t.Fatalf("unexpected enrollment payload: %+v", payload) + } + if len(payload.RequestedRoles) != 0 { + t.Fatalf("agent must not self-assign roles: %+v", payload.RequestedRoles) + } + if payload.ReportedCapabilities["can_run_rdp_worker"] != true { + t.Fatalf("expected rdp capability in MVP payload: %+v", payload.ReportedCapabilities) + } +} + +func TestHeartbeatPayloadIsStatusOnly(t *testing.T) { + payload := HeartbeatPayload() + if payload.HealthStatus != "healthy" { + t.Fatalf("HealthStatus = %q", payload.HealthStatus) + } + if payload.ServiceStates["workload_supervision"] == "running" { + t.Fatal("C3 must not pretend workload supervision is implemented") + } +} + +func TestMeshSelfObservationDoesNotEnableTrafficForwarding(t *testing.T) { + payload := MeshSelfObservationPayload(state.Identity{NodeID: "node-1"}) + if payload.SourceNodeID != "node-1" || payload.TargetNodeID != "node-1" { + t.Fatalf("unexpected mesh self observation payload: %+v", payload) + } + if payload.Metadata["traffic_forwarding"] != false { + t.Fatalf("traffic forwarding must stay disabled in C6: %+v", payload.Metadata) + } +} diff --git a/agents/rap-node-agent/internal/agent/telemetry.go b/agents/rap-node-agent/internal/agent/telemetry.go new file mode 100644 index 0000000..98796e7 --- /dev/null +++ b/agents/rap-node-agent/internal/agent/telemetry.go @@ -0,0 +1,33 @@ +package agent + +import ( + "runtime" + "time" + + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/client" + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/state" +) + +func TelemetryPayload(identity state.Identity, startedAt time.Time) client.TelemetryRequest { + var mem runtime.MemStats + runtime.ReadMemStats(&mem) + used := int64(mem.Alloc) + total := int64(mem.Sys) + processCount := runtime.NumGoroutine() + return client.TelemetryRequest{ + MemoryUsedBytes: &used, + MemoryTotalBytes: &total, + ProcessCount: &processCount, + Payload: map[string]any{ + "agent": "rap-node-agent", + "agent_version": Version, + "node_name": identity.NodeName, + "os": runtime.GOOS, + "arch": runtime.GOARCH, + "goroutines": runtime.NumGoroutine(), + "uptime_seconds": int64(time.Since(startedAt).Seconds()), + "telemetry_source": "testing_flag", + }, + ObservedAt: time.Now().UTC(), + } +} diff --git a/agents/rap-node-agent/internal/authority/authority.go b/agents/rap-node-agent/internal/authority/authority.go new file mode 100644 index 0000000..7c1cb52 --- /dev/null +++ b/agents/rap-node-agent/internal/authority/authority.go @@ -0,0 +1,110 @@ +package authority + +import ( + "crypto/ed25519" + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "strings" +) + +const ( + AuthoritySchemaVersion = "rap.cluster_authority.v1" + SignatureSchemaVersion = "rap.cluster_authority.signature.v1" + AlgorithmEd25519 = "ed25519" +) + +var ( + ErrInvalidKey = errors.New("invalid cluster authority key") + ErrInvalidSignature = errors.New("invalid cluster authority signature") + ErrInvalidPayload = errors.New("invalid cluster authority payload") +) + +type Signature struct { + SchemaVersion string `json:"schema_version"` + Algorithm string `json:"algorithm"` + KeyFingerprint string `json:"key_fingerprint"` + Signature string `json:"signature"` +} + +func VerifyRaw(publicKeyB64 string, payload json.RawMessage, signature Signature) error { + if signature.SchemaVersion != SignatureSchemaVersion { + return fmt.Errorf("%w: schema_version must be %s", ErrInvalidSignature, SignatureSchemaVersion) + } + if signature.Algorithm != AlgorithmEd25519 { + return fmt.Errorf("%w: algorithm must be %s", ErrInvalidSignature, AlgorithmEd25519) + } + publicKey, err := decodePublicKey(publicKeyB64) + if err != nil { + return err + } + if signature.KeyFingerprint != Fingerprint(publicKey) { + return fmt.Errorf("%w: key fingerprint mismatch", ErrInvalidSignature) + } + canonical, err := CanonicalJSON(payload) + if err != nil { + return err + } + decodedSignature, err := decodeBase64(strings.TrimSpace(signature.Signature)) + if err != nil || len(decodedSignature) != ed25519.SignatureSize { + return fmt.Errorf("%w: signature must be base64 ed25519 signature", ErrInvalidSignature) + } + if !ed25519.Verify(publicKey, canonical, decodedSignature) { + return ErrInvalidSignature + } + return nil +} + +func Fingerprint(publicKey ed25519.PublicKey) string { + sum := sha256.Sum256(publicKey) + return "rap-ca-ed25519-" + hex.EncodeToString(sum[:16]) +} + +func HashRaw(raw json.RawMessage) (string, error) { + canonical, err := CanonicalJSON(raw) + if err != nil { + return "", err + } + sum := sha256.Sum256(canonical) + return hex.EncodeToString(sum[:]), nil +} + +func CanonicalJSON(raw json.RawMessage) ([]byte, error) { + if len(raw) == 0 { + return nil, fmt.Errorf("%w: empty payload", ErrInvalidPayload) + } + var value any + if err := json.Unmarshal(raw, &value); err != nil { + return nil, fmt.Errorf("%w: invalid json: %v", ErrInvalidPayload, err) + } + canonical, err := json.Marshal(value) + if err != nil { + return nil, fmt.Errorf("%w: canonical json: %v", ErrInvalidPayload, err) + } + return canonical, nil +} + +func decodePublicKey(value string) (ed25519.PublicKey, error) { + decoded, err := decodeBase64(strings.TrimSpace(value)) + if err != nil { + return nil, fmt.Errorf("%w: public key must be base64 encoded", ErrInvalidKey) + } + if len(decoded) != ed25519.PublicKeySize { + return nil, fmt.Errorf("%w: public key must decode to %d bytes", ErrInvalidKey, ed25519.PublicKeySize) + } + return ed25519.PublicKey(decoded), nil +} + +func decodeBase64(value string) ([]byte, error) { + if value == "" { + return nil, errors.New("empty base64 value") + } + decoded, err := base64.StdEncoding.DecodeString(value) + if err == nil { + return decoded, nil + } + return base64.RawStdEncoding.DecodeString(value) +} diff --git a/agents/rap-node-agent/internal/authority/authority_test.go b/agents/rap-node-agent/internal/authority/authority_test.go new file mode 100644 index 0000000..2c64fb9 --- /dev/null +++ b/agents/rap-node-agent/internal/authority/authority_test.go @@ -0,0 +1,52 @@ +package authority + +import ( + "crypto/ed25519" + "encoding/base64" + "encoding/json" + "errors" + "testing" +) + +func TestVerifyRawAcceptsSignedPayload(t *testing.T) { + publicKey, privateKey, err := ed25519.GenerateKey(nil) + if err != nil { + t.Fatalf("GenerateKey: %v", err) + } + payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1"}`) + canonical, err := CanonicalJSON(payload) + if err != nil { + t.Fatalf("CanonicalJSON: %v", err) + } + signature := Signature{ + SchemaVersion: SignatureSchemaVersion, + Algorithm: AlgorithmEd25519, + KeyFingerprint: Fingerprint(publicKey), + Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)), + } + if err := VerifyRaw(base64.StdEncoding.EncodeToString(publicKey), payload, signature); err != nil { + t.Fatalf("VerifyRaw: %v", err) + } +} + +func TestVerifyRawRejectsTamperedPayload(t *testing.T) { + publicKey, privateKey, err := ed25519.GenerateKey(nil) + if err != nil { + t.Fatalf("GenerateKey: %v", err) + } + payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1"}`) + canonical, err := CanonicalJSON(payload) + if err != nil { + t.Fatalf("CanonicalJSON: %v", err) + } + signature := Signature{ + SchemaVersion: SignatureSchemaVersion, + Algorithm: AlgorithmEd25519, + KeyFingerprint: Fingerprint(publicKey), + Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)), + } + tampered := json.RawMessage(`{"cluster_id":"cluster-2","schema_version":"test.v1"}`) + if err := VerifyRaw(base64.StdEncoding.EncodeToString(publicKey), tampered, signature); !errors.Is(err, ErrInvalidSignature) { + t.Fatalf("err = %v, want ErrInvalidSignature", err) + } +} diff --git a/agents/rap-node-agent/internal/client/client.go b/agents/rap-node-agent/internal/client/client.go new file mode 100644 index 0000000..2bced1c --- /dev/null +++ b/agents/rap-node-agent/internal/client/client.go @@ -0,0 +1,400 @@ +package client + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "time" +) + +type Client struct { + baseURL string + httpClient *http.Client +} + +type EnrollRequest struct { + ClusterID string `json:"cluster_id"` + JoinToken string `json:"join_token"` + NodeName string `json:"node_name"` + NodeFingerprint string `json:"node_fingerprint"` + PublicKey string `json:"public_key"` + ReportedCapabilities map[string]any `json:"reported_capabilities"` + ReportedFacts map[string]any `json:"reported_facts"` + RequestedRoles []string `json:"requested_roles"` +} + +type EnrollResponse struct { + Status string `json:"status"` + JoinRequest json.RawMessage `json:"join_request"` +} + +type EnrollmentBootstrapRequest struct { + ClusterID string `json:"cluster_id"` + NodeFingerprint string `json:"node_fingerprint"` + PublicKey string `json:"public_key"` +} + +type EnrollmentBootstrapResponse struct { + Status string `json:"status"` + JoinRequest json.RawMessage `json:"join_request"` + Bootstrap *NodeBootstrap `json:"node_bootstrap,omitempty"` +} + +type NodeBootstrap struct { + NodeID string `json:"node_id"` + ClusterID string `json:"cluster_id"` + IdentityStatus string `json:"identity_status"` + Certificate map[string]any `json:"certificate"` + HeartbeatEndpoint string `json:"heartbeat_endpoint"` + ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"` + AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"` + AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"` +} + +type HeartbeatRequest struct { + HealthStatus string `json:"health_status"` + ReportedVersion string `json:"reported_version,omitempty"` + Capabilities map[string]any `json:"capabilities"` + ServiceStates map[string]any `json:"service_states"` + Metadata map[string]any `json:"metadata"` +} + +type HeartbeatResponse struct { + Heartbeat json.RawMessage `json:"heartbeat"` + TestingFlags EffectiveTestingFlags `json:"testing_flags"` +} + +type EffectiveTestingFlags struct { + Enabled bool `json:"enabled"` + TelemetryEnabled bool `json:"telemetry_enabled"` + SyntheticLinksEnabled bool `json:"synthetic_links_enabled"` + HistoryRetentionHours int `json:"history_retention_hours"` + AppliedScopes []string `json:"applied_scopes"` +} + +type DesiredWorkload struct { + ServiceType string `json:"service_type"` + DesiredState string `json:"desired_state"` + Version string `json:"version,omitempty"` + RuntimeMode string `json:"runtime_mode"` + ArtifactRef string `json:"artifact_ref,omitempty"` + Config map[string]any `json:"config"` + Environment map[string]any `json:"environment"` +} + +type WorkloadStatusRequest struct { + ReportedState string `json:"reported_state"` + RuntimeMode string `json:"runtime_mode"` + Version string `json:"version,omitempty"` + StatusPayload map[string]any `json:"status_payload"` +} + +type MeshLinkObservationRequest struct { + SourceNodeID string `json:"source_node_id"` + TargetNodeID string `json:"target_node_id"` + LinkStatus string `json:"link_status"` + LatencyMs *int `json:"latency_ms,omitempty"` + QualityScore *int `json:"quality_score,omitempty"` + Metadata map[string]any `json:"metadata"` +} + +type TelemetryRequest struct { + CPUPercent *float64 `json:"cpu_percent,omitempty"` + MemoryUsedBytes *int64 `json:"memory_used_bytes,omitempty"` + MemoryTotalBytes *int64 `json:"memory_total_bytes,omitempty"` + DiskUsedBytes *int64 `json:"disk_used_bytes,omitempty"` + DiskTotalBytes *int64 `json:"disk_total_bytes,omitempty"` + NetworkRxBytes *int64 `json:"network_rx_bytes,omitempty"` + NetworkTxBytes *int64 `json:"network_tx_bytes,omitempty"` + ProcessCount *int `json:"process_count,omitempty"` + Payload map[string]any `json:"payload"` + ObservedAt time.Time `json:"observed_at"` +} + +type SyntheticMeshRouteConfig struct { + RouteID string `json:"route_id"` + ClusterID string `json:"cluster_id"` + SourceNodeID string `json:"source_node_id"` + DestinationNodeID string `json:"destination_node_id"` + Hops []string `json:"hops"` + AllowedChannels []string `json:"allowed_channels"` + ExpiresAt time.Time `json:"expires_at"` + MaxTTL int `json:"max_ttl"` + MaxHops int `json:"max_hops"` + RouteVersion string `json:"route_version,omitempty"` + PolicyVersion string `json:"policy_version,omitempty"` + PeerDirectoryVersion string `json:"peer_directory_version,omitempty"` +} + +type SyntheticMeshConfig struct { + Enabled bool `json:"enabled"` + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + LocalNodeID string `json:"local_node_id"` + AuthorityRequired bool `json:"authority_required"` + ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"` + AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"` + AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"` + ConfigVersion string `json:"config_version,omitempty"` + PeerDirectoryVersion string `json:"peer_directory_version,omitempty"` + PolicyVersion string `json:"policy_version,omitempty"` + PeerEndpoints map[string]string `json:"peer_endpoints"` + PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"` + PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"` + RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"` + RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"` + RendezvousRelayPolicy *RendezvousRelayPolicyReport `json:"rendezvous_relay_policy,omitempty"` + RoutePathDecisions *RoutePathDecisionReport `json:"route_path_decisions,omitempty"` + Routes []SyntheticMeshRouteConfig `json:"routes"` + ProductionForwarding bool `json:"production_forwarding"` +} + +type ClusterAuthorityDescriptor struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + AuthorityState string `json:"authority_state"` + KeyAlgorithm string `json:"key_algorithm"` + PublicKey string `json:"public_key"` + PublicKeyFingerprint string `json:"public_key_fingerprint"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type ClusterSignature struct { + SchemaVersion string `json:"schema_version"` + Algorithm string `json:"algorithm"` + KeyFingerprint string `json:"key_fingerprint"` + Signature string `json:"signature"` + SignedAt time.Time `json:"signed_at"` +} + +type PeerDirectoryEntry struct { + NodeID string `json:"node_id"` + RouteIDs []string `json:"route_ids,omitempty"` + EndpointCount int `json:"endpoint_count"` + CandidateCount int `json:"candidate_count"` + ConnectivityModes []string `json:"connectivity_modes,omitempty"` + RecoverySeed bool `json:"recovery_seed"` +} + +type PeerRecoverySeed struct { + NodeID string `json:"node_id"` + Endpoint string `json:"endpoint"` + Transport string `json:"transport"` + ConnectivityMode string `json:"connectivity_mode,omitempty"` + Region string `json:"region,omitempty"` + Priority int `json:"priority"` + LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` +} + +type PeerRendezvousLease struct { + LeaseID string `json:"lease_id"` + PeerNodeID string `json:"peer_node_id"` + RelayNodeID string `json:"relay_node_id"` + RelayEndpoint string `json:"relay_endpoint"` + Transport string `json:"transport"` + ConnectivityMode string `json:"connectivity_mode,omitempty"` + RouteIDs []string `json:"route_ids,omitempty"` + AllowedChannels []string `json:"allowed_channels,omitempty"` + Priority int `json:"priority"` + ControlPlaneOnly bool `json:"control_plane_only"` + IssuedAt time.Time `json:"issued_at"` + ExpiresAt time.Time `json:"expires_at"` + Reason string `json:"reason,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` +} + +type RendezvousRelayPolicyDecision struct { + RouteID string `json:"route_id,omitempty"` + PeerNodeID string `json:"peer_node_id"` + WithdrawnLeaseID string `json:"withdrawn_lease_id,omitempty"` + StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"` + SelectedRelayID string `json:"selected_relay_id,omitempty"` + SelectedEndpoint string `json:"selected_endpoint,omitempty"` + Score int `json:"score,omitempty"` + Reason string `json:"reason"` + ScoreReasons []string `json:"score_reasons,omitempty"` + ReporterNodeID string `json:"reporter_node_id,omitempty"` +} + +type RendezvousRelayPolicyReport struct { + SchemaVersion string `json:"schema_version"` + ScoringMode string `json:"scoring_mode"` + FeedbackMaxAgeSeconds int `json:"feedback_max_age_seconds"` + StaleRelayCount int `json:"stale_relay_count"` + WithdrawnLeaseCount int `json:"withdrawn_lease_count"` + ReplacementLeaseCount int `json:"replacement_lease_count"` + Decisions []RendezvousRelayPolicyDecision `json:"decisions,omitempty"` +} + +type RoutePathDecision struct { + DecisionID string `json:"decision_id"` + RouteID string `json:"route_id"` + ClusterID string `json:"cluster_id"` + LocalNodeID string `json:"local_node_id"` + SourceNodeID string `json:"source_node_id"` + DestinationNodeID string `json:"destination_node_id"` + OriginalHops []string `json:"original_hops"` + EffectiveHops []string `json:"effective_hops"` + PreviousHopID string `json:"previous_hop_id,omitempty"` + NextHopID string `json:"next_hop_id,omitempty"` + LocalRole string `json:"local_role"` + SelectedRelayID string `json:"selected_relay_id,omitempty"` + SelectedRelayEndpoint string `json:"selected_relay_endpoint,omitempty"` + StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"` + RendezvousPeerNodeID string `json:"rendezvous_peer_node_id,omitempty"` + RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"` + RendezvousLeaseReason string `json:"rendezvous_lease_reason,omitempty"` + DecisionSource string `json:"decision_source"` + Generation string `json:"generation"` + PathScore int `json:"path_score,omitempty"` + ScoreReasons []string `json:"score_reasons,omitempty"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` + ExpiresAt time.Time `json:"expires_at"` +} + +type RoutePathDecisionReport struct { + SchemaVersion string `json:"schema_version"` + DecisionMode string `json:"decision_mode"` + Generation string `json:"generation"` + DecisionCount int `json:"decision_count"` + ReplacementDecisionCount int `json:"replacement_decision_count"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` + Decisions []RoutePathDecision `json:"decisions,omitempty"` +} + +type PeerEndpointCandidate struct { + EndpointID string `json:"endpoint_id"` + NodeID string `json:"node_id"` + Transport string `json:"transport"` + Address string `json:"address"` + AddressFamily string `json:"address_family,omitempty"` + Reachability string `json:"reachability"` + NATType string `json:"nat_type,omitempty"` + ConnectivityMode string `json:"connectivity_mode"` + Region string `json:"region,omitempty"` + Priority int `json:"priority"` + PolicyTags []string `json:"policy_tags,omitempty"` + LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` +} + +func New(baseURL string) *Client { + return &Client{ + baseURL: baseURL, + httpClient: &http.Client{ + Timeout: 15 * time.Second, + }, + } +} + +func (c *Client) Enroll(ctx context.Context, request EnrollRequest) (EnrollResponse, error) { + var response EnrollResponse + if err := c.postJSON(ctx, "/node-agents/enroll", request, &response); err != nil { + return EnrollResponse{}, err + } + return response, nil +} + +func (c *Client) BootstrapEnrollment(ctx context.Context, joinRequestID string, request EnrollmentBootstrapRequest) (EnrollmentBootstrapResponse, error) { + var response EnrollmentBootstrapResponse + path := fmt.Sprintf("/node-agents/enrollments/%s/bootstrap", joinRequestID) + if err := c.postJSON(ctx, path, request, &response); err != nil { + return EnrollmentBootstrapResponse{}, err + } + return response, nil +} + +func (c *Client) Heartbeat(ctx context.Context, clusterID, nodeID string, request HeartbeatRequest) (HeartbeatResponse, error) { + var response HeartbeatResponse + path := fmt.Sprintf("/clusters/%s/nodes/%s/heartbeats", clusterID, nodeID) + if err := c.postJSON(ctx, path, request, &response); err != nil { + return HeartbeatResponse{}, err + } + return response, nil +} + +func (c *Client) DesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]DesiredWorkload, error) { + var response struct { + DesiredWorkloads []DesiredWorkload `json:"desired_workloads"` + } + path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/desired", clusterID, nodeID) + if err := c.getJSON(ctx, path, &response); err != nil { + return nil, err + } + return response.DesiredWorkloads, nil +} + +func (c *Client) ReportWorkloadStatus(ctx context.Context, clusterID, nodeID, serviceType string, request WorkloadStatusRequest) error { + path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/%s/status", clusterID, nodeID, serviceType) + return c.postJSON(ctx, path, request, nil) +} + +func (c *Client) ReportMeshLink(ctx context.Context, clusterID string, request MeshLinkObservationRequest) error { + path := fmt.Sprintf("/clusters/%s/mesh/links", clusterID) + return c.postJSON(ctx, path, request, nil) +} + +func (c *Client) ReportTelemetry(ctx context.Context, clusterID, nodeID string, request TelemetryRequest) error { + path := fmt.Sprintf("/clusters/%s/nodes/%s/telemetry", clusterID, nodeID) + return c.postJSON(ctx, path, request, nil) +} + +func (c *Client) SyntheticMeshConfig(ctx context.Context, clusterID, nodeID string) (SyntheticMeshConfig, error) { + var response struct { + Config SyntheticMeshConfig `json:"synthetic_mesh_config"` + } + path := fmt.Sprintf("/clusters/%s/nodes/%s/mesh/synthetic-config", clusterID, nodeID) + if err := c.getJSON(ctx, path, &response); err != nil { + return SyntheticMeshConfig{}, err + } + return response.Config, nil +} + +func (c *Client) getJSON(ctx context.Context, path string, response any) error { + httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil) + if err != nil { + return err + } + httpResp, err := c.httpClient.Do(httpReq) + if err != nil { + return err + } + defer httpResp.Body.Close() + if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 { + return fmt.Errorf("backend returned status %d", httpResp.StatusCode) + } + if response == nil { + return nil + } + return json.NewDecoder(httpResp.Body).Decode(response) +} + +func (c *Client) postJSON(ctx context.Context, path string, request any, response any) error { + payload, err := json.Marshal(request) + if err != nil { + return err + } + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(payload)) + if err != nil { + return err + } + httpReq.Header.Set("Content-Type", "application/json") + httpResp, err := c.httpClient.Do(httpReq) + if err != nil { + return err + } + defer httpResp.Body.Close() + if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 { + return fmt.Errorf("backend returned status %d", httpResp.StatusCode) + } + if response == nil { + return nil + } + return json.NewDecoder(httpResp.Body).Decode(response) +} diff --git a/agents/rap-node-agent/internal/config/config.go b/agents/rap-node-agent/internal/config/config.go new file mode 100644 index 0000000..0af0625 --- /dev/null +++ b/agents/rap-node-agent/internal/config/config.go @@ -0,0 +1,183 @@ +package config + +import ( + "errors" + "flag" + "os" + "path/filepath" + "strconv" + "strings" + "time" +) + +const MaxMeshProductionObservationSinkCapacity = 10000 + +type Config struct { + BackendURL string + ClusterID string + ClusterAuthorityPublicKey string + ClusterAuthorityFingerprint string + JoinToken string + NodeName string + StateDir string + WorkloadSupervisionEnabled bool + HeartbeatInterval time.Duration + EnrollmentPollInterval time.Duration + EnrollmentPollTimeout time.Duration + MeshSyntheticRuntimeEnabled bool + MeshProductionForwardingEnabled bool + MeshProductionObservationSinkCapacity int + MeshListenAddr string + MeshAdvertiseEndpoint string + MeshAdvertiseEndpointsJSON string + MeshAdvertiseTransport string + MeshConnectivityMode string + MeshNATType string + MeshRegion string + MeshSyntheticConfigPath string + MeshPeerEndpointsJSON string + MeshSyntheticRoutesJSON string +} + +func Load(args []string, env map[string]string) (Config, error) { + if env == nil { + env = readEnv() + } + defaultStateDir := filepath.Join(".", ".rap-node-agent") + fs := flag.NewFlagSet("rap-node-agent", flag.ContinueOnError) + cfg := Config{} + fs.StringVar(&cfg.BackendURL, "backend-url", getEnv(env, "RAP_BACKEND_URL", "http://127.0.0.1:8080/api/v1"), "Backend API base URL.") + fs.StringVar(&cfg.ClusterID, "cluster-id", getEnv(env, "RAP_CLUSTER_ID", ""), "Cluster ID.") + fs.StringVar(&cfg.ClusterAuthorityPublicKey, "cluster-authority-public-key", getEnv(env, "RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned cluster authority Ed25519 public key.") + fs.StringVar(&cfg.ClusterAuthorityFingerprint, "cluster-authority-fingerprint", getEnv(env, "RAP_CLUSTER_AUTHORITY_FINGERPRINT", ""), "Pinned cluster authority key fingerprint.") + fs.StringVar(&cfg.JoinToken, "join-token", getEnv(env, "RAP_JOIN_TOKEN", ""), "Short-lived node join token.") + fs.StringVar(&cfg.NodeName, "node-name", getEnv(env, "RAP_NODE_NAME", hostnameOrDefault()), "Node display name.") + fs.StringVar(&cfg.StateDir, "state-dir", getEnv(env, "RAP_NODE_STATE_DIR", defaultStateDir), "Local node-agent state directory.") + fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getEnvBool(env, "RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable desired workload polling and status reporting. Disabled by default while service runtime is not implemented.") + fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.") + fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.") + fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.") + fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.") + fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.") + fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.") + fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "direct_tcp_tls"), "Transport label for the advertised mesh endpoint.") + fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.") + fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.") + fs.StringVar(&cfg.MeshRegion, "mesh-region", getEnv(env, "RAP_MESH_REGION", ""), "Optional region/site hint for the advertised mesh endpoint.") + fs.StringVar(&cfg.MeshSyntheticConfigPath, "mesh-synthetic-config", getEnv(env, "RAP_MESH_SYNTHETIC_CONFIG", ""), "Path to scoped synthetic mesh config snapshot. Preferred over debug JSON env.") + fs.StringVar(&cfg.MeshPeerEndpointsJSON, "mesh-peer-endpoints-json", getEnv(env, "RAP_MESH_PEER_ENDPOINTS_JSON", ""), "JSON object mapping peer node_id to synthetic mesh endpoint URL.") + fs.StringVar(&cfg.MeshSyntheticRoutesJSON, "mesh-synthetic-routes-json", getEnv(env, "RAP_MESH_SYNTHETIC_ROUTES_JSON", ""), "JSON array of synthetic mesh routes for test-only runtime.") + heartbeatSeconds := getEnvInt(env, "RAP_HEARTBEAT_INTERVAL_SECONDS", 15) + fs.DurationVar(&cfg.HeartbeatInterval, "heartbeat-interval", time.Duration(heartbeatSeconds)*time.Second, "Heartbeat interval.") + enrollmentPollIntervalSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5) + enrollmentPollTimeoutSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 600) + fs.DurationVar(&cfg.EnrollmentPollInterval, "enrollment-poll-interval", time.Duration(enrollmentPollIntervalSeconds)*time.Second, "Enrollment approval polling interval.") + fs.DurationVar(&cfg.EnrollmentPollTimeout, "enrollment-poll-timeout", time.Duration(enrollmentPollTimeoutSeconds)*time.Second, "Enrollment approval polling timeout.") + if err := fs.Parse(args); err != nil { + return Config{}, err + } + cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/") + cfg.ClusterID = strings.TrimSpace(cfg.ClusterID) + cfg.ClusterAuthorityPublicKey = strings.TrimSpace(cfg.ClusterAuthorityPublicKey) + cfg.ClusterAuthorityFingerprint = strings.TrimSpace(cfg.ClusterAuthorityFingerprint) + cfg.JoinToken = strings.TrimSpace(cfg.JoinToken) + cfg.NodeName = strings.TrimSpace(cfg.NodeName) + cfg.StateDir = strings.TrimSpace(cfg.StateDir) + cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr) + cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/") + cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON) + cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport) + cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode) + cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType) + cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion) + cfg.MeshSyntheticConfigPath = strings.TrimSpace(cfg.MeshSyntheticConfigPath) + cfg.MeshPeerEndpointsJSON = strings.TrimSpace(cfg.MeshPeerEndpointsJSON) + cfg.MeshSyntheticRoutesJSON = strings.TrimSpace(cfg.MeshSyntheticRoutesJSON) + if cfg.BackendURL == "" { + return Config{}, errors.New("backend URL is required") + } + if cfg.NodeName == "" { + return Config{}, errors.New("node name is required") + } + if cfg.StateDir == "" { + return Config{}, errors.New("state dir is required") + } + if cfg.HeartbeatInterval <= 0 { + return Config{}, errors.New("heartbeat interval must be positive") + } + if cfg.EnrollmentPollInterval <= 0 { + return Config{}, errors.New("enrollment poll interval must be positive") + } + if cfg.EnrollmentPollTimeout < 0 { + return Config{}, errors.New("enrollment poll timeout must not be negative") + } + if cfg.MeshProductionObservationSinkCapacity < 0 { + return Config{}, errors.New("mesh production observation sink capacity must not be negative") + } + if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity { + return Config{}, errors.New("mesh production observation sink capacity exceeds maximum") + } + return cfg, nil +} + +func readEnv() map[string]string { + out := map[string]string{} + for _, pair := range os.Environ() { + key, value, ok := strings.Cut(pair, "=") + if ok { + out[key] = value + } + } + return out +} + +func getEnv(env map[string]string, key, fallback string) string { + if value := strings.TrimSpace(env[key]); value != "" { + return value + } + return fallback +} + +func getEnvInt(env map[string]string, key string, fallback int) int { + value := strings.TrimSpace(env[key]) + if value == "" { + return fallback + } + parsed, err := strconv.Atoi(value) + if err != nil || parsed <= 0 { + return fallback + } + return parsed +} + +func getEnvSignedInt(env map[string]string, key string, fallback int) int { + value := strings.TrimSpace(env[key]) + if value == "" { + return fallback + } + parsed, err := strconv.Atoi(value) + if err != nil { + return fallback + } + return parsed +} + +func getEnvBool(env map[string]string, key string, fallback bool) bool { + value := strings.ToLower(strings.TrimSpace(env[key])) + switch value { + case "1", "true", "yes", "y", "on": + return true + case "0", "false", "no", "n", "off": + return false + default: + return fallback + } +} + +func hostnameOrDefault() string { + host, err := os.Hostname() + if err != nil || strings.TrimSpace(host) == "" { + return "rap-node" + } + return host +} diff --git a/agents/rap-node-agent/internal/config/config_test.go b/agents/rap-node-agent/internal/config/config_test.go new file mode 100644 index 0000000..19d8fba --- /dev/null +++ b/agents/rap-node-agent/internal/config/config_test.go @@ -0,0 +1,104 @@ +package config + +import ( + "testing" + "time" +) + +func TestLoadConfigFromEnvAndArgs(t *testing.T) { + cfg, err := Load([]string{"-node-name", "node-b"}, map[string]string{ + "RAP_BACKEND_URL": "http://backend/api/v1/", + "RAP_CLUSTER_ID": "cluster-1", + "RAP_CLUSTER_AUTHORITY_PUBLIC_KEY": "public-key-b64", + "RAP_CLUSTER_AUTHORITY_FINGERPRINT": "rap-ca-ed25519-test", + "RAP_JOIN_TOKEN": "join-token", + "RAP_NODE_NAME": "node-a", + "RAP_NODE_STATE_DIR": "/tmp/rap-node", + "RAP_WORKLOAD_SUPERVISION_ENABLED": "true", + "RAP_HEARTBEAT_INTERVAL_SECONDS": "7", + "RAP_ENROLLMENT_POLL_INTERVAL_SECONDS": "3", + "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30", + "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true", + "RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true", + "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5", + "RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001", + "RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443/", + "RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`, + "RAP_MESH_ADVERTISE_TRANSPORT": "wss", + "RAP_MESH_CONNECTIVITY_MODE": "outbound_only", + "RAP_MESH_NAT_TYPE": "symmetric", + "RAP_MESH_REGION": "eu", + "RAP_MESH_SYNTHETIC_CONFIG": "/tmp/rap-node/mesh-synthetic.json", + "RAP_MESH_PEER_ENDPOINTS_JSON": `{"node-b":"http://127.0.0.1:19002"}`, + "RAP_MESH_SYNTHETIC_ROUTES_JSON": `[{"route_id":"route-1"}]`, + }) + if err != nil { + t.Fatalf("load config: %v", err) + } + if cfg.BackendURL != "http://backend/api/v1" { + t.Fatalf("BackendURL = %q", cfg.BackendURL) + } + if cfg.NodeName != "node-b" { + t.Fatalf("NodeName = %q", cfg.NodeName) + } + if cfg.ClusterAuthorityPublicKey != "public-key-b64" || cfg.ClusterAuthorityFingerprint != "rap-ca-ed25519-test" { + t.Fatalf("unexpected cluster authority pin config: %+v", cfg) + } + if cfg.HeartbeatInterval != 7*time.Second { + t.Fatalf("HeartbeatInterval = %s", cfg.HeartbeatInterval) + } + if cfg.EnrollmentPollInterval != 3*time.Second || cfg.EnrollmentPollTimeout != 30*time.Second { + t.Fatalf("unexpected enrollment polling config: %+v", cfg) + } + if !cfg.WorkloadSupervisionEnabled { + t.Fatal("WorkloadSupervisionEnabled = false, want true") + } + if !cfg.MeshSyntheticRuntimeEnabled { + t.Fatal("MeshSyntheticRuntimeEnabled = false, want true") + } + if !cfg.MeshProductionForwardingEnabled { + t.Fatal("MeshProductionForwardingEnabled = false, want true") + } + if cfg.MeshProductionObservationSinkCapacity != 5 { + t.Fatalf("MeshProductionObservationSinkCapacity = %d, want 5", cfg.MeshProductionObservationSinkCapacity) + } + if cfg.MeshListenAddr != "127.0.0.1:19001" { + t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr) + } + if cfg.MeshAdvertiseEndpoint != "https://node-a.example.test:443" || + cfg.MeshAdvertiseEndpointsJSON == "" || + cfg.MeshAdvertiseTransport != "wss" || + cfg.MeshConnectivityMode != "outbound_only" || + cfg.MeshNATType != "symmetric" || + cfg.MeshRegion != "eu" { + t.Fatalf("unexpected mesh advertise config: %+v", cfg) + } + if cfg.MeshSyntheticConfigPath != "/tmp/rap-node/mesh-synthetic.json" { + t.Fatalf("MeshSyntheticConfigPath = %q", cfg.MeshSyntheticConfigPath) + } + if cfg.MeshPeerEndpointsJSON == "" || cfg.MeshSyntheticRoutesJSON == "" { + t.Fatalf("mesh live synthetic config was not loaded: %+v", cfg) + } +} + +func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) { + _, err := Load(nil, map[string]string{ + "RAP_BACKEND_URL": "http://backend/api/v1", + "RAP_NODE_NAME": "node-a", + "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "-1", + }) + if err == nil { + t.Fatal("Load returned nil error for negative sink capacity") + } +} + +func TestLoadConfigRejectsTooLargeProductionObservationSinkCapacity(t *testing.T) { + _, err := Load(nil, map[string]string{ + "RAP_BACKEND_URL": "http://backend/api/v1", + "RAP_NODE_NAME": "node-a", + "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "10001", + }) + if err == nil { + t.Fatal("Load returned nil error for too-large sink capacity") + } +} diff --git a/agents/rap-node-agent/internal/mesh/client.go b/agents/rap-node-agent/internal/mesh/client.go new file mode 100644 index 0000000..b389c61 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/client.go @@ -0,0 +1,111 @@ +package mesh + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "time" +) + +type Client struct { + BaseURL string + HTTPClient *http.Client +} + +func NewClient(baseURL string) Client { + return Client{ + BaseURL: baseURL, + HTTPClient: &http.Client{ + Timeout: 5 * time.Second, + }, + } +} + +func (c Client) SendHealth(ctx context.Context, message HealthMessage) (HealthAck, error) { + payload, err := json.Marshal(message) + if err != nil { + return HealthAck{}, err + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/health", bytes.NewReader(payload)) + if err != nil { + return HealthAck{}, err + } + req.Header.Set("Content-Type", "application/json") + httpClient := c.HTTPClient + if httpClient == nil { + httpClient = http.DefaultClient + } + resp, err := httpClient.Do(req) + if err != nil { + return HealthAck{}, err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return HealthAck{}, fmt.Errorf("mesh health rejected with status %d", resp.StatusCode) + } + var ack HealthAck + if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil { + return HealthAck{}, err + } + return ack, nil +} + +func (c Client) SendSynthetic(ctx context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) { + payload, err := json.Marshal(envelope) + if err != nil { + return SyntheticEnvelope{}, err + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/synthetic/probe", bytes.NewReader(payload)) + if err != nil { + return SyntheticEnvelope{}, err + } + req.Header.Set("Content-Type", "application/json") + httpClient := c.HTTPClient + if httpClient == nil { + httpClient = http.DefaultClient + } + resp, err := httpClient.Do(req) + if err != nil { + return SyntheticEnvelope{}, err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return SyntheticEnvelope{}, fmt.Errorf("mesh synthetic probe rejected with status %d", resp.StatusCode) + } + var ack SyntheticEnvelope + if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil { + return SyntheticEnvelope{}, err + } + return ack, nil +} + +func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) { + payload, err := json.Marshal(envelope) + if err != nil { + return ProductionForwardResult{}, err + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/forward", bytes.NewReader(payload)) + if err != nil { + return ProductionForwardResult{}, err + } + req.Header.Set("Content-Type", "application/json") + httpClient := c.HTTPClient + if httpClient == nil { + httpClient = http.DefaultClient + } + resp, err := httpClient.Do(req) + if err != nil { + return ProductionForwardResult{}, err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return ProductionForwardResult{}, fmt.Errorf("mesh production forward rejected with status %d", resp.StatusCode) + } + var result ProductionForwardResult + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return ProductionForwardResult{}, err + } + return result, nil +} diff --git a/agents/rap-node-agent/internal/mesh/contracts.go b/agents/rap-node-agent/internal/mesh/contracts.go new file mode 100644 index 0000000..dd610ec --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/contracts.go @@ -0,0 +1,288 @@ +package mesh + +import ( + "encoding/json" + "errors" + "time" +) + +const ProtocolVersion = "mesh-control-v1" + +var ( + ErrClusterMismatch = errors.New("mesh peer cluster mismatch") + ErrNodeMismatch = errors.New("mesh peer node mismatch") + ErrForwardDisabled = errors.New("production payload forwarding is disabled by mesh production gate") + ErrForwardRuntimeUnavailable = errors.New("production mesh forwarding runtime is unavailable for this route or stage") + ErrForwardPeerUnavailable = errors.New("production mesh next peer is unavailable") + ErrForwardEnvelopeInvalid = errors.New("production mesh envelope is invalid") + ErrForwardObservationFailed = errors.New("production mesh envelope observation failed") + ErrMeshRuntimeDisabled = errors.New("mesh synthetic runtime is disabled") + ErrUnsupportedSyntheticMessage = errors.New("unsupported synthetic mesh message") + ErrRouteIDRequired = errors.New("mesh synthetic route id is required") + ErrRouteNotFound = errors.New("mesh synthetic route not found") + ErrInvalidRoutePath = errors.New("mesh synthetic route path is invalid") + ErrRouteExpired = errors.New("mesh synthetic route is expired") + ErrTTLExhausted = errors.New("mesh synthetic route ttl exhausted") + ErrLoopDetected = errors.New("mesh synthetic route loop detected") + ErrUnauthorizedChannel = errors.New("mesh synthetic channel is not authorized") + ErrSyntheticPeerUnavailable = errors.New("mesh synthetic next peer is unavailable") + ErrNoHealthySyntheticRoute = errors.New("mesh synthetic no healthy route available") + ErrSyntheticRelayQueueFull = errors.New("mesh synthetic relay queue is full") + ErrSyntheticRelayQueueEmpty = errors.New("mesh synthetic relay queue is empty") + ErrSyntheticPayloadTooLarge = errors.New("mesh synthetic payload is too large") + ErrSyntheticOrganizationMismatch = errors.New("mesh synthetic organization mismatch") + ErrUnsupportedSyntheticService = errors.New("unsupported synthetic test service") + ErrSyntheticRequestInvalid = errors.New("mesh synthetic request is invalid") +) + +const ( + SyntheticMessageProbe = "fabric.probe" + SyntheticMessageProbeAck = "fabric.probe_ack" + SyntheticMessageRouteHealth = "fabric.route_health" + SyntheticMessageRouteHealthAck = "fabric.route_health_ack" + SyntheticMessageTelemetry = "fabric.telemetry" + SyntheticMessageTestService = "fabric.test_service" + SyntheticMessageTestServiceAck = "fabric.test_service_ack" + + SyntheticTestServiceType = "synthetic.echo" + SyntheticDefaultTestOrganizationID = "org-test" + SyntheticDefaultMaxTestPayloadBytes = 4096 + + SyntheticChannelFabricControl = "fabric_control" + SyntheticChannelRouteControl = "route_control" + SyntheticChannelTelemetry = "telemetry" + + SyntheticRouteStateUnknown = "unknown" + SyntheticRouteStateHealthy = "healthy" + SyntheticRouteStateDegraded = "degraded" + SyntheticRouteStateFailed = "failed" + + ProductionChannelFabricControl = "fabric_control" + ProductionMessageFabricControl = "fabric.control" + MaxProductionEnvelopePayloadBytes = 4096 + MaxProductionEnvelopeFutureSkew = time.Minute +) + +type PeerIdentity struct { + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` +} + +type SyntheticRoute struct { + RouteID string `json:"route_id"` + ClusterID string `json:"cluster_id"` + SourceNodeID string `json:"source_node_id"` + DestinationNodeID string `json:"destination_node_id"` + Hops []string `json:"hops"` + AllowedChannels []string `json:"allowed_channels"` + ExpiresAt time.Time `json:"expires_at"` + MaxTTL int `json:"max_ttl"` + MaxHops int `json:"max_hops"` + RouteVersion string `json:"route_version,omitempty"` + PolicyVersion string `json:"policy_version,omitempty"` + PeerDirectoryVersion string `json:"peer_directory_version,omitempty"` +} + +type SyntheticEnvelope struct { + ProtocolVersion string `json:"protocol_version"` + RouteID string `json:"route_id"` + ClusterID string `json:"cluster_id"` + From PeerIdentity `json:"from"` + To PeerIdentity `json:"to"` + Channel string `json:"channel"` + MessageType string `json:"message_type"` + TTL int `json:"ttl"` + HopCount int `json:"hop_count"` + Visited []string `json:"visited"` + Sequence uint64 `json:"sequence"` + SentAt time.Time `json:"sent_at"` + Payload json.RawMessage `json:"payload,omitempty"` +} + +type SyntheticProbePayload struct { + ProbeID string `json:"probe_id"` + SentAt time.Time `json:"sent_at"` +} + +type SyntheticProbeAckPayload struct { + ProbeID string `json:"probe_id"` + Path []string `json:"path"` + AcceptedAt time.Time `json:"accepted_at"` +} + +type SyntheticRouteObservation struct { + RouteID string `json:"route_id"` + State string `json:"state"` + LastSuccessAt time.Time `json:"last_success_at,omitempty"` + LastFailureAt time.Time `json:"last_failure_at,omitempty"` + LastFailureReason string `json:"last_failure_reason,omitempty"` + SuccessCount uint64 `json:"success_count"` + FailureCount uint64 `json:"failure_count"` + LastLatencyMs int64 `json:"last_latency_ms,omitempty"` + RouteVersion string `json:"route_version,omitempty"` + PolicyVersion string `json:"policy_version,omitempty"` + PeerDirectoryVersion string `json:"peer_directory_version,omitempty"` +} + +type SyntheticRouteHealthResult struct { + RequestedRouteID string `json:"requested_route_id"` + SelectedRouteID string `json:"selected_route_id"` + FallbackUsed bool `json:"fallback_used"` + Ack SyntheticEnvelope `json:"ack"` + Observation SyntheticRouteObservation `json:"observation"` +} + +type SyntheticTestServiceRequest struct { + RequestID string `json:"request_id"` + OrganizationID string `json:"organization_id"` + ServiceType string `json:"service_type"` + Payload string `json:"payload"` + SentAt time.Time `json:"sent_at"` +} + +type SyntheticTestServiceResponse struct { + RequestID string `json:"request_id"` + OrganizationID string `json:"organization_id"` + ServiceType string `json:"service_type"` + EchoPayload string `json:"echo_payload"` + Path []string `json:"path"` + AcceptedAt time.Time `json:"accepted_at"` +} + +type SyntheticTestServiceResult struct { + RequestedRouteID string `json:"requested_route_id"` + SelectedRouteID string `json:"selected_route_id"` + FallbackUsed bool `json:"fallback_used"` + Ack SyntheticEnvelope `json:"ack"` + Response SyntheticTestServiceResponse `json:"response"` + Observation SyntheticRouteObservation `json:"observation"` +} + +type SyntheticRouteCacheVersion struct { + RouteVersion string `json:"route_version,omitempty"` + PolicyVersion string `json:"policy_version,omitempty"` + PeerDirectoryVersion string `json:"peer_directory_version,omitempty"` +} + +type SyntheticRelayQueuePolicy struct { + Channel string `json:"channel"` + Capacity int `json:"capacity"` + Droppable bool `json:"droppable"` +} + +type SyntheticRelayEnqueueResult struct { + Channel string `json:"channel"` + QueueDepth int `json:"queue_depth"` + QueueCapacity int `json:"queue_capacity"` + Dropped bool `json:"dropped"` + DroppedSequence uint64 `json:"dropped_sequence,omitempty"` + AcceptedSequence uint64 `json:"accepted_sequence"` +} + +type SyntheticRelayQueueMetrics struct { + Enqueued uint64 `json:"enqueued"` + Dequeued uint64 `json:"dequeued"` + Dropped uint64 `json:"dropped"` + Rejected uint64 `json:"rejected"` + LastRejectReason string `json:"last_reject_reason,omitempty"` + QueueDepths map[string]int `json:"queue_depths"` +} + +type HealthMessage struct { + ProtocolVersion string `json:"protocol_version"` + From PeerIdentity `json:"from"` + To PeerIdentity `json:"to"` + ObservedAt time.Time `json:"observed_at"` + LinkStatus string `json:"link_status"` + LatencyMs *int `json:"latency_ms,omitempty"` + QualityScore *int `json:"quality_score,omitempty"` +} + +type HealthAck struct { + ProtocolVersion string `json:"protocol_version"` + Accepted bool `json:"accepted"` + By PeerIdentity `json:"by"` +} + +type ProductionEnvelope struct { + FabricProtocolVersion string `json:"fabric_protocol_version"` + MessageID string `json:"message_id"` + RouteID string `json:"route_id"` + ClusterID string `json:"cluster_id"` + SourceNodeID string `json:"source_node_id"` + DestinationNodeID string `json:"destination_node_id"` + CurrentHopNodeID string `json:"current_hop_node_id"` + NextHopNodeID string `json:"next_hop_node_id"` + RoutePath []string `json:"route_path,omitempty"` + VisitedNodeIDs []string `json:"visited_node_ids,omitempty"` + ChannelClass string `json:"channel_class"` + MessageType string `json:"message_type"` + TTL int `json:"ttl"` + HopCount int `json:"hop_count"` + CreatedAt time.Time `json:"created_at"` + ExpiresAt time.Time `json:"expires_at"` + PayloadLength int `json:"payload_length"` + PayloadHash string `json:"payload_hash"` + Payload json.RawMessage `json:"payload,omitempty"` +} + +type ProductionEnvelopeObservation struct { + MessageID string `json:"message_id"` + RouteID string `json:"route_id"` + ClusterID string `json:"cluster_id"` + SourceNodeID string `json:"source_node_id"` + DestinationNodeID string `json:"destination_node_id"` + CurrentHopNodeID string `json:"current_hop_node_id"` + NextHopNodeID string `json:"next_hop_node_id"` + RoutePath []string `json:"route_path,omitempty"` + VisitedNodeIDs []string `json:"visited_node_ids,omitempty"` + ChannelClass string `json:"channel_class"` + MessageType string `json:"message_type"` + TTL int `json:"ttl"` + HopCount int `json:"hop_count"` + PayloadLength int `json:"payload_length"` + PayloadHash string `json:"payload_hash"` + ObservedAt time.Time `json:"observed_at"` +} + +type ProductionForwardResult struct { + Accepted bool `json:"accepted"` + Delivered bool `json:"delivered"` + Forwarded bool `json:"forwarded"` + By PeerIdentity `json:"by"` + MessageID string `json:"message_id"` + RouteID string `json:"route_id"` + NextNodeID string `json:"next_node_id,omitempty"` +} + +type ProductionForwardLogEntry struct { + Event string `json:"event"` + RouteID string `json:"route_id,omitempty"` + MessageID string `json:"message_id,omitempty"` + ClusterID string `json:"cluster_id,omitempty"` + LocalNodeID string `json:"local_node_id,omitempty"` + SourceNodeID string `json:"source_node_id,omitempty"` + DestinationNodeID string `json:"destination_node_id,omitempty"` + CurrentHopNodeID string `json:"current_hop_node_id,omitempty"` + NextHopNodeID string `json:"next_hop_node_id,omitempty"` + ChannelClass string `json:"channel_class,omitempty"` + MessageType string `json:"message_type,omitempty"` + Reason string `json:"reason,omitempty"` + StatusCode int `json:"status_code,omitempty"` + TTL int `json:"ttl,omitempty"` + HopCount int `json:"hop_count,omitempty"` + RoutePathLength int `json:"route_path_length,omitempty"` + VisitedCount int `json:"visited_count,omitempty"` + PayloadLength int `json:"payload_length,omitempty"` + OccurredAt time.Time `json:"occurred_at"` +} + +func ValidatePeer(local PeerIdentity, remote PeerIdentity) error { + if local.ClusterID == "" || remote.ClusterID == "" || local.ClusterID != remote.ClusterID { + return ErrClusterMismatch + } + if remote.NodeID == "" { + return ErrNodeMismatch + } + return nil +} diff --git a/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring.go b/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring.go new file mode 100644 index 0000000..a80b76b --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring.go @@ -0,0 +1,258 @@ +package mesh + +import ( + "sort" + "strings" + "time" +) + +type EndpointCandidateScoreOptions struct { + ChannelClass string + PreferredRegion string + Now time.Time + MaxVerificationAge time.Duration + Observations map[string]EndpointCandidateHealthObservation + MaxObservationAge time.Duration +} + +type EndpointCandidateHealthObservation struct { + EndpointID string `json:"endpoint_id"` + LastLatencyMs int64 `json:"last_latency_ms,omitempty"` + SuccessCount uint64 `json:"success_count,omitempty"` + FailureCount uint64 `json:"failure_count,omitempty"` + LastFailureReason string `json:"last_failure_reason,omitempty"` + ReliabilityScore int `json:"reliability_score,omitempty"` + ObservedAt time.Time `json:"observed_at,omitempty"` +} + +type ScoredPeerEndpointCandidate struct { + Candidate PeerEndpointCandidate `json:"candidate"` + Score int `json:"score"` + Reasons []string `json:"reasons,omitempty"` +} + +func RankPeerEndpointCandidates(candidates []PeerEndpointCandidate, opts EndpointCandidateScoreOptions) []ScoredPeerEndpointCandidate { + if len(candidates) == 0 { + return nil + } + out := make([]ScoredPeerEndpointCandidate, 0, len(candidates)) + for _, candidate := range candidates { + out = append(out, scorePeerEndpointCandidate(candidate, opts)) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Score != out[j].Score { + return out[i].Score > out[j].Score + } + if out[i].Candidate.Priority != out[j].Candidate.Priority { + return out[i].Candidate.Priority < out[j].Candidate.Priority + } + if out[i].Candidate.NodeID != out[j].Candidate.NodeID { + return out[i].Candidate.NodeID < out[j].Candidate.NodeID + } + return out[i].Candidate.EndpointID < out[j].Candidate.EndpointID + }) + return out +} + +func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions) ScoredPeerEndpointCandidate { + score := 100 + reasons := []string{"base"} + + switch candidate.Transport { + case "direct_tcp_tls": + score += 35 + reasons = append(reasons, "transport:direct_tcp_tls") + case "wss": + score += 25 + reasons = append(reasons, "transport:wss") + case "outbound_reverse": + score += 10 + reasons = append(reasons, "transport:outbound_reverse") + case "relay": + score += 5 + reasons = append(reasons, "transport:relay") + default: + score -= 100 + reasons = append(reasons, "transport:unknown") + } + + switch candidate.Reachability { + case "public": + score += 30 + reasons = append(reasons, "reachability:public") + case "private": + score += 15 + reasons = append(reasons, "reachability:private") + case "relay": + score += 5 + reasons = append(reasons, "reachability:relay") + case "outbound_only": + score -= 5 + reasons = append(reasons, "reachability:outbound_only") + default: + score -= 15 + reasons = append(reasons, "reachability:unknown") + } + + switch candidate.ConnectivityMode { + case "direct": + score += 30 + reasons = append(reasons, "connectivity:direct") + case "outbound_only": + score += 5 + reasons = append(reasons, "connectivity:outbound_only") + case "relay_required": + score -= 5 + reasons = append(reasons, "connectivity:relay_required") + default: + score -= 10 + reasons = append(reasons, "connectivity:unknown") + } + + switch candidate.NATType { + case "", "none": + score += 15 + reasons = append(reasons, "nat:none") + case "full_cone": + score += 10 + reasons = append(reasons, "nat:full_cone") + case "restricted", "port_restricted": + score += 3 + reasons = append(reasons, "nat:restricted") + case "symmetric": + score -= 20 + reasons = append(reasons, "nat:symmetric") + case "blocked": + score -= 60 + reasons = append(reasons, "nat:blocked") + default: + score -= 8 + reasons = append(reasons, "nat:unknown") + } + + if candidate.Priority > 0 { + score -= candidate.Priority + reasons = append(reasons, "priority") + } + if opts.PreferredRegion != "" && candidate.Region != "" { + if strings.EqualFold(candidate.Region, opts.PreferredRegion) { + score += 12 + reasons = append(reasons, "region:preferred") + } else { + score -= 4 + reasons = append(reasons, "region:remote") + } + } + if hasPolicyTag(candidate.PolicyTags, "fast-path") { + score += 10 + reasons = append(reasons, "policy:fast-path") + } + if hasPolicyTag(candidate.PolicyTags, "private-lan") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "same-site") { + score += 18 + reasons = append(reasons, "policy:private-lan") + } + if hasPolicyTag(candidate.PolicyTags, "costly") { + score -= 10 + reasons = append(reasons, "policy:costly") + } + if opts.ChannelClass == SyntheticChannelFabricControl || opts.ChannelClass == SyntheticChannelRouteControl { + if candidate.ConnectivityMode == "direct" { + score += 8 + reasons = append(reasons, "channel:control-direct") + } + if candidate.Transport == "relay" { + score -= 8 + reasons = append(reasons, "channel:control-relay-penalty") + } + } + if !opts.Now.IsZero() && candidate.LastVerifiedAt != nil && opts.MaxVerificationAge > 0 { + age := opts.Now.Sub(candidate.LastVerifiedAt.UTC()) + if age >= 0 && age <= opts.MaxVerificationAge { + score += 8 + reasons = append(reasons, "verified:fresh") + } else { + score -= 12 + reasons = append(reasons, "verified:stale") + } + } + if observation, ok := opts.Observations[candidate.EndpointID]; ok { + observationScore, observationReasons := scoreEndpointCandidateObservation(observation, opts) + score += observationScore + reasons = append(reasons, observationReasons...) + } + + return ScoredPeerEndpointCandidate{ + Candidate: candidate, + Score: score, + Reasons: reasons, + } +} + +func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) { + score := 0 + reasons := []string{"observation:present"} + if !opts.Now.IsZero() && !observation.ObservedAt.IsZero() && opts.MaxObservationAge > 0 { + age := opts.Now.Sub(observation.ObservedAt.UTC()) + if age < 0 || age > opts.MaxObservationAge { + return -12, []string{"observation:stale"} + } + score += 6 + reasons = append(reasons, "observation:fresh") + } + switch { + case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50: + score += 18 + reasons = append(reasons, "latency:low") + case observation.LastLatencyMs <= 150: + score += 8 + reasons = append(reasons, "latency:moderate") + case observation.LastLatencyMs > 0: + score -= 10 + reasons = append(reasons, "latency:high") + } + if observation.ReliabilityScore > 0 { + switch { + case observation.ReliabilityScore >= 90: + score += 15 + reasons = append(reasons, "reliability:high") + case observation.ReliabilityScore >= 70: + score += 5 + reasons = append(reasons, "reliability:moderate") + default: + score -= 12 + reasons = append(reasons, "reliability:low") + } + } + if observation.SuccessCount > 0 { + score += boundedInt(int(observation.SuccessCount), 1, 10) + reasons = append(reasons, "history:success") + } + if observation.FailureCount > 0 { + score -= boundedInt(int(observation.FailureCount)*6, 6, 30) + reasons = append(reasons, "history:failure") + } + if strings.TrimSpace(observation.LastFailureReason) != "" { + score -= 8 + reasons = append(reasons, "failure:recent") + } + return score, reasons +} + +func hasPolicyTag(tags []string, needle string) bool { + for _, tag := range tags { + if strings.EqualFold(strings.TrimSpace(tag), needle) { + return true + } + } + return false +} + +func boundedInt(value, minValue, maxValue int) int { + if value < minValue { + return minValue + } + if value > maxValue { + return maxValue + } + return value +} diff --git a/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring_test.go b/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring_test.go new file mode 100644 index 0000000..4ec8d1c --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/endpoint_candidate_scoring_test.go @@ -0,0 +1,278 @@ +package mesh + +import ( + "testing" + "time" +) + +func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + fresh := now.Add(-time.Minute) + stale := now.Add(-2 * time.Hour) + candidates := []PeerEndpointCandidate{ + { + EndpointID: "node-b-relay", + NodeID: "node-b", + Transport: "relay", + Address: "relay.example.test/node-b", + Reachability: "relay", + NATType: "symmetric", + ConnectivityMode: "relay_required", + Region: "us", + Priority: 1, + LastVerifiedAt: &fresh, + }, + { + EndpointID: "node-b-public", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + NATType: "none", + ConnectivityMode: "direct", + Region: "eu", + Priority: 10, + PolicyTags: []string{"fast-path"}, + LastVerifiedAt: &fresh, + }, + { + EndpointID: "node-b-private-stale", + NodeID: "node-b", + Transport: "wss", + Address: "10.0.0.5:443", + Reachability: "private", + NATType: "restricted", + ConnectivityMode: "direct", + Region: "eu", + Priority: 5, + LastVerifiedAt: &stale, + }, + } + + ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{ + ChannelClass: SyntheticChannelFabricControl, + PreferredRegion: "eu", + Now: now, + MaxVerificationAge: time.Hour, + }) + if len(ranked) != 3 { + t.Fatalf("ranked length = %d, want 3", len(ranked)) + } + if ranked[0].Candidate.EndpointID != "node-b-public" { + t.Fatalf("top endpoint = %q, want node-b-public: %+v", ranked[0].Candidate.EndpointID, ranked) + } + if ranked[0].Score <= ranked[1].Score { + t.Fatalf("top score = %d, second = %d", ranked[0].Score, ranked[1].Score) + } + if !containsReason(ranked[0].Reasons, "policy:fast-path") || !containsReason(ranked[0].Reasons, "verified:fresh") { + t.Fatalf("top reasons missing expected hints: %+v", ranked[0].Reasons) + } +} + +func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) { + candidates := []PeerEndpointCandidate{ + { + EndpointID: "endpoint-b", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.21:443", + Reachability: "public", + NATType: "none", + ConnectivityMode: "direct", + Priority: 10, + }, + { + EndpointID: "endpoint-a", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + NATType: "none", + ConnectivityMode: "direct", + Priority: 10, + }, + } + + ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{}) + if ranked[0].Candidate.EndpointID != "endpoint-a" { + t.Fatalf("tie top endpoint = %q, want endpoint-a", ranked[0].Candidate.EndpointID) + } +} + +func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + candidates := []PeerEndpointCandidate{ + { + EndpointID: "node-b-public", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + NATType: "none", + ConnectivityMode: "direct", + Region: "corp-eu", + Priority: 10, + }, + { + EndpointID: "node-b-corp-lan", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "10.24.10.20:19001", + Reachability: "private", + NATType: "none", + ConnectivityMode: "direct", + Region: "corp-eu", + Priority: 1, + PolicyTags: []string{"corp-lan", "same-site"}, + }, + } + + ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{ + ChannelClass: SyntheticChannelFabricControl, + PreferredRegion: "corp-eu", + Now: now, + }) + if ranked[0].Candidate.EndpointID != "node-b-corp-lan" { + t.Fatalf("top endpoint = %q, want node-b-corp-lan: %+v", ranked[0].Candidate.EndpointID, ranked) + } + if !containsReason(ranked[0].Reasons, "policy:private-lan") || !containsReason(ranked[0].Reasons, "region:preferred") { + t.Fatalf("corp LAN reasons missing: %+v", ranked[0].Reasons) + } +} + +func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T) { + candidates := []PeerEndpointCandidate{ + { + EndpointID: "node-b-outbound", + NodeID: "node-b", + Transport: "outbound_reverse", + Address: "node-b.reverse.local", + Reachability: "outbound_only", + NATType: "symmetric", + ConnectivityMode: "outbound_only", + Priority: 20, + }, + { + EndpointID: "node-b-relay", + NodeID: "node-b", + Transport: "relay", + Address: "relay.example.test/node-b", + Reachability: "relay", + NATType: "blocked", + ConnectivityMode: "relay_required", + Priority: 30, + }, + } + + ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{ + ChannelClass: SyntheticChannelRouteControl, + }) + if len(ranked) != 2 { + t.Fatalf("ranked length = %d, want 2", len(ranked)) + } + for _, item := range ranked { + if item.Candidate.EndpointID == "" { + t.Fatalf("ranked candidate lost identity: %+v", item) + } + } +} + +func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) { + now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC) + candidates := []PeerEndpointCandidate{ + { + EndpointID: "node-b-direct", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + NATType: "none", + ConnectivityMode: "direct", + Priority: 10, + }, + { + EndpointID: "node-b-wss", + NodeID: "node-b", + Transport: "wss", + Address: "node-b.example.test", + Reachability: "public", + NATType: "restricted", + ConnectivityMode: "direct", + Priority: 10, + }, + } + + ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{ + Now: now, + MaxObservationAge: 5 * time.Minute, + Observations: map[string]EndpointCandidateHealthObservation{ + "node-b-direct": { + EndpointID: "node-b-direct", + LastLatencyMs: 240, + FailureCount: 3, + LastFailureReason: "connect_timeout", + ReliabilityScore: 50, + ObservedAt: now.Add(-time.Minute), + }, + "node-b-wss": { + EndpointID: "node-b-wss", + LastLatencyMs: 35, + SuccessCount: 8, + ReliabilityScore: 95, + ObservedAt: now.Add(-time.Minute), + }, + }, + }) + if ranked[0].Candidate.EndpointID != "node-b-wss" { + t.Fatalf("top endpoint = %q, want node-b-wss: %+v", ranked[0].Candidate.EndpointID, ranked) + } + if !containsReason(ranked[0].Reasons, "latency:low") || !containsReason(ranked[0].Reasons, "reliability:high") { + t.Fatalf("top reasons missing health hints: %+v", ranked[0].Reasons) + } + if !containsReason(ranked[1].Reasons, "history:failure") || !containsReason(ranked[1].Reasons, "failure:recent") { + t.Fatalf("failed endpoint reasons missing failure hints: %+v", ranked[1].Reasons) + } +} + +func TestRankPeerEndpointCandidatesTreatsStaleObservationAsPenalty(t *testing.T) { + now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC) + candidates := []PeerEndpointCandidate{ + { + EndpointID: "node-b-direct", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + NATType: "none", + ConnectivityMode: "direct", + Priority: 10, + }, + } + + ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{ + Now: now, + MaxObservationAge: 5 * time.Minute, + Observations: map[string]EndpointCandidateHealthObservation{ + "node-b-direct": { + EndpointID: "node-b-direct", + LastLatencyMs: 20, + ObservedAt: now.Add(-time.Hour), + }, + }, + }) + if !containsReason(ranked[0].Reasons, "observation:stale") { + t.Fatalf("reasons missing stale observation: %+v", ranked[0].Reasons) + } + if containsReason(ranked[0].Reasons, "latency:low") { + t.Fatalf("stale observation should not contribute latency: %+v", ranked[0].Reasons) + } +} + +func containsReason(reasons []string, reason string) bool { + for _, item := range reasons { + if item == reason { + return true + } + } + return false +} diff --git a/agents/rap-node-agent/internal/mesh/http_transport.go b/agents/rap-node-agent/internal/mesh/http_transport.go new file mode 100644 index 0000000..19d7aa1 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/http_transport.go @@ -0,0 +1,42 @@ +package mesh + +import ( + "context" + "net/http" + "strings" +) + +// HTTPPeerTransport sends synthetic mesh envelopes to explicitly configured +// peer endpoints. It is intentionally narrow: production forwarding remains +// disabled and only SyntheticRuntime messages use this transport. +type HTTPPeerTransport struct { + PeerURLs map[string]string + HTTPClient *http.Client +} + +func NewHTTPPeerTransport(peerURLs map[string]string) *HTTPPeerTransport { + normalized := make(map[string]string, len(peerURLs)) + for nodeID, baseURL := range peerURLs { + nodeID = strings.TrimSpace(nodeID) + baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/") + if nodeID != "" && baseURL != "" { + normalized[nodeID] = baseURL + } + } + return &HTTPPeerTransport{PeerURLs: normalized} +} + +func (t *HTTPPeerTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) { + if t == nil { + return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable + } + baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/") + if baseURL == "" { + return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable + } + client := NewClient(baseURL) + if t.HTTPClient != nil { + client.HTTPClient = t.HTTPClient + } + return client.SendSynthetic(ctx, envelope) +} diff --git a/agents/rap-node-agent/internal/mesh/http_transport_test.go b/agents/rap-node-agent/internal/mesh/http_transport_test.go new file mode 100644 index 0000000..b8a7107 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/http_transport_test.go @@ -0,0 +1,130 @@ +package mesh + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestHTTPPeerTransportDirectSyntheticProbe(t *testing.T) { + nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + defer nodeA.Close() + nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}) + defer nodeB.Close() + + route := liveSyntheticRoute("route-direct", []string{"node-a", "node-b"}) + routes := []SyntheticRoute{route} + nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-b": nodeB.URL}) + nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{}) + + ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-direct") + if err != nil { + t.Fatalf("send live direct probe: %v", err) + } + if ack.MessageType != SyntheticMessageProbeAck { + t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck) + } + payload := decodeAckPayload(t, ack) + if got, want := payload.Path, []string{"node-a", "node-b"}; !sameStrings(got, want) { + t.Fatalf("path = %v, want %v", got, want) + } +} + +func TestHTTPPeerTransportSingleRelaySyntheticProbe(t *testing.T) { + nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + defer nodeA.Close() + nodeR := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}) + defer nodeR.Close() + nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}) + defer nodeB.Close() + + route := liveSyntheticRoute("route-relay", []string{"node-a", "node-r", "node-b"}) + routes := []SyntheticRoute{route} + nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-r": nodeR.URL}) + nodeR.Runtime = newLiveRuntime(nodeR.Local, routes, map[string]string{"node-b": nodeB.URL}) + nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{}) + + ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-relay") + if err != nil { + t.Fatalf("send live relay probe: %v", err) + } + if ack.MessageType != SyntheticMessageProbeAck { + t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck) + } + payload := decodeAckPayload(t, ack) + if got, want := payload.Path, []string{"node-a", "node-r", "node-b"}; !sameStrings(got, want) { + t.Fatalf("path = %v, want %v", got, want) + } +} + +func TestHTTPPeerTransportMissingPeer(t *testing.T) { + transport := NewHTTPPeerTransport(map[string]string{}) + _, err := transport.SendSynthetic(context.Background(), "node-missing", SyntheticEnvelope{}) + if !errors.Is(err, ErrSyntheticPeerUnavailable) { + t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err) + } +} + +type liveSyntheticNode struct { + Local PeerIdentity + Runtime *SyntheticRuntime + URL string + server *httptest.Server +} + +func newLiveSyntheticNode(t *testing.T, local PeerIdentity) *liveSyntheticNode { + t.Helper() + node := &liveSyntheticNode{Local: local} + node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r) + })) + node.URL = node.server.URL + return node +} + +func (n *liveSyntheticNode) Close() { + if n.server != nil { + n.server.Close() + } +} + +func newLiveRuntime(local PeerIdentity, routes []SyntheticRoute, peers map[string]string) *SyntheticRuntime { + return NewSyntheticRuntime(SyntheticRuntimeConfig{ + Enabled: true, + Local: local, + Routes: routes, + Transport: NewHTTPPeerTransport(peers), + }) +} + +func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute { + return SyntheticRoute{ + RouteID: routeID, + ClusterID: "cluster-1", + SourceNodeID: hops[0], + DestinationNodeID: hops[len(hops)-1], + Hops: hops, + AllowedChannels: []string{SyntheticChannelFabricControl}, + MaxTTL: 8, + MaxHops: 8, + ExpiresAt: time.Now().UTC().Add(time.Hour), + RouteVersion: "route-v1", + PolicyVersion: "policy-v1", + PeerDirectoryVersion: "peers-v1", + } +} + +func sameStrings(left, right []string) bool { + if len(left) != len(right) { + return false + } + for i := range left { + if left[i] != right[i] { + return false + } + } + return true +} diff --git a/agents/rap-node-agent/internal/mesh/peer_cache.go b/agents/rap-node-agent/internal/mesh/peer_cache.go new file mode 100644 index 0000000..1f97f38 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_cache.go @@ -0,0 +1,374 @@ +package mesh + +import ( + "sort" + "strings" + "time" +) + +const DefaultWarmPeerLimit = 8 + +type PeerCacheConfig struct { + Local PeerIdentity + PeerEndpoints map[string]string + PeerEndpointCandidates map[string][]PeerEndpointCandidate + PeerDirectory []PeerDirectoryEntry + RecoverySeeds []PeerRecoverySeed + RendezvousLeases []PeerRendezvousLease + Routes []SyntheticRoute + WarmPeerLimit int + PreferredRegion string + Now time.Time +} + +type PeerCache struct { + snapshot PeerCacheSnapshot +} + +type PeerCacheSnapshot struct { + ClusterID string `json:"cluster_id"` + LocalNodeID string `json:"local_node_id"` + PeerCount int `json:"peer_count"` + WarmPeerCount int `json:"warm_peer_count"` + RecoverySeedCount int `json:"recovery_seed_count"` + RendezvousLeaseCount int `json:"rendezvous_lease_count"` + BuiltAt time.Time `json:"built_at"` + Entries []PeerCacheEntry `json:"entries"` +} + +type PeerCacheEntry struct { + NodeID string `json:"node_id"` + RouteIDs []string `json:"route_ids,omitempty"` + Endpoint string `json:"endpoint,omitempty"` + EndpointCount int `json:"endpoint_count"` + CandidateCount int `json:"candidate_count"` + ConnectivityModes []string `json:"connectivity_modes,omitempty"` + RecoverySeed bool `json:"recovery_seed"` + Warm bool `json:"warm"` + WarmReason string `json:"warm_reason,omitempty"` + BestCandidateID string `json:"best_candidate_id,omitempty"` + BestCandidateAddr string `json:"best_candidate_addr,omitempty"` + BestTransport string `json:"best_transport,omitempty"` + BestReachability string `json:"best_reachability,omitempty"` + BestConnectivity string `json:"best_connectivity,omitempty"` + BestNATType string `json:"best_nat_type,omitempty"` + BestPolicyTags []string `json:"best_policy_tags,omitempty"` + BestCandidateScore int `json:"best_candidate_score,omitempty"` + RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"` + RelayNodeID string `json:"relay_node_id,omitempty"` + RelayEndpoint string `json:"relay_endpoint,omitempty"` + RelayControl bool `json:"relay_control"` +} + +type peerCacheBuildEntry struct { + PeerCacheEntry + adjacentRoutePeer bool + bestScore int +} + +func NewPeerCache(cfg PeerCacheConfig) *PeerCache { + now := cfg.Now.UTC() + if now.IsZero() { + now = time.Now().UTC() + } + limit := cfg.WarmPeerLimit + if limit <= 0 { + limit = DefaultWarmPeerLimit + } + entries := map[string]*peerCacheBuildEntry{} + for _, item := range cfg.PeerDirectory { + nodeID := strings.TrimSpace(item.NodeID) + if nodeID == "" || nodeID == cfg.Local.NodeID { + continue + } + entry := peerCacheEntry(entries, nodeID) + entry.RouteIDs = mergeStrings(entry.RouteIDs, item.RouteIDs) + entry.EndpointCount = maxInt(entry.EndpointCount, item.EndpointCount) + entry.CandidateCount = maxInt(entry.CandidateCount, item.CandidateCount) + entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, item.ConnectivityModes) + entry.RecoverySeed = entry.RecoverySeed || item.RecoverySeed + } + for nodeID, endpoint := range cfg.PeerEndpoints { + nodeID = strings.TrimSpace(nodeID) + endpoint = strings.TrimSpace(endpoint) + if nodeID == "" || nodeID == cfg.Local.NodeID || endpoint == "" { + continue + } + entry := peerCacheEntry(entries, nodeID) + entry.Endpoint = endpoint + entry.EndpointCount = maxInt(entry.EndpointCount, 1) + } + for nodeID, candidates := range cfg.PeerEndpointCandidates { + nodeID = strings.TrimSpace(nodeID) + if nodeID == "" || nodeID == cfg.Local.NodeID || len(candidates) == 0 { + continue + } + entry := peerCacheEntry(entries, nodeID) + entry.CandidateCount = maxInt(entry.CandidateCount, len(candidates)) + for _, candidate := range candidates { + if strings.TrimSpace(candidate.ConnectivityMode) != "" { + entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{candidate.ConnectivityMode}) + } + } + scored := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{ + ChannelClass: SyntheticChannelFabricControl, + PreferredRegion: cfg.PreferredRegion, + Now: now, + MaxVerificationAge: time.Hour, + }) + if len(scored) > 0 { + entry.BestCandidateID = scored[0].Candidate.EndpointID + entry.BestCandidateAddr = scored[0].Candidate.Address + entry.BestTransport = scored[0].Candidate.Transport + entry.BestReachability = scored[0].Candidate.Reachability + entry.BestConnectivity = scored[0].Candidate.ConnectivityMode + entry.BestNATType = scored[0].Candidate.NATType + entry.BestPolicyTags = append([]string{}, scored[0].Candidate.PolicyTags...) + entry.BestCandidateScore = scored[0].Score + entry.bestScore = scored[0].Score + if strings.TrimSpace(scored[0].Candidate.Address) != "" { + entry.Endpoint = strings.TrimSpace(scored[0].Candidate.Address) + } + } + } + for _, route := range cfg.Routes { + path := routePath(route) + localIndex := indexOf(path, cfg.Local.NodeID) + if localIndex < 0 { + continue + } + for _, nodeID := range path { + if nodeID == "" || nodeID == cfg.Local.NodeID { + continue + } + entry := peerCacheEntry(entries, nodeID) + entry.RouteIDs = mergeStrings(entry.RouteIDs, []string{route.RouteID}) + } + for _, adjacentIndex := range []int{localIndex - 1, localIndex + 1} { + if adjacentIndex < 0 || adjacentIndex >= len(path) { + continue + } + nodeID := path[adjacentIndex] + if nodeID == "" || nodeID == cfg.Local.NodeID { + continue + } + peerCacheEntry(entries, nodeID).adjacentRoutePeer = true + } + } + for _, seed := range cfg.RecoverySeeds { + nodeID := strings.TrimSpace(seed.NodeID) + if nodeID == "" || nodeID == cfg.Local.NodeID { + continue + } + entry := peerCacheEntry(entries, nodeID) + entry.RecoverySeed = true + if entry.Endpoint == "" { + entry.Endpoint = strings.TrimSpace(seed.Endpoint) + } + if strings.TrimSpace(seed.ConnectivityMode) != "" { + entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{seed.ConnectivityMode}) + } + } + rendezvousLeases := 0 + for _, lease := range cfg.RendezvousLeases { + if !leaseUsableForPeerCache(lease, cfg.Local.NodeID, now) { + continue + } + rendezvousLeases++ + if lease.PeerNodeID != cfg.Local.NodeID { + entry := peerCacheEntry(entries, lease.PeerNodeID) + useLeaseEndpoint := shouldUseRendezvousEndpoint(*entry) + entry.RendezvousLeaseID = lease.LeaseID + entry.RelayNodeID = lease.RelayNodeID + entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/") + entry.RelayControl = true + entry.CandidateCount = maxInt(entry.CandidateCount, 1) + entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"}) + if useLeaseEndpoint { + entry.BestTransport = firstNonEmpty(lease.Transport, "relay_control") + entry.BestReachability = "relay" + entry.BestConnectivity = firstNonEmpty(lease.ConnectivityMode, "relay_required") + entry.Endpoint = entry.RelayEndpoint + entry.BestCandidateID = lease.LeaseID + entry.BestCandidateAddr = entry.RelayEndpoint + entry.bestScore = maxInt(entry.bestScore, 500) + } + } + if lease.PeerNodeID == cfg.Local.NodeID && lease.RelayNodeID != "" && lease.RelayNodeID != cfg.Local.NodeID { + entry := peerCacheEntry(entries, lease.RelayNodeID) + if entry.Endpoint == "" { + entry.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/") + } + entry.EndpointCount = maxInt(entry.EndpointCount, 1) + entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_control"}) + } + } + out := make([]peerCacheBuildEntry, 0, len(entries)) + recoverySeeds := 0 + for _, entry := range entries { + sort.Strings(entry.RouteIDs) + sort.Strings(entry.ConnectivityModes) + if entry.RecoverySeed { + recoverySeeds++ + } + out = append(out, *entry) + } + sort.SliceStable(out, func(i, j int) bool { + left := warmPeerPriority(out[i]) + right := warmPeerPriority(out[j]) + if left != right { + return left > right + } + return out[i].NodeID < out[j].NodeID + }) + warm := 0 + for i := range out { + if warm >= limit { + break + } + if warmPeerPriority(out[i]) <= 0 { + continue + } + out[i].Warm = true + out[i].WarmReason = warmPeerReason(out[i]) + warm++ + } + sort.SliceStable(out, func(i, j int) bool { + return out[i].NodeID < out[j].NodeID + }) + snapshotEntries := make([]PeerCacheEntry, 0, len(out)) + for _, entry := range out { + snapshotEntries = append(snapshotEntries, entry.PeerCacheEntry) + } + return &PeerCache{snapshot: PeerCacheSnapshot{ + ClusterID: cfg.Local.ClusterID, + LocalNodeID: cfg.Local.NodeID, + PeerCount: len(snapshotEntries), + WarmPeerCount: warm, + RecoverySeedCount: recoverySeeds, + RendezvousLeaseCount: rendezvousLeases, + BuiltAt: now, + Entries: snapshotEntries, + }} +} + +func (c *PeerCache) Snapshot() PeerCacheSnapshot { + if c == nil { + return PeerCacheSnapshot{} + } + snapshot := c.snapshot + snapshot.Entries = append([]PeerCacheEntry{}, c.snapshot.Entries...) + return snapshot +} + +func (c *PeerCache) WarmPeerIDs() []string { + snapshot := c.Snapshot() + out := make([]string, 0, snapshot.WarmPeerCount) + for _, entry := range snapshot.Entries { + if entry.Warm { + out = append(out, entry.NodeID) + } + } + return out +} + +func peerCacheEntry(entries map[string]*peerCacheBuildEntry, nodeID string) *peerCacheBuildEntry { + if entry, ok := entries[nodeID]; ok { + return entry + } + entry := &peerCacheBuildEntry{PeerCacheEntry: PeerCacheEntry{NodeID: nodeID}} + entries[nodeID] = entry + return entry +} + +func warmPeerPriority(entry peerCacheBuildEntry) int { + score := 0 + if entry.adjacentRoutePeer { + score += 1000 + } + if entry.RecoverySeed { + score += 500 + } + if entry.Endpoint != "" { + score += 100 + } + if entry.bestScore > 0 { + score += entry.bestScore + } + if entry.RelayControl { + score += 300 + } + score += entry.CandidateCount + return score +} + +func warmPeerReason(entry peerCacheBuildEntry) string { + if entry.adjacentRoutePeer { + return "route_adjacent" + } + if entry.RecoverySeed { + return "recovery_seed" + } + if entry.RelayControl { + return "rendezvous_lease" + } + if entry.BestCandidateID != "" { + return "endpoint_candidate" + } + if entry.Endpoint != "" { + return "peer_endpoint" + } + return "scoped_peer" +} + +func leaseUsableForPeerCache(lease PeerRendezvousLease, localNodeID string, now time.Time) bool { + if strings.TrimSpace(lease.LeaseID) == "" || + strings.TrimSpace(lease.PeerNodeID) == "" || + strings.TrimSpace(lease.RelayNodeID) == "" || + strings.TrimSpace(lease.RelayEndpoint) == "" || + lease.ExpiresAt.IsZero() || + !lease.ExpiresAt.After(now) || + !lease.ControlPlaneOnly { + return false + } + return lease.PeerNodeID != localNodeID || lease.RelayNodeID != localNodeID +} + +func shouldUseRendezvousEndpoint(entry peerCacheBuildEntry) bool { + if strings.TrimSpace(entry.Endpoint) == "" { + return true + } + transport := strings.ToLower(strings.TrimSpace(entry.BestTransport)) + reachability := strings.ToLower(strings.TrimSpace(entry.BestReachability)) + connectivity := strings.ToLower(strings.TrimSpace(entry.BestConnectivity)) + return strings.Contains(transport, "relay") || + strings.Contains(transport, "outbound") || + reachability == "relay" || + reachability == "outbound_only" || + connectivity == "relay_required" || + connectivity == "outbound_only" +} + +func mergeStrings(existing []string, incoming []string) []string { + seen := map[string]struct{}{} + out := make([]string, 0, len(existing)+len(incoming)) + for _, value := range append(existing, incoming...) { + value = strings.TrimSpace(value) + if value == "" { + continue + } + if _, ok := seen[value]; ok { + continue + } + seen[value] = struct{}{} + out = append(out, value) + } + return out +} + +func maxInt(left, right int) int { + if left > right { + return left + } + return right +} diff --git a/agents/rap-node-agent/internal/mesh/peer_cache_test.go b/agents/rap-node-agent/internal/mesh/peer_cache_test.go new file mode 100644 index 0000000..a7d7f51 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_cache_test.go @@ -0,0 +1,170 @@ +package mesh + +import ( + "testing" + "time" +) + +func TestPeerCacheSelectsAdjacentWarmPeersWithinLimit(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + cache := NewPeerCache(PeerCacheConfig{ + Local: local, + PeerEndpoints: map[string]string{ + "node-a": "http://node-a:19000", + "node-r": "http://node-r:19000", + "node-c": "http://node-c:19000", + }, + Routes: []SyntheticRoute{ + peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"}), + }, + RecoverySeeds: []PeerRecoverySeed{ + {NodeID: "node-seed", Endpoint: "https://seed.example.test", Transport: "direct_tcp_tls", Priority: 10}, + }, + WarmPeerLimit: 2, + Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC), + }) + + warm := cache.WarmPeerIDs() + if len(warm) != 2 || warm[0] != "node-a" || warm[1] != "node-r" { + t.Fatalf("warm peers = %+v, want adjacent node-a/node-r", warm) + } + snapshot := cache.Snapshot() + if snapshot.PeerCount != 4 || snapshot.RecoverySeedCount != 1 { + t.Fatalf("unexpected snapshot counts: %+v", snapshot) + } +} + +func TestPeerCachePromotesRecoverySeedAfterRoutePeers(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + cache := NewPeerCache(PeerCacheConfig{ + Local: local, + Routes: []SyntheticRoute{ + peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r"}), + }, + RecoverySeeds: []PeerRecoverySeed{ + {NodeID: "node-seed", Endpoint: "wss://seed.example.test/mesh", Transport: "wss", ConnectivityMode: "direct", Priority: 1}, + }, + WarmPeerLimit: 3, + Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC), + }) + + warm := cache.WarmPeerIDs() + if len(warm) != 3 || warm[0] != "node-a" || warm[1] != "node-r" || warm[2] != "node-seed" { + t.Fatalf("warm peers = %+v, want adjacent peers then seed", warm) + } + seed, ok := peerCacheEntryByID(cache.Snapshot(), "node-seed") + if !ok || !seed.RecoverySeed || seed.WarmReason != "recovery_seed" { + t.Fatalf("unexpected seed entry: %+v", seed) + } +} + +func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"} + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + cache := NewPeerCache(PeerCacheConfig{ + Local: local, + PeerEndpointCandidates: map[string][]PeerEndpointCandidate{ + "node-b": { + { + EndpointID: "node-b-relay", + NodeID: "node-b", + Transport: "relay", + Address: "relay.example.test", + Reachability: "relay", + ConnectivityMode: "relay_required", + Priority: 20, + }, + { + EndpointID: "node-b-public", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + NATType: "none", + ConnectivityMode: "direct", + Priority: 1, + LastVerifiedAt: &now, + }, + }, + }, + WarmPeerLimit: 1, + Now: now, + }) + + entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b") + if !ok { + t.Fatal("node-b missing from cache") + } + if entry.BestCandidateID != "node-b-public" || !entry.Warm { + t.Fatalf("unexpected candidate selection: %+v", entry) + } +} + +func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"} + cache := NewPeerCache(PeerCacheConfig{ + Local: local, + PeerEndpoints: map[string]string{ + "node-b": "https://node-b.public.example.test:443", + }, + PeerEndpointCandidates: map[string][]PeerEndpointCandidate{ + "node-b": { + { + EndpointID: "node-b-public", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "https://node-b.public.example.test:443", + Reachability: "public", + NATType: "none", + ConnectivityMode: "direct", + Region: "corp-eu", + Priority: 10, + }, + { + EndpointID: "node-b-corp-lan", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "http://10.24.10.20:19001", + Reachability: "private", + NATType: "none", + ConnectivityMode: "direct", + Region: "corp-eu", + Priority: 1, + PolicyTags: []string{"corp-lan"}, + }, + }, + }, + PreferredRegion: "corp-eu", + WarmPeerLimit: 1, + Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC), + }) + + entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b") + if !ok { + t.Fatal("node-b missing from peer cache") + } + if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "http://10.24.10.20:19001" { + t.Fatalf("peer cache did not choose corp LAN endpoint: %+v", entry) + } +} + +func peerCacheRoute(routeID string, hops []string) SyntheticRoute { + return SyntheticRoute{ + RouteID: routeID, + ClusterID: "cluster-1", + SourceNodeID: hops[0], + DestinationNodeID: hops[len(hops)-1], + Hops: append([]string{}, hops...), + AllowedChannels: []string{SyntheticChannelFabricControl}, + ExpiresAt: time.Now().UTC().Add(time.Hour), + } +} + +func peerCacheEntryByID(snapshot PeerCacheSnapshot, nodeID string) (PeerCacheEntry, bool) { + for _, entry := range snapshot.Entries { + if entry.NodeID == nodeID { + return entry, true + } + } + return PeerCacheEntry{}, false +} diff --git a/agents/rap-node-agent/internal/mesh/peer_connection_intent.go b/agents/rap-node-agent/internal/mesh/peer_connection_intent.go new file mode 100644 index 0000000..1f4c167 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_connection_intent.go @@ -0,0 +1,303 @@ +package mesh + +import ( + "net" + "net/netip" + "net/url" + "sort" + "strings" + "time" +) + +const ( + PeerConnectionIntentMaintain = "maintain" + PeerConnectionIntentProbe = "probe" + PeerConnectionIntentRecover = "recover" +) + +const ( + PeerTransportModeDirect = "direct" + PeerTransportModePrivateLAN = "private_lan" + PeerTransportModeCorporateLAN = "corporate_lan" + PeerTransportModeOutboundOnly = "outbound_only" + PeerTransportModeRelayRequired = "relay_required" + PeerTransportModeRelayControl = "relay_control" + PeerTransportModeUnknown = "unknown" +) + +type PeerConnectionIntentPlanConfig struct { + PeerCache PeerCacheSnapshot + RecoveryPlan PeerRecoveryPlan + RendezvousLeases []PeerRendezvousLease + Now time.Time +} + +type PeerConnectionIntentPlan struct { + Mode string `json:"mode"` + IntentCount int `json:"intent_count"` + MaintainCount int `json:"maintain_count"` + ProbeCount int `json:"probe_count"` + RecoverCount int `json:"recover_count"` + DirectCount int `json:"direct_count"` + PrivateLANCount int `json:"private_lan_count"` + CorporateLANCount int `json:"corporate_lan_count"` + OutboundOnlyCount int `json:"outbound_only_count"` + RelayRequiredCount int `json:"relay_required_count"` + RelayControlCount int `json:"relay_control_count"` + RendezvousRequiredCount int `json:"rendezvous_required_count"` + RendezvousResolvedCount int `json:"rendezvous_resolved_count"` + RendezvousLeaseCount int `json:"rendezvous_lease_count"` + GeneratedAt time.Time `json:"generated_at"` + Intents []PeerConnectionIntent `json:"intents,omitempty"` +} + +type PeerConnectionIntent struct { + NodeID string `json:"node_id"` + Action string `json:"action"` + Reason string `json:"reason"` + Endpoint string `json:"endpoint,omitempty"` + ConnectionState string `json:"connection_state"` + Transport string `json:"transport,omitempty"` + TransportMode string `json:"transport_mode"` + Reachability string `json:"reachability,omitempty"` + ConnectivityMode string `json:"connectivity_mode,omitempty"` + NATType string `json:"nat_type,omitempty"` + PolicyTags []string `json:"policy_tags,omitempty"` + RequiresRendezvous bool `json:"requires_rendezvous"` + RendezvousResolved bool `json:"rendezvous_resolved"` + DirectCandidate bool `json:"direct_candidate"` + RelayCandidate bool `json:"relay_candidate"` + BestCandidateID string `json:"best_candidate_id,omitempty"` + RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"` + RelayNodeID string `json:"relay_node_id,omitempty"` + RelayEndpoint string `json:"relay_endpoint,omitempty"` + ControlPlaneOnly bool `json:"control_plane_only"` + RecoverySeed bool `json:"recovery_seed"` + Priority int `json:"priority"` + GeneratedAt time.Time `json:"generated_at"` +} + +func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectionIntentPlan { + now := normalizedNow(cfg.Now) + entryByNode := map[string]PeerCacheEntry{} + for _, entry := range cfg.PeerCache.Entries { + if strings.TrimSpace(entry.NodeID) == "" { + continue + } + entryByNode[entry.NodeID] = entry + } + + intents := make([]PeerConnectionIntent, 0, len(cfg.RecoveryPlan.Candidates)) + for _, candidate := range cfg.RecoveryPlan.Candidates { + if strings.TrimSpace(candidate.NodeID) == "" { + continue + } + entry := entryByNode[candidate.NodeID] + intent := PeerConnectionIntent{ + NodeID: candidate.NodeID, + Action: connectionIntentAction(candidate), + Reason: candidate.Reason, + Endpoint: candidate.Endpoint, + ConnectionState: candidate.ConnectionState, + Transport: firstNonEmpty(candidate.BestTransport, entry.BestTransport), + Reachability: entry.BestReachability, + ConnectivityMode: entry.BestConnectivity, + NATType: entry.BestNATType, + PolicyTags: append([]string{}, entry.BestPolicyTags...), + BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID), + RendezvousLeaseID: entry.RendezvousLeaseID, + RelayNodeID: entry.RelayNodeID, + RelayEndpoint: entry.RelayEndpoint, + RelayCandidate: entry.RelayControl, + ControlPlaneOnly: entry.RelayControl, + RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed, + Priority: candidate.Priority, + GeneratedAt: now, + } + mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent) + intent.TransportMode = mode + intent.RequiresRendezvous = requiresRendezvous + intent.DirectCandidate = directCandidate + if intent.RequiresRendezvous { + if lease, ok := rendezvousLeaseForPeer(cfg.RendezvousLeases, intent.NodeID, now); ok { + applyRendezvousLease(&intent, lease) + } + } + intents = append(intents, intent) + } + sort.SliceStable(intents, func(i, j int) bool { + if intents[i].Priority != intents[j].Priority { + return intents[i].Priority > intents[j].Priority + } + return intents[i].NodeID < intents[j].NodeID + }) + + plan := PeerConnectionIntentPlan{ + Mode: cfg.RecoveryPlan.Mode, + IntentCount: len(intents), + GeneratedAt: now, + Intents: intents, + } + for _, intent := range intents { + switch intent.Action { + case PeerConnectionIntentMaintain: + plan.MaintainCount++ + case PeerConnectionIntentProbe: + plan.ProbeCount++ + case PeerConnectionIntentRecover: + plan.RecoverCount++ + } + switch intent.TransportMode { + case PeerTransportModeDirect: + plan.DirectCount++ + case PeerTransportModePrivateLAN: + plan.PrivateLANCount++ + case PeerTransportModeCorporateLAN: + plan.CorporateLANCount++ + case PeerTransportModeOutboundOnly: + plan.OutboundOnlyCount++ + case PeerTransportModeRelayRequired: + plan.RelayRequiredCount++ + case PeerTransportModeRelayControl: + plan.RelayControlCount++ + } + if intent.RequiresRendezvous { + plan.RendezvousRequiredCount++ + } + if intent.RendezvousResolved { + plan.RendezvousResolvedCount++ + } + if intent.RendezvousLeaseID != "" { + plan.RendezvousLeaseCount++ + } + } + return plan +} + +func connectionIntentAction(candidate PeerRecoveryCandidate) string { + switch candidate.Reason { + case "maintain_ready": + return PeerConnectionIntentMaintain + case "recover_degraded", "recover_seed", "recover_warm", "recover_peer": + return PeerConnectionIntentRecover + default: + return PeerConnectionIntentProbe + } +} + +func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) { + transport := strings.ToLower(strings.TrimSpace(intent.Transport)) + connectivity := strings.ToLower(strings.TrimSpace(intent.ConnectivityMode)) + reachability := strings.ToLower(strings.TrimSpace(intent.Reachability)) + tags := lowerStringSet(intent.PolicyTags) + + if strings.Contains(transport, "relay") || connectivity == "relay_required" || reachability == "relay" { + return PeerTransportModeRelayRequired, true, false + } + if connectivity == "outbound_only" || reachability == "outbound_only" { + return PeerTransportModeOutboundOnly, true, false + } + if tags["corp-lan"] || tags["same-site"] { + return PeerTransportModeCorporateLAN, false, true + } + if tags["private-lan"] || reachability == "private" || endpointHasPrivateHost(intent.Endpoint) { + return PeerTransportModePrivateLAN, false, true + } + if strings.Contains(transport, "direct") || reachability == "public" || connectivity == "direct" { + return PeerTransportModeDirect, false, true + } + return PeerTransportModeUnknown, false, false +} + +func rendezvousLeaseForPeer(leases []PeerRendezvousLease, peerNodeID string, now time.Time) (PeerRendezvousLease, bool) { + now = normalizedNow(now) + candidates := make([]PeerRendezvousLease, 0, len(leases)) + for _, lease := range leases { + if strings.TrimSpace(lease.PeerNodeID) != peerNodeID || + strings.TrimSpace(lease.RelayEndpoint) == "" || + strings.TrimSpace(lease.RelayNodeID) == "" || + !lease.ControlPlaneOnly || + lease.ExpiresAt.IsZero() || + !lease.ExpiresAt.After(now) { + continue + } + candidates = append(candidates, lease) + } + if len(candidates) == 0 { + return PeerRendezvousLease{}, false + } + sort.SliceStable(candidates, func(i, j int) bool { + leftPriority := candidates[i].Priority + rightPriority := candidates[j].Priority + if leftPriority <= 0 { + leftPriority = 100 + } + if rightPriority <= 0 { + rightPriority = 100 + } + if leftPriority != rightPriority { + return leftPriority < rightPriority + } + if !candidates[i].ExpiresAt.Equal(candidates[j].ExpiresAt) { + return candidates[i].ExpiresAt.After(candidates[j].ExpiresAt) + } + return candidates[i].LeaseID < candidates[j].LeaseID + }) + return candidates[0], true +} + +func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease) { + intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/") + intent.Transport = firstNonEmpty(lease.Transport, "relay_control") + intent.TransportMode = PeerTransportModeRelayControl + intent.RequiresRendezvous = false + intent.RendezvousResolved = true + intent.DirectCandidate = false + intent.RelayCandidate = true + intent.RendezvousLeaseID = lease.LeaseID + intent.RelayNodeID = lease.RelayNodeID + intent.RelayEndpoint = intent.Endpoint + intent.ControlPlaneOnly = true + if lease.ConnectivityMode != "" { + intent.ConnectivityMode = lease.ConnectivityMode + } +} + +func endpointHasPrivateHost(rawEndpoint string) bool { + rawEndpoint = strings.TrimSpace(rawEndpoint) + if rawEndpoint == "" { + return false + } + host := rawEndpoint + if parsed, err := url.Parse(rawEndpoint); err == nil && parsed.Host != "" { + host = parsed.Host + } + if splitHost, _, err := net.SplitHostPort(host); err == nil { + host = splitHost + } + addr, err := netip.ParseAddr(strings.Trim(host, "[]")) + if err != nil { + return false + } + return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast() +} + +func lowerStringSet(values []string) map[string]bool { + out := map[string]bool{} + for _, value := range values { + value = strings.ToLower(strings.TrimSpace(value)) + if value != "" { + out[value] = true + } + } + return out +} + +func firstNonEmpty(values ...string) string { + for _, value := range values { + if strings.TrimSpace(value) != "" { + return strings.TrimSpace(value) + } + } + return "" +} diff --git a/agents/rap-node-agent/internal/mesh/peer_connection_intent_test.go b/agents/rap-node-agent/internal/mesh/peer_connection_intent_test.go new file mode 100644 index 0000000..87fce03 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_connection_intent_test.go @@ -0,0 +1,234 @@ +package mesh + +import ( + "testing" + "time" +) + +func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{ + PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{ + { + NodeID: "node-b", + Endpoint: "http://10.24.10.20:19001", + BestTransport: "direct_tcp_tls", + BestReachability: "private", + BestConnectivity: "direct", + BestPolicyTags: []string{"corp-lan", "same-site"}, + }, + }}, + RecoveryPlan: PeerRecoveryPlan{ + Mode: PeerRecoveryModeSteady, + Candidates: []PeerRecoveryCandidate{ + { + NodeID: "node-b", + Endpoint: "http://10.24.10.20:19001", + ConnectionState: PeerConnectionReady, + Reason: "maintain_ready", + Priority: 100, + }, + }, + }, + Now: now, + }) + + if plan.IntentCount != 1 || plan.MaintainCount != 1 || plan.CorporateLANCount != 1 { + t.Fatalf("unexpected plan counts: %+v", plan) + } + intent := plan.Intents[0] + if intent.Action != PeerConnectionIntentMaintain || intent.TransportMode != PeerTransportModeCorporateLAN || intent.RequiresRendezvous { + t.Fatalf("unexpected corporate intent: %+v", intent) + } +} + +func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{ + PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{ + { + NodeID: "node-b", + Endpoint: "https://node-b.example.test:443", + BestTransport: "direct_tcp_tls", + BestReachability: "outbound_only", + BestConnectivity: "outbound_only", + }, + { + NodeID: "node-c", + Endpoint: "relay://fabric-relay/node-c", + BestTransport: "relay", + BestReachability: "relay", + BestConnectivity: "relay_required", + }, + }}, + RecoveryPlan: PeerRecoveryPlan{ + Mode: PeerRecoveryModeRecovery, + Candidates: []PeerRecoveryCandidate{ + { + NodeID: "node-b", + Endpoint: "https://node-b.example.test:443", + ConnectionState: PeerConnectionDisconnected, + Reason: "recover_warm", + Priority: 90, + }, + { + NodeID: "node-c", + Endpoint: "relay://fabric-relay/node-c", + ConnectionState: PeerConnectionDisconnected, + Reason: "recover_seed", + Priority: 80, + }, + }, + }, + Now: now, + }) + + if plan.RecoverCount != 2 || plan.OutboundOnlyCount != 1 || plan.RelayRequiredCount != 1 || plan.RendezvousRequiredCount != 2 { + t.Fatalf("unexpected rendezvous counts: %+v", plan) + } + if plan.Intents[0].Action != PeerConnectionIntentRecover || plan.Intents[1].Action != PeerConnectionIntentRecover { + t.Fatalf("unexpected actions: %+v", plan.Intents) + } +} + +func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{ + PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{ + { + NodeID: "node-b", + Endpoint: "relay://fabric/node-b", + BestTransport: "relay", + BestReachability: "relay", + BestConnectivity: "relay_required", + }, + }}, + RecoveryPlan: PeerRecoveryPlan{ + Mode: PeerRecoveryModeRecovery, + Candidates: []PeerRecoveryCandidate{ + { + NodeID: "node-b", + Endpoint: "relay://fabric/node-b", + ConnectionState: PeerConnectionDisconnected, + Reason: "recover_warm", + Priority: 100, + }, + }, + }, + RendezvousLeases: []PeerRendezvousLease{ + { + LeaseID: "lease-node-b-via-node-r", + PeerNodeID: "node-b", + RelayNodeID: "node-r", + RelayEndpoint: "http://node-r:19000", + Transport: "relay_control", + ConnectivityMode: "relay_required", + Priority: 10, + ControlPlaneOnly: true, + IssuedAt: now.Add(-time.Minute), + ExpiresAt: now.Add(time.Minute), + }, + }, + Now: now, + }) + + if plan.IntentCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 { + t.Fatalf("unexpected relay-control plan counts: %+v", plan) + } + intent := plan.Intents[0] + if intent.TransportMode != PeerTransportModeRelayControl || + intent.Endpoint != "http://node-r:19000" || + intent.RelayNodeID != "node-r" || + intent.RendezvousLeaseID != "lease-node-b-via-node-r" || + !intent.RelayCandidate || + !intent.RendezvousResolved || + intent.RequiresRendezvous { + t.Fatalf("unexpected resolved rendezvous intent: %+v", intent) + } +} + +func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{ + PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{ + { + NodeID: "node-b", + Endpoint: "relay://fabric/node-b", + BestTransport: "relay", + BestReachability: "relay", + BestConnectivity: "relay_required", + }, + }}, + RecoveryPlan: PeerRecoveryPlan{ + Mode: PeerRecoveryModeRecovery, + Candidates: []PeerRecoveryCandidate{ + { + NodeID: "node-b", + Endpoint: "relay://fabric/node-b", + ConnectionState: PeerConnectionWaiting, + Reason: "recover_warm", + Priority: 100, + }, + }, + }, + RendezvousLeases: []PeerRendezvousLease{ + { + LeaseID: "lease-expired-preferred", + PeerNodeID: "node-b", + RelayNodeID: "node-r-old", + RelayEndpoint: "http://node-r-old:19000", + Transport: "relay_control", + ConnectivityMode: "relay_required", + Priority: 1, + ControlPlaneOnly: true, + IssuedAt: now.Add(-10 * time.Minute), + ExpiresAt: now.Add(-time.Second), + }, + { + LeaseID: "lease-active-reselected", + PeerNodeID: "node-b", + RelayNodeID: "node-r-new", + RelayEndpoint: "http://node-r-new:19000", + Transport: "relay_control", + ConnectivityMode: "relay_required", + Priority: 20, + ControlPlaneOnly: true, + IssuedAt: now.Add(-time.Minute), + ExpiresAt: now.Add(time.Minute), + }, + }, + Now: now, + }) + + if plan.RendezvousResolvedCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousRequiredCount != 0 { + t.Fatalf("unexpected reselected plan counts: %+v", plan) + } + intent := plan.Intents[0] + if intent.RendezvousLeaseID != "lease-active-reselected" || + intent.RelayNodeID != "node-r-new" || + intent.Endpoint != "http://node-r-new:19000" { + t.Fatalf("expired lease was not skipped: %+v", intent) + } +} + +func TestPeerConnectionIntentsClassifyPrivateEndpointWithoutCandidateHints(t *testing.T) { + plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{ + PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{ + {NodeID: "node-b", Endpoint: "http://192.168.10.20:19001"}, + }}, + RecoveryPlan: PeerRecoveryPlan{Candidates: []PeerRecoveryCandidate{ + { + NodeID: "node-b", + Endpoint: "http://192.168.10.20:19001", + ConnectionState: PeerConnectionDisconnected, + Reason: "recover_peer", + Priority: 10, + }, + }}, + Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC), + }) + + if plan.PrivateLANCount != 1 || plan.Intents[0].TransportMode != PeerTransportModePrivateLAN || !plan.Intents[0].DirectCandidate { + t.Fatalf("unexpected private endpoint classification: %+v", plan) + } +} diff --git a/agents/rap-node-agent/internal/mesh/peer_connection_manager.go b/agents/rap-node-agent/internal/mesh/peer_connection_manager.go new file mode 100644 index 0000000..053ae2a --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_connection_manager.go @@ -0,0 +1,304 @@ +package mesh + +import ( + "context" + "net/http" + "strings" + "sync" + "time" +) + +const ( + PeerConnectionProbeReachable = "reachable" + PeerConnectionProbeUnreachable = "unreachable" + PeerConnectionProbeDeferred = "deferred" + PeerConnectionProbeSkipped = "skipped" +) + +const ( + DefaultPeerConnectionProbeTimeout = 2 * time.Second +) + +type PeerConnectionManagerConfig struct { + Local PeerIdentity + PeerCache *PeerCache + Tracker *PeerConnectionTracker + RendezvousLeases []PeerRendezvousLease + HTTPClient *http.Client + ProbeTimeout time.Duration + Now func() time.Time +} + +type PeerConnectionManager struct { + local PeerIdentity + peerCache *PeerCache + tracker *PeerConnectionTracker + rendezvousLeases []PeerRendezvousLease + httpClient *http.Client + probeTimeout time.Duration + now func() time.Time + + mu sync.Mutex + lastCycle PeerConnectionManagerCycle +} + +type PeerConnectionManagerCycle struct { + Mode string `json:"mode"` + StartedAt time.Time `json:"started_at"` + CompletedAt time.Time `json:"completed_at"` + ProbeTimeoutMs int `json:"probe_timeout_ms"` + IntentCount int `json:"intent_count"` + Attempted int `json:"attempted"` + Succeeded int `json:"succeeded"` + Failed int `json:"failed"` + Deferred int `json:"deferred"` + Skipped int `json:"skipped"` + RendezvousRequiredCount int `json:"rendezvous_required_count"` + RendezvousResolvedCount int `json:"rendezvous_resolved_count"` + RelayControlCount int `json:"relay_control_count"` + RecoveryPlan PeerRecoveryPlan `json:"recovery_plan"` + IntentPlan PeerConnectionIntentPlan `json:"intent_plan"` + Results []PeerConnectionProbeResult `json:"results,omitempty"` +} + +type PeerConnectionManagerSnapshot struct { + LastCycle PeerConnectionManagerCycle `json:"last_cycle"` +} + +type PeerConnectionProbeResult struct { + NodeID string `json:"node_id"` + LinkStatus string `json:"link_status"` + Action string `json:"action"` + Reason string `json:"reason"` + Endpoint string `json:"endpoint,omitempty"` + ConnectionState PeerConnectionState `json:"connection_state"` + TransportMode string `json:"transport_mode"` + RequiresRendezvous bool `json:"requires_rendezvous"` + RendezvousResolved bool `json:"rendezvous_resolved"` + DirectCandidate bool `json:"direct_candidate"` + RelayCandidate bool `json:"relay_candidate"` + RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"` + RelayNodeID string `json:"relay_node_id,omitempty"` + RelayEndpoint string `json:"relay_endpoint,omitempty"` + LatencyMs int `json:"latency_ms,omitempty"` + FailureReason string `json:"failure_reason,omitempty"` + StartedAt time.Time `json:"started_at"` + CompletedAt time.Time `json:"completed_at"` +} + +func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager { + probeTimeout := cfg.ProbeTimeout + if probeTimeout <= 0 { + probeTimeout = DefaultPeerConnectionProbeTimeout + } + httpClient := cfg.HTTPClient + if httpClient == nil { + httpClient = &http.Client{ + Transport: &http.Transport{ + MaxIdleConns: 64, + MaxIdleConnsPerHost: 8, + IdleConnTimeout: 90 * time.Second, + }, + Timeout: probeTimeout + time.Second, + } + } + now := cfg.Now + if now == nil { + now = func() time.Time { return time.Now().UTC() } + } + return &PeerConnectionManager{ + local: cfg.Local, + peerCache: cfg.PeerCache, + tracker: cfg.Tracker, + rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...), + httpClient: httpClient, + probeTimeout: probeTimeout, + now: now, + } +} + +func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionManagerCycle { + peerCache, rendezvousLeases := m.peerConfigSnapshot() + if m == nil || peerCache == nil || m.tracker == nil { + return PeerConnectionManagerCycle{} + } + startedAt := normalizedNow(m.now()) + peerSnapshot := peerCache.Snapshot() + recoveryPlan := PlanPeerRecovery(PeerRecoveryPlanConfig{ + PeerCache: peerSnapshot, + Connections: m.tracker.Snapshot(), + TargetReadyPeers: DefaultStablePeerTarget, + MaxProbeCandidates: DefaultRecoveryProbeLimit, + Now: startedAt, + }) + intentPlan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{ + PeerCache: peerSnapshot, + RecoveryPlan: recoveryPlan, + RendezvousLeases: rendezvousLeases, + Now: startedAt, + }) + cycle := PeerConnectionManagerCycle{ + Mode: recoveryPlan.Mode, + StartedAt: startedAt, + ProbeTimeoutMs: int(m.probeTimeout.Milliseconds()), + IntentCount: intentPlan.IntentCount, + RendezvousRequiredCount: intentPlan.RendezvousRequiredCount, + RendezvousResolvedCount: intentPlan.RendezvousResolvedCount, + RelayControlCount: intentPlan.RelayControlCount, + RecoveryPlan: recoveryPlan, + IntentPlan: intentPlan, + Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)), + } + for _, intent := range intentPlan.Intents { + result := m.probeIntent(ctx, intent) + cycle.Results = append(cycle.Results, result) + switch result.LinkStatus { + case PeerConnectionProbeReachable: + cycle.Attempted++ + cycle.Succeeded++ + case PeerConnectionProbeUnreachable: + cycle.Attempted++ + cycle.Failed++ + case PeerConnectionProbeDeferred: + cycle.Deferred++ + case PeerConnectionProbeSkipped: + cycle.Skipped++ + } + } + cycle.CompletedAt = normalizedNow(m.now()) + m.mu.Lock() + m.lastCycle = cycle + m.mu.Unlock() + return cycle +} + +func (m *PeerConnectionManager) Snapshot() PeerConnectionManagerSnapshot { + if m == nil { + return PeerConnectionManagerSnapshot{} + } + m.mu.Lock() + defer m.mu.Unlock() + return PeerConnectionManagerSnapshot{LastCycle: m.lastCycle} +} + +func (m *PeerConnectionManager) UpdatePeerConfig(peerCache *PeerCache, rendezvousLeases []PeerRendezvousLease) { + if m == nil { + return + } + m.mu.Lock() + defer m.mu.Unlock() + m.peerCache = peerCache + m.rendezvousLeases = append([]PeerRendezvousLease{}, rendezvousLeases...) +} + +func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvousLease) { + if m == nil { + return nil, nil + } + m.mu.Lock() + defer m.mu.Unlock() + return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...) +} + +func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult { + startedAt := normalizedNow(m.now()) + result := PeerConnectionProbeResult{ + NodeID: intent.NodeID, + Action: intent.Action, + Reason: intent.Reason, + Endpoint: intent.Endpoint, + TransportMode: intent.TransportMode, + RequiresRendezvous: intent.RequiresRendezvous, + RendezvousResolved: intent.RendezvousResolved, + DirectCandidate: intent.DirectCandidate, + RelayCandidate: intent.RelayCandidate, + RendezvousLeaseID: intent.RendezvousLeaseID, + RelayNodeID: intent.RelayNodeID, + RelayEndpoint: intent.RelayEndpoint, + StartedAt: startedAt, + } + peer := PeerCacheEntry{ + NodeID: intent.NodeID, + Endpoint: intent.Endpoint, + Warm: true, + WarmReason: intent.Reason, + RecoverySeed: intent.RecoverySeed, + BestCandidateID: intent.BestCandidateID, + BestTransport: intent.Transport, + RendezvousLeaseID: intent.RendezvousLeaseID, + RelayNodeID: intent.RelayNodeID, + RelayEndpoint: intent.RelayEndpoint, + RelayControl: intent.RelayCandidate, + } + if intent.RequiresRendezvous { + result.LinkStatus = PeerConnectionProbeDeferred + result.FailureReason = "rendezvous_required" + result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt) + result.CompletedAt = normalizedNow(m.now()) + return result + } + if strings.TrimSpace(intent.Endpoint) == "" || (!intent.DirectCandidate && !intent.RelayCandidate) { + result.LinkStatus = PeerConnectionProbeDeferred + result.FailureReason = "direct_candidate_unavailable" + if intent.RelayCandidate { + result.FailureReason = "relay_candidate_unavailable" + } + result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt) + result.CompletedAt = normalizedNow(m.now()) + return result + } + if !m.tracker.ShouldProbe(intent.NodeID, startedAt) { + result.LinkStatus = PeerConnectionProbeSkipped + result.FailureReason = "backoff_active" + result.ConnectionState = m.connectionState(intent.NodeID) + result.CompletedAt = normalizedNow(m.now()) + return result + } + m.tracker.BeginProbe(peer, startedAt) + probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout) + defer cancel() + target := PeerIdentity{ + ClusterID: m.local.ClusterID, + NodeID: intent.NodeID, + } + if intent.RelayCandidate && intent.RelayNodeID != "" { + target.NodeID = intent.RelayNodeID + } + _, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target)) + completedAt := normalizedNow(m.now()) + if err != nil { + result.LinkStatus = PeerConnectionProbeUnreachable + result.FailureReason = err.Error() + result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt) + result.CompletedAt = completedAt + return result + } + latency := int(completedAt.Sub(startedAt).Milliseconds()) + if latency < 0 { + latency = 0 + } + result.LinkStatus = PeerConnectionProbeReachable + result.LatencyMs = latency + if intent.RelayCandidate { + result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt) + } else { + result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt) + } + result.CompletedAt = completedAt + return result +} + +func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState { + snapshot := m.tracker.Snapshot() + for _, entry := range snapshot.Entries { + if entry.NodeID == nodeID { + return entry + } + } + return PeerConnectionState{NodeID: nodeID, State: PeerConnectionDisconnected} +} + +func (c Client) withHTTPClient(httpClient *http.Client) Client { + c.HTTPClient = httpClient + return c +} diff --git a/agents/rap-node-agent/internal/mesh/peer_connection_manager_test.go b/agents/rap-node-agent/internal/mesh/peer_connection_manager_test.go new file mode 100644 index 0000000..5d64f3a --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_connection_manager_test.go @@ -0,0 +1,190 @@ +package mesh + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + current := now + server := httptest.NewServer(Server{ + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}, + }.Handler()) + defer server.Close() + + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"} + cache := NewPeerCache(PeerCacheConfig{ + Local: local, + PeerEndpointCandidates: map[string][]PeerEndpointCandidate{ + "node-b": { + { + EndpointID: "node-b-direct", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: server.URL, + Reachability: "private", + ConnectivityMode: "direct", + PolicyTags: []string{"corp-lan", "same-site"}, + Priority: 1, + }, + }, + "node-c": { + { + EndpointID: "node-c-relay", + NodeID: "node-c", + Transport: "relay", + Address: "relay://fabric/node-c", + Reachability: "relay", + ConnectivityMode: "relay_required", + Priority: 1, + }, + }, + }, + WarmPeerLimit: 2, + Now: now, + }) + tracker := NewPeerConnectionTracker(cache.Snapshot(), now) + manager := NewPeerConnectionManager(PeerConnectionManagerConfig{ + Local: local, + PeerCache: cache, + Tracker: tracker, + ProbeTimeout: time.Second, + Now: func() time.Time { + current = current.Add(10 * time.Millisecond) + return current + }, + }) + + cycle := manager.ProbeOnce(context.Background()) + if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Deferred != 1 || cycle.RendezvousRequiredCount != 1 { + t.Fatalf("unexpected cycle: %+v", cycle) + } + snapshot := tracker.Snapshot() + if snapshot.Ready != 1 || snapshot.Waiting != 1 { + t.Fatalf("unexpected tracker snapshot: %+v", snapshot) + } + if cycle.Results[0].NodeID != "node-b" || cycle.Results[0].LinkStatus != PeerConnectionProbeReachable { + t.Fatalf("direct peer was not probed first: %+v", cycle.Results) + } + if cycle.Results[1].NodeID != "node-c" || cycle.Results[1].LinkStatus != PeerConnectionProbeDeferred { + t.Fatalf("relay peer was not deferred: %+v", cycle.Results) + } +} + +func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + current := now + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"} + cache := NewPeerCache(PeerCacheConfig{ + Local: local, + PeerEndpoints: map[string]string{ + "node-b": "http://127.0.0.1:1", + }, + WarmPeerLimit: 1, + Now: now, + }) + tracker := NewPeerConnectionTracker(cache.Snapshot(), now) + manager := NewPeerConnectionManager(PeerConnectionManagerConfig{ + Local: local, + PeerCache: cache, + Tracker: tracker, + HTTPClient: &http.Client{Timeout: 20 * time.Millisecond}, + ProbeTimeout: 20 * time.Millisecond, + Now: func() time.Time { + current = current.Add(10 * time.Millisecond) + return current + }, + }) + + for i := 0; i < 3; i++ { + manager.ProbeOnce(context.Background()) + } + backoff := tracker.Snapshot() + if backoff.Backoff != 1 { + t.Fatalf("expected backoff after repeated failures: %+v", backoff) + } + cycle := manager.ProbeOnce(context.Background()) + if cycle.Attempted != 0 || len(cycle.Results) != 0 { + t.Fatalf("active backoff peer should not be attempted: %+v", cycle) + } +} + +func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + current := now + server := httptest.NewServer(Server{ + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}, + }.Handler()) + defer server.Close() + + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"} + leases := []PeerRendezvousLease{ + { + LeaseID: "lease-node-b-via-node-r", + PeerNodeID: "node-b", + RelayNodeID: "node-r", + RelayEndpoint: server.URL, + Transport: "relay_control", + ConnectivityMode: "relay_required", + Priority: 10, + ControlPlaneOnly: true, + IssuedAt: now.Add(-time.Minute), + ExpiresAt: now.Add(time.Minute), + }, + } + cache := NewPeerCache(PeerCacheConfig{ + Local: local, + PeerEndpointCandidates: map[string][]PeerEndpointCandidate{ + "node-b": { + { + EndpointID: "node-b-relay", + NodeID: "node-b", + Transport: "relay", + Address: "relay://fabric/node-b", + Reachability: "relay", + ConnectivityMode: "relay_required", + Priority: 10, + }, + }, + }, + RendezvousLeases: leases, + WarmPeerLimit: 1, + Now: now, + }) + tracker := NewPeerConnectionTracker(cache.Snapshot(), now) + manager := NewPeerConnectionManager(PeerConnectionManagerConfig{ + Local: local, + PeerCache: cache, + Tracker: tracker, + RendezvousLeases: leases, + ProbeTimeout: time.Second, + Now: func() time.Time { + current = current.Add(10 * time.Millisecond) + return current + }, + }) + + cycle := manager.ProbeOnce(context.Background()) + if cycle.Attempted != 1 || + cycle.Succeeded != 1 || + cycle.Deferred != 0 || + cycle.RelayControlCount != 1 || + cycle.RendezvousResolvedCount != 1 || + cycle.RendezvousRequiredCount != 0 { + t.Fatalf("unexpected relay-control cycle: %+v", cycle) + } + if len(cycle.Results) != 1 || + cycle.Results[0].NodeID != "node-b" || + cycle.Results[0].RelayNodeID != "node-r" || + cycle.Results[0].ConnectionState.State != PeerConnectionRelayReady { + t.Fatalf("unexpected relay-control result: %+v", cycle.Results) + } + snapshot := tracker.Snapshot() + if snapshot.RelayReady != 1 || snapshot.Waiting != 0 { + t.Fatalf("unexpected tracker snapshot: %+v", snapshot) + } +} diff --git a/agents/rap-node-agent/internal/mesh/peer_connection_state.go b/agents/rap-node-agent/internal/mesh/peer_connection_state.go new file mode 100644 index 0000000..92d5952 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_connection_state.go @@ -0,0 +1,284 @@ +package mesh + +import ( + "sort" + "sync" + "time" +) + +const ( + PeerConnectionDisconnected = "disconnected" + PeerConnectionConnecting = "connecting" + PeerConnectionReady = "ready" + PeerConnectionRelayReady = "relay_ready" + PeerConnectionDegraded = "degraded" + PeerConnectionBackoff = "backoff" + PeerConnectionWaiting = "waiting_rendezvous" +) + +const ( + peerConnectionBackoffBase = 5 * time.Second + peerConnectionBackoffMax = time.Minute +) + +type PeerConnectionTracker struct { + mu sync.Mutex + entries map[string]PeerConnectionState +} + +type PeerConnectionState struct { + NodeID string `json:"node_id"` + State string `json:"state"` + Warm bool `json:"warm"` + WarmReason string `json:"warm_reason,omitempty"` + Endpoint string `json:"endpoint,omitempty"` + BestCandidateID string `json:"best_candidate_id,omitempty"` + RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"` + RelayNodeID string `json:"relay_node_id,omitempty"` + RelayEndpoint string `json:"relay_endpoint,omitempty"` + RelayControl bool `json:"relay_control"` + ConsecutiveSuccesses int `json:"consecutive_successes"` + ConsecutiveFailures int `json:"consecutive_failures"` + LastLatencyMs int `json:"last_latency_ms,omitempty"` + LastFailureReason string `json:"last_failure_reason,omitempty"` + LastTransitionAt time.Time `json:"last_transition_at"` + LastProbeAt time.Time `json:"last_probe_at,omitempty"` + BackoffUntil time.Time `json:"backoff_until,omitempty"` +} + +type PeerConnectionSnapshot struct { + Total int `json:"total"` + Ready int `json:"ready"` + RelayReady int `json:"relay_ready"` + Degraded int `json:"degraded"` + Backoff int `json:"backoff"` + Waiting int `json:"waiting_rendezvous"` + Connecting int `json:"connecting"` + Disconnected int `json:"disconnected"` + StateCounts map[string]int `json:"state_counts"` + Entries []PeerConnectionState `json:"entries"` + LastTransitionAt time.Time `json:"last_transition_at,omitempty"` +} + +func NewPeerConnectionTracker(peerSnapshot PeerCacheSnapshot, now time.Time) *PeerConnectionTracker { + now = normalizedNow(now) + tracker := &PeerConnectionTracker{entries: map[string]PeerConnectionState{}} + for _, peer := range peerSnapshot.Entries { + if !peer.Warm || peer.NodeID == "" { + continue + } + tracker.entries[peer.NodeID] = PeerConnectionState{ + NodeID: peer.NodeID, + State: PeerConnectionDisconnected, + Warm: peer.Warm, + WarmReason: peer.WarmReason, + Endpoint: peer.Endpoint, + BestCandidateID: peer.BestCandidateID, + LastTransitionAt: now, + } + } + return tracker +} + +func (t *PeerConnectionTracker) ShouldProbe(nodeID string, now time.Time) bool { + if t == nil { + return true + } + t.mu.Lock() + defer t.mu.Unlock() + entry, ok := t.entries[nodeID] + if !ok { + return true + } + now = normalizedNow(now) + return entry.State != PeerConnectionBackoff || entry.BackoffUntil.IsZero() || !entry.BackoffUntil.After(now) +} + +func (t *PeerConnectionTracker) BeginProbe(peer PeerCacheEntry, now time.Time) PeerConnectionState { + if t == nil { + return PeerConnectionState{} + } + t.mu.Lock() + defer t.mu.Unlock() + now = normalizedNow(now) + entry := t.entry(peer, now) + if entry.State != PeerConnectionReady && entry.State != PeerConnectionDegraded { + entry.State = PeerConnectionConnecting + entry.LastTransitionAt = now + } + entry.LastProbeAt = now + t.entries[peer.NodeID] = entry + return entry +} + +func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now time.Time) PeerConnectionState { + if t == nil { + return PeerConnectionState{} + } + t.mu.Lock() + defer t.mu.Unlock() + now = normalizedNow(now) + entry := t.entries[nodeID] + entry.NodeID = nodeID + entry.ConsecutiveSuccesses++ + entry.ConsecutiveFailures = 0 + entry.LastLatencyMs = latencyMs + entry.LastFailureReason = "" + entry.LastProbeAt = now + entry.BackoffUntil = time.Time{} + nextState := PeerConnectionReady + if latencyMs >= 500 { + nextState = PeerConnectionDegraded + } + if entry.State != nextState { + entry.State = nextState + entry.LastTransitionAt = now + } + t.entries[nodeID] = entry + return entry +} + +func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState { + if t == nil { + return PeerConnectionState{} + } + t.mu.Lock() + defer t.mu.Unlock() + now = normalizedNow(now) + entry := t.entry(peer, now) + entry.ConsecutiveSuccesses++ + entry.ConsecutiveFailures = 0 + entry.LastLatencyMs = latencyMs + entry.LastFailureReason = "" + entry.LastProbeAt = now + entry.BackoffUntil = time.Time{} + if entry.State != PeerConnectionRelayReady { + entry.State = PeerConnectionRelayReady + entry.LastTransitionAt = now + } + t.entries[peer.NodeID] = entry + return entry +} + +func (t *PeerConnectionTracker) RecordFailure(nodeID string, reason string, now time.Time) PeerConnectionState { + if t == nil { + return PeerConnectionState{} + } + t.mu.Lock() + defer t.mu.Unlock() + now = normalizedNow(now) + entry := t.entries[nodeID] + entry.NodeID = nodeID + entry.ConsecutiveFailures++ + entry.ConsecutiveSuccesses = 0 + entry.LastFailureReason = reason + entry.LastProbeAt = now + nextState := PeerConnectionDegraded + if entry.ConsecutiveFailures >= 3 { + nextState = PeerConnectionBackoff + entry.BackoffUntil = now.Add(peerConnectionBackoffDuration(entry.ConsecutiveFailures)) + } + if entry.State != nextState { + entry.State = nextState + entry.LastTransitionAt = now + } + t.entries[nodeID] = entry + return entry +} + +func (t *PeerConnectionTracker) RecordDeferred(peer PeerCacheEntry, reason string, now time.Time) PeerConnectionState { + if t == nil { + return PeerConnectionState{} + } + t.mu.Lock() + defer t.mu.Unlock() + now = normalizedNow(now) + entry := t.entry(peer, now) + entry.State = PeerConnectionWaiting + entry.LastFailureReason = reason + entry.LastProbeAt = time.Time{} + entry.LastTransitionAt = now + entry.BackoffUntil = time.Time{} + t.entries[peer.NodeID] = entry + return entry +} + +func (t *PeerConnectionTracker) Snapshot() PeerConnectionSnapshot { + if t == nil { + return PeerConnectionSnapshot{StateCounts: map[string]int{}} + } + t.mu.Lock() + defer t.mu.Unlock() + entries := make([]PeerConnectionState, 0, len(t.entries)) + counts := map[string]int{ + PeerConnectionDisconnected: 0, + PeerConnectionConnecting: 0, + PeerConnectionReady: 0, + PeerConnectionRelayReady: 0, + PeerConnectionDegraded: 0, + PeerConnectionBackoff: 0, + PeerConnectionWaiting: 0, + } + var lastTransition time.Time + for _, entry := range t.entries { + entries = append(entries, entry) + counts[entry.State]++ + if entry.LastTransitionAt.After(lastTransition) { + lastTransition = entry.LastTransitionAt + } + } + sort.SliceStable(entries, func(i, j int) bool { + return entries[i].NodeID < entries[j].NodeID + }) + return PeerConnectionSnapshot{ + Total: len(entries), + Ready: counts[PeerConnectionReady], + RelayReady: counts[PeerConnectionRelayReady], + Degraded: counts[PeerConnectionDegraded], + Backoff: counts[PeerConnectionBackoff], + Waiting: counts[PeerConnectionWaiting], + Connecting: counts[PeerConnectionConnecting], + Disconnected: counts[PeerConnectionDisconnected], + StateCounts: counts, + Entries: entries, + LastTransitionAt: lastTransition, + } +} + +func (t *PeerConnectionTracker) entry(peer PeerCacheEntry, now time.Time) PeerConnectionState { + entry, ok := t.entries[peer.NodeID] + if !ok { + entry = PeerConnectionState{ + NodeID: peer.NodeID, + State: PeerConnectionDisconnected, + LastTransitionAt: now, + } + } + entry.Warm = peer.Warm + entry.WarmReason = peer.WarmReason + entry.Endpoint = peer.Endpoint + entry.BestCandidateID = peer.BestCandidateID + entry.RendezvousLeaseID = peer.RendezvousLeaseID + entry.RelayNodeID = peer.RelayNodeID + entry.RelayEndpoint = peer.RelayEndpoint + entry.RelayControl = peer.RelayControl + return entry +} + +func peerConnectionBackoffDuration(failures int) time.Duration { + if failures < 3 { + return 0 + } + backoff := peerConnectionBackoffBase * time.Duration(failures-2) + if backoff > peerConnectionBackoffMax { + return peerConnectionBackoffMax + } + return backoff +} + +func normalizedNow(now time.Time) time.Time { + if now.IsZero() { + return time.Now().UTC() + } + return now.UTC() +} diff --git a/agents/rap-node-agent/internal/mesh/peer_connection_state_test.go b/agents/rap-node-agent/internal/mesh/peer_connection_state_test.go new file mode 100644 index 0000000..5a6fca1 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_connection_state_test.go @@ -0,0 +1,76 @@ +package mesh + +import ( + "testing" + "time" +) + +func TestPeerConnectionTrackerTransitionsReadyAndDegraded(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + tracker := NewPeerConnectionTracker(PeerCacheSnapshot{ + Entries: []PeerCacheEntry{ + {NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "http://node-b:19000"}, + }, + }, now) + + begin := tracker.BeginProbe(PeerCacheEntry{NodeID: "node-b", Warm: true}, now.Add(time.Second)) + if begin.State != PeerConnectionConnecting { + t.Fatalf("begin state = %q, want connecting", begin.State) + } + ready := tracker.RecordSuccess("node-b", 42, now.Add(2*time.Second)) + if ready.State != PeerConnectionReady || ready.ConsecutiveSuccesses != 1 || ready.ConsecutiveFailures != 0 { + t.Fatalf("ready state unexpected: %+v", ready) + } + degraded := tracker.RecordSuccess("node-b", 800, now.Add(3*time.Second)) + if degraded.State != PeerConnectionDegraded || degraded.LastLatencyMs != 800 { + t.Fatalf("degraded state unexpected: %+v", degraded) + } +} + +func TestPeerConnectionTrackerBackoffAfterRepeatedFailures(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + tracker := NewPeerConnectionTracker(PeerCacheSnapshot{ + Entries: []PeerCacheEntry{{NodeID: "node-b", Warm: true}}, + }, now) + + first := tracker.RecordFailure("node-b", "timeout", now.Add(time.Second)) + if first.State != PeerConnectionDegraded { + t.Fatalf("first failure state = %q, want degraded", first.State) + } + _ = tracker.RecordFailure("node-b", "timeout", now.Add(2*time.Second)) + third := tracker.RecordFailure("node-b", "timeout", now.Add(3*time.Second)) + if third.State != PeerConnectionBackoff || third.BackoffUntil.IsZero() { + t.Fatalf("third failure did not enter backoff: %+v", third) + } + if tracker.ShouldProbe("node-b", now.Add(4*time.Second)) { + t.Fatal("ShouldProbe returned true during backoff") + } + if !tracker.ShouldProbe("node-b", third.BackoffUntil.Add(time.Millisecond)) { + t.Fatal("ShouldProbe returned false after backoff") + } + recovered := tracker.RecordSuccess("node-b", 12, third.BackoffUntil.Add(time.Second)) + if recovered.State != PeerConnectionReady || recovered.ConsecutiveFailures != 0 || !recovered.BackoffUntil.IsZero() { + t.Fatalf("success did not recover from backoff: %+v", recovered) + } +} + +func TestPeerConnectionTrackerSnapshotCountsStates(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + tracker := NewPeerConnectionTracker(PeerCacheSnapshot{ + Entries: []PeerCacheEntry{ + {NodeID: "node-a", Warm: true}, + {NodeID: "node-b", Warm: true}, + {NodeID: "node-c", Warm: true}, + }, + }, now) + tracker.RecordSuccess("node-a", 25, now.Add(time.Second)) + tracker.RecordFailure("node-b", "timeout", now.Add(time.Second)) + tracker.RecordFailure("node-c", "timeout", now.Add(time.Second)) + tracker.RecordFailure("node-c", "timeout", now.Add(2*time.Second)) + tracker.RecordFailure("node-c", "timeout", now.Add(3*time.Second)) + + snapshot := tracker.Snapshot() + if snapshot.Total != 3 || snapshot.Ready != 1 || snapshot.Degraded != 1 || snapshot.Backoff != 1 { + t.Fatalf("unexpected snapshot: %+v", snapshot) + } +} diff --git a/agents/rap-node-agent/internal/mesh/peer_recovery_plan.go b/agents/rap-node-agent/internal/mesh/peer_recovery_plan.go new file mode 100644 index 0000000..0defcd6 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_recovery_plan.go @@ -0,0 +1,276 @@ +package mesh + +import ( + "sort" + "strings" + "time" +) + +const ( + PeerRecoveryModeSteady = "steady" + PeerRecoveryModeRecovery = "recovery" +) + +const ( + DefaultStablePeerTarget = 3 + DefaultRecoveryProbeLimit = 6 +) + +type PeerRecoveryPlanConfig struct { + PeerCache PeerCacheSnapshot + Connections PeerConnectionSnapshot + TargetReadyPeers int + MaxProbeCandidates int + Now time.Time +} + +type PeerRecoveryPlan struct { + Mode string `json:"mode"` + Healthy bool `json:"healthy"` + TargetReadyPeers int `json:"target_ready_peers"` + ReadyPeerCount int `json:"ready_peer_count"` + DegradedPeerCount int `json:"degraded_peer_count"` + BackoffPeerCount int `json:"backoff_peer_count"` + ConnectablePeerCount int `json:"connectable_peer_count"` + Deficit int `json:"deficit"` + ProbeCandidateCount int `json:"probe_candidate_count"` + RecoverySeedCandidateCount int `json:"recovery_seed_candidate_count"` + GeneratedAt time.Time `json:"generated_at"` + Candidates []PeerRecoveryCandidate `json:"candidates,omitempty"` +} + +type PeerRecoveryCandidate struct { + NodeID string `json:"node_id"` + Endpoint string `json:"endpoint,omitempty"` + Warm bool `json:"warm"` + WarmReason string `json:"warm_reason,omitempty"` + RecoverySeed bool `json:"recovery_seed"` + BestCandidateID string `json:"best_candidate_id,omitempty"` + BestTransport string `json:"best_transport,omitempty"` + ConnectionState string `json:"connection_state"` + ConsecutiveFailures int `json:"consecutive_failures,omitempty"` + LastLatencyMs int `json:"last_latency_ms,omitempty"` + BackoffUntil time.Time `json:"backoff_until,omitempty"` + Reason string `json:"reason"` + Priority int `json:"priority"` +} + +type peerRecoveryCandidateBuild struct { + PeerRecoveryCandidate +} + +func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan { + now := normalizedNow(cfg.Now) + target := cfg.TargetReadyPeers + if target <= 0 { + target = DefaultStablePeerTarget + } + limit := cfg.MaxProbeCandidates + if limit <= 0 { + limit = DefaultRecoveryProbeLimit + } + connectable := connectablePeerCount(cfg.PeerCache) + if target > connectable { + target = connectable + } + if limit < target { + limit = target + } + + connectionByNode := map[string]PeerConnectionState{} + for _, connection := range cfg.Connections.Entries { + if strings.TrimSpace(connection.NodeID) == "" { + continue + } + connectionByNode[connection.NodeID] = connection + } + + entryByNode := map[string]PeerCacheEntry{} + for _, entry := range cfg.PeerCache.Entries { + if strings.TrimSpace(entry.NodeID) == "" { + continue + } + entryByNode[entry.NodeID] = entry + } + + ready := 0 + degraded := 0 + backoff := 0 + for nodeID, connection := range connectionByNode { + entry, ok := entryByNode[nodeID] + if !ok || strings.TrimSpace(entry.Endpoint) == "" { + continue + } + switch connection.State { + case PeerConnectionReady, PeerConnectionRelayReady: + ready++ + case PeerConnectionDegraded: + degraded++ + case PeerConnectionBackoff: + backoff++ + } + } + + deficit := target - ready + if deficit < 0 { + deficit = 0 + } + mode := PeerRecoveryModeSteady + if deficit > 0 { + mode = PeerRecoveryModeRecovery + } + if mode == PeerRecoveryModeSteady { + limit = target + } + + candidates := make([]peerRecoveryCandidateBuild, 0, len(cfg.PeerCache.Entries)) + for _, entry := range cfg.PeerCache.Entries { + if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" { + continue + } + connection := connectionByNode[entry.NodeID] + if connection.State == "" { + connection.State = PeerConnectionDisconnected + } + if connection.State == PeerConnectionBackoff && connection.BackoffUntil.After(now) { + continue + } + reason, ok := peerRecoveryCandidateReason(mode, entry, connection) + if !ok { + continue + } + candidate := PeerRecoveryCandidate{ + NodeID: entry.NodeID, + Endpoint: strings.TrimSpace(entry.Endpoint), + Warm: entry.Warm, + WarmReason: entry.WarmReason, + RecoverySeed: entry.RecoverySeed, + BestCandidateID: entry.BestCandidateID, + BestTransport: entry.BestTransport, + ConnectionState: connection.State, + ConsecutiveFailures: connection.ConsecutiveFailures, + LastLatencyMs: connection.LastLatencyMs, + BackoffUntil: connection.BackoffUntil, + Reason: reason, + Priority: peerRecoveryCandidatePriority(entry, connection, reason), + } + candidates = append(candidates, peerRecoveryCandidateBuild{PeerRecoveryCandidate: candidate}) + } + sort.SliceStable(candidates, func(i, j int) bool { + if candidates[i].Priority != candidates[j].Priority { + return candidates[i].Priority > candidates[j].Priority + } + return candidates[i].NodeID < candidates[j].NodeID + }) + if len(candidates) > limit { + candidates = candidates[:limit] + } + + outCandidates := make([]PeerRecoveryCandidate, 0, len(candidates)) + recoverySeedCandidates := 0 + for _, candidate := range candidates { + outCandidates = append(outCandidates, candidate.PeerRecoveryCandidate) + if candidate.RecoverySeed { + recoverySeedCandidates++ + } + } + + return PeerRecoveryPlan{ + Mode: mode, + Healthy: deficit == 0, + TargetReadyPeers: target, + ReadyPeerCount: ready, + DegradedPeerCount: degraded, + BackoffPeerCount: backoff, + ConnectablePeerCount: connectable, + Deficit: deficit, + ProbeCandidateCount: len(outCandidates), + RecoverySeedCandidateCount: recoverySeedCandidates, + GeneratedAt: now, + Candidates: outCandidates, + } +} + +func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState) (string, bool) { + if mode == PeerRecoveryModeSteady { + if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady { + return "maintain_ready", true + } + return "", false + } + if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady { + return "maintain_ready", true + } + if connection.State == PeerConnectionDegraded { + return "recover_degraded", true + } + if entry.Warm { + return "recover_warm", true + } + if entry.RecoverySeed { + return "recover_seed", true + } + return "recover_peer", true +} + +func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string) int { + score := 0 + if entry.Warm { + score += 1000 + } + switch entry.WarmReason { + case "route_adjacent": + score += 500 + case "recovery_seed": + score += 350 + case "endpoint_candidate": + score += 200 + case "peer_endpoint": + score += 100 + } + if entry.RecoverySeed { + score += 250 + } + if entry.BestCandidateID != "" { + score += 150 + } + score += entry.BestCandidateScore / 10 + switch connection.State { + case PeerConnectionReady, PeerConnectionRelayReady: + score += 600 + case PeerConnectionDegraded: + score += 350 + case PeerConnectionConnecting: + score += 200 + case PeerConnectionDisconnected: + score += 100 + } + switch reason { + case "maintain_ready": + score += 500 + case "recover_degraded": + score += 300 + case "recover_seed": + score += 250 + case "recover_warm": + score += 150 + } + if connection.LastLatencyMs > 0 { + score -= connection.LastLatencyMs / 10 + } + if score < 0 { + return 0 + } + return score +} + +func connectablePeerCount(snapshot PeerCacheSnapshot) int { + count := 0 + for _, entry := range snapshot.Entries { + if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" { + continue + } + count++ + } + return count +} diff --git a/agents/rap-node-agent/internal/mesh/peer_recovery_plan_test.go b/agents/rap-node-agent/internal/mesh/peer_recovery_plan_test.go new file mode 100644 index 0000000..97655d0 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/peer_recovery_plan_test.go @@ -0,0 +1,139 @@ +package mesh + +import ( + "testing" + "time" +) + +func TestPeerRecoveryPlanMaintainsBoundedReadyPeers(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + plan := PlanPeerRecovery(PeerRecoveryPlanConfig{ + PeerCache: PeerCacheSnapshot{ + Entries: []PeerCacheEntry{ + recoveryPlanPeer("node-a", true, false, "route_adjacent"), + recoveryPlanPeer("node-b", true, false, "route_adjacent"), + recoveryPlanPeer("node-c", true, false, "peer_endpoint"), + recoveryPlanPeer("node-d", true, false, "peer_endpoint"), + }, + }, + Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{ + {NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 40}, + {NodeID: "node-b", State: PeerConnectionReady, LastLatencyMs: 20}, + {NodeID: "node-c", State: PeerConnectionReady, LastLatencyMs: 30}, + {NodeID: "node-d", State: PeerConnectionReady, LastLatencyMs: 10}, + }}, + Now: now, + }) + + if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy { + t.Fatalf("unexpected plan health: %+v", plan) + } + if plan.TargetReadyPeers != DefaultStablePeerTarget || len(plan.Candidates) != DefaultStablePeerTarget { + t.Fatalf("unexpected bounded candidates: %+v", plan) + } + for _, candidate := range plan.Candidates { + if candidate.Reason != "maintain_ready" { + t.Fatalf("unexpected candidate reason: %+v", candidate) + } + } +} + +func TestPeerRecoveryPlanAddsRecoverySeedWhenReadyDeficit(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + plan := PlanPeerRecovery(PeerRecoveryPlanConfig{ + PeerCache: PeerCacheSnapshot{ + Entries: []PeerCacheEntry{ + recoveryPlanPeer("node-a", true, false, "route_adjacent"), + recoveryPlanPeer("node-b", true, false, "route_adjacent"), + recoveryPlanPeer("node-seed", false, true, ""), + }, + }, + Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{ + {NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 20}, + {NodeID: "node-b", State: PeerConnectionBackoff, BackoffUntil: now.Add(time.Minute)}, + }}, + Now: now, + }) + + if plan.Mode != PeerRecoveryModeRecovery || plan.Healthy { + t.Fatalf("unexpected recovery mode: %+v", plan) + } + if plan.Deficit != 2 || plan.RecoverySeedCandidateCount != 1 { + t.Fatalf("unexpected deficit/seed count: %+v", plan) + } + if !recoveryPlanHasCandidate(plan, "node-seed", "recover_seed") { + t.Fatalf("recovery seed was not selected: %+v", plan.Candidates) + } + if recoveryPlanHasCandidate(plan, "node-b", "") { + t.Fatalf("active backoff peer should not be selected: %+v", plan.Candidates) + } +} + +func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + plan := PlanPeerRecovery(PeerRecoveryPlanConfig{ + PeerCache: PeerCacheSnapshot{ + Entries: []PeerCacheEntry{ + { + NodeID: "node-c", + Endpoint: "http://relay:19001", + Warm: true, + WarmReason: "rendezvous_lease", + RendezvousLeaseID: "lease-1", + RelayNodeID: "node-r", + RelayEndpoint: "http://relay:19001", + RelayControl: true, + }, + }, + }, + Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{ + {NodeID: "node-c", State: PeerConnectionRelayReady, LastLatencyMs: 15}, + }}, + Now: now, + }) + + if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy { + t.Fatalf("unexpected steady plan: %+v", plan) + } + if !recoveryPlanHasCandidate(plan, "node-c", "maintain_ready") { + t.Fatalf("relay-ready peer was not maintained: %+v", plan.Candidates) + } +} + +func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + plan := PlanPeerRecovery(PeerRecoveryPlanConfig{ + PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{ + {NodeID: "node-a", Warm: true, WarmReason: "route_adjacent"}, + recoveryPlanPeer("node-b", true, false, "route_adjacent"), + }}, + Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{ + {NodeID: "node-b", State: PeerConnectionReady}, + }}, + Now: now, + }) + + if plan.TargetReadyPeers != 1 || !plan.Healthy { + t.Fatalf("target should be capped by connectable peers: %+v", plan) + } +} + +func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry { + return PeerCacheEntry{ + NodeID: nodeID, + Endpoint: "http://" + nodeID + ":19001", + Warm: warm, + WarmReason: warmReason, + RecoverySeed: recoverySeed, + } +} + +func recoveryPlanHasCandidate(plan PeerRecoveryPlan, nodeID string, reason string) bool { + for _, candidate := range plan.Candidates { + if candidate.NodeID != nodeID { + continue + } + return reason == "" || candidate.Reason == reason + } + return false +} diff --git a/agents/rap-node-agent/internal/mesh/production_envelope.go b/agents/rap-node-agent/internal/mesh/production_envelope.go new file mode 100644 index 0000000..7cd323d --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/production_envelope.go @@ -0,0 +1,149 @@ +package mesh + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "time" +) + +func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope, now time.Time) error { + if envelope.FabricProtocolVersion != ProtocolVersion { + return fmt.Errorf("%w: unsupported fabric_protocol_version", ErrForwardEnvelopeInvalid) + } + if envelope.MessageID == "" { + return fmt.Errorf("%w: message_id is required", ErrForwardEnvelopeInvalid) + } + if envelope.RouteID == "" { + return fmt.Errorf("%w: route_id is required", ErrForwardEnvelopeInvalid) + } + if envelope.ClusterID == "" || envelope.ClusterID != local.ClusterID { + return ErrClusterMismatch + } + if envelope.SourceNodeID == "" || envelope.DestinationNodeID == "" { + return fmt.Errorf("%w: source_node_id and destination_node_id are required", ErrForwardEnvelopeInvalid) + } + if envelope.CurrentHopNodeID != local.NodeID { + return ErrNodeMismatch + } + if envelope.NextHopNodeID == "" { + return fmt.Errorf("%w: next_hop_node_id is required", ErrForwardEnvelopeInvalid) + } + if len(envelope.RoutePath) > 0 { + if err := validateProductionRoutePath(local, envelope); err != nil { + return err + } + } + if envelope.ChannelClass != ProductionChannelFabricControl { + return ErrUnauthorizedChannel + } + if envelope.MessageType != ProductionMessageFabricControl { + return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid) + } + if envelope.TTL <= 0 { + return ErrTTLExhausted + } + if envelope.HopCount < 0 { + return fmt.Errorf("%w: hop_count must not be negative", ErrForwardEnvelopeInvalid) + } + if envelope.CreatedAt.IsZero() || envelope.ExpiresAt.IsZero() { + return fmt.Errorf("%w: created_at and expires_at are required", ErrForwardEnvelopeInvalid) + } + if envelope.CreatedAt.After(now.UTC().Add(MaxProductionEnvelopeFutureSkew)) { + return fmt.Errorf("%w: created_at exceeds allowed future skew", ErrForwardEnvelopeInvalid) + } + if !envelope.ExpiresAt.After(now.UTC()) { + return ErrRouteExpired + } + if envelope.PayloadLength != len(envelope.Payload) { + return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid) + } + if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes { + return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid) + } + if envelope.PayloadHash == "" { + return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid) + } + sum := sha256.Sum256(envelope.Payload) + if envelope.PayloadHash != hex.EncodeToString(sum[:]) { + return fmt.Errorf("%w: payload_hash mismatch", ErrForwardEnvelopeInvalid) + } + return nil +} + +func validateProductionRoutePath(local PeerIdentity, envelope ProductionEnvelope) error { + if len(envelope.RoutePath) < 2 { + return ErrInvalidRoutePath + } + if envelope.RoutePath[0] != envelope.SourceNodeID || envelope.RoutePath[len(envelope.RoutePath)-1] != envelope.DestinationNodeID { + return ErrInvalidRoutePath + } + currentIndex := -1 + seen := map[string]struct{}{} + for index, nodeID := range envelope.RoutePath { + if nodeID == "" { + return ErrInvalidRoutePath + } + if _, duplicate := seen[nodeID]; duplicate { + return ErrLoopDetected + } + seen[nodeID] = struct{}{} + if nodeID == local.NodeID { + currentIndex = index + } + } + if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID { + return ErrNodeMismatch + } + if containsProductionNodeID(envelope.VisitedNodeIDs, local.NodeID) { + return ErrLoopDetected + } + for _, visitedNodeID := range envelope.VisitedNodeIDs { + if visitedNodeID == "" || !containsProductionNodeID(envelope.RoutePath, visitedNodeID) { + return ErrInvalidRoutePath + } + } + if envelope.DestinationNodeID == local.NodeID { + if envelope.NextHopNodeID != local.NodeID { + return ErrInvalidRoutePath + } + return nil + } + if currentIndex >= len(envelope.RoutePath)-1 { + return ErrInvalidRoutePath + } + if envelope.NextHopNodeID != envelope.RoutePath[currentIndex+1] { + return ErrInvalidRoutePath + } + return nil +} + +func containsProductionNodeID(values []string, needle string) bool { + for _, value := range values { + if value == needle { + return true + } + } + return false +} + +func NewProductionEnvelopeObservation(envelope ProductionEnvelope, observedAt time.Time) ProductionEnvelopeObservation { + return ProductionEnvelopeObservation{ + MessageID: envelope.MessageID, + RouteID: envelope.RouteID, + ClusterID: envelope.ClusterID, + SourceNodeID: envelope.SourceNodeID, + DestinationNodeID: envelope.DestinationNodeID, + CurrentHopNodeID: envelope.CurrentHopNodeID, + NextHopNodeID: envelope.NextHopNodeID, + RoutePath: append([]string{}, envelope.RoutePath...), + VisitedNodeIDs: append([]string{}, envelope.VisitedNodeIDs...), + ChannelClass: envelope.ChannelClass, + MessageType: envelope.MessageType, + TTL: envelope.TTL, + HopCount: envelope.HopCount, + PayloadLength: envelope.PayloadLength, + PayloadHash: envelope.PayloadHash, + ObservedAt: observedAt.UTC(), + } +} diff --git a/agents/rap-node-agent/internal/mesh/production_observation_sink.go b/agents/rap-node-agent/internal/mesh/production_observation_sink.go new file mode 100644 index 0000000..ecbdfbb --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/production_observation_sink.go @@ -0,0 +1,81 @@ +package mesh + +import ( + "context" + "sync" +) + +type ProductionEnvelopeObservationSink struct { + mu sync.Mutex + capacity int + items []ProductionEnvelopeObservation + accepted uint64 + dropped uint64 +} + +type ProductionEnvelopeObservationSinkMetrics struct { + Capacity int `json:"capacity"` + CurrentDepth int `json:"current_depth"` + AcceptedTotal uint64 `json:"accepted_total"` + DroppedOldest uint64 `json:"dropped_oldest"` +} + +func NewProductionEnvelopeObservationSink(capacity int) *ProductionEnvelopeObservationSink { + if capacity < 1 { + capacity = 1 + } + return &ProductionEnvelopeObservationSink{ + capacity: capacity, + items: make([]ProductionEnvelopeObservation, 0, capacity), + } +} + +func (s *ProductionEnvelopeObservationSink) Observe(_ context.Context, observation ProductionEnvelopeObservation) error { + s.mu.Lock() + defer s.mu.Unlock() + + s.accepted++ + if len(s.items) == s.capacity { + copy(s.items, s.items[1:]) + s.items[len(s.items)-1] = observation + s.dropped++ + return nil + } + s.items = append(s.items, observation) + return nil +} + +func (s *ProductionEnvelopeObservationSink) Snapshot() []ProductionEnvelopeObservation { + s.mu.Lock() + defer s.mu.Unlock() + + out := make([]ProductionEnvelopeObservation, len(s.items)) + copy(out, s.items) + return out +} + +func (s *ProductionEnvelopeObservationSink) Len() int { + s.mu.Lock() + defer s.mu.Unlock() + + return len(s.items) +} + +func (s *ProductionEnvelopeObservationSink) Capacity() int { + s.mu.Lock() + defer s.mu.Unlock() + + return s.capacity +} + +func (s *ProductionEnvelopeObservationSink) Metrics() ProductionEnvelopeObservationSinkMetrics { + s.mu.Lock() + defer s.mu.Unlock() + + return ProductionEnvelopeObservationSinkMetrics{ + Capacity: s.capacity, + CurrentDepth: len(s.items), + AcceptedTotal: s.accepted, + DroppedOldest: s.dropped, + } +} diff --git a/agents/rap-node-agent/internal/mesh/production_route_config.go b/agents/rap-node-agent/internal/mesh/production_route_config.go new file mode 100644 index 0000000..3a4c9f0 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/production_route_config.go @@ -0,0 +1,80 @@ +package mesh + +import ( + "fmt" + "time" +) + +func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope ProductionEnvelope, routes []SyntheticRoute, now time.Time) error { + if len(routes) == 0 { + return nil + } + route, ok := productionRouteByID(routes, envelope.RouteID) + if !ok { + return ErrRouteNotFound + } + if route.ClusterID != envelope.ClusterID || route.ClusterID != local.ClusterID { + return ErrClusterMismatch + } + if route.SourceNodeID != envelope.SourceNodeID || route.DestinationNodeID != envelope.DestinationNodeID { + return ErrInvalidRoutePath + } + if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) { + return ErrRouteExpired + } + if !contains(route.AllowedChannels, ProductionChannelFabricControl) { + return ErrUnauthorizedChannel + } + path := routePath(route) + if len(path) < 2 || path[0] != route.SourceNodeID || path[len(path)-1] != route.DestinationNodeID { + return ErrInvalidRoutePath + } + if len(envelope.RoutePath) > 0 && !sameNodePath(envelope.RoutePath, path) { + return ErrInvalidRoutePath + } + if len(path) > 2 && len(envelope.RoutePath) == 0 { + return ErrInvalidRoutePath + } + currentIndex := indexOf(path, local.NodeID) + if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID { + return ErrNodeMismatch + } + expectedNextHop := local.NodeID + if local.NodeID != envelope.DestinationNodeID { + if currentIndex >= len(path)-1 { + return ErrInvalidRoutePath + } + expectedNextHop = path[currentIndex+1] + } + if envelope.NextHopNodeID != expectedNextHop { + return ErrInvalidRoutePath + } + if route.MaxTTL > 0 && envelope.TTL > route.MaxTTL { + return fmt.Errorf("%w: ttl exceeds configured route max_ttl", ErrForwardEnvelopeInvalid) + } + if route.MaxHops > 0 && envelope.HopCount > route.MaxHops { + return fmt.Errorf("%w: hop_count exceeds configured route max_hops", ErrForwardEnvelopeInvalid) + } + return nil +} + +func productionRouteByID(routes []SyntheticRoute, routeID string) (SyntheticRoute, bool) { + for _, route := range routes { + if route.RouteID == routeID { + return route, true + } + } + return SyntheticRoute{}, false +} + +func sameNodePath(a []string, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} diff --git a/agents/rap-node-agent/internal/mesh/production_transport.go b/agents/rap-node-agent/internal/mesh/production_transport.go new file mode 100644 index 0000000..5c0ba2a --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/production_transport.go @@ -0,0 +1,43 @@ +package mesh + +import ( + "context" + "net/http" + "strings" +) + +type ProductionForwardTransport interface { + SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) +} + +type HTTPProductionForwardTransport struct { + PeerURLs map[string]string + HTTPClient *http.Client +} + +func NewHTTPProductionForwardTransport(peerURLs map[string]string) *HTTPProductionForwardTransport { + normalized := make(map[string]string, len(peerURLs)) + for nodeID, baseURL := range peerURLs { + nodeID = strings.TrimSpace(nodeID) + baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/") + if nodeID != "" && baseURL != "" { + normalized[nodeID] = baseURL + } + } + return &HTTPProductionForwardTransport{PeerURLs: normalized} +} + +func (t *HTTPProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) { + if t == nil { + return ProductionForwardResult{}, ErrForwardPeerUnavailable + } + baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/") + if baseURL == "" { + return ProductionForwardResult{}, ErrForwardPeerUnavailable + } + client := NewClient(baseURL) + if t.HTTPClient != nil { + client.HTTPClient = t.HTTPClient + } + return client.SendProduction(ctx, envelope) +} diff --git a/agents/rap-node-agent/internal/mesh/scoped_config.go b/agents/rap-node-agent/internal/mesh/scoped_config.go new file mode 100644 index 0000000..f446403 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/scoped_config.go @@ -0,0 +1,241 @@ +package mesh + +import ( + "encoding/json" + "fmt" + "os" + "strings" + "time" +) + +type ScopedSyntheticConfig struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + LocalNodeID string `json:"local_node_id"` + ConfigVersion string `json:"config_version,omitempty"` + PeerDirectoryVersion string `json:"peer_directory_version,omitempty"` + PolicyVersion string `json:"policy_version,omitempty"` + PeerEndpoints map[string]string `json:"peer_endpoints"` + PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"` + PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"` + RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"` + RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"` + Routes []SyntheticRoute `json:"routes"` +} + +type PeerDirectoryEntry struct { + NodeID string `json:"node_id"` + RouteIDs []string `json:"route_ids,omitempty"` + EndpointCount int `json:"endpoint_count"` + CandidateCount int `json:"candidate_count"` + ConnectivityModes []string `json:"connectivity_modes,omitempty"` + RecoverySeed bool `json:"recovery_seed"` +} + +type PeerRecoverySeed struct { + NodeID string `json:"node_id"` + Endpoint string `json:"endpoint"` + Transport string `json:"transport"` + ConnectivityMode string `json:"connectivity_mode,omitempty"` + Region string `json:"region,omitempty"` + Priority int `json:"priority"` + LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` +} + +type PeerRendezvousLease struct { + LeaseID string `json:"lease_id"` + PeerNodeID string `json:"peer_node_id"` + RelayNodeID string `json:"relay_node_id"` + RelayEndpoint string `json:"relay_endpoint"` + Transport string `json:"transport"` + ConnectivityMode string `json:"connectivity_mode,omitempty"` + RouteIDs []string `json:"route_ids,omitempty"` + AllowedChannels []string `json:"allowed_channels,omitempty"` + Priority int `json:"priority"` + ControlPlaneOnly bool `json:"control_plane_only"` + IssuedAt time.Time `json:"issued_at"` + ExpiresAt time.Time `json:"expires_at"` + Reason string `json:"reason,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` +} + +type PeerEndpointCandidate struct { + EndpointID string `json:"endpoint_id"` + NodeID string `json:"node_id"` + Transport string `json:"transport"` + Address string `json:"address"` + AddressFamily string `json:"address_family,omitempty"` + Reachability string `json:"reachability"` + NATType string `json:"nat_type,omitempty"` + ConnectivityMode string `json:"connectivity_mode"` + Region string `json:"region,omitempty"` + Priority int `json:"priority"` + PolicyTags []string `json:"policy_tags,omitempty"` + LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` +} + +func LoadScopedSyntheticConfig(path string, local PeerIdentity) (ScopedSyntheticConfig, error) { + payload, err := os.ReadFile(path) + if err != nil { + return ScopedSyntheticConfig{}, err + } + var cfg ScopedSyntheticConfig + if err := json.Unmarshal(payload, &cfg); err != nil { + return ScopedSyntheticConfig{}, fmt.Errorf("parse scoped synthetic mesh config: %w", err) + } + if err := cfg.Validate(local); err != nil { + return ScopedSyntheticConfig{}, err + } + return cfg, nil +} + +func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error { + if cfg.SchemaVersion == "" { + return fmt.Errorf("scoped synthetic mesh config schema_version is required") + } + if cfg.ClusterID == "" || cfg.ClusterID != local.ClusterID { + return ErrClusterMismatch + } + if cfg.LocalNodeID == "" || cfg.LocalNodeID != local.NodeID { + return ErrNodeMismatch + } + for nodeID, endpoint := range cfg.PeerEndpoints { + if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" { + return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint") + } + } + for nodeID, candidates := range cfg.PeerEndpointCandidates { + if strings.TrimSpace(nodeID) == "" { + return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint candidate node") + } + for _, candidate := range candidates { + if strings.TrimSpace(candidate.EndpointID) == "" || + strings.TrimSpace(candidate.NodeID) == "" || + candidate.NodeID != nodeID || + strings.TrimSpace(candidate.Transport) == "" || + strings.TrimSpace(candidate.Address) == "" || + strings.TrimSpace(candidate.Reachability) == "" || + strings.TrimSpace(candidate.ConnectivityMode) == "" { + return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate") + } + } + } + if err := validatePeerDirectory(cfg.PeerDirectory, cfg.LocalNodeID); err != nil { + return err + } + if err := validateRecoverySeeds(cfg.RecoverySeeds); err != nil { + return err + } + if err := validateRendezvousLeases(cfg.RendezvousLeases, cfg.Routes, cfg.LocalNodeID); err != nil { + return err + } + for _, route := range cfg.Routes { + if route.ClusterID != cfg.ClusterID { + return ErrClusterMismatch + } + path := routePath(route) + if len(path) < 2 { + return ErrInvalidRoutePath + } + if !contains(path, cfg.LocalNodeID) { + return ErrNodeMismatch + } + if route.ExpiresAt.IsZero() { + return fmt.Errorf("scoped synthetic route %q expires_at is required", route.RouteID) + } + if !route.ExpiresAt.After(time.Now().UTC()) { + return ErrRouteExpired + } + } + return nil +} + +func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) error { + seen := map[string]struct{}{} + for _, entry := range entries { + nodeID := strings.TrimSpace(entry.NodeID) + if nodeID == "" || nodeID == localNodeID { + return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory entry") + } + if _, duplicate := seen[nodeID]; duplicate { + return fmt.Errorf("scoped synthetic mesh config contains duplicate peer directory entry") + } + seen[nodeID] = struct{}{} + if entry.EndpointCount < 0 || entry.CandidateCount < 0 { + return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory count") + } + } + return nil +} + +func validateRecoverySeeds(seeds []PeerRecoverySeed) error { + if len(seeds) > 20 { + return fmt.Errorf("scoped synthetic mesh config contains too many recovery seeds") + } + seen := map[string]struct{}{} + for _, seed := range seeds { + key := strings.TrimSpace(seed.NodeID) + "\x00" + strings.TrimSpace(seed.Endpoint) + if strings.TrimSpace(seed.NodeID) == "" || + strings.TrimSpace(seed.Endpoint) == "" || + strings.TrimSpace(seed.Transport) == "" { + return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed") + } + if _, duplicate := seen[key]; duplicate { + return fmt.Errorf("scoped synthetic mesh config contains duplicate recovery seed") + } + seen[key] = struct{}{} + } + return nil +} + +func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRoute, localNodeID string) error { + if len(leases) > 20 { + return fmt.Errorf("scoped synthetic mesh config contains too many rendezvous leases") + } + routesByID := map[string]SyntheticRoute{} + for _, route := range routes { + if strings.TrimSpace(route.RouteID) != "" { + routesByID[route.RouteID] = route + } + } + seen := map[string]struct{}{} + now := time.Now().UTC() + for _, lease := range leases { + if strings.TrimSpace(lease.LeaseID) == "" || + strings.TrimSpace(lease.PeerNodeID) == "" || + strings.TrimSpace(lease.RelayNodeID) == "" || + strings.TrimSpace(lease.RelayEndpoint) == "" || + strings.TrimSpace(lease.Transport) == "" || + lease.PeerNodeID == lease.RelayNodeID || + !lease.ControlPlaneOnly || + lease.ExpiresAt.IsZero() || + !lease.ExpiresAt.After(now) || + (len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) { + return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease") + } + if _, duplicate := seen[lease.LeaseID]; duplicate { + return fmt.Errorf("scoped synthetic mesh config contains duplicate rendezvous lease") + } + seen[lease.LeaseID] = struct{}{} + if len(lease.RouteIDs) == 0 { + continue + } + visible := false + for _, routeID := range lease.RouteIDs { + route, ok := routesByID[routeID] + if !ok { + return fmt.Errorf("scoped synthetic mesh config contains rendezvous lease for unknown route") + } + path := routePath(route) + if contains(path, localNodeID) && contains(path, lease.PeerNodeID) && contains(path, lease.RelayNodeID) { + visible = true + } + } + if !visible { + return fmt.Errorf("scoped synthetic mesh config contains out-of-scope rendezvous lease") + } + } + return nil +} diff --git a/agents/rap-node-agent/internal/mesh/scoped_config_test.go b/agents/rap-node-agent/internal/mesh/scoped_config_test.go new file mode 100644 index 0000000..6809ec8 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/scoped_config_test.go @@ -0,0 +1,235 @@ +package mesh + +import ( + "encoding/json" + "errors" + "os" + "path/filepath" + "testing" + "time" +) + +func TestLoadScopedSyntheticConfig(t *testing.T) { + expiresAt := time.Now().UTC().Add(time.Hour) + path := writeScopedConfig(t, ScopedSyntheticConfig{ + SchemaVersion: "c17f.synthetic.v1", + ClusterID: "cluster-1", + LocalNodeID: "node-a", + ConfigVersion: "config-v1", + PeerDirectoryVersion: "peers-v1", + PolicyVersion: "policy-v1", + PeerEndpoints: map[string]string{"node-b": "http://127.0.0.1:19002"}, + PeerEndpointCandidates: map[string][]PeerEndpointCandidate{ + "node-b": { + { + EndpointID: "node-b-public", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + NATType: "restricted", + ConnectivityMode: "direct", + Priority: 10, + }, + }, + }, + PeerDirectory: []PeerDirectoryEntry{ + { + NodeID: "node-b", + RouteIDs: []string{"route-a-b"}, + EndpointCount: 1, + CandidateCount: 1, + ConnectivityModes: []string{"direct"}, + RecoverySeed: true, + }, + }, + RecoverySeeds: []PeerRecoverySeed{ + { + NodeID: "node-b", + Endpoint: "https://node-b.example.test:443", + Transport: "direct_tcp_tls", + ConnectivityMode: "direct", + Priority: 10, + }, + }, + RendezvousLeases: []PeerRendezvousLease{ + { + LeaseID: "lease-node-b-via-node-r", + PeerNodeID: "node-b", + RelayNodeID: "node-r", + RelayEndpoint: "http://node-r:19000", + Transport: "relay_control", + ConnectivityMode: "relay_required", + RouteIDs: []string{"route-a-b"}, + AllowedChannels: []string{"fabric_control", "route_control"}, + Priority: 10, + ControlPlaneOnly: true, + IssuedAt: expiresAt.Add(-time.Minute), + ExpiresAt: expiresAt, + }, + }, + Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})}, + }) + + cfg, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + if err != nil { + t.Fatalf("load scoped config: %v", err) + } + if cfg.ConfigVersion != "config-v1" || cfg.PeerEndpoints["node-b"] == "" || len(cfg.Routes) != 1 { + t.Fatalf("unexpected config: %+v", cfg) + } + if got := cfg.PeerEndpointCandidates["node-b"]; len(got) != 1 || got[0].EndpointID != "node-b-public" { + t.Fatalf("unexpected endpoint candidates: %+v", cfg.PeerEndpointCandidates) + } + if len(cfg.PeerDirectory) != 1 || cfg.PeerDirectory[0].NodeID != "node-b" || !cfg.PeerDirectory[0].RecoverySeed { + t.Fatalf("unexpected peer directory: %+v", cfg.PeerDirectory) + } + if len(cfg.RecoverySeeds) != 1 || cfg.RecoverySeeds[0].NodeID != "node-b" { + t.Fatalf("unexpected recovery seeds: %+v", cfg.RecoverySeeds) + } + if len(cfg.RendezvousLeases) != 1 || cfg.RendezvousLeases[0].RelayNodeID != "node-r" { + t.Fatalf("unexpected rendezvous leases: %+v", cfg.RendezvousLeases) + } +} + +func TestLoadScopedSyntheticConfigRejectsWrongCluster(t *testing.T) { + path := writeScopedConfig(t, ScopedSyntheticConfig{ + SchemaVersion: "c17f.synthetic.v1", + ClusterID: "cluster-2", + LocalNodeID: "node-a", + Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})}, + }) + + _, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + if !errors.Is(err, ErrClusterMismatch) { + t.Fatalf("err = %v, want ErrClusterMismatch", err) + } +} + +func TestLoadScopedSyntheticConfigRejectsWrongNode(t *testing.T) { + path := writeScopedConfig(t, ScopedSyntheticConfig{ + SchemaVersion: "c17f.synthetic.v1", + ClusterID: "cluster-1", + LocalNodeID: "node-x", + Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})}, + }) + + _, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + if !errors.Is(err, ErrNodeMismatch) { + t.Fatalf("err = %v, want ErrNodeMismatch", err) + } +} + +func TestLoadScopedSyntheticConfigRejectsExpiredRoute(t *testing.T) { + route := liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"}) + route.ExpiresAt = time.Now().UTC().Add(-time.Minute) + path := writeScopedConfig(t, ScopedSyntheticConfig{ + SchemaVersion: "c17f.synthetic.v1", + ClusterID: "cluster-1", + LocalNodeID: "node-a", + Routes: []SyntheticRoute{route}, + }) + + _, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + if !errors.Is(err, ErrRouteExpired) { + t.Fatalf("err = %v, want ErrRouteExpired", err) + } +} + +func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing.T) { + path := writeScopedConfig(t, ScopedSyntheticConfig{ + SchemaVersion: "c17f.synthetic.v1", + ClusterID: "cluster-1", + LocalNodeID: "node-a", + PeerEndpointCandidates: map[string][]PeerEndpointCandidate{ + "node-b": { + { + EndpointID: "node-b-public", + NodeID: "node-c", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + ConnectivityMode: "direct", + }, + }, + }, + Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})}, + }) + + _, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + if err == nil { + t.Fatal("expected invalid peer endpoint candidate error") + } +} + +func TestLoadScopedSyntheticConfigRejectsInvalidPeerDirectory(t *testing.T) { + path := writeScopedConfig(t, ScopedSyntheticConfig{ + SchemaVersion: "c17f.synthetic.v1", + ClusterID: "cluster-1", + LocalNodeID: "node-a", + PeerDirectory: []PeerDirectoryEntry{ + {NodeID: "node-a"}, + }, + Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})}, + }) + + _, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + if err == nil { + t.Fatal("expected invalid peer directory error") + } +} + +func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) { + path := writeScopedConfig(t, ScopedSyntheticConfig{ + SchemaVersion: "c17f.synthetic.v1", + ClusterID: "cluster-1", + LocalNodeID: "node-a", + RecoverySeeds: []PeerRecoverySeed{ + {NodeID: "node-b", Endpoint: "", Transport: "direct_tcp_tls"}, + }, + Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})}, + }) + + _, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + if err == nil { + t.Fatal("expected invalid recovery seed error") + } +} + +func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) { + path := writeScopedConfig(t, ScopedSyntheticConfig{ + SchemaVersion: "c17z12.synthetic.v1", + ClusterID: "cluster-1", + LocalNodeID: "node-a", + RendezvousLeases: []PeerRendezvousLease{ + { + LeaseID: "lease-node-b-via-node-r", + PeerNodeID: "node-b", + RelayNodeID: "node-r", + RelayEndpoint: "http://node-r:19000", + Transport: "relay_control", + RouteIDs: []string{"route-a-b"}, + ExpiresAt: time.Now().UTC().Add(time.Hour), + }, + }, + Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})}, + }) + + _, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}) + if err == nil { + t.Fatal("expected invalid rendezvous lease error") + } +} + +func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string { + t.Helper() + payload, err := json.Marshal(cfg) + if err != nil { + t.Fatalf("marshal config: %v", err) + } + path := filepath.Join(t.TempDir(), "mesh-config.json") + if err := os.WriteFile(path, payload, 0o600); err != nil { + t.Fatalf("write config: %v", err) + } + return path +} diff --git a/agents/rap-node-agent/internal/mesh/server.go b/agents/rap-node-agent/internal/mesh/server.go new file mode 100644 index 0000000..b0f8b45 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/server.go @@ -0,0 +1,291 @@ +package mesh + +import ( + "context" + "encoding/json" + "net/http" + "time" +) + +type ProductionEnvelopeObserver func(context.Context, ProductionEnvelopeObservation) error +type ProductionForwardLogger func(ProductionForwardLogEntry) + +type Server struct { + Local PeerIdentity + SyntheticRuntime *SyntheticRuntime + ProductionForwardingEnabled bool + ProductionEnvelopeObserver ProductionEnvelopeObserver + ProductionForwardTransport ProductionForwardTransport + ProductionForwardLogger ProductionForwardLogger + ProductionRoutes []SyntheticRoute +} + +func (s Server) Handler() http.Handler { + mux := http.NewServeMux() + mux.HandleFunc("/mesh/v1/health", s.handleHealth) + mux.HandleFunc("/mesh/v1/forward", s.handleForward) + mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe) + return mux +} + +func (s Server) handleHealth(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + var message HealthMessage + if err := json.NewDecoder(r.Body).Decode(&message); err != nil { + http.Error(w, "invalid health message", http.StatusBadRequest) + return + } + if message.ProtocolVersion != ProtocolVersion { + http.Error(w, "unsupported mesh protocol version", http.StatusBadRequest) + return + } + if err := ValidatePeer(s.Local, message.From); err != nil { + http.Error(w, err.Error(), http.StatusForbidden) + return + } + if message.To.NodeID != "" && message.To.NodeID != s.Local.NodeID { + http.Error(w, ErrNodeMismatch.Error(), http.StatusForbidden) + return + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(HealthAck{ + ProtocolVersion: ProtocolVersion, + Accepted: true, + By: s.Local, + }) +} + +func (s Server) handleForward(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + if !s.ProductionForwardingEnabled { + s.logProductionForward(ProductionForwardLogEntry{ + Event: "production_forward_rejected", + ClusterID: s.Local.ClusterID, + LocalNodeID: s.Local.NodeID, + Reason: ErrForwardDisabled.Error(), + StatusCode: http.StatusNotImplemented, + OccurredAt: time.Now().UTC(), + }) + http.Error(w, ErrForwardDisabled.Error(), http.StatusNotImplemented) + return + } + var envelope ProductionEnvelope + if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil { + s.logProductionForward(ProductionForwardLogEntry{ + Event: "production_forward_rejected", + ClusterID: s.Local.ClusterID, + LocalNodeID: s.Local.NodeID, + Reason: "invalid production mesh envelope", + StatusCode: http.StatusBadRequest, + OccurredAt: time.Now().UTC(), + }) + http.Error(w, "invalid production mesh envelope", http.StatusBadRequest) + return + } + if err := ValidateProductionEnvelope(s.Local, envelope, time.Now().UTC()); err != nil { + s.rejectProductionForward(w, envelope, err, forwardStatusCode(err)) + return + } + if err := ValidateProductionEnvelopeRouteConfig(s.Local, envelope, s.ProductionRoutes, time.Now().UTC()); err != nil { + s.rejectProductionForward(w, envelope, err, forwardStatusCode(err)) + return + } + s.logProductionForward(productionForwardLogEntry("production_forward_accepted", s.Local, envelope, "", 0)) + if s.ProductionEnvelopeObserver != nil { + observation := NewProductionEnvelopeObservation(envelope, time.Now().UTC()) + if err := observeProductionEnvelope(r.Context(), s.ProductionEnvelopeObserver, observation); err != nil { + s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, ErrForwardObservationFailed.Error(), http.StatusInternalServerError)) + http.Error(w, ErrForwardObservationFailed.Error(), http.StatusInternalServerError) + return + } + } + if envelope.DestinationNodeID == s.Local.NodeID { + s.logProductionForward(productionForwardLogEntry("production_forward_delivered", s.Local, envelope, "", http.StatusOK)) + writeProductionForwardResult(w, ProductionForwardResult{ + Accepted: true, + Delivered: true, + By: s.Local, + MessageID: envelope.MessageID, + RouteID: envelope.RouteID, + }) + return + } + if envelope.NextHopNodeID == s.Local.NodeID { + s.rejectProductionForward(w, envelope, ErrLoopDetected, forwardStatusCode(ErrLoopDetected)) + return + } + if len(envelope.RoutePath) == 0 && envelope.NextHopNodeID != envelope.DestinationNodeID { + s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented) + return + } + if s.ProductionForwardTransport == nil { + s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented) + return + } + if envelope.TTL <= 1 { + s.rejectProductionForward(w, envelope, ErrTTLExhausted, forwardStatusCode(ErrTTLExhausted)) + return + } + forwarded := envelope + forwarded.CurrentHopNodeID = envelope.NextHopNodeID + forwarded.NextHopNodeID = nextProductionHopAfter(envelope.RoutePath, envelope.NextHopNodeID, envelope.DestinationNodeID) + forwarded.TTL = envelope.TTL - 1 + forwarded.HopCount = envelope.HopCount + 1 + forwarded.VisitedNodeIDs = append(append([]string{}, envelope.VisitedNodeIDs...), s.Local.NodeID) + result, err := s.ProductionForwardTransport.SendProduction(r.Context(), envelope.NextHopNodeID, forwarded) + if err != nil { + s.rejectProductionForward(w, envelope, err, forwardStatusCode(err)) + return + } + s.logProductionForward(productionForwardLogEntry("production_forward_forwarded", s.Local, envelope, "", http.StatusOK)) + result.Accepted = true + result.Forwarded = true + result.By = s.Local + result.MessageID = envelope.MessageID + result.RouteID = envelope.RouteID + result.NextNodeID = envelope.NextHopNodeID + writeProductionForwardResult(w, result) +} + +func (s Server) rejectProductionForward(w http.ResponseWriter, envelope ProductionEnvelope, err error, statusCode int) { + s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, err.Error(), statusCode)) + http.Error(w, err.Error(), statusCode) +} + +func (s Server) logProductionForward(entry ProductionForwardLogEntry) { + if s.ProductionForwardLogger == nil { + return + } + if entry.OccurredAt.IsZero() { + entry.OccurredAt = time.Now().UTC() + } + s.ProductionForwardLogger(entry) +} + +func productionForwardLogEntry(event string, local PeerIdentity, envelope ProductionEnvelope, reason string, statusCode int) ProductionForwardLogEntry { + return ProductionForwardLogEntry{ + Event: event, + RouteID: envelope.RouteID, + MessageID: envelope.MessageID, + ClusterID: envelope.ClusterID, + LocalNodeID: local.NodeID, + SourceNodeID: envelope.SourceNodeID, + DestinationNodeID: envelope.DestinationNodeID, + CurrentHopNodeID: envelope.CurrentHopNodeID, + NextHopNodeID: envelope.NextHopNodeID, + ChannelClass: envelope.ChannelClass, + MessageType: envelope.MessageType, + Reason: reason, + StatusCode: statusCode, + TTL: envelope.TTL, + HopCount: envelope.HopCount, + RoutePathLength: len(envelope.RoutePath), + VisitedCount: len(envelope.VisitedNodeIDs), + PayloadLength: envelope.PayloadLength, + OccurredAt: time.Now().UTC(), + } +} + +func nextProductionHopAfter(routePath []string, currentNodeID string, destinationNodeID string) string { + if len(routePath) == 0 { + return destinationNodeID + } + for index, nodeID := range routePath { + if nodeID == currentNodeID { + if index >= len(routePath)-1 { + return currentNodeID + } + return routePath[index+1] + } + } + return destinationNodeID +} + +func writeProductionForwardResult(w http.ResponseWriter, result ProductionForwardResult) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(result) +} + +func observeProductionEnvelope(ctx context.Context, observer ProductionEnvelopeObserver, observation ProductionEnvelopeObservation) (err error) { + if observer == nil { + return nil + } + defer func() { + if recover() != nil { + err = ErrForwardObservationFailed + } + }() + return observer(ctx, observation) +} + +func (s Server) handleSyntheticProbe(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + w.WriteHeader(http.StatusMethodNotAllowed) + return + } + if s.SyntheticRuntime == nil { + http.Error(w, ErrMeshRuntimeDisabled.Error(), http.StatusServiceUnavailable) + return + } + var envelope SyntheticEnvelope + if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil { + http.Error(w, "invalid synthetic mesh envelope", http.StatusBadRequest) + return + } + ack, err := s.SyntheticRuntime.Receive(r.Context(), envelope) + if err != nil { + http.Error(w, err.Error(), syntheticStatusCode(err)) + return + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(ack) +} + +func NewHealthMessage(from, to PeerIdentity) HealthMessage { + status := "reachable" + return HealthMessage{ + ProtocolVersion: ProtocolVersion, + From: from, + To: to, + ObservedAt: time.Now().UTC(), + LinkStatus: status, + } +} + +func syntheticStatusCode(err error) int { + switch err { + case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected: + return http.StatusForbidden + case ErrMeshRuntimeDisabled: + return http.StatusServiceUnavailable + case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrUnsupportedSyntheticMessage, ErrRouteIDRequired: + return http.StatusBadRequest + case ErrRouteNotFound, ErrSyntheticPeerUnavailable: + return http.StatusNotFound + default: + return http.StatusBadRequest + } +} + +func forwardStatusCode(err error) int { + switch err { + case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected: + return http.StatusForbidden + case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrRouteIDRequired: + return http.StatusBadRequest + case ErrForwardRuntimeUnavailable: + return http.StatusNotImplemented + case ErrRouteNotFound: + return http.StatusNotFound + case ErrForwardPeerUnavailable: + return http.StatusBadGateway + default: + return http.StatusBadRequest + } +} diff --git a/agents/rap-node-agent/internal/mesh/server_test.go b/agents/rap-node-agent/internal/mesh/server_test.go new file mode 100644 index 0000000..2f2e663 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/server_test.go @@ -0,0 +1,802 @@ +package mesh + +import ( + "bytes" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestMeshHealthAcceptsSameCluster(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + server := httptest.NewServer(Server{Local: local}.Handler()) + defer server.Close() + + client := NewClient(server.URL) + ack, err := client.SendHealth(context.Background(), NewHealthMessage( + PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}, + local, + )) + if err != nil { + t.Fatalf("send health: %v", err) + } + if !ack.Accepted || ack.By.NodeID != "node-b" { + t.Fatalf("unexpected ack: %+v", ack) + } +} + +func TestMeshHealthRejectsClusterMismatch(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + server := httptest.NewServer(Server{Local: local}.Handler()) + defer server.Close() + + message := NewHealthMessage(PeerIdentity{ClusterID: "cluster-2", NodeID: "node-a"}, local) + payload, err := json.Marshal(message) + if err != nil { + t.Fatalf("marshal message: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/health", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post health: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden) + } +} + +func TestMeshForwardingDisabled(t *testing.T) { + server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler()) + defer server.Close() + + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/octet-stream", bytes.NewReader([]byte("payload"))) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusNotImplemented { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented) + } +} + +func TestMeshForwardingGateEnabledStillHasNoProductionRuntime(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + }.Handler()) + defer server.Close() + + payload, err := json.Marshal(validProductionEnvelope(local)) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusNotImplemented { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented) + } +} + +func TestMeshForwardingGateDeliversFabricControlAtDestination(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"} + var events []ProductionForwardLogEntry + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionForwardLogger: func(entry ProductionForwardLogEntry) { + events = append(events, entry) + }, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + envelope.SourceNodeID = "node-a" + envelope.DestinationNodeID = local.NodeID + envelope.CurrentHopNodeID = local.NodeID + envelope.NextHopNodeID = local.NodeID + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK) + } + var result ProductionForwardResult + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + t.Fatalf("decode result: %v", err) + } + if !result.Accepted || !result.Delivered || result.Forwarded || result.By.NodeID != local.NodeID { + t.Fatalf("unexpected result: %+v", result) + } + if !hasProductionForwardEvent(events, "production_forward_accepted") || !hasProductionForwardEvent(events, "production_forward_delivered") { + t.Fatalf("missing production forward events: %+v", events) + } +} + +func TestMeshForwardingGateForwardsDirectFabricControlToNextHop(t *testing.T) { + nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"} + var deliveredObservation ProductionEnvelopeObservation + serverC := httptest.NewServer(Server{ + Local: nodeC, + ProductionForwardingEnabled: true, + ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error { + deliveredObservation = observation + return nil + }, + }.Handler()) + defer serverC.Close() + + nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + serverB := httptest.NewServer(Server{ + Local: nodeB, + ProductionForwardingEnabled: true, + ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{ + nodeC.NodeID: serverC.URL, + }), + }.Handler()) + defer serverB.Close() + + envelope := validProductionEnvelope(nodeB) + envelope.SourceNodeID = "node-a" + envelope.DestinationNodeID = nodeC.NodeID + envelope.CurrentHopNodeID = nodeB.NodeID + envelope.NextHopNodeID = nodeC.NodeID + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK) + } + var result ProductionForwardResult + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + t.Fatalf("decode result: %v", err) + } + if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeC.NodeID || result.By.NodeID != nodeB.NodeID { + t.Fatalf("unexpected forward result: %+v", result) + } + if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.MessageID != envelope.MessageID { + t.Fatalf("destination did not observe forwarded envelope: %+v", deliveredObservation) + } +} + +func TestMeshForwardingGateForwardsMultiHopFabricControlByRoutePath(t *testing.T) { + nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"} + var deliveredObservation ProductionEnvelopeObservation + var nodeREvents []ProductionForwardLogEntry + var nodeBEvents []ProductionForwardLogEntry + serverC := httptest.NewServer(Server{ + Local: nodeC, + ProductionForwardingEnabled: true, + ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error { + deliveredObservation = observation + return nil + }, + }.Handler()) + defer serverC.Close() + + nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"} + serverR := httptest.NewServer(Server{ + Local: nodeR, + ProductionForwardingEnabled: true, + ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{ + nodeC.NodeID: serverC.URL, + }), + ProductionForwardLogger: func(entry ProductionForwardLogEntry) { + nodeREvents = append(nodeREvents, entry) + }, + }.Handler()) + defer serverR.Close() + + nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + serverB := httptest.NewServer(Server{ + Local: nodeB, + ProductionForwardingEnabled: true, + ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{ + nodeR.NodeID: serverR.URL, + }), + ProductionForwardLogger: func(entry ProductionForwardLogEntry) { + nodeBEvents = append(nodeBEvents, entry) + }, + }.Handler()) + defer serverB.Close() + + envelope := validProductionEnvelope(nodeB) + envelope.SourceNodeID = "node-a" + envelope.DestinationNodeID = nodeC.NodeID + envelope.CurrentHopNodeID = nodeB.NodeID + envelope.NextHopNodeID = nodeR.NodeID + envelope.RoutePath = []string{"node-a", nodeB.NodeID, nodeR.NodeID, nodeC.NodeID} + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK) + } + var result ProductionForwardResult + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + t.Fatalf("decode result: %v", err) + } + if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeR.NodeID || result.By.NodeID != nodeB.NodeID { + t.Fatalf("unexpected multi-hop result: %+v", result) + } + if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.NextHopNodeID != nodeC.NodeID { + t.Fatalf("destination did not observe final hop: %+v", deliveredObservation) + } + if len(deliveredObservation.VisitedNodeIDs) != 2 || deliveredObservation.VisitedNodeIDs[0] != nodeB.NodeID || deliveredObservation.VisitedNodeIDs[1] != nodeR.NodeID { + t.Fatalf("visited path not propagated: %+v", deliveredObservation.VisitedNodeIDs) + } + if !hasProductionForwardEvent(nodeBEvents, "production_forward_forwarded") || !hasProductionForwardEvent(nodeREvents, "production_forward_forwarded") { + t.Fatalf("missing relay forward events: nodeB=%+v nodeR=%+v", nodeBEvents, nodeREvents) + } +} + +func TestMeshForwardingGateForwardsConfiguredProductionRoute(t *testing.T) { + nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"} + route := configuredProductionRoute("route-1", []string{"node-a", "node-b", "node-r", nodeC.NodeID}) + var deliveredObservation ProductionEnvelopeObservation + serverC := httptest.NewServer(Server{ + Local: nodeC, + ProductionForwardingEnabled: true, + ProductionRoutes: []SyntheticRoute{route}, + ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error { + deliveredObservation = observation + return nil + }, + }.Handler()) + defer serverC.Close() + + nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"} + serverR := httptest.NewServer(Server{ + Local: nodeR, + ProductionForwardingEnabled: true, + ProductionRoutes: []SyntheticRoute{route}, + ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{ + nodeC.NodeID: serverC.URL, + }), + }.Handler()) + defer serverR.Close() + + nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + serverB := httptest.NewServer(Server{ + Local: nodeB, + ProductionForwardingEnabled: true, + ProductionRoutes: []SyntheticRoute{route}, + ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{ + nodeR.NodeID: serverR.URL, + }), + }.Handler()) + defer serverB.Close() + + envelope := validProductionEnvelope(nodeB) + envelope.SourceNodeID = "node-a" + envelope.DestinationNodeID = nodeC.NodeID + envelope.CurrentHopNodeID = nodeB.NodeID + envelope.NextHopNodeID = nodeR.NodeID + envelope.RoutePath = route.Hops + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK) + } + if deliveredObservation.RouteID != route.RouteID || deliveredObservation.CurrentHopNodeID != nodeC.NodeID { + t.Fatalf("configured route was not delivered: %+v", deliveredObservation) + } +} + +func TestMeshForwardingGateRejectsUnknownConfiguredProductionRoute(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionRoutes: []SyntheticRoute{ + configuredProductionRoute("route-other", []string{"node-a", local.NodeID, "node-c"}), + }, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusNotFound { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotFound) + } +} + +func TestMeshForwardingGateRejectsConfiguredProductionRouteWrongNextHop(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + route := configuredProductionRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"}) + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionRoutes: []SyntheticRoute{route}, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + envelope.SourceNodeID = "node-a" + envelope.DestinationNodeID = "node-c" + envelope.CurrentHopNodeID = local.NodeID + envelope.NextHopNodeID = "node-c" + envelope.RoutePath = route.Hops + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest) + } +} + +func TestMeshForwardingGateRejectsRoutePathWrongNextHop(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + var events []ProductionForwardLogEntry + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionForwardLogger: func(entry ProductionForwardLogEntry) { + events = append(events, entry) + }, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + envelope.SourceNodeID = "node-a" + envelope.DestinationNodeID = "node-c" + envelope.CurrentHopNodeID = local.NodeID + envelope.NextHopNodeID = "node-x" + envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", "node-c"} + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest) + } + if !hasProductionForwardEvent(events, "production_forward_rejected") { + t.Fatalf("missing reject event: %+v", events) + } +} + +func TestMeshForwardingGateRejectsRoutePathLoop(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + envelope.SourceNodeID = "node-a" + envelope.DestinationNodeID = "node-c" + envelope.CurrentHopNodeID = local.NodeID + envelope.NextHopNodeID = "node-r" + envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", local.NodeID, "node-c"} + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden) + } +} + +func TestMeshForwardingGateRejectsInvalidProductionEnvelope(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + envelope.PayloadHash = "bad-hash" + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest) + } +} + +func TestMeshForwardingGateRejectsOversizedProductionEnvelopePayload(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + observed := false + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error { + observed = true + return nil + }, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + envelope.Payload = json.RawMessage(`"` + string(bytes.Repeat([]byte("a"), MaxProductionEnvelopePayloadBytes+1)) + `"`) + sum := sha256.Sum256(envelope.Payload) + envelope.PayloadLength = len(envelope.Payload) + envelope.PayloadHash = hex.EncodeToString(sum[:]) + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest) + } + if observed { + t.Fatal("observer called for oversized envelope") + } +} + +func TestMeshForwardingGateRejectsFutureCreatedAt(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + observed := false + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error { + observed = true + return nil + }, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + envelope.CreatedAt = time.Now().UTC().Add(MaxProductionEnvelopeFutureSkew + time.Second) + envelope.ExpiresAt = envelope.CreatedAt.Add(time.Minute) + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest) + } + if observed { + t.Fatal("observer called for future-created envelope") + } +} + +func TestMeshForwardingGateObservesValidEnvelopeWithoutPayload(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + var observed ProductionEnvelopeObservation + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error { + observed = observation + return nil + }, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusNotImplemented { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented) + } + if observed.MessageID != envelope.MessageID || observed.RouteID != envelope.RouteID { + t.Fatalf("unexpected observation: %+v", observed) + } + if observed.PayloadHash != envelope.PayloadHash || observed.PayloadLength != envelope.PayloadLength { + t.Fatalf("payload metadata missing from observation: %+v", observed) + } +} + +func TestMeshForwardingGateDoesNotObserveRejectedEnvelope(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + observed := false + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error { + observed = true + return nil + }, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + envelope.ClusterID = "wrong-cluster" + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden) + } + if observed { + t.Fatal("observer called for rejected envelope") + } +} + +func TestMeshForwardingGateFailsClosedWhenObservationFails(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error { + return errors.New("observer down") + }, + }.Handler()) + defer server.Close() + + payload, err := json.Marshal(validProductionEnvelope(local)) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusInternalServerError { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError) + } +} + +func TestMeshForwardingGateFailsClosedWhenObservationPanics(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error { + panic("observer panic") + }, + }.Handler()) + defer server.Close() + + payload, err := json.Marshal(validProductionEnvelope(local)) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusInternalServerError { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError) + } +} + +func TestObserveProductionEnvelopeAllowsNilObserver(t *testing.T) { + if err := observeProductionEnvelope(context.Background(), nil, ProductionEnvelopeObservation{}); err != nil { + t.Fatalf("observeProductionEnvelope nil observer err = %v", err) + } +} + +func TestProductionEnvelopeObservationSinkKeepsBoundedMetadata(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + sink := NewProductionEnvelopeObservationSink(2) + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + ProductionEnvelopeObserver: sink.Observe, + }.Handler()) + defer server.Close() + + for i := 1; i <= 3; i++ { + envelope := validProductionEnvelope(local) + envelope.MessageID = "message-" + string(rune('0'+i)) + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusNotImplemented { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented) + } + } + + observations := sink.Snapshot() + if len(observations) != 2 { + t.Fatalf("observation count = %d, want 2", len(observations)) + } + if observations[0].MessageID != "message-2" || observations[1].MessageID != "message-3" { + t.Fatalf("unexpected bounded observations: %+v", observations) + } + if observations[0].PayloadHash == "" || observations[0].PayloadLength == 0 { + t.Fatalf("payload metadata missing from bounded observation: %+v", observations[0]) + } + metrics := sink.Metrics() + if metrics.Capacity != 2 || metrics.CurrentDepth != 2 || metrics.AcceptedTotal != 3 || metrics.DroppedOldest != 1 { + t.Fatalf("unexpected sink metrics: %+v", metrics) + } +} + +func TestProductionEnvelopeObservationSinkMetricsStartEmpty(t *testing.T) { + sink := NewProductionEnvelopeObservationSink(3) + metrics := sink.Metrics() + if metrics.Capacity != 3 || metrics.CurrentDepth != 0 || metrics.AcceptedTotal != 0 || metrics.DroppedOldest != 0 { + t.Fatalf("unexpected empty metrics: %+v", metrics) + } +} + +func TestMeshForwardingGateRejectsServiceChannel(t *testing.T) { + local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"} + server := httptest.NewServer(Server{ + Local: local, + ProductionForwardingEnabled: true, + }.Handler()) + defer server.Close() + + envelope := validProductionEnvelope(local) + envelope.ChannelClass = "render" + payload, err := json.Marshal(envelope) + if err != nil { + t.Fatalf("marshal envelope: %v", err) + } + resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload)) + if err != nil { + t.Fatalf("post forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusForbidden { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden) + } +} + +func TestMeshForwardingRequiresPost(t *testing.T) { + server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler()) + defer server.Close() + + resp, err := http.Get(server.URL + "/mesh/v1/forward") + if err != nil { + t.Fatalf("get forward: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusMethodNotAllowed { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusMethodNotAllowed) + } +} + +func validProductionEnvelope(local PeerIdentity) ProductionEnvelope { + payload := json.RawMessage(`{"kind":"control"}`) + sum := sha256.Sum256(payload) + now := time.Now().UTC() + return ProductionEnvelope{ + FabricProtocolVersion: ProtocolVersion, + MessageID: "message-1", + RouteID: "route-1", + ClusterID: local.ClusterID, + SourceNodeID: "node-a", + DestinationNodeID: "node-c", + CurrentHopNodeID: local.NodeID, + NextHopNodeID: "node-c", + ChannelClass: ProductionChannelFabricControl, + MessageType: ProductionMessageFabricControl, + TTL: 4, + HopCount: 1, + CreatedAt: now, + ExpiresAt: now.Add(time.Minute), + PayloadLength: len(payload), + PayloadHash: hex.EncodeToString(sum[:]), + Payload: payload, + } +} + +func configuredProductionRoute(routeID string, hops []string) SyntheticRoute { + return SyntheticRoute{ + RouteID: routeID, + ClusterID: "cluster-1", + SourceNodeID: hops[0], + DestinationNodeID: hops[len(hops)-1], + Hops: append([]string{}, hops...), + AllowedChannels: []string{ProductionChannelFabricControl}, + ExpiresAt: time.Now().UTC().Add(time.Hour), + MaxTTL: 8, + MaxHops: 8, + } +} + +func hasProductionForwardEvent(events []ProductionForwardLogEntry, event string) bool { + for _, item := range events { + if item.Event == event { + return true + } + } + return false +} + +func TestSyntheticEndpointDisabledByDefault(t *testing.T) { + server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler()) + defer server.Close() + + resp, err := http.Post(server.URL+"/mesh/v1/synthetic/probe", "application/json", bytes.NewReader([]byte(`{}`))) + if err != nil { + t.Fatalf("post synthetic probe: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusServiceUnavailable { + t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusServiceUnavailable) + } +} diff --git a/agents/rap-node-agent/internal/mesh/synthetic_relay.go b/agents/rap-node-agent/internal/mesh/synthetic_relay.go new file mode 100644 index 0000000..7624860 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/synthetic_relay.go @@ -0,0 +1,280 @@ +package mesh + +import ( + "sync" + "time" +) + +type SyntheticRelaySchedulerConfig struct { + Enabled bool + Local PeerIdentity + QueuePolicies []SyntheticRelayQueuePolicy + AllowedChannels []string + AllowedMessageTypes []string + Now func() time.Time + Logger func(SyntheticLogEntry) +} + +type SyntheticRelayScheduler struct { + enabled bool + local PeerIdentity + policies map[string]SyntheticRelayQueuePolicy + allowedChannels map[string]struct{} + allowedMessageTypes map[string]struct{} + priorityOrder []string + now func() time.Time + logger func(SyntheticLogEntry) + + mu sync.Mutex + queues map[string][]SyntheticEnvelope + metrics SyntheticRelayQueueMetrics +} + +func NewSyntheticRelayScheduler(cfg SyntheticRelaySchedulerConfig) *SyntheticRelayScheduler { + policies := cfg.QueuePolicies + if len(policies) == 0 { + policies = []SyntheticRelayQueuePolicy{ + {Channel: SyntheticChannelFabricControl, Capacity: 64, Droppable: false}, + {Channel: SyntheticChannelRouteControl, Capacity: 64, Droppable: false}, + {Channel: SyntheticChannelTelemetry, Capacity: 16, Droppable: true}, + } + } + policyMap := map[string]SyntheticRelayQueuePolicy{} + allowedChannels := map[string]struct{}{} + priorityOrder := make([]string, 0, len(policies)) + for _, policy := range policies { + if policy.Channel == "" { + continue + } + if policy.Capacity <= 0 { + policy.Capacity = 1 + } + policyMap[policy.Channel] = policy + allowedChannels[policy.Channel] = struct{}{} + priorityOrder = append(priorityOrder, policy.Channel) + } + for _, channel := range cfg.AllowedChannels { + if channel != "" { + allowedChannels[channel] = struct{}{} + } + } + messageTypes := cfg.AllowedMessageTypes + if len(messageTypes) == 0 { + messageTypes = []string{ + SyntheticMessageProbe, + SyntheticMessageProbeAck, + SyntheticMessageRouteHealth, + SyntheticMessageRouteHealthAck, + SyntheticMessageTelemetry, + SyntheticMessageTestService, + SyntheticMessageTestServiceAck, + } + } + allowedMessageTypes := map[string]struct{}{} + for _, messageType := range messageTypes { + if messageType != "" { + allowedMessageTypes[messageType] = struct{}{} + } + } + now := cfg.Now + if now == nil { + now = func() time.Time { return time.Now().UTC() } + } + return &SyntheticRelayScheduler{ + enabled: cfg.Enabled, + local: cfg.Local, + policies: policyMap, + allowedChannels: allowedChannels, + allowedMessageTypes: allowedMessageTypes, + priorityOrder: priorityOrder, + now: now, + logger: cfg.Logger, + queues: map[string][]SyntheticEnvelope{}, + metrics: SyntheticRelayQueueMetrics{ + QueueDepths: map[string]int{}, + }, + } +} + +func (s *SyntheticRelayScheduler) Enqueue(envelope SyntheticEnvelope) (SyntheticRelayEnqueueResult, error) { + if err := s.validateEnvelope(envelope); err != nil { + s.reject(envelope, err) + return SyntheticRelayEnqueueResult{}, err + } + policy := s.policies[envelope.Channel] + result := SyntheticRelayEnqueueResult{ + Channel: envelope.Channel, + QueueCapacity: policy.Capacity, + AcceptedSequence: envelope.Sequence, + } + s.mu.Lock() + queue := s.queues[envelope.Channel] + if len(queue) >= policy.Capacity { + if !policy.Droppable { + s.metrics.Rejected++ + s.metrics.LastRejectReason = ErrSyntheticRelayQueueFull.Error() + s.mu.Unlock() + s.log(SyntheticLogEntry{ + Event: "fabric_relay_rejected", + RouteID: envelope.RouteID, + ClusterID: envelope.ClusterID, + LocalNodeID: s.local.NodeID, + Channel: envelope.Channel, + MessageType: envelope.MessageType, + Reason: ErrSyntheticRelayQueueFull.Error(), + QueueDepth: len(queue), + QueueCapacity: policy.Capacity, + OccurredAt: s.now(), + }) + return SyntheticRelayEnqueueResult{}, ErrSyntheticRelayQueueFull + } + result.Dropped = true + result.DroppedSequence = queue[0].Sequence + queue = queue[1:] + s.metrics.Dropped++ + } + queue = append(queue, envelope) + s.queues[envelope.Channel] = queue + result.QueueDepth = len(queue) + s.metrics.Enqueued++ + s.metrics.QueueDepths[envelope.Channel] = len(queue) + s.mu.Unlock() + s.log(SyntheticLogEntry{ + Event: "fabric_relay_enqueued", + RouteID: envelope.RouteID, + ClusterID: envelope.ClusterID, + LocalNodeID: s.local.NodeID, + Channel: envelope.Channel, + MessageType: envelope.MessageType, + QueueDepth: result.QueueDepth, + QueueCapacity: result.QueueCapacity, + Dropped: result.Dropped, + DroppedSequence: result.DroppedSequence, + OccurredAt: s.now(), + }) + return result, nil +} + +func (s *SyntheticRelayScheduler) Dequeue() (SyntheticEnvelope, error) { + if !s.enabled { + return SyntheticEnvelope{}, ErrMeshRuntimeDisabled + } + s.mu.Lock() + for _, channel := range s.priorityOrder { + queue := s.queues[channel] + if len(queue) == 0 { + continue + } + envelope := queue[0] + queue = queue[1:] + s.queues[channel] = queue + s.metrics.Dequeued++ + s.metrics.QueueDepths[channel] = len(queue) + s.mu.Unlock() + s.log(SyntheticLogEntry{ + Event: "fabric_relay_dequeued", + RouteID: envelope.RouteID, + ClusterID: envelope.ClusterID, + LocalNodeID: s.local.NodeID, + Channel: envelope.Channel, + MessageType: envelope.MessageType, + QueueDepth: len(queue), + QueueCapacity: s.policies[channel].Capacity, + OccurredAt: s.now(), + }) + return envelope, nil + } + s.mu.Unlock() + return SyntheticEnvelope{}, ErrSyntheticRelayQueueEmpty +} + +func (s *SyntheticRelayScheduler) SnapshotQueueMetrics() SyntheticRelayQueueMetrics { + s.mu.Lock() + defer s.mu.Unlock() + depths := map[string]int{} + for channel, depth := range s.metrics.QueueDepths { + depths[channel] = depth + } + for channel, queue := range s.queues { + depths[channel] = len(queue) + } + return SyntheticRelayQueueMetrics{ + Enqueued: s.metrics.Enqueued, + Dequeued: s.metrics.Dequeued, + Dropped: s.metrics.Dropped, + Rejected: s.metrics.Rejected, + LastRejectReason: s.metrics.LastRejectReason, + QueueDepths: depths, + } +} + +func (s *SyntheticRelayScheduler) validateEnvelope(envelope SyntheticEnvelope) error { + if s == nil || !s.enabled { + return ErrMeshRuntimeDisabled + } + if envelope.ProtocolVersion != ProtocolVersion { + return ErrUnsupportedSyntheticMessage + } + if envelope.RouteID == "" { + return ErrRouteIDRequired + } + if envelope.ClusterID == "" || envelope.ClusterID != s.local.ClusterID { + return ErrClusterMismatch + } + if envelope.From.ClusterID != s.local.ClusterID || envelope.From.NodeID == "" { + return ErrNodeMismatch + } + if envelope.To.ClusterID != s.local.ClusterID || envelope.To.NodeID != s.local.NodeID { + return ErrNodeMismatch + } + if envelope.TTL <= 0 { + return ErrTTLExhausted + } + if envelope.HopCount <= 0 { + return ErrInvalidRoutePath + } + if contains(envelope.Visited, s.local.NodeID) { + return ErrLoopDetected + } + if _, ok := s.allowedChannels[envelope.Channel]; !ok { + return ErrUnauthorizedChannel + } + if _, ok := s.policies[envelope.Channel]; !ok { + return ErrUnauthorizedChannel + } + if _, ok := s.allowedMessageTypes[envelope.MessageType]; !ok { + return ErrUnsupportedSyntheticMessage + } + return nil +} + +func (s *SyntheticRelayScheduler) reject(envelope SyntheticEnvelope, err error) { + reason := "" + if err != nil { + reason = err.Error() + } + if s != nil { + s.mu.Lock() + s.metrics.Rejected++ + s.metrics.LastRejectReason = reason + s.mu.Unlock() + } + if s != nil { + s.log(SyntheticLogEntry{ + Event: "fabric_relay_rejected", + RouteID: envelope.RouteID, + ClusterID: envelope.ClusterID, + LocalNodeID: s.local.NodeID, + Channel: envelope.Channel, + MessageType: envelope.MessageType, + Reason: reason, + OccurredAt: s.now(), + }) + } +} + +func (s *SyntheticRelayScheduler) log(entry SyntheticLogEntry) { + if s.logger != nil { + s.logger(entry) + } +} diff --git a/agents/rap-node-agent/internal/mesh/synthetic_relay_test.go b/agents/rap-node-agent/internal/mesh/synthetic_relay_test.go new file mode 100644 index 0000000..75138ff --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/synthetic_relay_test.go @@ -0,0 +1,213 @@ +package mesh + +import ( + "errors" + "testing" +) + +func TestSyntheticRelaySchedulerDequeuesByQoSPriority(t *testing.T) { + scheduler := testRelayScheduler() + telemetry := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1) + routeControl := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2) + fabricControl := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 3) + + if _, err := scheduler.Enqueue(telemetry); err != nil { + t.Fatalf("enqueue telemetry: %v", err) + } + if _, err := scheduler.Enqueue(routeControl); err != nil { + t.Fatalf("enqueue route control: %v", err) + } + if _, err := scheduler.Enqueue(fabricControl); err != nil { + t.Fatalf("enqueue fabric control: %v", err) + } + + first, err := scheduler.Dequeue() + if err != nil { + t.Fatalf("dequeue first: %v", err) + } + second, err := scheduler.Dequeue() + if err != nil { + t.Fatalf("dequeue second: %v", err) + } + third, err := scheduler.Dequeue() + if err != nil { + t.Fatalf("dequeue third: %v", err) + } + if first.Channel != SyntheticChannelFabricControl { + t.Fatalf("first channel = %q, want fabric_control", first.Channel) + } + if second.Channel != SyntheticChannelRouteControl { + t.Fatalf("second channel = %q, want route_control", second.Channel) + } + if third.Channel != SyntheticChannelTelemetry { + t.Fatalf("third channel = %q, want telemetry", third.Channel) + } +} + +func TestSyntheticRelaySchedulerDropsOldestTelemetryOnly(t *testing.T) { + scheduler := testRelayScheduler() + first := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1) + second := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 2) + + if result, err := scheduler.Enqueue(first); err != nil || result.Dropped { + t.Fatalf("enqueue first result=%+v err=%v", result, err) + } + result, err := scheduler.Enqueue(second) + if err != nil { + t.Fatalf("enqueue second: %v", err) + } + if !result.Dropped || result.DroppedSequence != 1 { + t.Fatalf("result = %+v, want dropped sequence 1", result) + } + dequeued, err := scheduler.Dequeue() + if err != nil { + t.Fatalf("dequeue: %v", err) + } + if dequeued.Sequence != 2 { + t.Fatalf("dequeued sequence = %d, want 2", dequeued.Sequence) + } + metrics := scheduler.SnapshotQueueMetrics() + if metrics.Dropped != 1 || metrics.Enqueued != 2 { + t.Fatalf("metrics = %+v, want one drop and two enqueues", metrics) + } +} + +func TestSyntheticRelaySchedulerRejectsFullReliableQueue(t *testing.T) { + scheduler := testRelayScheduler() + first := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1) + second := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 2) + + if _, err := scheduler.Enqueue(first); err != nil { + t.Fatalf("enqueue first: %v", err) + } + _, err := scheduler.Enqueue(second) + if !errors.Is(err, ErrSyntheticRelayQueueFull) { + t.Fatalf("err = %v, want ErrSyntheticRelayQueueFull", err) + } + dequeued, err := scheduler.Dequeue() + if err != nil { + t.Fatalf("dequeue: %v", err) + } + if dequeued.Sequence != 1 { + t.Fatalf("dequeued sequence = %d, want 1", dequeued.Sequence) + } + metrics := scheduler.SnapshotQueueMetrics() + if metrics.Dropped != 0 || metrics.Rejected != 1 { + t.Fatalf("metrics = %+v, want no drop and one rejection", metrics) + } +} + +func TestSyntheticRelaySchedulerRejectsInvalidEnvelopes(t *testing.T) { + tests := []struct { + name string + mutate func(*SyntheticEnvelope) + want error + }{ + { + name: "wrong cluster", + mutate: func(envelope *SyntheticEnvelope) { + envelope.ClusterID = "cluster-2" + }, + want: ErrClusterMismatch, + }, + { + name: "wrong node", + mutate: func(envelope *SyntheticEnvelope) { + envelope.To.NodeID = "node-x" + }, + want: ErrNodeMismatch, + }, + { + name: "unauthorized channel", + mutate: func(envelope *SyntheticEnvelope) { + envelope.Channel = "rdp_render" + }, + want: ErrUnauthorizedChannel, + }, + { + name: "unsupported message", + mutate: func(envelope *SyntheticEnvelope) { + envelope.MessageType = "rdp.input" + }, + want: ErrUnsupportedSyntheticMessage, + }, + { + name: "ttl exhausted", + mutate: func(envelope *SyntheticEnvelope) { + envelope.TTL = 0 + }, + want: ErrTTLExhausted, + }, + { + name: "loop detected", + mutate: func(envelope *SyntheticEnvelope) { + envelope.Visited = append(envelope.Visited, "node-r") + }, + want: ErrLoopDetected, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + scheduler := testRelayScheduler() + envelope := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1) + tt.mutate(&envelope) + _, err := scheduler.Enqueue(envelope) + if !errors.Is(err, tt.want) { + t.Fatalf("err = %v, want %v", err, tt.want) + } + }) + } +} + +func TestSyntheticRelaySchedulerDisabledRejects(t *testing.T) { + scheduler := NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{ + Enabled: false, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}, + }) + _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)) + if !errors.Is(err, ErrMeshRuntimeDisabled) { + t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err) + } + if _, err := scheduler.Dequeue(); !errors.Is(err, ErrMeshRuntimeDisabled) { + t.Fatalf("dequeue err = %v, want ErrMeshRuntimeDisabled", err) + } +} + +func TestSyntheticRelaySchedulerQueueDepthSnapshot(t *testing.T) { + scheduler := testRelayScheduler() + if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)); err != nil { + t.Fatalf("enqueue fabric control: %v", err) + } + if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2)); err != nil { + t.Fatalf("enqueue route control: %v", err) + } + metrics := scheduler.SnapshotQueueMetrics() + if metrics.QueueDepths[SyntheticChannelFabricControl] != 1 { + t.Fatalf("fabric_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelFabricControl]) + } + if metrics.QueueDepths[SyntheticChannelRouteControl] != 1 { + t.Fatalf("route_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelRouteControl]) + } +} + +func testRelayScheduler() *SyntheticRelayScheduler { + return NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{ + Enabled: true, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}, + QueuePolicies: []SyntheticRelayQueuePolicy{ + {Channel: SyntheticChannelFabricControl, Capacity: 1, Droppable: false}, + {Channel: SyntheticChannelRouteControl, Capacity: 1, Droppable: false}, + {Channel: SyntheticChannelTelemetry, Capacity: 1, Droppable: true}, + }, + }) +} + +func testRelayEnvelope(channel string, messageType string, sequence uint64) SyntheticEnvelope { + route := testRoute("route-relay-scheduler", []string{"node-a", "node-r", "node-b"}) + envelope := testEnvelope(route, "node-a", "node-r") + envelope.Channel = channel + envelope.MessageType = messageType + envelope.Sequence = sequence + return envelope +} diff --git a/agents/rap-node-agent/internal/mesh/synthetic_runtime.go b/agents/rap-node-agent/internal/mesh/synthetic_runtime.go new file mode 100644 index 0000000..f7b7fde --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/synthetic_runtime.go @@ -0,0 +1,1037 @@ +package mesh + +import ( + "context" + "encoding/json" + "sync" + "time" +) + +type SyntheticTransport interface { + SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) +} + +type SyntheticLogEntry struct { + Event string `json:"event"` + RouteID string `json:"route_id,omitempty"` + ClusterID string `json:"cluster_id,omitempty"` + LocalNodeID string `json:"local_node_id,omitempty"` + NextNodeID string `json:"next_node_id,omitempty"` + Channel string `json:"channel,omitempty"` + MessageType string `json:"message_type,omitempty"` + Reason string `json:"reason,omitempty"` + TTL int `json:"ttl,omitempty"` + HopCount int `json:"hop_count,omitempty"` + QueueDepth int `json:"queue_depth,omitempty"` + QueueCapacity int `json:"queue_capacity,omitempty"` + Dropped bool `json:"dropped,omitempty"` + DroppedSequence uint64 `json:"dropped_sequence,omitempty"` + OccurredAt time.Time `json:"occurred_at"` +} + +type SyntheticMetrics struct { + ProbesSent uint64 + ProbesReceived uint64 + ProbesForwarded uint64 + ProbeAcksCreated uint64 + RouteHealthProbesSent uint64 + TestServiceRequestsSent uint64 + TestServiceDeliveriesSucceeded uint64 + TestServiceDeliveriesFailed uint64 + TestServiceFallbacksUsed uint64 + RouteDeliveriesSucceeded uint64 + RouteDeliveriesFailed uint64 + FallbackRoutesUsed uint64 + WarmRoutesPromoted uint64 + RouteCacheInvalidations uint64 + Rejected uint64 + LastRejectReason string +} + +type SyntheticRuntimeConfig struct { + Enabled bool + Local PeerIdentity + Routes []SyntheticRoute + RouteHealthRoutes []SyntheticRoute + AllowedChannels []string + MaxTTL int + MaxHops int + TestOrganizationID string + MaxTestPayloadBytes int + Transport SyntheticTransport + Now func() time.Time + Logger func(SyntheticLogEntry) +} + +type SyntheticRuntime struct { + enabled bool + local PeerIdentity + routes map[string]SyntheticRoute + routeHealthRoutes map[string]SyntheticRoute + allowedChannels map[string]struct{} + maxTTL int + maxHops int + testOrganizationID string + maxTestPayloadBytes int + transport SyntheticTransport + now func() time.Time + logger func(SyntheticLogEntry) + + mu sync.Mutex + metrics SyntheticMetrics + seq uint64 + routeHealth map[string]SyntheticRouteObservation +} + +func NewSyntheticRuntime(cfg SyntheticRuntimeConfig) *SyntheticRuntime { + allowed := cfg.AllowedChannels + if len(allowed) == 0 { + allowed = []string{ + SyntheticChannelFabricControl, + SyntheticChannelRouteControl, + SyntheticChannelTelemetry, + } + } + allowedSet := make(map[string]struct{}, len(allowed)) + for _, channel := range allowed { + if channel != "" { + allowedSet[channel] = struct{}{} + } + } + routes := make(map[string]SyntheticRoute, len(cfg.Routes)) + for _, route := range cfg.Routes { + if route.RouteID != "" { + routes[route.RouteID] = route + } + } + routeHealthRoutes := routes + if len(cfg.RouteHealthRoutes) > 0 { + routeHealthRoutes = make(map[string]SyntheticRoute, len(cfg.RouteHealthRoutes)) + for _, route := range cfg.RouteHealthRoutes { + if route.RouteID != "" { + routeHealthRoutes[route.RouteID] = route + } + } + } + maxTTL := cfg.MaxTTL + if maxTTL <= 0 { + maxTTL = 8 + } + maxHops := cfg.MaxHops + if maxHops <= 0 { + maxHops = 8 + } + testOrganizationID := cfg.TestOrganizationID + if testOrganizationID == "" { + testOrganizationID = SyntheticDefaultTestOrganizationID + } + maxTestPayloadBytes := cfg.MaxTestPayloadBytes + if maxTestPayloadBytes <= 0 { + maxTestPayloadBytes = SyntheticDefaultMaxTestPayloadBytes + } + now := cfg.Now + if now == nil { + now = func() time.Time { return time.Now().UTC() } + } + return &SyntheticRuntime{ + enabled: cfg.Enabled, + local: cfg.Local, + routes: routes, + routeHealthRoutes: routeHealthRoutes, + allowedChannels: allowedSet, + maxTTL: maxTTL, + maxHops: maxHops, + testOrganizationID: testOrganizationID, + maxTestPayloadBytes: maxTestPayloadBytes, + transport: cfg.Transport, + now: now, + logger: cfg.Logger, + routeHealth: map[string]SyntheticRouteObservation{}, + } +} + +func (r *SyntheticRuntime) UpdateConfig(routes []SyntheticRoute, transport SyntheticTransport) { + if r == nil { + return + } + updatedRoutes := make(map[string]SyntheticRoute, len(routes)) + for _, route := range routes { + if route.RouteID != "" { + updatedRoutes[route.RouteID] = route + } + } + r.mu.Lock() + defer r.mu.Unlock() + r.routes = updatedRoutes + r.routeHealthRoutes = updatedRoutes + if transport != nil { + r.transport = transport + } +} + +func (r *SyntheticRuntime) UpdateRouteHealthConfig(routes []SyntheticRoute) { + if r == nil { + return + } + updatedRoutes := make(map[string]SyntheticRoute, len(routes)) + for _, route := range routes { + if route.RouteID != "" { + updatedRoutes[route.RouteID] = route + } + } + r.mu.Lock() + defer r.mu.Unlock() + r.routeHealthRoutes = updatedRoutes +} + +func (r *SyntheticRuntime) SendProbe(ctx context.Context, routeID string, channel string, probeID string) (SyntheticEnvelope, error) { + return r.sendProbeMessage(ctx, routeID, channel, probeID, SyntheticMessageProbe) +} + +func (r *SyntheticRuntime) SendRouteHealthProbe(ctx context.Context, routeID string, channel string, probeID string) (SyntheticRouteHealthResult, error) { + startedAt := r.now() + ack, err := r.sendProbeMessage(ctx, routeID, channel, probeID, SyntheticMessageRouteHealth) + if err != nil { + r.recordRouteFailure(routeID, startedAt, err) + return SyntheticRouteHealthResult{}, err + } + observation := r.recordRouteSuccess(routeID, startedAt) + return SyntheticRouteHealthResult{ + RequestedRouteID: routeID, + SelectedRouteID: routeID, + FallbackUsed: false, + Ack: ack, + Observation: observation, + }, nil +} + +func (r *SyntheticRuntime) SendRouteHealthProbeWithFallback(ctx context.Context, preferredRouteID string, fallbackRouteIDs []string, channel string, probeID string) (SyntheticRouteHealthResult, error) { + if err := r.ensureEnabled(); err != nil { + r.reject(preferredRouteID, channel, SyntheticMessageRouteHealth, err) + return SyntheticRouteHealthResult{}, err + } + candidates := append([]string{preferredRouteID}, fallbackRouteIDs...) + var lastErr error + for i, routeID := range candidates { + if routeID == "" { + continue + } + if i > 0 { + r.log(SyntheticLogEntry{ + Event: "fabric_fallback_route_selected", + RouteID: routeID, + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + Channel: channel, + MessageType: SyntheticMessageRouteHealth, + Reason: "preferred_route_unavailable", + OccurredAt: r.now(), + }) + } + result, err := r.SendRouteHealthProbe(ctx, routeID, channel, probeID) + if err == nil { + if i > 0 { + result.RequestedRouteID = preferredRouteID + result.FallbackUsed = true + r.increment(func(m *SyntheticMetrics) { + m.FallbackRoutesUsed++ + m.WarmRoutesPromoted++ + }) + r.log(SyntheticLogEntry{ + Event: "fabric_warm_route_promoted", + RouteID: routeID, + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + Channel: channel, + MessageType: SyntheticMessageRouteHealth, + OccurredAt: r.now(), + }) + } + return result, nil + } + lastErr = err + r.log(SyntheticLogEntry{ + Event: "fabric_route_delivery_failed", + RouteID: routeID, + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + Channel: channel, + MessageType: SyntheticMessageRouteHealth, + Reason: err.Error(), + OccurredAt: r.now(), + }) + } + if lastErr == nil { + lastErr = ErrNoHealthySyntheticRoute + } + r.reject(preferredRouteID, channel, SyntheticMessageRouteHealth, lastErr) + return SyntheticRouteHealthResult{}, lastErr +} + +func (r *SyntheticRuntime) SendTestService(ctx context.Context, routeID string, channel string, request SyntheticTestServiceRequest) (SyntheticTestServiceResult, error) { + startedAt := r.now() + ack, err := r.sendTestServiceMessage(ctx, routeID, channel, request) + if err != nil { + r.recordTestServiceFailure(routeID, startedAt, err) + return SyntheticTestServiceResult{}, err + } + response, err := decodeTestServiceResponse(ack.Payload) + if err != nil { + r.recordTestServiceFailure(routeID, startedAt, err) + return SyntheticTestServiceResult{}, err + } + observation := r.recordTestServiceSuccess(routeID, startedAt) + return SyntheticTestServiceResult{ + RequestedRouteID: routeID, + SelectedRouteID: routeID, + FallbackUsed: false, + Ack: ack, + Response: response, + Observation: observation, + }, nil +} + +func (r *SyntheticRuntime) SendTestServiceWithFallback(ctx context.Context, preferredRouteID string, fallbackRouteIDs []string, channel string, request SyntheticTestServiceRequest) (SyntheticTestServiceResult, error) { + if err := r.ensureEnabled(); err != nil { + r.reject(preferredRouteID, channel, SyntheticMessageTestService, err) + return SyntheticTestServiceResult{}, err + } + candidates := append([]string{preferredRouteID}, fallbackRouteIDs...) + var lastErr error + for i, routeID := range candidates { + if routeID == "" { + continue + } + if i > 0 { + r.log(SyntheticLogEntry{ + Event: "fabric_test_service_fallback_selected", + RouteID: routeID, + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + Channel: channel, + MessageType: SyntheticMessageTestService, + Reason: "preferred_route_unavailable", + OccurredAt: r.now(), + }) + } + result, err := r.SendTestService(ctx, routeID, channel, request) + if err == nil { + if i > 0 { + result.RequestedRouteID = preferredRouteID + result.FallbackUsed = true + r.increment(func(m *SyntheticMetrics) { + m.TestServiceFallbacksUsed++ + }) + r.log(SyntheticLogEntry{ + Event: "fabric_test_service_fallback_used", + RouteID: routeID, + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + Channel: channel, + MessageType: SyntheticMessageTestService, + OccurredAt: r.now(), + }) + } + return result, nil + } + lastErr = err + } + if lastErr == nil { + lastErr = ErrNoHealthySyntheticRoute + } + r.reject(preferredRouteID, channel, SyntheticMessageTestService, lastErr) + return SyntheticTestServiceResult{}, lastErr +} + +func (r *SyntheticRuntime) SnapshotRouteObservation(routeID string) (SyntheticRouteObservation, bool) { + r.mu.Lock() + defer r.mu.Unlock() + observation, ok := r.routeHealth[routeID] + return observation, ok +} + +func (r *SyntheticRuntime) InvalidateRouteCache(reason string, version SyntheticRouteCacheVersion) int { + r.mu.Lock() + invalidated := 0 + for routeID, observation := range r.routeHealth { + if shouldInvalidateObservation(observation, version) { + delete(r.routeHealth, routeID) + invalidated++ + } + } + r.metrics.RouteCacheInvalidations += uint64(invalidated) + r.mu.Unlock() + r.log(SyntheticLogEntry{ + Event: "fabric_route_cache_invalidated", + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + Reason: reason, + OccurredAt: r.now(), + }) + return invalidated +} + +func (r *SyntheticRuntime) sendProbeMessage(ctx context.Context, routeID string, channel string, probeID string, messageType string) (SyntheticEnvelope, error) { + if err := r.ensureEnabled(); err != nil { + r.reject(routeID, channel, messageType, err) + return SyntheticEnvelope{}, err + } + route, path, err := r.routeForMessage(routeID, messageType) + if err != nil { + r.reject(routeID, channel, messageType, err) + return SyntheticEnvelope{}, err + } + if err := r.validateRouteForSend(route, path, channel); err != nil { + r.reject(routeID, channel, messageType, err) + return SyntheticEnvelope{}, err + } + if len(path) < 2 { + r.reject(routeID, channel, messageType, ErrInvalidRoutePath) + return SyntheticEnvelope{}, ErrInvalidRoutePath + } + r.log(SyntheticLogEntry{ + Event: "fabric_route_selected", + RouteID: route.RouteID, + ClusterID: route.ClusterID, + LocalNodeID: r.local.NodeID, + NextNodeID: path[1], + Channel: channel, + MessageType: messageType, + TTL: routeTTL(route, r.maxTTL), + OccurredAt: r.now(), + }) + payload, err := json.Marshal(SyntheticProbePayload{ + ProbeID: probeID, + SentAt: r.now(), + }) + if err != nil { + return SyntheticEnvelope{}, err + } + envelope := SyntheticEnvelope{ + ProtocolVersion: ProtocolVersion, + RouteID: route.RouteID, + ClusterID: route.ClusterID, + From: r.local, + To: PeerIdentity{ClusterID: route.ClusterID, NodeID: path[1]}, + Channel: channel, + MessageType: messageType, + TTL: routeTTL(route, r.maxTTL), + HopCount: 1, + Visited: []string{r.local.NodeID}, + Sequence: r.nextSequence(), + SentAt: r.now(), + Payload: payload, + } + r.increment(func(m *SyntheticMetrics) { + if messageType == SyntheticMessageRouteHealth { + m.RouteHealthProbesSent++ + } else { + m.ProbesSent++ + } + }) + r.log(SyntheticLogEntry{ + Event: "fabric_probe_sent", + RouteID: envelope.RouteID, + ClusterID: envelope.ClusterID, + LocalNodeID: r.local.NodeID, + NextNodeID: envelope.To.NodeID, + Channel: envelope.Channel, + MessageType: envelope.MessageType, + TTL: envelope.TTL, + HopCount: envelope.HopCount, + OccurredAt: r.now(), + }) + return r.send(ctx, envelope.To.NodeID, envelope) +} + +func (r *SyntheticRuntime) sendTestServiceMessage(ctx context.Context, routeID string, channel string, request SyntheticTestServiceRequest) (SyntheticEnvelope, error) { + if err := r.ensureEnabled(); err != nil { + r.reject(routeID, channel, SyntheticMessageTestService, err) + return SyntheticEnvelope{}, err + } + if err := r.validateTestServiceRequest(request); err != nil { + r.reject(routeID, channel, SyntheticMessageTestService, err) + return SyntheticEnvelope{}, err + } + route, path, err := r.route(routeID) + if err != nil { + r.reject(routeID, channel, SyntheticMessageTestService, err) + return SyntheticEnvelope{}, err + } + if err := r.validateRouteForSend(route, path, channel); err != nil { + r.reject(routeID, channel, SyntheticMessageTestService, err) + return SyntheticEnvelope{}, err + } + if len(path) < 2 { + r.reject(routeID, channel, SyntheticMessageTestService, ErrInvalidRoutePath) + return SyntheticEnvelope{}, ErrInvalidRoutePath + } + if request.SentAt.IsZero() { + request.SentAt = r.now() + } + payload, err := json.Marshal(request) + if err != nil { + return SyntheticEnvelope{}, err + } + envelope := SyntheticEnvelope{ + ProtocolVersion: ProtocolVersion, + RouteID: route.RouteID, + ClusterID: route.ClusterID, + From: r.local, + To: PeerIdentity{ClusterID: route.ClusterID, NodeID: path[1]}, + Channel: channel, + MessageType: SyntheticMessageTestService, + TTL: routeTTL(route, r.maxTTL), + HopCount: 1, + Visited: []string{r.local.NodeID}, + Sequence: r.nextSequence(), + SentAt: r.now(), + Payload: payload, + } + r.increment(func(m *SyntheticMetrics) { + m.TestServiceRequestsSent++ + }) + r.log(SyntheticLogEntry{ + Event: "fabric_test_service_sent", + RouteID: envelope.RouteID, + ClusterID: envelope.ClusterID, + LocalNodeID: r.local.NodeID, + NextNodeID: envelope.To.NodeID, + Channel: envelope.Channel, + MessageType: envelope.MessageType, + TTL: envelope.TTL, + HopCount: envelope.HopCount, + OccurredAt: r.now(), + }) + return r.send(ctx, envelope.To.NodeID, envelope) +} + +func (r *SyntheticRuntime) Receive(ctx context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) { + if err := r.ensureEnabled(); err != nil { + r.reject(envelope.RouteID, envelope.Channel, envelope.MessageType, err) + return SyntheticEnvelope{}, err + } + route, path, err := r.validateInbound(envelope) + if err != nil { + r.reject(envelope.RouteID, envelope.Channel, envelope.MessageType, err) + return SyntheticEnvelope{}, err + } + if envelope.MessageType == SyntheticMessageTestService { + request, err := decodeTestServiceRequest(envelope.Payload) + if err != nil { + r.reject(envelope.RouteID, envelope.Channel, envelope.MessageType, err) + return SyntheticEnvelope{}, err + } + if err := r.validateTestServiceRequest(request); err != nil { + r.reject(envelope.RouteID, envelope.Channel, envelope.MessageType, err) + return SyntheticEnvelope{}, err + } + } + r.increment(func(m *SyntheticMetrics) { m.ProbesReceived++ }) + r.log(SyntheticLogEntry{ + Event: "fabric_probe_received", + RouteID: envelope.RouteID, + ClusterID: envelope.ClusterID, + LocalNodeID: r.local.NodeID, + Channel: envelope.Channel, + MessageType: envelope.MessageType, + TTL: envelope.TTL, + HopCount: envelope.HopCount, + OccurredAt: r.now(), + }) + visited := append(append([]string{}, envelope.Visited...), r.local.NodeID) + if r.local.NodeID == route.DestinationNodeID { + return r.ack(route, envelope, visited) + } + if envelope.TTL <= 1 { + r.reject(envelope.RouteID, envelope.Channel, envelope.MessageType, ErrTTLExhausted) + return SyntheticEnvelope{}, ErrTTLExhausted + } + index := indexOf(path, r.local.NodeID) + if index < 0 || index+1 >= len(path) { + r.reject(envelope.RouteID, envelope.Channel, envelope.MessageType, ErrInvalidRoutePath) + return SyntheticEnvelope{}, ErrInvalidRoutePath + } + nextNodeID := path[index+1] + forward := envelope + forward.From = r.local + forward.To = PeerIdentity{ClusterID: route.ClusterID, NodeID: nextNodeID} + forward.TTL = envelope.TTL - 1 + forward.HopCount = envelope.HopCount + 1 + forward.Visited = visited + forward.SentAt = r.now() + r.increment(func(m *SyntheticMetrics) { m.ProbesForwarded++ }) + r.log(SyntheticLogEntry{ + Event: "fabric_probe_forwarded", + RouteID: forward.RouteID, + ClusterID: forward.ClusterID, + LocalNodeID: r.local.NodeID, + NextNodeID: nextNodeID, + Channel: forward.Channel, + MessageType: forward.MessageType, + TTL: forward.TTL, + HopCount: forward.HopCount, + OccurredAt: r.now(), + }) + return r.send(ctx, nextNodeID, forward) +} + +func (r *SyntheticRuntime) SnapshotMetrics() SyntheticMetrics { + r.mu.Lock() + defer r.mu.Unlock() + return r.metrics +} + +func (r *SyntheticRuntime) ensureEnabled() error { + if r == nil || !r.enabled { + return ErrMeshRuntimeDisabled + } + if r.local.ClusterID == "" || r.local.NodeID == "" { + return ErrNodeMismatch + } + return nil +} + +func (r *SyntheticRuntime) route(routeID string) (SyntheticRoute, []string, error) { + return r.routeFrom(routeID, r.routes) +} + +func (r *SyntheticRuntime) routeForMessage(routeID string, messageType string) (SyntheticRoute, []string, error) { + if messageType == SyntheticMessageRouteHealth { + return r.routeFrom(routeID, r.routeHealthRoutes) + } + return r.route(routeID) +} + +func (r *SyntheticRuntime) routeFrom(routeID string, routes map[string]SyntheticRoute) (SyntheticRoute, []string, error) { + if routeID == "" { + return SyntheticRoute{}, nil, ErrRouteIDRequired + } + r.mu.Lock() + route, ok := routes[routeID] + r.mu.Unlock() + if !ok { + return SyntheticRoute{}, nil, ErrRouteNotFound + } + path := routePath(route) + if len(path) < 2 || path[0] != route.SourceNodeID || path[len(path)-1] != route.DestinationNodeID { + return SyntheticRoute{}, nil, ErrInvalidRoutePath + } + if route.ClusterID == "" || route.ClusterID != r.local.ClusterID { + return SyntheticRoute{}, nil, ErrClusterMismatch + } + if !route.ExpiresAt.IsZero() && !r.now().Before(route.ExpiresAt) { + return SyntheticRoute{}, nil, ErrRouteExpired + } + return route, path, nil +} + +func (r *SyntheticRuntime) validateRouteForSend(route SyntheticRoute, path []string, channel string) error { + if route.SourceNodeID != r.local.NodeID { + return ErrNodeMismatch + } + if routeTTL(route, r.maxTTL) <= 0 { + return ErrTTLExhausted + } + if routeMaxHops(route, r.maxHops) < len(path)-1 { + return ErrInvalidRoutePath + } + return r.validateChannel(route, channel) +} + +func (r *SyntheticRuntime) validateInbound(envelope SyntheticEnvelope) (SyntheticRoute, []string, error) { + if envelope.ProtocolVersion != ProtocolVersion { + return SyntheticRoute{}, nil, ErrUnsupportedSyntheticMessage + } + if envelope.MessageType != SyntheticMessageProbe && envelope.MessageType != SyntheticMessageRouteHealth && envelope.MessageType != SyntheticMessageTestService { + return SyntheticRoute{}, nil, ErrUnsupportedSyntheticMessage + } + if envelope.ClusterID == "" || envelope.ClusterID != r.local.ClusterID { + return SyntheticRoute{}, nil, ErrClusterMismatch + } + if envelope.To.ClusterID != r.local.ClusterID || envelope.To.NodeID != r.local.NodeID { + return SyntheticRoute{}, nil, ErrNodeMismatch + } + if envelope.TTL <= 0 { + return SyntheticRoute{}, nil, ErrTTLExhausted + } + route, path, err := r.routeForMessage(envelope.RouteID, envelope.MessageType) + if err != nil { + return SyntheticRoute{}, nil, err + } + if envelope.ClusterID != route.ClusterID { + return SyntheticRoute{}, nil, ErrClusterMismatch + } + if envelope.HopCount <= 0 || envelope.HopCount > routeMaxHops(route, r.maxHops) { + return SyntheticRoute{}, nil, ErrInvalidRoutePath + } + index := indexOf(path, r.local.NodeID) + if index < 0 { + return SyntheticRoute{}, nil, ErrNodeMismatch + } + if envelope.HopCount != index { + return SyntheticRoute{}, nil, ErrInvalidRoutePath + } + if contains(envelope.Visited, r.local.NodeID) { + return SyntheticRoute{}, nil, ErrLoopDetected + } + if index > 0 && len(envelope.Visited) > 0 && envelope.Visited[len(envelope.Visited)-1] != path[index-1] { + return SyntheticRoute{}, nil, ErrInvalidRoutePath + } + if err := r.validateChannel(route, envelope.Channel); err != nil { + return SyntheticRoute{}, nil, err + } + return route, path, nil +} + +func (r *SyntheticRuntime) validateChannel(route SyntheticRoute, channel string) error { + if channel == "" { + return ErrUnauthorizedChannel + } + if _, ok := r.allowedChannels[channel]; !ok { + return ErrUnauthorizedChannel + } + if len(route.AllowedChannels) == 0 { + return nil + } + for _, allowed := range route.AllowedChannels { + if allowed == channel { + return nil + } + } + return ErrUnauthorizedChannel +} + +func (r *SyntheticRuntime) ack(route SyntheticRoute, envelope SyntheticEnvelope, visited []string) (SyntheticEnvelope, error) { + if envelope.MessageType == SyntheticMessageTestService { + return r.ackTestService(route, envelope, visited) + } + var probe SyntheticProbePayload + _ = json.Unmarshal(envelope.Payload, &probe) + payload, err := json.Marshal(SyntheticProbeAckPayload{ + ProbeID: probe.ProbeID, + Path: visited, + AcceptedAt: r.now(), + }) + if err != nil { + return SyntheticEnvelope{}, err + } + ack := SyntheticEnvelope{ + ProtocolVersion: ProtocolVersion, + RouteID: route.RouteID, + ClusterID: route.ClusterID, + From: r.local, + To: PeerIdentity{ClusterID: route.ClusterID, NodeID: route.SourceNodeID}, + Channel: envelope.Channel, + MessageType: syntheticAckMessageType(envelope.MessageType), + TTL: envelope.TTL, + HopCount: envelope.HopCount, + Visited: visited, + Sequence: envelope.Sequence, + SentAt: r.now(), + Payload: payload, + } + r.increment(func(m *SyntheticMetrics) { m.ProbeAcksCreated++ }) + r.log(SyntheticLogEntry{ + Event: "fabric_probe_ack_created", + RouteID: ack.RouteID, + ClusterID: ack.ClusterID, + LocalNodeID: r.local.NodeID, + Channel: ack.Channel, + MessageType: ack.MessageType, + TTL: ack.TTL, + HopCount: ack.HopCount, + OccurredAt: r.now(), + }) + return ack, nil +} + +func (r *SyntheticRuntime) ackTestService(route SyntheticRoute, envelope SyntheticEnvelope, visited []string) (SyntheticEnvelope, error) { + request, err := decodeTestServiceRequest(envelope.Payload) + if err != nil { + return SyntheticEnvelope{}, err + } + if err := r.validateTestServiceRequest(request); err != nil { + return SyntheticEnvelope{}, err + } + response := SyntheticTestServiceResponse{ + RequestID: request.RequestID, + OrganizationID: request.OrganizationID, + ServiceType: request.ServiceType, + EchoPayload: request.Payload, + Path: visited, + AcceptedAt: r.now(), + } + payload, err := json.Marshal(response) + if err != nil { + return SyntheticEnvelope{}, err + } + ack := SyntheticEnvelope{ + ProtocolVersion: ProtocolVersion, + RouteID: route.RouteID, + ClusterID: route.ClusterID, + From: r.local, + To: PeerIdentity{ClusterID: route.ClusterID, NodeID: route.SourceNodeID}, + Channel: envelope.Channel, + MessageType: SyntheticMessageTestServiceAck, + TTL: envelope.TTL, + HopCount: envelope.HopCount, + Visited: visited, + Sequence: envelope.Sequence, + SentAt: r.now(), + Payload: payload, + } + r.increment(func(m *SyntheticMetrics) { m.ProbeAcksCreated++ }) + r.log(SyntheticLogEntry{ + Event: "fabric_test_service_ack_created", + RouteID: ack.RouteID, + ClusterID: ack.ClusterID, + LocalNodeID: r.local.NodeID, + Channel: ack.Channel, + MessageType: ack.MessageType, + TTL: ack.TTL, + HopCount: ack.HopCount, + OccurredAt: r.now(), + }) + return ack, nil +} + +func (r *SyntheticRuntime) recordRouteSuccess(routeID string, startedAt time.Time) SyntheticRouteObservation { + return r.recordRouteSuccessForMessage(routeID, startedAt, SyntheticMessageRouteHealth) +} + +func (r *SyntheticRuntime) recordRouteSuccessForMessage(routeID string, startedAt time.Time, messageType string) SyntheticRouteObservation { + route, _, _ := r.routeForMessage(routeID, messageType) + now := r.now() + observation := SyntheticRouteObservation{} + r.mu.Lock() + observation = r.routeHealth[routeID] + observation.RouteID = routeID + observation.State = SyntheticRouteStateHealthy + observation.LastSuccessAt = now + observation.LastFailureReason = "" + observation.SuccessCount++ + observation.LastLatencyMs = now.Sub(startedAt).Milliseconds() + observation.RouteVersion = route.RouteVersion + observation.PolicyVersion = route.PolicyVersion + observation.PeerDirectoryVersion = route.PeerDirectoryVersion + r.routeHealth[routeID] = observation + r.metrics.RouteDeliveriesSucceeded++ + r.mu.Unlock() + r.log(SyntheticLogEntry{ + Event: "fabric_route_delivery_succeeded", + RouteID: routeID, + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + MessageType: messageType, + OccurredAt: now, + }) + return observation +} + +func (r *SyntheticRuntime) recordRouteFailure(routeID string, startedAt time.Time, err error) SyntheticRouteObservation { + return r.recordRouteFailureForMessage(routeID, startedAt, err, SyntheticMessageRouteHealth) +} + +func (r *SyntheticRuntime) recordRouteFailureForMessage(routeID string, startedAt time.Time, err error, messageType string) SyntheticRouteObservation { + route, _, _ := r.routeForMessage(routeID, messageType) + now := r.now() + reason := "" + if err != nil { + reason = err.Error() + } + observation := SyntheticRouteObservation{} + r.mu.Lock() + observation = r.routeHealth[routeID] + observation.RouteID = routeID + observation.State = SyntheticRouteStateFailed + observation.LastFailureAt = now + observation.LastFailureReason = reason + observation.FailureCount++ + observation.LastLatencyMs = now.Sub(startedAt).Milliseconds() + observation.RouteVersion = route.RouteVersion + observation.PolicyVersion = route.PolicyVersion + observation.PeerDirectoryVersion = route.PeerDirectoryVersion + r.routeHealth[routeID] = observation + r.metrics.RouteDeliveriesFailed++ + r.mu.Unlock() + r.log(SyntheticLogEntry{ + Event: "fabric_route_marked_failed", + RouteID: routeID, + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + MessageType: messageType, + Reason: reason, + OccurredAt: now, + }) + return observation +} + +func (r *SyntheticRuntime) recordTestServiceSuccess(routeID string, startedAt time.Time) SyntheticRouteObservation { + observation := r.recordRouteSuccessForMessage(routeID, startedAt, SyntheticMessageTestService) + r.increment(func(m *SyntheticMetrics) { + m.TestServiceDeliveriesSucceeded++ + }) + r.log(SyntheticLogEntry{ + Event: "fabric_test_service_delivery_succeeded", + RouteID: routeID, + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + MessageType: SyntheticMessageTestService, + OccurredAt: r.now(), + }) + return observation +} + +func (r *SyntheticRuntime) recordTestServiceFailure(routeID string, startedAt time.Time, err error) SyntheticRouteObservation { + observation := r.recordRouteFailureForMessage(routeID, startedAt, err, SyntheticMessageTestService) + r.increment(func(m *SyntheticMetrics) { + m.TestServiceDeliveriesFailed++ + }) + return observation +} + +func (r *SyntheticRuntime) send(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) { + r.mu.Lock() + transport := r.transport + r.mu.Unlock() + if transport == nil { + return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable + } + return transport.SendSynthetic(ctx, nextNodeID, envelope) +} + +func (r *SyntheticRuntime) nextSequence() uint64 { + r.mu.Lock() + defer r.mu.Unlock() + r.seq++ + return r.seq +} + +func (r *SyntheticRuntime) increment(fn func(*SyntheticMetrics)) { + r.mu.Lock() + defer r.mu.Unlock() + fn(&r.metrics) +} + +func (r *SyntheticRuntime) reject(routeID string, channel string, messageType string, err error) { + reason := "" + if err != nil { + reason = err.Error() + } + r.increment(func(m *SyntheticMetrics) { + m.Rejected++ + m.LastRejectReason = reason + }) + r.log(SyntheticLogEntry{ + Event: "fabric_probe_rejected", + RouteID: routeID, + ClusterID: r.local.ClusterID, + LocalNodeID: r.local.NodeID, + Channel: channel, + MessageType: messageType, + Reason: reason, + OccurredAt: r.now(), + }) +} + +func (r *SyntheticRuntime) log(entry SyntheticLogEntry) { + if r.logger != nil { + r.logger(entry) + } +} + +func routePath(route SyntheticRoute) []string { + if len(route.Hops) > 0 { + return append([]string{}, route.Hops...) + } + return []string{route.SourceNodeID, route.DestinationNodeID} +} + +func routeTTL(route SyntheticRoute, fallback int) int { + if route.MaxTTL > 0 { + return route.MaxTTL + } + return fallback +} + +func routeMaxHops(route SyntheticRoute, fallback int) int { + if route.MaxHops > 0 { + return route.MaxHops + } + return fallback +} + +func indexOf(values []string, needle string) int { + for i, value := range values { + if value == needle { + return i + } + } + return -1 +} + +func contains(values []string, needle string) bool { + return indexOf(values, needle) >= 0 +} + +func syntheticAckMessageType(messageType string) string { + if messageType == SyntheticMessageTestService { + return SyntheticMessageTestServiceAck + } + if messageType == SyntheticMessageRouteHealth { + return SyntheticMessageRouteHealthAck + } + return SyntheticMessageProbeAck +} + +func (r *SyntheticRuntime) validateTestServiceRequest(request SyntheticTestServiceRequest) error { + if request.RequestID == "" { + return ErrSyntheticRequestInvalid + } + if request.OrganizationID != r.testOrganizationID { + return ErrSyntheticOrganizationMismatch + } + if request.ServiceType != SyntheticTestServiceType { + return ErrUnsupportedSyntheticService + } + if len([]byte(request.Payload)) > r.maxTestPayloadBytes { + return ErrSyntheticPayloadTooLarge + } + return nil +} + +func decodeTestServiceRequest(payload json.RawMessage) (SyntheticTestServiceRequest, error) { + var request SyntheticTestServiceRequest + if len(payload) == 0 { + return request, ErrSyntheticRequestInvalid + } + if err := json.Unmarshal(payload, &request); err != nil { + return request, err + } + return request, nil +} + +func decodeTestServiceResponse(payload json.RawMessage) (SyntheticTestServiceResponse, error) { + var response SyntheticTestServiceResponse + if len(payload) == 0 { + return response, ErrSyntheticRequestInvalid + } + if err := json.Unmarshal(payload, &response); err != nil { + return response, err + } + return response, nil +} + +func shouldInvalidateObservation(observation SyntheticRouteObservation, version SyntheticRouteCacheVersion) bool { + if version.RouteVersion != "" && observation.RouteVersion != version.RouteVersion { + return true + } + if version.PolicyVersion != "" && observation.PolicyVersion != version.PolicyVersion { + return true + } + if version.PeerDirectoryVersion != "" && observation.PeerDirectoryVersion != version.PeerDirectoryVersion { + return true + } + return false +} diff --git a/agents/rap-node-agent/internal/mesh/synthetic_runtime_test.go b/agents/rap-node-agent/internal/mesh/synthetic_runtime_test.go new file mode 100644 index 0000000..30159e5 --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/synthetic_runtime_test.go @@ -0,0 +1,432 @@ +package mesh + +import ( + "context" + "encoding/json" + "errors" + "testing" + "time" +) + +type syntheticTestTransport struct { + nodes map[string]*SyntheticRuntime +} + +func (t syntheticTestTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) { + next := t.nodes[nextNodeID] + if next == nil { + return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable + } + return next.Receive(ctx, envelope) +} + +func TestSyntheticRuntimeDirectProbe(t *testing.T) { + route := testRoute("route-direct", []string{"node-a", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, route) + nodeB := testRuntime("node-b", transport, route) + transport.nodes["node-a"] = nodeA + transport.nodes["node-b"] = nodeB + + ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-direct") + if err != nil { + t.Fatalf("send probe: %v", err) + } + if ack.MessageType != SyntheticMessageProbeAck { + t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck) + } + if ack.From.NodeID != "node-b" || ack.To.NodeID != "node-a" { + t.Fatalf("unexpected ack peers: from=%+v to=%+v", ack.From, ack.To) + } + payload := decodeAckPayload(t, ack) + if len(payload.Path) != 2 || payload.Path[0] != "node-a" || payload.Path[1] != "node-b" { + t.Fatalf("Path = %#v, want node-a -> node-b", payload.Path) + } + if nodeB.SnapshotMetrics().ProbeAcksCreated != 1 { + t.Fatalf("ProbeAcksCreated = %d, want 1", nodeB.SnapshotMetrics().ProbeAcksCreated) + } +} + +func TestSyntheticRuntimeSingleRelayProbe(t *testing.T) { + route := testRoute("route-relay", []string{"node-a", "node-r", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, route) + nodeR := testRuntime("node-r", transport, route) + nodeB := testRuntime("node-b", transport, route) + transport.nodes["node-a"] = nodeA + transport.nodes["node-r"] = nodeR + transport.nodes["node-b"] = nodeB + + ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-relay") + if err != nil { + t.Fatalf("send probe: %v", err) + } + payload := decodeAckPayload(t, ack) + if len(payload.Path) != 3 || payload.Path[0] != "node-a" || payload.Path[1] != "node-r" || payload.Path[2] != "node-b" { + t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", payload.Path) + } + if nodeR.SnapshotMetrics().ProbesForwarded != 1 { + t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded) + } +} + +func TestSyntheticRuntimeDisabledRejectsProbe(t *testing.T) { + route := testRoute("route-disabled", []string{"node-a", "node-b"}) + nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{ + Enabled: false, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}, + Routes: []SyntheticRoute{route}, + }) + _, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-disabled") + if !errors.Is(err, ErrMeshRuntimeDisabled) { + t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err) + } +} + +func TestSyntheticRuntimeRejectsWrongCluster(t *testing.T) { + route := testRoute("route-wrong-cluster", []string{"node-a", "node-b"}) + nodeB := testRuntime("node-b", syntheticTestTransport{}, route) + envelope := testEnvelope(route, "node-a", "node-b") + envelope.ClusterID = "cluster-2" + + _, err := nodeB.Receive(context.Background(), envelope) + if !errors.Is(err, ErrClusterMismatch) { + t.Fatalf("err = %v, want ErrClusterMismatch", err) + } +} + +func TestSyntheticRuntimeRejectsWrongNode(t *testing.T) { + route := testRoute("route-wrong-node", []string{"node-a", "node-b"}) + nodeB := testRuntime("node-b", syntheticTestTransport{}, route) + envelope := testEnvelope(route, "node-a", "node-c") + + _, err := nodeB.Receive(context.Background(), envelope) + if !errors.Is(err, ErrNodeMismatch) { + t.Fatalf("err = %v, want ErrNodeMismatch", err) + } +} + +func TestSyntheticRuntimeRejectsUnauthorizedChannel(t *testing.T) { + route := testRoute("route-unauthorized", []string{"node-a", "node-b"}) + nodeA := testRuntime("node-a", syntheticTestTransport{}, route) + + _, err := nodeA.SendProbe(context.Background(), route.RouteID, "rdp_render", "probe-unauthorized") + if !errors.Is(err, ErrUnauthorizedChannel) { + t.Fatalf("err = %v, want ErrUnauthorizedChannel", err) + } +} + +func TestSyntheticRuntimeRejectsExpiredRoute(t *testing.T) { + route := testRoute("route-expired", []string{"node-a", "node-b"}) + route.ExpiresAt = time.Now().UTC().Add(-time.Minute) + nodeA := testRuntime("node-a", syntheticTestTransport{}, route) + + _, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-expired") + if !errors.Is(err, ErrRouteExpired) { + t.Fatalf("err = %v, want ErrRouteExpired", err) + } +} + +func TestSyntheticRuntimeRejectsTTLExhaustion(t *testing.T) { + route := testRoute("route-ttl", []string{"node-a", "node-r", "node-b"}) + route.MaxTTL = 1 + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, route) + nodeR := testRuntime("node-r", transport, route) + transport.nodes["node-a"] = nodeA + transport.nodes["node-r"] = nodeR + + _, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-ttl") + if !errors.Is(err, ErrTTLExhausted) { + t.Fatalf("err = %v, want ErrTTLExhausted", err) + } +} + +func TestSyntheticRuntimeRejectsLoop(t *testing.T) { + route := testRoute("route-loop", []string{"node-a", "node-b"}) + nodeB := testRuntime("node-b", syntheticTestTransport{}, route) + envelope := testEnvelope(route, "node-a", "node-b") + envelope.Visited = []string{"node-a", "node-b"} + + _, err := nodeB.Receive(context.Background(), envelope) + if !errors.Is(err, ErrLoopDetected) { + t.Fatalf("err = %v, want ErrLoopDetected", err) + } +} + +func TestSyntheticRuntimeRejectsUnavailablePeer(t *testing.T) { + route := testRoute("route-missing-peer", []string{"node-a", "node-b"}) + nodeA := testRuntime("node-a", syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}, route) + + _, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-missing-peer") + if !errors.Is(err, ErrSyntheticPeerUnavailable) { + t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err) + } +} + +func TestSyntheticRuntimeRouteHealthProbeRecordsSuccess(t *testing.T) { + route := testRoute("route-health", []string{"node-a", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, route) + nodeB := testRuntime("node-b", transport, route) + transport.nodes["node-a"] = nodeA + transport.nodes["node-b"] = nodeB + + result, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-health") + if err != nil { + t.Fatalf("send route health probe: %v", err) + } + if result.Ack.MessageType != SyntheticMessageRouteHealthAck { + t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageRouteHealthAck) + } + if result.FallbackUsed { + t.Fatal("FallbackUsed = true, want false") + } + observation, ok := nodeA.SnapshotRouteObservation(route.RouteID) + if !ok { + t.Fatal("route observation missing") + } + if observation.State != SyntheticRouteStateHealthy || observation.SuccessCount != 1 { + t.Fatalf("observation = %+v, want healthy success", observation) + } + if observation.PolicyVersion != "policy-v1" || observation.PeerDirectoryVersion != "peers-v1" || observation.RouteVersion != "route-v1" { + t.Fatalf("observation versions = %+v", observation) + } + metrics := nodeA.SnapshotMetrics() + if metrics.RouteHealthProbesSent != 1 || metrics.RouteDeliveriesSucceeded != 1 { + t.Fatalf("metrics = %+v, want health probe success", metrics) + } +} + +func TestSyntheticRuntimeRouteHealthUsesDedicatedRouteConfig(t *testing.T) { + base := testRoute("route-effective-health", []string{"node-a", "node-old", "node-b"}) + effective := testRoute("route-effective-health", []string{"node-a", "node-new", "node-b"}) + effective.RouteVersion = "decision-v1" + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntimeWithRouteHealth("node-a", transport, []SyntheticRoute{base}, []SyntheticRoute{effective}) + nodeOld := testRuntimeWithRouteHealth("node-old", transport, []SyntheticRoute{base}, []SyntheticRoute{effective}) + nodeNew := testRuntimeWithRouteHealth("node-new", transport, []SyntheticRoute{base}, []SyntheticRoute{effective}) + nodeB := testRuntimeWithRouteHealth("node-b", transport, []SyntheticRoute{base}, []SyntheticRoute{effective}) + transport.nodes["node-a"] = nodeA + transport.nodes["node-old"] = nodeOld + transport.nodes["node-new"] = nodeNew + transport.nodes["node-b"] = nodeB + + health, err := nodeA.SendRouteHealthProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-health-effective") + if err != nil { + t.Fatalf("send route health probe: %v", err) + } + healthPayload := decodeAckPayload(t, health.Ack) + if got, want := healthPayload.Path, []string{"node-a", "node-new", "node-b"}; !sameStrings(got, want) { + t.Fatalf("route health path = %v, want %v", got, want) + } + if nodeNew.SnapshotMetrics().ProbesForwarded != 1 { + t.Fatalf("node-new forwarded = %d, want 1", nodeNew.SnapshotMetrics().ProbesForwarded) + } + if nodeOld.SnapshotMetrics().ProbesForwarded != 0 { + t.Fatalf("node-old forwarded = %d, want 0 before regular probe", nodeOld.SnapshotMetrics().ProbesForwarded) + } + observation, ok := nodeA.SnapshotRouteObservation(base.RouteID) + if !ok || observation.RouteVersion != "decision-v1" { + t.Fatalf("route health observation = %+v, want decision route version", observation) + } + + probe, err := nodeA.SendProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-regular") + if err != nil { + t.Fatalf("send regular probe: %v", err) + } + probePayload := decodeAckPayload(t, probe) + if got, want := probePayload.Path, []string{"node-a", "node-old", "node-b"}; !sameStrings(got, want) { + t.Fatalf("regular probe path = %v, want %v", got, want) + } + if nodeOld.SnapshotMetrics().ProbesForwarded != 1 { + t.Fatalf("node-old forwarded = %d, want 1 after regular probe", nodeOld.SnapshotMetrics().ProbesForwarded) + } +} + +func TestSyntheticRuntimeRouteHealthUsesFallbackWhenPreferredUnavailable(t *testing.T) { + preferred := testRoute("route-preferred", []string{"node-a", "node-r", "node-b"}) + fallback := testRoute("route-fallback", []string{"node-a", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, preferred, fallback) + nodeB := testRuntime("node-b", transport, preferred, fallback) + transport.nodes["node-a"] = nodeA + transport.nodes["node-b"] = nodeB + + result, err := nodeA.SendRouteHealthProbeWithFallback( + context.Background(), + preferred.RouteID, + []string{fallback.RouteID}, + SyntheticChannelFabricControl, + "probe-fallback", + ) + if err != nil { + t.Fatalf("send route health probe with fallback: %v", err) + } + if !result.FallbackUsed { + t.Fatal("FallbackUsed = false, want true") + } + if result.SelectedRouteID != fallback.RouteID { + t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID) + } + preferredObservation, ok := nodeA.SnapshotRouteObservation(preferred.RouteID) + if !ok { + t.Fatal("preferred route observation missing") + } + if preferredObservation.State != SyntheticRouteStateFailed || preferredObservation.FailureCount != 1 { + t.Fatalf("preferred observation = %+v, want failed", preferredObservation) + } + fallbackObservation, ok := nodeA.SnapshotRouteObservation(fallback.RouteID) + if !ok { + t.Fatal("fallback route observation missing") + } + if fallbackObservation.State != SyntheticRouteStateHealthy || fallbackObservation.SuccessCount != 1 { + t.Fatalf("fallback observation = %+v, want healthy", fallbackObservation) + } + metrics := nodeA.SnapshotMetrics() + if metrics.FallbackRoutesUsed != 1 || metrics.WarmRoutesPromoted != 1 || metrics.RouteDeliveriesFailed != 1 { + t.Fatalf("metrics = %+v, want fallback promotion and one failed delivery", metrics) + } +} + +func TestSyntheticRuntimeRouteCacheInvalidatesOnVersionChange(t *testing.T) { + route := testRoute("route-cache", []string{"node-a", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, route) + nodeB := testRuntime("node-b", transport, route) + transport.nodes["node-a"] = nodeA + transport.nodes["node-b"] = nodeB + + if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache"); err != nil { + t.Fatalf("send route health probe: %v", err) + } + if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok { + t.Fatal("route observation missing before invalidation") + } + + invalidated := nodeA.InvalidateRouteCache("policy_changed", SyntheticRouteCacheVersion{PolicyVersion: "policy-v2"}) + if invalidated != 1 { + t.Fatalf("invalidated = %d, want 1", invalidated) + } + if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); ok { + t.Fatal("route observation still present after invalidation") + } + if nodeA.SnapshotMetrics().RouteCacheInvalidations != 1 { + t.Fatalf("RouteCacheInvalidations = %d, want 1", nodeA.SnapshotMetrics().RouteCacheInvalidations) + } +} + +func TestSyntheticRuntimeRouteCacheKeepsCurrentVersion(t *testing.T) { + route := testRoute("route-cache-current", []string{"node-a", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, route) + nodeB := testRuntime("node-b", transport, route) + transport.nodes["node-a"] = nodeA + transport.nodes["node-b"] = nodeB + + if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache-current"); err != nil { + t.Fatalf("send route health probe: %v", err) + } + invalidated := nodeA.InvalidateRouteCache("same_versions", SyntheticRouteCacheVersion{ + RouteVersion: "route-v1", + PolicyVersion: "policy-v1", + PeerDirectoryVersion: "peers-v1", + }) + if invalidated != 0 { + t.Fatalf("invalidated = %d, want 0", invalidated) + } + if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok { + t.Fatal("route observation missing after same-version invalidation") + } +} + +func TestSyntheticRuntimeRouteHealthDisabledRejects(t *testing.T) { + route := testRoute("route-health-disabled", []string{"node-a", "node-b"}) + nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{ + Enabled: false, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}, + Routes: []SyntheticRoute{route}, + }) + + _, err := nodeA.SendRouteHealthProbeWithFallback( + context.Background(), + route.RouteID, + []string{"route-fallback"}, + SyntheticChannelFabricControl, + "probe-disabled-health", + ) + if !errors.Is(err, ErrMeshRuntimeDisabled) { + t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err) + } +} + +func testRuntime(nodeID string, transport SyntheticTransport, routes ...SyntheticRoute) *SyntheticRuntime { + return NewSyntheticRuntime(SyntheticRuntimeConfig{ + Enabled: true, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID}, + Routes: routes, + Transport: transport, + MaxTTL: 8, + MaxHops: 8, + }) +} + +func testRuntimeWithRouteHealth(nodeID string, transport SyntheticTransport, routes []SyntheticRoute, routeHealthRoutes []SyntheticRoute) *SyntheticRuntime { + return NewSyntheticRuntime(SyntheticRuntimeConfig{ + Enabled: true, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID}, + Routes: routes, + RouteHealthRoutes: routeHealthRoutes, + Transport: transport, + MaxTTL: 8, + MaxHops: 8, + }) +} + +func testRoute(routeID string, hops []string) SyntheticRoute { + return SyntheticRoute{ + RouteID: routeID, + ClusterID: "cluster-1", + SourceNodeID: hops[0], + DestinationNodeID: hops[len(hops)-1], + Hops: hops, + AllowedChannels: []string{SyntheticChannelFabricControl}, + ExpiresAt: time.Now().UTC().Add(time.Hour), + MaxTTL: 8, + MaxHops: 8, + RouteVersion: "route-v1", + PolicyVersion: "policy-v1", + PeerDirectoryVersion: "peers-v1", + } +} + +func testEnvelope(route SyntheticRoute, fromNodeID string, toNodeID string) SyntheticEnvelope { + payload, _ := json.Marshal(SyntheticProbePayload{ + ProbeID: "probe-test", + SentAt: time.Now().UTC(), + }) + return SyntheticEnvelope{ + ProtocolVersion: ProtocolVersion, + RouteID: route.RouteID, + ClusterID: route.ClusterID, + From: PeerIdentity{ClusterID: route.ClusterID, NodeID: fromNodeID}, + To: PeerIdentity{ClusterID: route.ClusterID, NodeID: toNodeID}, + Channel: SyntheticChannelFabricControl, + MessageType: SyntheticMessageProbe, + TTL: 8, + HopCount: 1, + Visited: []string{fromNodeID}, + Sequence: 1, + SentAt: time.Now().UTC(), + Payload: payload, + } +} + +func decodeAckPayload(t *testing.T, envelope SyntheticEnvelope) SyntheticProbeAckPayload { + t.Helper() + var payload SyntheticProbeAckPayload + if err := json.Unmarshal(envelope.Payload, &payload); err != nil { + t.Fatalf("decode ack payload: %v", err) + } + return payload +} diff --git a/agents/rap-node-agent/internal/mesh/synthetic_test_service_test.go b/agents/rap-node-agent/internal/mesh/synthetic_test_service_test.go new file mode 100644 index 0000000..05095cb --- /dev/null +++ b/agents/rap-node-agent/internal/mesh/synthetic_test_service_test.go @@ -0,0 +1,235 @@ +package mesh + +import ( + "context" + "encoding/json" + "errors" + "strings" + "testing" +) + +func TestSyntheticRuntimeTestServiceDirectRoute(t *testing.T) { + route := testServiceRoute("route-test-service-direct", []string{"node-a", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, route) + nodeB := testRuntime("node-b", transport, route) + transport.nodes["node-a"] = nodeA + transport.nodes["node-b"] = nodeB + + result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-direct", "hello")) + if err != nil { + t.Fatalf("send test service: %v", err) + } + if result.Ack.MessageType != SyntheticMessageTestServiceAck { + t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageTestServiceAck) + } + if result.Response.EchoPayload != "hello" { + t.Fatalf("EchoPayload = %q, want hello", result.Response.EchoPayload) + } + if len(result.Response.Path) != 2 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-b" { + t.Fatalf("Path = %#v, want node-a -> node-b", result.Response.Path) + } + metrics := nodeA.SnapshotMetrics() + if metrics.TestServiceRequestsSent != 1 || metrics.TestServiceDeliveriesSucceeded != 1 { + t.Fatalf("metrics = %+v, want one test service success", metrics) + } +} + +func TestSyntheticRuntimeTestServiceSingleRelayRoute(t *testing.T) { + route := testServiceRoute("route-test-service-relay", []string{"node-a", "node-r", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, route) + nodeR := testRuntime("node-r", transport, route) + nodeB := testRuntime("node-b", transport, route) + transport.nodes["node-a"] = nodeA + transport.nodes["node-r"] = nodeR + transport.nodes["node-b"] = nodeB + + result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-relay", "relay")) + if err != nil { + t.Fatalf("send test service: %v", err) + } + if len(result.Response.Path) != 3 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-r" || result.Response.Path[2] != "node-b" { + t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", result.Response.Path) + } + if nodeR.SnapshotMetrics().ProbesForwarded != 1 { + t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded) + } +} + +func TestSyntheticRuntimeTestServiceUsesForcedFallback(t *testing.T) { + preferred := testServiceRoute("route-test-service-preferred", []string{"node-a", "node-r", "node-b"}) + fallback := testServiceRoute("route-test-service-fallback", []string{"node-a", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := testRuntime("node-a", transport, preferred, fallback) + nodeB := testRuntime("node-b", transport, preferred, fallback) + transport.nodes["node-a"] = nodeA + transport.nodes["node-b"] = nodeB + + result, err := nodeA.SendTestServiceWithFallback( + context.Background(), + preferred.RouteID, + []string{fallback.RouteID}, + SyntheticChannelRouteControl, + testServiceRequest("request-fallback", "fallback"), + ) + if err != nil { + t.Fatalf("send test service with fallback: %v", err) + } + if !result.FallbackUsed { + t.Fatal("FallbackUsed = false, want true") + } + if result.SelectedRouteID != fallback.RouteID { + t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID) + } + if result.Response.EchoPayload != "fallback" { + t.Fatalf("EchoPayload = %q, want fallback", result.Response.EchoPayload) + } + metrics := nodeA.SnapshotMetrics() + if metrics.TestServiceFallbacksUsed != 1 || metrics.TestServiceDeliveriesFailed != 1 || metrics.TestServiceDeliveriesSucceeded != 1 { + t.Fatalf("metrics = %+v, want fallback success with one preferred failure", metrics) + } +} + +func TestSyntheticRuntimeTestServiceRejectsWrongOrganization(t *testing.T) { + route := testServiceRoute("route-test-service-wrong-org", []string{"node-a", "node-b"}) + nodeA := testRuntime("node-a", syntheticTestTransport{}, route) + request := testServiceRequest("request-wrong-org", "hello") + request.OrganizationID = "org-other" + + _, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request) + if !errors.Is(err, ErrSyntheticOrganizationMismatch) { + t.Fatalf("err = %v, want ErrSyntheticOrganizationMismatch", err) + } +} + +func TestSyntheticRuntimeTestServiceRejectsUnsupportedService(t *testing.T) { + route := testServiceRoute("route-test-service-unsupported", []string{"node-a", "node-b"}) + nodeA := testRuntime("node-a", syntheticTestTransport{}, route) + request := testServiceRequest("request-unsupported", "hello") + request.ServiceType = "rdp" + + _, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request) + if !errors.Is(err, ErrUnsupportedSyntheticService) { + t.Fatalf("err = %v, want ErrUnsupportedSyntheticService", err) + } +} + +func TestSyntheticRuntimeTestServiceRejectsOversizedPayload(t *testing.T) { + route := testServiceRoute("route-test-service-oversized", []string{"node-a", "node-b"}) + nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{ + Enabled: true, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}, + Routes: []SyntheticRoute{route}, + MaxTestPayloadBytes: 4, + }) + + _, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-oversized", "12345")) + if !errors.Is(err, ErrSyntheticPayloadTooLarge) { + t.Fatalf("err = %v, want ErrSyntheticPayloadTooLarge", err) + } +} + +func TestSyntheticRuntimeTestServiceRejectsUnauthorizedChannel(t *testing.T) { + route := testServiceRoute("route-test-service-channel", []string{"node-a", "node-b"}) + nodeA := testRuntime("node-a", syntheticTestTransport{}, route) + + _, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelFabricControl, testServiceRequest("request-channel", "hello")) + if !errors.Is(err, ErrUnauthorizedChannel) { + t.Fatalf("err = %v, want ErrUnauthorizedChannel", err) + } +} + +func TestSyntheticRuntimeTestServiceDisabledRejects(t *testing.T) { + route := testServiceRoute("route-test-service-disabled", []string{"node-a", "node-b"}) + nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{ + Enabled: false, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}, + Routes: []SyntheticRoute{route}, + }) + + _, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-disabled", "hello")) + if !errors.Is(err, ErrMeshRuntimeDisabled) { + t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err) + } +} + +func TestSyntheticRelaySchedulerAcceptsTestServiceMessage(t *testing.T) { + scheduler := testRelayScheduler() + envelope := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageTestService, 42) + envelope.Payload = mustMarshalTestServiceRequest(testServiceRequest("request-relay-scheduler", "hello")) + + if _, err := scheduler.Enqueue(envelope); err != nil { + t.Fatalf("enqueue test service: %v", err) + } + dequeued, err := scheduler.Dequeue() + if err != nil { + t.Fatalf("dequeue test service: %v", err) + } + if dequeued.MessageType != SyntheticMessageTestService { + t.Fatalf("MessageType = %q, want %q", dequeued.MessageType, SyntheticMessageTestService) + } +} + +func testServiceRoute(routeID string, hops []string) SyntheticRoute { + route := testRoute(routeID, hops) + route.AllowedChannels = []string{SyntheticChannelRouteControl} + return route +} + +func testServiceRequest(requestID string, payload string) SyntheticTestServiceRequest { + return SyntheticTestServiceRequest{ + RequestID: requestID, + OrganizationID: SyntheticDefaultTestOrganizationID, + ServiceType: SyntheticTestServiceType, + Payload: payload, + } +} + +func mustMarshalTestServiceRequest(request SyntheticTestServiceRequest) []byte { + payload, err := json.Marshal(request) + if err != nil { + panic(err) + } + return payload +} + +func TestSyntheticRuntimeTestServiceRejectsMissingRequestID(t *testing.T) { + route := testServiceRoute("route-test-service-missing-request", []string{"node-a", "node-b"}) + nodeA := testRuntime("node-a", syntheticTestTransport{}, route) + request := testServiceRequest("", "hello") + + _, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request) + if !errors.Is(err, ErrSyntheticRequestInvalid) { + t.Fatalf("err = %v, want ErrSyntheticRequestInvalid", err) + } +} + +func TestSyntheticRuntimeTestServiceAllowsMaxPayloadBoundary(t *testing.T) { + route := testServiceRoute("route-test-service-max", []string{"node-a", "node-b"}) + transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}} + nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{ + Enabled: true, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}, + Routes: []SyntheticRoute{route}, + Transport: transport, + MaxTestPayloadBytes: 8, + }) + nodeB := NewSyntheticRuntime(SyntheticRuntimeConfig{ + Enabled: true, + Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}, + Routes: []SyntheticRoute{route}, + Transport: transport, + MaxTestPayloadBytes: 8, + }) + transport.nodes["node-a"] = nodeA + transport.nodes["node-b"] = nodeB + + result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-max", strings.Repeat("a", 8))) + if err != nil { + t.Fatalf("send test service: %v", err) + } + if result.Response.EchoPayload != strings.Repeat("a", 8) { + t.Fatalf("EchoPayload = %q", result.Response.EchoPayload) + } +} diff --git a/agents/rap-node-agent/internal/relay/relay.go b/agents/rap-node-agent/internal/relay/relay.go new file mode 100644 index 0000000..a566e61 --- /dev/null +++ b/agents/rap-node-agent/internal/relay/relay.go @@ -0,0 +1,18 @@ +package relay + +import "errors" + +var ErrProductionForwardingDisabled = errors.New("relay skeleton does not forward production payloads before an approved production mesh stage") + +type Skeleton struct { + ClusterID string + NodeID string +} + +func (s Skeleton) AcceptControlConnection() bool { + return s.ClusterID != "" && s.NodeID != "" +} + +func (s Skeleton) ForwardProductionPayload([]byte) error { + return ErrProductionForwardingDisabled +} diff --git a/agents/rap-node-agent/internal/relay/relay_test.go b/agents/rap-node-agent/internal/relay/relay_test.go new file mode 100644 index 0000000..ab16b18 --- /dev/null +++ b/agents/rap-node-agent/internal/relay/relay_test.go @@ -0,0 +1,16 @@ +package relay + +import ( + "errors" + "testing" +) + +func TestRelaySkeletonAcceptsControlButNotPayloadForwarding(t *testing.T) { + relay := Skeleton{ClusterID: "cluster-1", NodeID: "node-relay"} + if !relay.AcceptControlConnection() { + t.Fatal("relay skeleton should accept control connection metadata") + } + if err := relay.ForwardProductionPayload([]byte("rdp")); !errors.Is(err, ErrProductionForwardingDisabled) { + t.Fatalf("err = %v, want ErrProductionForwardingDisabled", err) + } +} diff --git a/agents/rap-node-agent/internal/state/state.go b/agents/rap-node-agent/internal/state/state.go new file mode 100644 index 0000000..9f92054 --- /dev/null +++ b/agents/rap-node-agent/internal/state/state.go @@ -0,0 +1,129 @@ +package state + +import ( + "crypto/rand" + "encoding/base64" + "encoding/json" + "errors" + "os" + "path/filepath" + "time" +) + +const FileName = "identity.json" + +type Identity struct { + NodeID string `json:"node_id"` + ClusterID string `json:"cluster_id"` + NodeName string `json:"node_name"` + NodeFingerprint string `json:"node_fingerprint"` + PublicKey string `json:"public_key"` + IdentityStatus string `json:"identity_status"` + PendingJoinRequestID string `json:"pending_join_request_id,omitempty"` + ClusterAuthorityPublicKey string `json:"cluster_authority_public_key,omitempty"` + ClusterAuthorityFingerprint string `json:"cluster_authority_fingerprint,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +func LoadOrCreate(dir, clusterID, nodeName string) (Identity, error) { + path := filepath.Join(dir, FileName) + existing, err := Load(path) + if err == nil { + return existing, nil + } + if !errors.Is(err, os.ErrNotExist) { + return Identity{}, err + } + now := time.Now().UTC() + fingerprint, err := randomToken("rap-node-fp") + if err != nil { + return Identity{}, err + } + publicKey, err := randomToken("rap-node-pub") + if err != nil { + return Identity{}, err + } + identity := Identity{ + ClusterID: clusterID, + NodeName: nodeName, + NodeFingerprint: fingerprint, + PublicKey: publicKey, + IdentityStatus: "new", + CreatedAt: now, + UpdatedAt: now, + } + if err := Save(path, identity); err != nil { + return Identity{}, err + } + return identity, nil +} + +func Load(path string) (Identity, error) { + payload, err := os.ReadFile(path) + if err != nil { + return Identity{}, err + } + var identity Identity + if err := json.Unmarshal(payload, &identity); err != nil { + return Identity{}, err + } + return identity, nil +} + +func Save(path string, identity Identity) error { + if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { + return err + } + identity.UpdatedAt = time.Now().UTC() + payload, err := json.MarshalIndent(identity, "", " ") + if err != nil { + return err + } + return os.WriteFile(path, payload, 0o600) +} + +func MarkEnrollmentSubmitted(dir, clusterID, joinRequestID string) (Identity, error) { + path := filepath.Join(dir, FileName) + identity, err := Load(path) + if err != nil { + return Identity{}, err + } + identity.ClusterID = clusterID + identity.PendingJoinRequestID = joinRequestID + identity.IdentityStatus = "pending_approval" + if err := Save(path, identity); err != nil { + return Identity{}, err + } + return identity, nil +} + +func MarkApproved(dir string, nodeID, clusterID, status string) (Identity, error) { + return MarkApprovedWithAuthority(dir, nodeID, clusterID, status, "", "") +} + +func MarkApprovedWithAuthority(dir string, nodeID, clusterID, status, authorityPublicKey, authorityFingerprint string) (Identity, error) { + path := filepath.Join(dir, FileName) + identity, err := Load(path) + if err != nil { + return Identity{}, err + } + identity.NodeID = nodeID + identity.ClusterID = clusterID + identity.IdentityStatus = status + identity.PendingJoinRequestID = "" + identity.ClusterAuthorityPublicKey = authorityPublicKey + identity.ClusterAuthorityFingerprint = authorityFingerprint + if err := Save(path, identity); err != nil { + return Identity{}, err + } + return identity, nil +} + +func randomToken(prefix string) (string, error) { + var random [32]byte + if _, err := rand.Read(random[:]); err != nil { + return "", err + } + return prefix + "_" + base64.RawURLEncoding.EncodeToString(random[:]), nil +} diff --git a/agents/rap-node-agent/internal/state/state_test.go b/agents/rap-node-agent/internal/state/state_test.go new file mode 100644 index 0000000..a9814ce --- /dev/null +++ b/agents/rap-node-agent/internal/state/state_test.go @@ -0,0 +1,55 @@ +package state + +import ( + "path/filepath" + "testing" +) + +func TestLoadOrCreatePersistsIdentity(t *testing.T) { + dir := t.TempDir() + identity, err := LoadOrCreate(dir, "cluster-1", "node-a") + if err != nil { + t.Fatalf("load or create: %v", err) + } + if identity.NodeFingerprint == "" || identity.PublicKey == "" { + t.Fatalf("identity missing generated fields: %+v", identity) + } + loaded, err := Load(filepath.Join(dir, FileName)) + if err != nil { + t.Fatalf("load identity: %v", err) + } + if loaded.NodeFingerprint != identity.NodeFingerprint { + t.Fatal("identity fingerprint was not persisted") + } +} + +func TestMarkApprovedUpdatesIdentity(t *testing.T) { + dir := t.TempDir() + if _, err := LoadOrCreate(dir, "cluster-1", "node-a"); err != nil { + t.Fatalf("load or create: %v", err) + } + if _, err := MarkEnrollmentSubmitted(dir, "cluster-1", "join-request-1"); err != nil { + t.Fatalf("mark enrollment submitted: %v", err) + } + approved, err := MarkApproved(dir, "node-1", "cluster-1", "active") + if err != nil { + t.Fatalf("mark approved: %v", err) + } + if approved.NodeID != "node-1" || approved.IdentityStatus != "active" || approved.PendingJoinRequestID != "" { + t.Fatalf("unexpected approved identity: %+v", approved) + } +} + +func TestMarkApprovedWithAuthorityPinsClusterAuthority(t *testing.T) { + dir := t.TempDir() + if _, err := LoadOrCreate(dir, "cluster-1", "node-a"); err != nil { + t.Fatalf("load or create: %v", err) + } + approved, err := MarkApprovedWithAuthority(dir, "node-1", "cluster-1", "active", "public-key-b64", "rap-ca-ed25519-test") + if err != nil { + t.Fatalf("mark approved with authority: %v", err) + } + if approved.ClusterAuthorityPublicKey != "public-key-b64" || approved.ClusterAuthorityFingerprint != "rap-ca-ed25519-test" { + t.Fatalf("authority pin was not persisted: %+v", approved) + } +} diff --git a/agents/rap-node-agent/internal/supervisor/supervisor.go b/agents/rap-node-agent/internal/supervisor/supervisor.go new file mode 100644 index 0000000..1bc6863 --- /dev/null +++ b/agents/rap-node-agent/internal/supervisor/supervisor.go @@ -0,0 +1,40 @@ +package supervisor + +import ( + "context" + + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/client" +) + +type Supervisor interface { + Apply(ctx context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error) +} + +type StubSupervisor struct { + Version string +} + +func (s StubSupervisor) Apply(_ context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error) { + statuses := make([]client.WorkloadStatusRequest, 0, len(desired)) + for _, workload := range desired { + state := "degraded" + if workload.DesiredState == "disabled" { + state = "stopped" + } + version := workload.Version + if version == "" { + version = s.Version + } + statuses = append(statuses, client.WorkloadStatusRequest{ + ReportedState: state, + RuntimeMode: workload.RuntimeMode, + Version: version, + StatusPayload: map[string]any{ + "supervisor": "stub", + "desired_state": workload.DesiredState, + "service_type": workload.ServiceType, + }, + }) + } + return statuses, nil +} diff --git a/agents/rap-node-agent/internal/supervisor/supervisor_test.go b/agents/rap-node-agent/internal/supervisor/supervisor_test.go new file mode 100644 index 0000000..9ffed26 --- /dev/null +++ b/agents/rap-node-agent/internal/supervisor/supervisor_test.go @@ -0,0 +1,35 @@ +package supervisor + +import ( + "context" + "testing" + + "github.com/example/remote-access-platform/agents/rap-node-agent/internal/client" +) + +func TestStubSupervisorReportsDegradedForEnabledWorkload(t *testing.T) { + statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{ + {ServiceType: "rdp-worker", DesiredState: "enabled", RuntimeMode: "container"}, + }) + if err != nil { + t.Fatalf("apply desired workload: %v", err) + } + if len(statuses) != 1 { + t.Fatalf("statuses length = %d", len(statuses)) + } + if statuses[0].ReportedState != "degraded" { + t.Fatalf("ReportedState = %q", statuses[0].ReportedState) + } +} + +func TestStubSupervisorReportsStoppedForDisabledWorkload(t *testing.T) { + statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{ + {ServiceType: "relay-node", DesiredState: "disabled", RuntimeMode: "container"}, + }) + if err != nil { + t.Fatalf("apply desired workload: %v", err) + } + if statuses[0].ReportedState != "stopped" { + t.Fatalf("ReportedState = %q", statuses[0].ReportedState) + } +} diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..90fa09a --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,19 @@ +FROM golang:1.23-bookworm AS build + +WORKDIR /src + +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . +RUN CGO_ENABLED=0 GOOS=linux go build -o /out/rap-api ./cmd/api + +FROM debian:bookworm-slim + +RUN apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=build /out/rap-api /usr/local/bin/rap-api + +ENTRYPOINT ["/usr/local/bin/rap-api"] diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 0000000..ab858f6 --- /dev/null +++ b/backend/README.md @@ -0,0 +1,270 @@ +# Backend Foundation + +Production-oriented Go backend skeleton for the remote access platform. + +## Scope included + +- configuration loading from environment +- HTTP server bootstrap with graceful shutdown +- PostgreSQL and Redis connectivity wiring +- migrations scaffold +- auth foundation with access/refresh tokens, hashed refresh rotation, trusted devices, and persisted auth sessions +- persistent session storage foundation for remote sessions, attachments, resource policies, and audit events +- session broker orchestration for start, attach, detach, takeover, terminate, failure, and detached-session recovery +- Redis-backed live session state, controller binding, attach tokens, heartbeat keys, worker routing, and reconnect support +- Redis-backed worker registration, lease lifecycle, heartbeat tracking, stale lease recovery, and routing queues +- worker assignment queueing and worker event ingestion for the minimal real RDP worker runtime +- websocket live plane with attach handshake, ping/pong heartbeat, state messages, takeover detection, and transport reconnect flow +- module boundaries for auth, resources, session broker, and websocket gateway +- worker registry scaffold to prepare later RDP worker integration +- per-resource certificate verification policy for RDP connections with `strict` default and explicit `ignore` override +- platform-core v2 foundations for organizations, memberships, identity sources, nodes, and node-agent control plane +- Data Plane v1 contract scaffolding for optional session response candidates/tokens, with current backend gateway behavior preserved as fallback +- production resource secret-readiness guard for rejecting plaintext credential-like metadata and requiring `secret_ref` for RDP/VNC/SSH resources in production mode +- encrypted resource secret storage/resolver MVP for production `secret_ref` usage + +## Entry point + +Run the API from `cmd/api`. + +## Local dev + +- backend: `pwsh -File scripts/smoke/run-backend.ps1` +- infra: `pwsh -File scripts/smoke/start-infra.ps1` +- migrations: `pwsh -File scripts/smoke/apply-migrations.ps1` +- worker image build: `docker build --tag rap-rdp-worker:dev --file workers/rdp-worker/Dockerfile workers/rdp-worker` +- end-to-end smoke path: [scripts/smoke/README.md](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\scripts\smoke\README.md) + +## Configuration + +Use `configs/api.example.env` as the starting point for local environment variables. + +Resource secret-readiness is controlled by `APP_ENV`: + +- in `APP_ENV=production` or `APP_ENV=prod`, RDP/VNC/SSH resources must carry + `secret_ref` and must not include plaintext credential-like fields in + `metadata` +- in development and smoke environments, plaintext metadata remains allowed + until the encrypted secret resolver is implemented +- the production guard is enforced both on resource create/update and on + session start, so legacy plaintext resources cannot be started in production + accidentally +- `SECRET_ENCRYPTION_KEY_B64` or `SECRET_ENCRYPTION_KEY_FILE` supplies the + AES-256-GCM master key for the MVP encrypted store; production mode refuses + to start without one +- `SECRET_ENCRYPTION_KEY_ID` labels the active key version in stored records +- `PUT /api/v1/resources/{resourceID}/secret` creates or rotates a resource + secret and updates `resources.secret_ref`; plaintext is never returned by the + API +- session assignment keeps PostgreSQL metadata safe: `remote_sessions.metadata` + stores `secret_ref`, while resolved credentials are merged only into the + transient worker assignment after session/worker/lease checks + +See `docs/architecture/SECURITY_SECRETS_READINESS.md` for the target +secret-reference model and remaining resolver/PKI gaps. + +Data Plane v1 contract scaffolding is controlled by: + +- `DATA_PLANE_TOKEN_TTL`, default `1m` +- `DATA_PLANE_TOKEN_PRIVATE_KEY_FILE`, optional path to an RSA private key PEM used to sign RS256 data-plane tokens +- `DATA_PLANE_TOKEN_PRIVATE_KEY_PEM`, optional inline RSA private key PEM; used when file path is not configured +- `DATA_PLANE_BACKEND_GATEWAY_URL`, default `/api/v1/gateway/ws` +- `DATA_PLANE_DIRECT_WORKER_WSS_URL_TEMPLATE`, optional; supports `{worker_id}` replacement +- `DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME`, default `false`; advertises + `runtime_transport=json_v1` only after the worker direct JSON bridge is + deployed and verified +- `DATA_PLANE_DIRECT_WORKER_BINARY_RENDER`, default `false`; when the direct + JSON runtime is enabled, advertises `render_transport=binary_v1` so DP-2 + clients can request binary render frames over direct worker WSS. Binary + render candidates also advertise `supported_color_modes=["full_color","grayscale"]` + and `default_color_mode="full_color"` for the DP-3A grayscale foundation. +- `DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE`, default `smoke_insecure`; allowed + values are `smoke_insecure`, `public_ca`, and `platform_ca`. +- `DATA_PLANE_DIRECT_WORKER_TLS_CA_REF`, optional label for the platform CA or + trust bundle version advertised to clients. + +Data-plane tokens are RS256-signed. The backend must hold only the private key; +workers receive only the matching public key for validation. If no private key +is configured, the backend omits the optional `data_plane` offer and the +backend gateway fallback remains unchanged. + +If no direct worker WSS URL template is configured, session responses still include the backend gateway fallback candidate only. +If the URL template is configured but `DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME` +is `false`, the direct candidate is still present for contract visibility but is +not marked data-capable; DP-1D Windows clients will skip it and use the backend +gateway fallback. +If `DATA_PLANE_DIRECT_WORKER_BINARY_RENDER` is `false`, direct worker WSS +remains JSON/base64 for render. If it is `true`, only direct worker WSS render +is binary; backend gateway fallback remains JSON/base64. +In production, the backend does not advertise direct worker WSS when +`DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE=smoke_insecure`; it keeps the backend +gateway fallback instead. Trusted direct candidates include `tls_trust_mode`, +`production_trusted`, `smoke_only`, and optional `tls_ca_ref` metadata. See +`docs/architecture/DIRECT_WORKER_TLS_PKI.md`. + +## Module layout + +- `internal/platform` shared runtime, config, infra, and bootstrap concerns +- `internal/modules/auth` auth and trusted-device boundary +- `internal/modules/organization` organization model, org roles, and memberships +- `internal/modules/identitysource` local/LDAP/OIDC identity source model and future mapping foundations +- `internal/modules/resource` remote resource inventory boundary +- `internal/modules/sessionbroker` persistent session lifecycle, orchestration, audit, and Redis live-state boundary +- `internal/modules/sessiongateway` websocket attach/reconnect/takeover transport boundary +- `internal/modules/worker` worker registration, lease coordination, and control-plane routing boundary for future C++ RDP workers +- `internal/modules/node` node inventory, capabilities, enabled services, update policy, and partition state +- `internal/modules/nodeagent` node-agent registration, health, service status, and update/rollback control interface +- `pkg/contracts` cross-module contracts for sessions and worker control + +## Backend responsibilities + +- PostgreSQL remains the source of truth for auth sessions, devices, remote sessions, attachments, resource policies, and audit events +- Redis is used only for live routing and coordination: attach tokens, controller bindings, live session cache, worker registration, worker leases, heartbeats, and routing queues +- `worker:control:` carries worker assignments, `worker:queue:` carries live control/input envelopes, and `worker:events` carries worker-reported lifecycle events back into broker processing +- Session broker owns state transitions and orchestration rules; websocket handlers call broker services instead of talking to postgres repositories directly +- Worker runtime stays behind interfaces and Redis coordination so the backend remains isolated from FreeRDP implementation details while the minimal real RDP worker plugs into the control plane +- RDP certificate verification is configured per resource through `certificate_verification_mode` +- resources are now org-scoped in PostgreSQL and remote sessions persist their owning organization without changing the proven worker/session runtime contracts +- session start/attach/takeover responses may include optional `data_plane` candidates and a short-lived signed data-plane token for DP-1 direct worker WSS migration; existing clients continue to use the current gateway path, and direct realtime use remains gated by explicit candidate metadata + +## Authorization model + +- `platform_admin` and `platform_recovery_admin` have global access across organizations, resources, and sessions +- in `INSTALLATION_AUTHORITY_MODE=strict`, platform-admin power is effective only + when the user also has a valid signed row in `platform_role_grants`; changing + `users.platform_role` in PostgreSQL alone no longer grants owner access +- first-owner bootstrap is available at + `POST /api/v1/installation/bootstrap-owner` and requires a Product Root + Ed25519 signature over an activation manifest in strict mode +- production (`APP_ENV=production` or `prod`) requires strict installation + authority plus `INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_B64` or + `INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_FILE` +- legacy/dev installs can keep database-role behavior, and insecure first-owner + bootstrap is available only when + `INSTALLATION_INSECURE_BOOTSTRAP_ENABLED=true` +- `org_owner` and `org_admin` can create and update resources inside their organization and can manage any remote session inside that organization +- active non-admin memberships such as `org_operator`, `org_member`, and `org_viewer` are deny-by-default for admin actions; they can only access org-scoped reads and operate on their own session flows where the session broker explicitly allows it +- session start always authorizes the actor against the resource organization before worker reservation +- attach, detach, takeover, and terminate authorize against the owning remote session organization before any state transition is written +- worker-facing events do not bypass this model for user-originated commands; internal worker failure and heartbeat paths remain broker-internal control-plane operations + +## Migration safety + +- `000005_platform_core_v2` bootstraps a single `default` organization and backfills existing `resources.organization_id` and `remote_sessions.organization_id` into that organization before setting `NOT NULL` +- `000006_default_org_memberships_backfill` safely restores access continuity by inserting missing active memberships for existing users into the `default` organization +- the backfill is idempotent because it only inserts rows missing under the `(organization_id, user_id)` uniqueness constraint +- platform administrators are backfilled as `org_owner` in the default organization, while other existing users are backfilled as `org_member` +- if `000005` fails before the `NOT NULL` step, PostgreSQL rolls back the transaction and leaves pre-v2 rows untouched; if `000006` is rerun, it skips already-created memberships rather than duplicating them + +## Platform-Core V2 Notes + +- `organizations`, `organization_memberships`, and `organization_roles` establish multi-tenant ownership and basic org-scoped authorization boundaries +- `identity_sources` and `identity_mappings` are foundation-only in this phase; full LDAP/OIDC sync and claim/group ingestion are intentionally deferred +- `nodes`, `node_capabilities`, `node_services`, `node_update_policies`, `node_partition_states`, and `node_agent_update_runs` provide the first control-plane model for node and node-agent lifecycle +- current proven RDP session lifecycle remains preserved: the session broker still orchestrates the same worker/session behavior, but it now records organization ownership via org-scoped resources +- PostgreSQL remains the source of truth for organizations, memberships, org-scoped resources, identity sources, nodes, node-agent state, and session lifecycle state + +## Resource Certificate Verification + +- `strict` is the default and keeps normal certificate validation enabled in the worker runtime +- `ignore` must be explicitly stored on the resource and allows that one RDP connection to skip certificate validation +- the backend passes this policy through session assignment data; it is not a global backend toggle + +## Messaging Model + +- HTTP errors now use a structured envelope: + - `error.code` + - `error.message_key` + - `error.fallback_message` + - `error.details` + - `error.trace_id` +- `internal/platform/httpx` owns error normalization and trace-id generation so handlers can keep calling `WriteError(...)` without changing business logic. +- For `5xx` responses, user-facing payloads are normalized to an English generic fallback message while logs and diagnostics can still keep raw internal details elsewhere. +- For `4xx` responses, stable `code` and `message_key` are derived from the current fallback message, so clients can localize without depending on raw English text as the primary contract. + +## WebSocket Messaging + +- Session gateway envelopes keep the existing `type` and `payload` contract. +- User-facing websocket events now also include `event` with: + - `code` + - `message_key` + - `fallback_message` + - `details` + - `trace_id` +- `session.taken_over`, terminal `session.state`, `transport.closed`, and protocol-level errors now carry this structured event object. +- Existing payload semantics remain intact for compatibility with the already proven session lifecycle. + +## Message Rules + +- Keep English as the only development language for `fallback_message`, logs, and diagnostics. +- New HTTP handlers should prefer `httpx.WriteError(...)` for user-facing failures instead of hand-building `"error": "..."` JSON. +- New websocket user-facing notifications should populate `TransportEnvelope.Event` with a stable `code` and `message_key`. +- Do not use raw human-readable English text as the primary client contract; it should only remain as fallback text. +- This messaging layer is now runtime-proven against the live Windows smoke flow for invalid-login errors, websocket takeover delivery, websocket state fallback rendering, and worker-death failure handling. +## Clipboard Policy + +RDP text clipboard is controlled per resource through `resource_policies.clipboard_mode`. +Allowed values are `disabled`, `client_to_server`, `server_to_client`, and +`bidirectional`; the default is `disabled`. The legacy `clipboard_enabled` +column is retained only for compatibility and migration/backfill, while new +runtime decisions use `clipboard_mode`. + +Clipboard enforcement happens in the real data path: + +- `sessionbroker.ResourcePolicy.ClipboardMode` is loaded from PostgreSQL and + embedded into the session assignment metadata sent to the worker. +- `sessiongateway.Module.handleEnvelope` blocks client-to-server clipboard + envelopes unless the session is `active` and the policy allows that direction. +- `worker.EventProcessor` sends worker-originated clipboard text through + `sessionbroker.Service.UpdateWorkerClipboardText`, which applies the same + active-state and server-to-client policy checks before updating live state. +- Clipboard messages carry `sequence_id`, `origin`, and `content_hash` so + clients and workers can avoid feedback loops across reattach/takeover paths. +- Redis stores clipboard text only as transient live state for routing to the + active controller; PostgreSQL remains authoritative for policy/session state. + +## File Upload Policy + +Stage 5.1 introduces client-to-server file upload as a policy-gated RDP +feature. The authoritative policy field is +`resource_policies.file_transfer_mode`; allowed values are `disabled`, +`client_to_server`, `server_to_client`, and `bidirectional`, but only +`client_to_server` behavior is implemented in this stage. The default is +`disabled`. The legacy `file_transfer_enabled` column is retained only as a +derived compatibility flag and must not be treated as the primary policy. + +Enforcement is deliberately duplicated in the real data path: + +- `resource.Module` exposes `file_transfer_mode` in resource create, update, + list, and read payloads. +- `sessionbroker.Service.StartRemoteSession` embeds `file_transfer_mode` into + assignment metadata and requests the worker `file-transfer` capability only + when client-to-server upload is allowed. +- `sessiongateway.Module.handleFileUploadStart` and + `handleFileUploadChunk` require an active session, current controller, + allowed policy mode, valid UUID `transfer_id`, safe file name, 25 MiB max + file size, and 256 KiB max chunk size before routing chunks to the worker. +- Redis is used only to route bounded upload envelopes to the worker. The file + itself is written by the worker to controlled worker storage; PostgreSQL + remains authoritative for policy and session state. + +## File Download Policy + +Stage 5.2 adds a runtime-proven server-to-client download path for RDP. The +policy field remains `resource_policies.file_transfer_mode`; `server_to_client` +and `bidirectional` allow download, while `disabled` and `client_to_server` +block it. The default remains `disabled`. + +The v1 download model uses only the restricted `RAP_Transfers\ToClient` +drop-zone inside the existing per-session visible transfer directory. Backend +gateway accepts only `file_download.start`, `file_download.ack`, and +`file_download.cancel` from the current controller of an active session and +routes them to the worker after policy validation. Worker-origin +`file_download.*` events are stored only as transient live state for +backend-gateway fallback delivery; PostgreSQL remains authoritative for +session/resource/policy state and must not store file contents. + +The direct worker WSS path is also lifecycle-gated: detach returns +`file_download.blocked`, old-controller takeover returns `session.taken_over`, +and worker failure closes the direct transport after PostgreSQL transitions the +session to `failed`. diff --git a/backend/cmd/api/main.go b/backend/cmd/api/main.go new file mode 100644 index 0000000..0945b62 --- /dev/null +++ b/backend/cmd/api/main.go @@ -0,0 +1,24 @@ +package main + +import ( + "context" + "log" + "os/signal" + "syscall" + + "github.com/example/remote-access-platform/backend/internal/platform/runtime" +) + +func main() { + ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer stop() + + app, err := runtime.NewApp(ctx) + if err != nil { + log.Fatalf("bootstrap app: %v", err) + } + + if err := app.Run(ctx); err != nil { + log.Fatalf("run app: %v", err) + } +} diff --git a/backend/cmd/ws-smoke-client/main.go b/backend/cmd/ws-smoke-client/main.go new file mode 100644 index 0000000..f074beb --- /dev/null +++ b/backend/cmd/ws-smoke-client/main.go @@ -0,0 +1,94 @@ +package main + +import ( + "context" + "flag" + "fmt" + "log" + "net/url" + "os" + "os/signal" + "strings" + "time" + + "github.com/gorilla/websocket" +) + +func main() { + var ( + gatewayURL = flag.String("gateway-url", "ws://127.0.0.1:8080/api/v1/gateway/ws", "websocket gateway url") + attachToken = flag.String("attach-token", "", "short-lived attach token") + duration = flag.Duration("duration", 90*time.Second, "maximum time to keep the client attached") + heartbeatInterval = flag.Duration("heartbeat-interval", 10*time.Second, "client heartbeat interval") + exitOnTakenOver = flag.Bool("exit-on-taken-over", true, "exit successfully after receiving session.taken_over") + ) + flag.Parse() + + if *attachToken == "" { + log.Fatal("attach-token is required") + } + + ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) + defer cancel() + if *duration > 0 { + var timeoutCancel context.CancelFunc + ctx, timeoutCancel = context.WithTimeout(ctx, *duration) + defer timeoutCancel() + } + + targetURL, err := url.Parse(*gatewayURL) + if err != nil { + log.Fatalf("parse gateway url: %v", err) + } + query := targetURL.Query() + query.Set("attach_token", *attachToken) + targetURL.RawQuery = query.Encode() + + conn, _, err := websocket.DefaultDialer.DialContext(ctx, targetURL.String(), nil) + if err != nil { + log.Fatalf("dial websocket: %v", err) + } + defer conn.Close() + + done := make(chan error, 1) + go func() { + for { + _, payload, err := conn.ReadMessage() + if err != nil { + done <- err + return + } + fmt.Println(string(payload)) + if *exitOnTakenOver && containsTakenOver(payload) { + done <- nil + return + } + } + }() + + heartbeatTicker := time.NewTicker(*heartbeatInterval) + defer heartbeatTicker.Stop() + + for { + select { + case err := <-done: + if err != nil { + log.Fatalf("websocket read: %v", err) + } + return + case <-heartbeatTicker.C: + if err := conn.WriteJSON(map[string]any{"type": "heartbeat"}); err != nil { + log.Fatalf("heartbeat write: %v", err) + } + case <-ctx.Done(): + if err := ctx.Err(); err != nil && err != context.Canceled && err != context.DeadlineExceeded { + log.Fatalf("context ended: %v", err) + } + return + } + } +} + +func containsTakenOver(payload []byte) bool { + return strings.Contains(string(payload), `"type":"session.taken_over"`) +} diff --git a/backend/configs/api.example.env b/backend/configs/api.example.env new file mode 100644 index 0000000..7e6b995 --- /dev/null +++ b/backend/configs/api.example.env @@ -0,0 +1,44 @@ +APP_NAME=rap-api +APP_ENV=development +HTTP_HOST=0.0.0.0 +HTTP_PORT=8080 +HTTP_READ_TIMEOUT=15s +HTTP_WRITE_TIMEOUT=15s +HTTP_IDLE_TIMEOUT=60s +HTTP_SHUTDOWN_TIMEOUT=10s +POSTGRES_DSN=postgres://rap_user:rap_password@localhost:5432/remote_access_platform?sslmode=disable +POSTGRES_MAX_CONNS=20 +POSTGRES_MIN_CONNS=2 +POSTGRES_CONNECT_TIMEOUT=5s +REDIS_ADDR=localhost:6379 +REDIS_PASSWORD= +REDIS_DB=0 +REDIS_DIAL_TIMEOUT=5s +AUTH_ACCESS_TOKEN_TTL=15m +AUTH_REFRESH_TOKEN_TTL=720h +AUTH_ISSUER=rap-api +AUTH_ACCESS_TOKEN_SECRET=change-me-access-secret +AUTH_REFRESH_HASH_SECRET=change-me-refresh-hash-secret +DATA_PLANE_TOKEN_TTL=1m +DATA_PLANE_TOKEN_PRIVATE_KEY_FILE= +DATA_PLANE_TOKEN_PRIVATE_KEY_PEM= +DATA_PLANE_BACKEND_GATEWAY_URL=/api/v1/gateway/ws +DATA_PLANE_DIRECT_WORKER_WSS_URL_TEMPLATE= +DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME=false +DATA_PLANE_DIRECT_WORKER_BINARY_RENDER=false +DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE=smoke_insecure +DATA_PLANE_DIRECT_WORKER_TLS_CA_REF= +SECRET_ENCRYPTION_KEY_B64= +SECRET_ENCRYPTION_KEY_FILE= +SECRET_ENCRYPTION_KEY_ID=local-v1 +SESSION_HEARTBEAT_TTL=90s +SESSION_DETACH_GRACE_PERIOD=30m +SESSION_ATTACH_TOKEN_TTL=2m +SESSION_LIVE_STATE_TTL=2m +SESSION_RECOVERY_BATCH_SIZE=100 +WORKER_LEASE_TTL=45s +WORKER_HEARTBEAT_TTL=15s +WORKER_STALE_LEASE_GRACE_PERIOD=30s +WEBSOCKET_WRITE_TIMEOUT=10s +WEBSOCKET_PING_INTERVAL=20s +WEBSOCKET_PONG_WAIT=40s diff --git a/backend/go.mod b/backend/go.mod new file mode 100644 index 0000000..28998cc --- /dev/null +++ b/backend/go.mod @@ -0,0 +1,23 @@ +module github.com/example/remote-access-platform/backend + +go 1.23.2 + +require ( + github.com/go-chi/chi/v5 v5.2.1 + github.com/golang-jwt/jwt/v5 v5.2.2 + github.com/google/uuid v1.6.0 + github.com/gorilla/websocket v1.5.3 + github.com/jackc/pgx/v5 v5.7.4 + github.com/redis/go-redis/v9 v9.8.0 + golang.org/x/crypto v0.37.0 +) + +require ( + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect + github.com/jackc/pgpassfile v1.0.0 // indirect + github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect + github.com/jackc/puddle/v2 v2.2.2 // indirect + golang.org/x/sync v0.13.0 // indirect + golang.org/x/text v0.24.0 // indirect +) diff --git a/backend/go.sum b/backend/go.sum new file mode 100644 index 0000000..1942278 --- /dev/null +++ b/backend/go.sum @@ -0,0 +1,46 @@ +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/go-chi/chi/v5 v5.2.1 h1:KOIHODQj58PmL80G2Eak4WdvUzjSJSm0vG72crDCqb8= +github.com/go-chi/chi/v5 v5.2.1/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops= +github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8= +github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= +github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= +github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= +github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= +github.com/jackc/pgx/v5 v5.7.4 h1:9wKznZrhWa2QiHL+NjTSPP6yjl3451BX3imWDnokYlg= +github.com/jackc/pgx/v5 v5.7.4/go.mod h1:ncY89UGWxg82EykZUwSpUKEfccBGGYq1xjrOpsbsfGQ= +github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= +github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/redis/go-redis/v9 v9.8.0 h1:q3nRvjrlge/6UD7eTu/DSg2uYiU2mCL0G/uzBWqhicI= +github.com/redis/go-redis/v9 v9.8.0/go.mod h1:huWgSWd8mW6+m0VPhJjSSQ+d6Nh1VICQ6Q5lHuCH/Iw= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= +golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= +golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= +golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/backend/internal/modules/auth/errors.go b/backend/internal/modules/auth/errors.go new file mode 100644 index 0000000..19a4e3f --- /dev/null +++ b/backend/internal/modules/auth/errors.go @@ -0,0 +1,19 @@ +package auth + +import "errors" + +var ( + ErrInvalidCredentials = errors.New("invalid credentials") + ErrInvalidRefreshToken = errors.New("invalid refresh token") + ErrAuthSessionRevoked = errors.New("auth session revoked") + ErrDeviceRevoked = errors.New("device revoked") + ErrDeviceNotTrusted = errors.New("device not trusted") + ErrAuthSessionNotFound = errors.New("auth session not found") + ErrTrustedDeviceMissing = errors.New("trusted device not found") + + ErrInstallationAlreadyBootstrapped = errors.New("installation is already bootstrapped") + ErrInstallationActivationRequired = errors.New("signed installation activation is required") + ErrInvalidInstallationActivation = errors.New("invalid installation activation") + ErrInsecureBootstrapDisabled = errors.New("insecure installation bootstrap is disabled") + ErrInvalidBootstrapOwner = errors.New("invalid bootstrap owner") +) diff --git a/backend/internal/modules/auth/models.go b/backend/internal/modules/auth/models.go new file mode 100644 index 0000000..0410689 --- /dev/null +++ b/backend/internal/modules/auth/models.go @@ -0,0 +1,114 @@ +package auth + +import ( + "encoding/json" + "time" +) + +type DeviceTrustStatus string + +const ( + DeviceTrustStatusPending DeviceTrustStatus = "pending" + DeviceTrustStatusTrusted DeviceTrustStatus = "trusted" + DeviceTrustStatusRevoked DeviceTrustStatus = "revoked" +) + +type User struct { + ID string + Email string + PasswordHash string + MFAEnabled bool + CreatedAt time.Time + UpdatedAt time.Time +} + +type Device struct { + ID string + UserID string + Fingerprint string + Label string + TrustStatus DeviceTrustStatus + TrustedAt *time.Time + LastSeenAt *time.Time + RevokedAt *time.Time + RevokedReason *string + CreatedAt time.Time + UpdatedAt time.Time +} + +type AuthSession struct { + ID string + UserID string + DeviceID string + RefreshTokenHash string + RefreshExpiresAt time.Time + LastSeenAt *time.Time + LastRotatedAt *time.Time + RevokedAt *time.Time + RevokedReason *string + CreatedAt time.Time + UpdatedAt time.Time +} + +type LoginCommand struct { + Email string `json:"email"` + Password string `json:"password"` + DeviceFingerprint string `json:"device_fingerprint"` + DeviceLabel string `json:"device_label"` + TrustDevice bool `json:"trust_device"` +} + +type RefreshCommand struct { + RefreshToken string `json:"refresh_token"` +} + +type BootstrapOwnerCommand struct { + Email string `json:"email"` + Password string `json:"password"` + ActivationPayload json.RawMessage `json:"activation_payload"` + ActivationSignature string `json:"activation_signature"` +} + +type RevokeAuthSessionCommand struct { + UserID string `json:"user_id"` + AuthSessionID string `json:"auth_session_id"` + Reason string `json:"reason"` +} + +type RevokeDeviceCommand struct { + UserID string `json:"user_id"` + DeviceID string `json:"device_id"` + Reason string `json:"reason"` +} + +type TokenPair struct { + AccessToken string `json:"access_token"` + AccessTokenExpiresAt time.Time `json:"access_token_expires_at"` + RefreshToken string `json:"refresh_token"` + RefreshTokenExpiresAt time.Time `json:"refresh_token_expires_at"` +} + +type AuthResult struct { + User User `json:"user"` + Device Device `json:"device"` + AuthSession AuthSession `json:"auth_session"` + Tokens TokenPair `json:"tokens"` +} + +type InstallationStatus struct { + Bootstrapped bool `json:"bootstrapped"` + AuthorityState string `json:"authority_state"` + InstallID string `json:"install_id,omitempty"` + BootstrappedOwnerEmail string `json:"bootstrapped_owner_email,omitempty"` + BootstrappedAt *time.Time `json:"bootstrapped_at,omitempty"` + AuthorityMode string `json:"authority_mode"` + StrictAuthority bool `json:"strict_authority"` + RootFingerprint string `json:"root_fingerprint,omitempty"` + InsecureBootstrapAllowed bool `json:"insecure_bootstrap_allowed"` +} + +type BootstrapOwnerResult struct { + Installation InstallationStatus `json:"installation"` + User User `json:"user"` + PlatformRole string `json:"platform_role"` +} diff --git a/backend/internal/modules/auth/module.go b/backend/internal/modules/auth/module.go new file mode 100644 index 0000000..8af90e3 --- /dev/null +++ b/backend/internal/modules/auth/module.go @@ -0,0 +1,173 @@ +package auth + +import ( + "encoding/json" + "net/http" + + "github.com/go-chi/chi/v5" + + "github.com/example/remote-access-platform/backend/internal/platform/httpx" + "github.com/example/remote-access-platform/backend/internal/platform/module" +) + +type Module struct { + service *Service +} + +func NewModule(deps module.Dependencies, service *Service) *Module { + return &Module{service: service} +} + +func (m *Module) Name() string { + return "auth" +} + +func (m *Module) RegisterRoutes(router chi.Router) { + router.Route("/installation", func(r chi.Router) { + r.Get("/status", m.handleInstallationStatus) + r.Post("/bootstrap-owner", m.handleBootstrapOwner) + }) + router.Route("/auth", func(r chi.Router) { + r.Post("/login", m.handleLogin) + r.Post("/refresh", m.handleRefresh) + r.Post("/sessions/revoke", m.handleRevokeAuthSession) + r.Get("/devices", m.handleTrustedDevices) + r.Post("/devices/{deviceID}/revoke", m.handleRevokeTrustedDevice) + }) +} + +func (m *Module) handleInstallationStatus(w http.ResponseWriter, r *http.Request) { + status, err := m.service.InstallationStatus(r.Context()) + if err != nil { + statusCode, message := m.service.MapError(err) + httpx.WriteError(w, statusCode, message) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"installation": status}) +} + +func (m *Module) handleBootstrapOwner(w http.ResponseWriter, r *http.Request) { + var cmd BootstrapOwnerCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid installation bootstrap payload") + return + } + result, err := m.service.BootstrapOwner(r.Context(), cmd) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + httpx.WriteJSON(w, http.StatusCreated, result) +} + +func (m *Module) handleLogin(w http.ResponseWriter, r *http.Request) { + var cmd LoginCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid login payload") + return + } + + result, err := m.service.Login(r.Context(), cmd) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + + httpx.WriteJSON(w, http.StatusOK, result) +} + +func (m *Module) handleRefresh(w http.ResponseWriter, r *http.Request) { + var cmd RefreshCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid refresh payload") + return + } + + result, err := m.service.Refresh(r.Context(), cmd) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + + httpx.WriteJSON(w, http.StatusOK, result) +} + +func (m *Module) handleRevokeAuthSession(w http.ResponseWriter, r *http.Request) { + var cmd RevokeAuthSessionCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid auth session revoke payload") + return + } + + if err := m.service.RevokeAuthSession(r.Context(), cmd); err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{ + "status": "revoked", + "message": httpx.NewMessage( + "auth.session.revoked", + "status.auth.session.revoked", + "Auth session revoked.", + nil, + "", + ), + }) +} + +func (m *Module) handleTrustedDevices(w http.ResponseWriter, r *http.Request) { + userID := r.URL.Query().Get("user_id") + if userID == "" { + httpx.WriteError(w, http.StatusBadRequest, "user_id is required") + return + } + + devices, err := m.service.ListTrustedDevices(r.Context(), userID) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + + httpx.WriteJSON(w, http.StatusOK, map[string]any{ + "devices": devices, + }) +} + +func (m *Module) handleRevokeTrustedDevice(w http.ResponseWriter, r *http.Request) { + var payload struct { + UserID string `json:"user_id"` + Reason string `json:"reason"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid device revoke payload") + return + } + + err := m.service.RevokeTrustedDevice(r.Context(), RevokeDeviceCommand{ + UserID: payload.UserID, + DeviceID: chi.URLParam(r, "deviceID"), + Reason: payload.Reason, + }) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{ + "status": "revoked", + "message": httpx.NewMessage( + "auth.device.revoked", + "status.auth.device.revoked", + "Trusted device revoked.", + nil, + "", + ), + }) +} diff --git a/backend/internal/modules/auth/postgres_store.go b/backend/internal/modules/auth/postgres_store.go new file mode 100644 index 0000000..78b976c --- /dev/null +++ b/backend/internal/modules/auth/postgres_store.go @@ -0,0 +1,525 @@ +package auth + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strings" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + postgresplatform "github.com/example/remote-access-platform/backend/internal/platform/postgres" +) + +type postgresStore struct { + db postgresplatform.DBTX +} + +type PostgresTransactor struct { + pool *pgxpool.Pool +} + +func NewPostgresStore(pool *pgxpool.Pool) Store { + return &postgresStore{db: pool} +} + +func NewPostgresTransactor(pool *pgxpool.Pool) *PostgresTransactor { + return &PostgresTransactor{pool: pool} +} + +func (t *PostgresTransactor) WithinTransaction(ctx context.Context, fn func(store Store) error) error { + return postgresplatform.WithTransaction(ctx, t.pool, func(tx pgx.Tx) error { + return fn(&postgresStore{db: tx}) + }) +} + +func (s *postgresStore) Users() UserRepository { + return &postgresUserRepository{db: s.db} +} + +func (s *postgresStore) Devices() DeviceRepository { + return &postgresDeviceRepository{db: s.db} +} + +func (s *postgresStore) AuthSessions() AuthSessionRepository { + return &postgresAuthSessionRepository{db: s.db} +} + +func (s *postgresStore) Installation() InstallationRepository { + return &postgresInstallationRepository{db: s.db} +} + +type postgresUserRepository struct { + db postgresplatform.DBTX +} + +type postgresDeviceRepository struct { + db postgresplatform.DBTX +} + +type postgresAuthSessionRepository struct { + db postgresplatform.DBTX +} + +type postgresInstallationRepository struct { + db postgresplatform.DBTX +} + +func (r *postgresUserRepository) GetByEmail(ctx context.Context, email string) (*User, error) { + const query = ` +SELECT id::text, email, password_hash, mfa_enabled, created_at, updated_at +FROM users +WHERE email = $1 +` + return scanOptionalUser(r.db.QueryRow(ctx, query, email)) +} + +func (r *postgresUserRepository) GetByID(ctx context.Context, userID string) (*User, error) { + const query = ` +SELECT id::text, email, password_hash, mfa_enabled, created_at, updated_at +FROM users +WHERE id = $1::uuid +` + return scanOptionalUser(r.db.QueryRow(ctx, query, userID)) +} + +func (r *postgresDeviceRepository) Upsert(ctx context.Context, params UpsertDeviceParams) (*Device, error) { + const query = ` +INSERT INTO devices ( + user_id, + device_fingerprint, + device_label, + trust_status, + trusted_at, + last_seen_at, + created_at, + updated_at +) VALUES ( + $1::uuid, + $2, + $3, + CASE WHEN $4 THEN 'trusted' ELSE 'pending' END, + CASE WHEN $4 THEN $5::timestamptz ELSE NULL::timestamptz END, + $5::timestamptz, + $5::timestamptz, + $5::timestamptz +) +ON CONFLICT (user_id, device_fingerprint) DO UPDATE SET + device_label = EXCLUDED.device_label, + last_seen_at = EXCLUDED.last_seen_at, + updated_at = EXCLUDED.updated_at, + trust_status = CASE + WHEN devices.trust_status = 'revoked' THEN devices.trust_status + WHEN devices.trust_status = 'trusted' THEN devices.trust_status + WHEN EXCLUDED.trust_status = 'trusted' THEN 'trusted' + ELSE devices.trust_status + END, + trusted_at = CASE + WHEN devices.trust_status = 'trusted' THEN devices.trusted_at + WHEN EXCLUDED.trust_status = 'trusted' THEN EXCLUDED.trusted_at + ELSE devices.trusted_at + END +RETURNING + id::text, user_id::text, device_fingerprint, COALESCE(device_label, ''), + trust_status, trusted_at, last_seen_at, revoked_at, revoked_reason, created_at, updated_at +` + return scanDevice(r.db.QueryRow(ctx, query, + params.UserID, + params.Fingerprint, + params.Label, + params.TrustRequested, + params.SeenAt, + )) +} + +func (r *postgresDeviceRepository) GetByIDForUser(ctx context.Context, userID, deviceID string) (*Device, error) { + const query = ` +SELECT id::text, user_id::text, device_fingerprint, COALESCE(device_label, ''), + trust_status, trusted_at, last_seen_at, revoked_at, revoked_reason, created_at, updated_at +FROM devices +WHERE id = $1::uuid AND user_id = $2::uuid +` + device, err := scanDevice(r.db.QueryRow(ctx, query, deviceID, userID)) + if errors.Is(err, pgx.ErrNoRows) { + return nil, nil + } + return device, err +} + +func (r *postgresDeviceRepository) ListTrustedByUser(ctx context.Context, userID string) ([]Device, error) { + const query = ` +SELECT id::text, user_id::text, device_fingerprint, COALESCE(device_label, ''), + trust_status, trusted_at, last_seen_at, revoked_at, revoked_reason, created_at, updated_at +FROM devices +WHERE user_id = $1::uuid AND trust_status = 'trusted' AND revoked_at IS NULL +ORDER BY created_at DESC +` + rows, err := r.db.Query(ctx, query, userID) + if err != nil { + return nil, fmt.Errorf("query trusted devices: %w", err) + } + defer rows.Close() + + var devices []Device + for rows.Next() { + device, err := scanDevice(rows) + if err != nil { + return nil, err + } + devices = append(devices, *device) + } + return devices, rows.Err() +} + +func (r *postgresDeviceRepository) Revoke(ctx context.Context, params RevokeDeviceParams) error { + const query = ` +UPDATE devices +SET trust_status = 'revoked', + revoked_at = $3, + revoked_reason = $4, + updated_at = $3 +WHERE id = $1::uuid AND user_id = $2::uuid +` + if _, err := r.db.Exec(ctx, query, params.DeviceID, params.UserID, params.RevokedAt, params.Reason); err != nil { + return fmt.Errorf("revoke device: %w", err) + } + return nil +} + +func (r *postgresAuthSessionRepository) Create(ctx context.Context, session AuthSession) error { + const query = ` +INSERT INTO auth_sessions ( + id, + user_id, + device_id, + refresh_token_hash, + refresh_expires_at, + last_seen_at, + created_at, + updated_at +) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8) +` + if _, err := r.db.Exec(ctx, query, + session.ID, + session.UserID, + session.DeviceID, + session.RefreshTokenHash, + session.RefreshExpiresAt, + session.LastSeenAt, + session.CreatedAt, + session.UpdatedAt, + ); err != nil { + return fmt.Errorf("create auth session: %w", err) + } + return nil +} + +func (r *postgresAuthSessionRepository) GetByID(ctx context.Context, authSessionID string) (*AuthSession, error) { + return r.getByID(ctx, authSessionID, "") +} + +func (r *postgresAuthSessionRepository) GetByIDForUpdate(ctx context.Context, authSessionID string) (*AuthSession, error) { + return r.getByID(ctx, authSessionID, " FOR UPDATE") +} + +func (r *postgresAuthSessionRepository) getByID(ctx context.Context, authSessionID string, suffix string) (*AuthSession, error) { + query := ` +SELECT id::text, user_id::text, device_id::text, refresh_token_hash, refresh_expires_at, + last_seen_at, last_rotated_at, revoked_at, revoked_reason, created_at, updated_at +FROM auth_sessions +WHERE id = $1::uuid` + suffix + session, err := scanAuthSession(r.db.QueryRow(ctx, query, authSessionID)) + if errors.Is(err, pgx.ErrNoRows) { + return nil, nil + } + return session, err +} + +func (r *postgresAuthSessionRepository) Rotate(ctx context.Context, params RotateAuthSessionParams) error { + const query = ` +UPDATE auth_sessions +SET refresh_token_hash = $2, + refresh_expires_at = $3, + last_seen_at = $4, + last_rotated_at = $5, + updated_at = $5 +WHERE id = $1::uuid AND revoked_at IS NULL +` + if _, err := r.db.Exec(ctx, query, + params.AuthSessionID, + params.RefreshTokenHash, + params.RefreshExpiresAt, + params.LastSeenAt, + params.LastRotatedAt, + ); err != nil { + return fmt.Errorf("rotate auth session: %w", err) + } + return nil +} + +func (r *postgresAuthSessionRepository) Touch(ctx context.Context, authSessionID string, seenAt time.Time) error { + const query = ` +UPDATE auth_sessions +SET last_seen_at = $2, updated_at = $2 +WHERE id = $1::uuid AND revoked_at IS NULL +` + if _, err := r.db.Exec(ctx, query, authSessionID, seenAt); err != nil { + return fmt.Errorf("touch auth session: %w", err) + } + return nil +} + +func (r *postgresAuthSessionRepository) Revoke(ctx context.Context, params RevokeAuthSessionParams) error { + const query = ` +UPDATE auth_sessions +SET revoked_at = $3, + revoked_reason = $4, + updated_at = $3 +WHERE id = $1::uuid AND user_id = $2::uuid AND revoked_at IS NULL +` + if _, err := r.db.Exec(ctx, query, params.AuthSessionID, params.UserID, params.RevokedAt, params.Reason); err != nil { + return fmt.Errorf("revoke auth session: %w", err) + } + return nil +} + +func (r *postgresAuthSessionRepository) RevokeByDevice(ctx context.Context, userID, deviceID, reason string, revokedAt time.Time) error { + const query = ` +UPDATE auth_sessions +SET revoked_at = $3, + revoked_reason = $4, + updated_at = $3 +WHERE user_id = $1::uuid AND device_id = $2::uuid AND revoked_at IS NULL +` + if _, err := r.db.Exec(ctx, query, userID, deviceID, revokedAt, reason); err != nil { + return fmt.Errorf("revoke auth sessions by device: %w", err) + } + return nil +} + +func (r *postgresInstallationRepository) GetStatus(ctx context.Context) (*InstallationAuthorityState, error) { + const query = ` +SELECT install_id, authority_state, product_root_key_fingerprint, bootstrapped_owner_email, bootstrapped_at +FROM installation_authority +WHERE id = 1 +` + status := &InstallationAuthorityState{} + if err := r.db.QueryRow(ctx, query).Scan( + &status.InstallID, + &status.AuthorityState, + &status.ProductRootFingerprint, + &status.BootstrappedOwnerEmail, + &status.BootstrappedAt, + ); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return &InstallationAuthorityState{ + Bootstrapped: false, + AuthorityState: "unbootstrapped", + }, nil + } + return nil, fmt.Errorf("get installation status: %w", err) + } + status.Bootstrapped = true + return status, nil +} + +func (r *postgresInstallationRepository) BootstrapOwner(ctx context.Context, params BootstrapOwnerParams) (*User, error) { + var existingInstallID string + if err := r.db.QueryRow(ctx, ` +SELECT install_id +FROM installation_authority +WHERE id = 1 +FOR UPDATE +`).Scan(&existingInstallID); err != nil && !errors.Is(err, pgx.ErrNoRows) { + return nil, fmt.Errorf("lock installation authority: %w", err) + } else if err == nil { + return nil, ErrInstallationAlreadyBootstrapped + } + + email := strings.ToLower(strings.TrimSpace(params.Email)) + now := params.Now.UTC() + user, err := scanOptionalUser(r.db.QueryRow(ctx, ` +INSERT INTO users (email, password_hash, mfa_enabled, platform_role, created_at, updated_at) +VALUES ($1, $2, FALSE, $3, $4, $4) +ON CONFLICT (email) DO UPDATE SET + password_hash = EXCLUDED.password_hash, + platform_role = EXCLUDED.platform_role, + updated_at = EXCLUDED.updated_at +RETURNING id::text, email, password_hash, mfa_enabled, created_at, updated_at +`, email, params.PasswordHash, params.Role, now)) + if err != nil { + return nil, fmt.Errorf("upsert bootstrap owner: %w", err) + } + if user == nil { + return nil, fmt.Errorf("upsert bootstrap owner returned no user") + } + + payload := json.RawMessage(`{}`) + if len(params.ActivationPayload) > 0 { + payload = params.ActivationPayload + } + if _, err := r.db.Exec(ctx, ` +INSERT INTO installation_authority ( + id, + install_id, + authority_state, + product_root_key_fingerprint, + activation_payload, + activation_signature, + bootstrapped_owner_email, + bootstrapped_at, + created_at, + updated_at +) VALUES ( + 1, + $1, + 'active', + $2, + $3::jsonb, + $4, + $5, + $6, + $6, + $6 +) +`, params.InstallID, params.ProductRootKeyFingerprint, []byte(payload), params.ActivationSignature, email, now); err != nil { + return nil, fmt.Errorf("insert installation authority: %w", err) + } + + if _, err := r.db.Exec(ctx, ` +UPDATE platform_role_grants +SET revoked_at = $4 +WHERE user_id = $1::uuid + AND role = $2 + AND install_id = $3 + AND revoked_at IS NULL +`, user.ID, params.Role, params.InstallID, now); err != nil { + return nil, fmt.Errorf("revoke superseded platform role grants: %w", err) + } + if _, err := r.db.Exec(ctx, ` +INSERT INTO platform_role_grants ( + user_id, + role, + install_id, + grant_payload, + grant_signature, + grant_source, + granted_at, + expires_at, + metadata +) VALUES ( + $1::uuid, + $2, + $3, + $4::jsonb, + $5, + $6, + $7, + $8, + '{"bootstrap_owner":true}'::jsonb +) +`, user.ID, params.Role, params.InstallID, []byte(payload), params.ActivationSignature, params.GrantSource, now, params.ExpiresAt); err != nil { + return nil, fmt.Errorf("insert platform role grant: %w", err) + } + + if _, err := r.db.Exec(ctx, ` +INSERT INTO organization_memberships ( + organization_id, + user_id, + role_id, + status, + invited_by_user_id, + created_at, + updated_at +) +SELECT id, $1::uuid, 'org_owner', 'active', $1::uuid, $2, $2 +FROM organizations +WHERE slug = 'default' +ON CONFLICT (organization_id, user_id) DO UPDATE SET + role_id = 'org_owner', + status = 'active', + invited_by_user_id = EXCLUDED.invited_by_user_id, + updated_at = EXCLUDED.updated_at +`, user.ID, now); err != nil { + return nil, fmt.Errorf("upsert default organization owner membership: %w", err) + } + + return user, nil +} + +type scanner interface { + Scan(dest ...any) error +} + +func scanOptionalUser(row scanner) (*User, error) { + user := &User{} + if err := row.Scan( + &user.ID, + &user.Email, + &user.PasswordHash, + &user.MFAEnabled, + &user.CreatedAt, + &user.UpdatedAt, + ); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return nil, nil + } + return nil, fmt.Errorf("scan user: %w", err) + } + return user, nil +} + +func scanDevice(row scanner) (*Device, error) { + device := &Device{} + var trustedAt, lastSeenAt, revokedAt *time.Time + var revokedReason *string + if err := row.Scan( + &device.ID, + &device.UserID, + &device.Fingerprint, + &device.Label, + &device.TrustStatus, + &trustedAt, + &lastSeenAt, + &revokedAt, + &revokedReason, + &device.CreatedAt, + &device.UpdatedAt, + ); err != nil { + return nil, fmt.Errorf("scan device: %w", err) + } + device.TrustedAt = trustedAt + device.LastSeenAt = lastSeenAt + device.RevokedAt = revokedAt + device.RevokedReason = revokedReason + return device, nil +} + +func scanAuthSession(row scanner) (*AuthSession, error) { + session := &AuthSession{} + var lastSeenAt, lastRotatedAt, revokedAt *time.Time + var revokedReason *string + if err := row.Scan( + &session.ID, + &session.UserID, + &session.DeviceID, + &session.RefreshTokenHash, + &session.RefreshExpiresAt, + &lastSeenAt, + &lastRotatedAt, + &revokedAt, + &revokedReason, + &session.CreatedAt, + &session.UpdatedAt, + ); err != nil { + return nil, fmt.Errorf("scan auth session: %w", err) + } + session.LastSeenAt = lastSeenAt + session.LastRotatedAt = lastRotatedAt + session.RevokedAt = revokedAt + session.RevokedReason = revokedReason + return session, nil +} diff --git a/backend/internal/modules/auth/repository.go b/backend/internal/modules/auth/repository.go new file mode 100644 index 0000000..0f31164 --- /dev/null +++ b/backend/internal/modules/auth/repository.go @@ -0,0 +1,97 @@ +package auth + +import ( + "context" + "encoding/json" + "time" +) + +type UserRepository interface { + GetByEmail(ctx context.Context, email string) (*User, error) + GetByID(ctx context.Context, userID string) (*User, error) +} + +type DeviceRepository interface { + Upsert(ctx context.Context, params UpsertDeviceParams) (*Device, error) + GetByIDForUser(ctx context.Context, userID, deviceID string) (*Device, error) + ListTrustedByUser(ctx context.Context, userID string) ([]Device, error) + Revoke(ctx context.Context, params RevokeDeviceParams) error +} + +type AuthSessionRepository interface { + Create(ctx context.Context, session AuthSession) error + GetByID(ctx context.Context, authSessionID string) (*AuthSession, error) + GetByIDForUpdate(ctx context.Context, authSessionID string) (*AuthSession, error) + Rotate(ctx context.Context, params RotateAuthSessionParams) error + Touch(ctx context.Context, authSessionID string, seenAt time.Time) error + Revoke(ctx context.Context, params RevokeAuthSessionParams) error + RevokeByDevice(ctx context.Context, userID, deviceID, reason string, revokedAt time.Time) error +} + +type InstallationRepository interface { + GetStatus(ctx context.Context) (*InstallationAuthorityState, error) + BootstrapOwner(ctx context.Context, params BootstrapOwnerParams) (*User, error) +} + +type Store interface { + Users() UserRepository + Devices() DeviceRepository + AuthSessions() AuthSessionRepository + Installation() InstallationRepository +} + +type Transactor interface { + WithinTransaction(ctx context.Context, fn func(store Store) error) error +} + +type UpsertDeviceParams struct { + UserID string + Fingerprint string + Label string + TrustRequested bool + SeenAt time.Time +} + +type RotateAuthSessionParams struct { + AuthSessionID string + RefreshTokenHash string + RefreshExpiresAt time.Time + LastSeenAt time.Time + LastRotatedAt time.Time +} + +type RevokeAuthSessionParams struct { + AuthSessionID string + UserID string + Reason string + RevokedAt time.Time +} + +type RevokeDeviceParams struct { + UserID string + DeviceID string + Reason string + RevokedAt time.Time +} + +type InstallationAuthorityState struct { + Bootstrapped bool + AuthorityState string + InstallID string + ProductRootFingerprint string + BootstrappedOwnerEmail string + BootstrappedAt *time.Time +} + +type BootstrapOwnerParams struct { + Email string + PasswordHash string + Role string + InstallID string + ProductRootKeyFingerprint string + ActivationPayload json.RawMessage + ActivationSignature string + GrantSource string + ExpiresAt *time.Time + Now time.Time +} diff --git a/backend/internal/modules/auth/service.go b/backend/internal/modules/auth/service.go new file mode 100644 index 0000000..7c1b1f7 --- /dev/null +++ b/backend/internal/modules/auth/service.go @@ -0,0 +1,440 @@ +package auth + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strings" + "time" + + "github.com/google/uuid" + "golang.org/x/crypto/bcrypt" + + "github.com/example/remote-access-platform/backend/internal/platform/authority" + "github.com/example/remote-access-platform/backend/internal/platform/module" +) + +type Service struct { + cfg module.Config + store Store + transactor Transactor + tokenManager *TokenManager + authority *authority.Verifier + now func() time.Time +} + +func NewService(deps module.Dependencies, store Store, transactor Transactor, verifiers ...*authority.Verifier) *Service { + var authorityVerifier *authority.Verifier + if len(verifiers) > 0 { + authorityVerifier = verifiers[0] + } else if verifier, err := authority.NewVerifier(deps.Config.Installation); err == nil { + authorityVerifier = verifier + } + return &Service{ + cfg: deps.Config, + store: store, + transactor: transactor, + tokenManager: NewTokenManager(TokenConfig{ + Issuer: deps.Config.Auth.Issuer, + AccessTokenSecret: deps.Config.Auth.AccessTokenSecret, + RefreshHashSecret: deps.Config.Auth.RefreshHashSecret, + AccessTokenTTL: deps.Config.Auth.AccessTokenTTL, + RefreshTokenTTL: deps.Config.Auth.RefreshTokenTTL, + }), + authority: authorityVerifier, + now: time.Now, + } +} + +func (s *Service) Login(ctx context.Context, cmd LoginCommand) (*AuthResult, error) { + user, err := s.store.Users().GetByEmail(ctx, cmd.Email) + if err != nil { + return nil, err + } + if user == nil { + return nil, ErrInvalidCredentials + } + if err := bcrypt.CompareHashAndPassword([]byte(user.PasswordHash), []byte(cmd.Password)); err != nil { + return nil, ErrInvalidCredentials + } + + var result AuthResult + now := s.now().UTC() + + if err := s.transactor.WithinTransaction(ctx, func(store Store) error { + device, err := store.Devices().Upsert(ctx, UpsertDeviceParams{ + UserID: user.ID, + Fingerprint: cmd.DeviceFingerprint, + Label: cmd.DeviceLabel, + TrustRequested: cmd.TrustDevice, + SeenAt: now, + }) + if err != nil { + return err + } + if device.TrustStatus == DeviceTrustStatusRevoked { + return ErrDeviceRevoked + } + + authSessionID := uuid.NewString() + refreshToken, refreshHash, refreshExpiresAt, err := s.tokenManager.IssueRefreshToken(authSessionID, now) + if err != nil { + return err + } + accessToken, accessExpiresAt, err := s.tokenManager.IssueAccessToken(user.ID, authSessionID, device.ID, now) + if err != nil { + return err + } + + session := AuthSession{ + ID: authSessionID, + UserID: user.ID, + DeviceID: device.ID, + RefreshTokenHash: refreshHash, + RefreshExpiresAt: refreshExpiresAt, + CreatedAt: now, + UpdatedAt: now, + LastSeenAt: &now, + } + if err := store.AuthSessions().Create(ctx, session); err != nil { + return err + } + + result = AuthResult{ + User: *user, + Device: *device, + AuthSession: session, + Tokens: TokenPair{ + AccessToken: accessToken, + AccessTokenExpiresAt: accessExpiresAt, + RefreshToken: refreshToken, + RefreshTokenExpiresAt: refreshExpiresAt, + }, + } + return nil + }); err != nil { + return nil, err + } + + return &result, nil +} + +func (s *Service) Refresh(ctx context.Context, cmd RefreshCommand) (*AuthResult, error) { + authSessionID, err := s.tokenManager.ParseRefreshToken(cmd.RefreshToken) + if err != nil { + return nil, err + } + + var result AuthResult + now := s.now().UTC() + + if err := s.transactor.WithinTransaction(ctx, func(store Store) error { + session, err := store.AuthSessions().GetByIDForUpdate(ctx, authSessionID) + if err != nil { + return err + } + if session == nil { + return ErrInvalidRefreshToken + } + if session.RevokedAt != nil { + return ErrAuthSessionRevoked + } + if now.After(session.RefreshExpiresAt) { + if revokeErr := store.AuthSessions().Revoke(ctx, RevokeAuthSessionParams{ + AuthSessionID: session.ID, + UserID: session.UserID, + Reason: "refresh_token_expired", + RevokedAt: now, + }); revokeErr != nil { + return revokeErr + } + return ErrInvalidRefreshToken + } + + expectedHash := s.tokenManager.HashRefreshToken(cmd.RefreshToken) + if expectedHash != session.RefreshTokenHash { + if revokeErr := store.AuthSessions().Revoke(ctx, RevokeAuthSessionParams{ + AuthSessionID: session.ID, + UserID: session.UserID, + Reason: "refresh_rotation_reuse_detected", + RevokedAt: now, + }); revokeErr != nil { + return revokeErr + } + return ErrInvalidRefreshToken + } + + user, err := store.Users().GetByID(ctx, session.UserID) + if err != nil { + return err + } + if user == nil { + return ErrInvalidCredentials + } + + device, err := store.Devices().GetByIDForUser(ctx, session.UserID, session.DeviceID) + if err != nil { + return err + } + if device == nil { + return ErrTrustedDeviceMissing + } + if device.TrustStatus == DeviceTrustStatusRevoked { + return ErrDeviceRevoked + } + + refreshToken, refreshHash, refreshExpiresAt, err := s.tokenManager.IssueRefreshToken(session.ID, now) + if err != nil { + return err + } + accessToken, accessExpiresAt, err := s.tokenManager.IssueAccessToken(user.ID, session.ID, device.ID, now) + if err != nil { + return err + } + + if err := store.AuthSessions().Rotate(ctx, RotateAuthSessionParams{ + AuthSessionID: session.ID, + RefreshTokenHash: refreshHash, + RefreshExpiresAt: refreshExpiresAt, + LastSeenAt: now, + LastRotatedAt: now, + }); err != nil { + return err + } + + result = AuthResult{ + User: *user, + Device: *device, + AuthSession: AuthSession{ + ID: session.ID, + UserID: session.UserID, + DeviceID: session.DeviceID, + RefreshTokenHash: refreshHash, + RefreshExpiresAt: refreshExpiresAt, + LastSeenAt: &now, + LastRotatedAt: &now, + }, + Tokens: TokenPair{ + AccessToken: accessToken, + AccessTokenExpiresAt: accessExpiresAt, + RefreshToken: refreshToken, + RefreshTokenExpiresAt: refreshExpiresAt, + }, + } + return nil + }); err != nil { + return nil, err + } + + return &result, nil +} + +func (s *Service) InstallationStatus(ctx context.Context) (*InstallationStatus, error) { + record, err := s.store.Installation().GetStatus(ctx) + if err != nil { + return nil, err + } + return s.installationStatusFromRecord(record), nil +} + +func (s *Service) BootstrapOwner(ctx context.Context, cmd BootstrapOwnerCommand) (*BootstrapOwnerResult, error) { + email := strings.ToLower(strings.TrimSpace(cmd.Email)) + password := strings.TrimSpace(cmd.Password) + if email == "" || !strings.Contains(email, "@") || len(password) < 12 { + return nil, ErrInvalidBootstrapOwner + } + + now := s.now().UTC() + role := authority.PlatformRoleAdmin + installID := "" + grantSource := "installation_activation" + rootFingerprint := "" + activationPayload := cmd.ActivationPayload + activationSignature := strings.TrimSpace(cmd.ActivationSignature) + var expiresAt *time.Time + + if s.strictAuthority() { + if len(activationPayload) == 0 || activationSignature == "" { + return nil, ErrInstallationActivationRequired + } + activation, err := s.authority.VerifyActivation(activationPayload, activationSignature) + if err != nil { + return nil, fmt.Errorf("%w: %v", ErrInvalidInstallationActivation, err) + } + if !strings.EqualFold(activation.OwnerEmail, email) { + return nil, ErrInvalidInstallationActivation + } + role = activation.PlatformRole + installID = activation.InstallID + expiresAt = activation.ExpiresAt + rootFingerprint = s.authority.RootFingerprint() + } else { + if s.authority == nil || !s.authority.AllowInsecureBootstrap() { + return nil, ErrInsecureBootstrapDisabled + } + installID = uuid.NewString() + grantSource = "dev_insecure" + rootFingerprint = "dev-insecure" + devPayload, err := json.Marshal(authority.ActivationPayload{ + SchemaVersion: authority.ActivationSchemaVersion, + InstallID: installID, + OwnerEmail: email, + PlatformRole: role, + IssuedAt: now, + Environment: s.cfg.App.Env, + }) + if err != nil { + return nil, err + } + activationPayload = json.RawMessage(devPayload) + activationSignature = "dev-insecure" + } + + passwordHash, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost) + if err != nil { + return nil, fmt.Errorf("hash bootstrap owner password: %w", err) + } + + var user *User + if err := s.transactor.WithinTransaction(ctx, func(store Store) error { + created, err := store.Installation().BootstrapOwner(ctx, BootstrapOwnerParams{ + Email: email, + PasswordHash: string(passwordHash), + Role: role, + InstallID: installID, + ProductRootKeyFingerprint: rootFingerprint, + ActivationPayload: activationPayload, + ActivationSignature: activationSignature, + GrantSource: grantSource, + ExpiresAt: expiresAt, + Now: now, + }) + if err != nil { + return err + } + user = created + return nil + }); err != nil { + return nil, err + } + + status, err := s.InstallationStatus(ctx) + if err != nil { + return nil, err + } + return &BootstrapOwnerResult{ + Installation: *status, + User: *user, + PlatformRole: role, + }, nil +} + +func (s *Service) RevokeAuthSession(ctx context.Context, cmd RevokeAuthSessionCommand) error { + return s.transactor.WithinTransaction(ctx, func(store Store) error { + session, err := store.AuthSessions().GetByIDForUpdate(ctx, cmd.AuthSessionID) + if err != nil { + return err + } + if session == nil || session.UserID != cmd.UserID { + return ErrAuthSessionNotFound + } + return store.AuthSessions().Revoke(ctx, RevokeAuthSessionParams{ + AuthSessionID: cmd.AuthSessionID, + UserID: cmd.UserID, + Reason: cmd.Reason, + RevokedAt: s.now().UTC(), + }) + }) +} + +func (s *Service) RevokeTrustedDevice(ctx context.Context, cmd RevokeDeviceCommand) error { + return s.transactor.WithinTransaction(ctx, func(store Store) error { + device, err := store.Devices().GetByIDForUser(ctx, cmd.UserID, cmd.DeviceID) + if err != nil { + return err + } + if device == nil { + return ErrTrustedDeviceMissing + } + if device.TrustStatus != DeviceTrustStatusTrusted { + return ErrDeviceNotTrusted + } + + now := s.now().UTC() + if err := store.Devices().Revoke(ctx, RevokeDeviceParams{ + UserID: cmd.UserID, + DeviceID: cmd.DeviceID, + Reason: cmd.Reason, + RevokedAt: now, + }); err != nil { + return err + } + return store.AuthSessions().RevokeByDevice(ctx, cmd.UserID, cmd.DeviceID, "device_revoked:"+cmd.Reason, now) + }) +} + +func (s *Service) ListTrustedDevices(ctx context.Context, userID string) ([]Device, error) { + return s.store.Devices().ListTrustedByUser(ctx, userID) +} + +func (s *Service) MapError(err error) (int, string) { + switch { + case err == nil: + return 0, "" + case errors.Is(err, ErrInvalidCredentials): + return 401, "invalid credentials" + case errors.Is(err, ErrInvalidRefreshToken): + return 401, "invalid refresh token" + case errors.Is(err, ErrAuthSessionRevoked): + return 401, "auth session revoked" + case errors.Is(err, ErrDeviceRevoked): + return 403, "device revoked" + case errors.Is(err, ErrDeviceNotTrusted): + return 409, "device is not trusted" + case errors.Is(err, ErrAuthSessionNotFound), errors.Is(err, ErrTrustedDeviceMissing): + return 404, err.Error() + case errors.Is(err, ErrInstallationActivationRequired), errors.Is(err, ErrInvalidInstallationActivation), errors.Is(err, ErrInvalidBootstrapOwner): + return 400, err.Error() + case errors.Is(err, ErrInsecureBootstrapDisabled): + return 403, err.Error() + case errors.Is(err, ErrInstallationAlreadyBootstrapped): + return 409, err.Error() + default: + return 500, fmt.Sprintf("internal error: %v", err) + } +} + +func (s *Service) installationStatusFromRecord(record *InstallationAuthorityState) *InstallationStatus { + if record == nil { + record = &InstallationAuthorityState{AuthorityState: "unbootstrapped"} + } + mode := authority.ModeLegacy + strict := false + rootFingerprint := "" + insecureAllowed := false + if s.authority != nil { + mode = s.authority.Mode() + strict = s.authority.Strict() + rootFingerprint = s.authority.RootFingerprint() + insecureAllowed = s.authority.AllowInsecureBootstrap() + } + if record.ProductRootFingerprint != "" { + rootFingerprint = record.ProductRootFingerprint + } + return &InstallationStatus{ + Bootstrapped: record.Bootstrapped, + AuthorityState: record.AuthorityState, + InstallID: record.InstallID, + BootstrappedOwnerEmail: record.BootstrappedOwnerEmail, + BootstrappedAt: record.BootstrappedAt, + AuthorityMode: mode, + StrictAuthority: strict, + RootFingerprint: rootFingerprint, + InsecureBootstrapAllowed: insecureAllowed, + } +} + +func (s *Service) strictAuthority() bool { + return s.authority != nil && s.authority.Strict() +} diff --git a/backend/internal/modules/auth/token.go b/backend/internal/modules/auth/token.go new file mode 100644 index 0000000..2bfa56c --- /dev/null +++ b/backend/internal/modules/auth/token.go @@ -0,0 +1,95 @@ +package auth + +import ( + "crypto/hmac" + "crypto/rand" + "crypto/sha256" + "encoding/base64" + "fmt" + "strings" + "time" + + "github.com/golang-jwt/jwt/v5" +) + +type TokenManager struct { + issuer string + accessSecret []byte + refreshHashSecret []byte + accessTTL time.Duration + refreshTTL time.Duration +} + +type AccessClaims struct { + AuthSessionID string `json:"sid"` + DeviceID string `json:"did"` + jwt.RegisteredClaims +} + +func NewTokenManager(cfg TokenConfig) *TokenManager { + return &TokenManager{ + issuer: cfg.Issuer, + accessSecret: []byte(cfg.AccessTokenSecret), + refreshHashSecret: []byte(cfg.RefreshHashSecret), + accessTTL: cfg.AccessTokenTTL, + refreshTTL: cfg.RefreshTokenTTL, + } +} + +type TokenConfig struct { + Issuer string + AccessTokenSecret string + RefreshHashSecret string + AccessTokenTTL time.Duration + RefreshTokenTTL time.Duration +} + +func (m *TokenManager) IssueAccessToken(userID, authSessionID, deviceID string, now time.Time) (string, time.Time, error) { + expiresAt := now.Add(m.accessTTL) + claims := AccessClaims{ + AuthSessionID: authSessionID, + DeviceID: deviceID, + RegisteredClaims: jwt.RegisteredClaims{ + Issuer: m.issuer, + Subject: userID, + ExpiresAt: jwt.NewNumericDate(expiresAt), + IssuedAt: jwt.NewNumericDate(now), + NotBefore: jwt.NewNumericDate(now), + }, + } + + token := jwt.NewWithClaims(jwt.SigningMethodHS256, claims) + signed, err := token.SignedString(m.accessSecret) + if err != nil { + return "", time.Time{}, fmt.Errorf("sign access token: %w", err) + } + + return signed, expiresAt, nil +} + +func (m *TokenManager) IssueRefreshToken(authSessionID string, now time.Time) (raw string, hash string, expiresAt time.Time, err error) { + secret := make([]byte, 32) + if _, err = rand.Read(secret); err != nil { + return "", "", time.Time{}, fmt.Errorf("read random refresh secret: %w", err) + } + + encodedSecret := base64.RawURLEncoding.EncodeToString(secret) + raw = authSessionID + "." + encodedSecret + hash = m.HashRefreshToken(raw) + expiresAt = now.Add(m.refreshTTL) + return raw, hash, expiresAt, nil +} + +func (m *TokenManager) HashRefreshToken(token string) string { + mac := hmac.New(sha256.New, m.refreshHashSecret) + _, _ = mac.Write([]byte(token)) + return base64.RawURLEncoding.EncodeToString(mac.Sum(nil)) +} + +func (m *TokenManager) ParseRefreshToken(token string) (string, error) { + sessionID, _, ok := strings.Cut(token, ".") + if !ok || sessionID == "" { + return "", ErrInvalidRefreshToken + } + return sessionID, nil +} diff --git a/backend/internal/modules/cluster/models.go b/backend/internal/modules/cluster/models.go new file mode 100644 index 0000000..f23c6c9 --- /dev/null +++ b/backend/internal/modules/cluster/models.go @@ -0,0 +1,1017 @@ +package cluster + +import ( + "encoding/json" + "time" + + "github.com/example/remote-access-platform/backend/internal/platform/clusterauth" +) + +const ( + PlatformRoleAdmin = "platform_admin" + PlatformRoleRecoveryAdmin = "platform_recovery_admin" + + ClusterStatusActive = "active" + ClusterStatusDisabled = "disabled" + + JoinRequestStatusPending = "pending" + JoinRequestStatusApproved = "approved" + JoinRequestStatusRejected = "rejected" + + NodeRegistrationPending = "pending" + NodeRegistrationActive = "active" + + DefaultClusterSlug = "default" + + VPNConnectionModeSingleActive = "single_active" + + VPNConnectionDesiredEnabled = "enabled" + VPNConnectionDesiredDisabled = "disabled" + + VPNConnectionStatusDisabled = "disabled" + VPNConnectionStatusEnabled = "enabled" + VPNConnectionStatusConnecting = "connecting" + VPNConnectionStatusActive = "active" + VPNConnectionStatusDegraded = "degraded" + VPNConnectionStatusFailed = "failed" + + VPNLeaseStatusActive = "active" + VPNLeaseStatusReleased = "released" + VPNLeaseStatusExpired = "expired" + VPNLeaseStatusFenced = "fenced" + + VPNAssignmentStatusNotStarted = "not_started" + VPNAssignmentStatusAssigned = "assigned" + VPNAssignmentStatusLeaseRequired = "lease_required" + VPNAssignmentStatusBlocked = "blocked" + VPNAssignmentStatusUnknown = "unknown" +) + +var allowedNodeRoles = map[string]struct{}{ + "entry-node": {}, + "relay-node": {}, + "core-mesh": {}, + "rdp-worker": {}, + "vnc-worker": {}, + "vpn-exit": {}, + "vpn-connector": {}, + "file-storage-cache": {}, + "update-cache": {}, + "video-relay": {}, +} + +type Cluster struct { + ID string `json:"id"` + Slug string `json:"slug"` + Name string `json:"name"` + Status string `json:"status"` + Region *string `json:"region,omitempty"` + Metadata json.RawMessage `json:"metadata"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type ClusterNode struct { + ID string `json:"id"` + OwnerOrganizationID *string `json:"owner_organization_id,omitempty"` + NodeKey string `json:"node_key"` + Name string `json:"name"` + OwnershipType string `json:"ownership_type"` + RegistrationStatus string `json:"registration_status"` + HealthStatus string `json:"health_status"` + VersionState string `json:"version_state"` + PartitionState string `json:"partition_state"` + ReportedVersion *string `json:"reported_version,omitempty"` + LastSeenAt *time.Time `json:"last_seen_at,omitempty"` + MembershipStatus string `json:"membership_status"` + MembershipMetadata json.RawMessage `json:"membership_metadata"` + NodeGroupID *string `json:"node_group_id,omitempty"` + NodeGroupName *string `json:"node_group_name,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type ClusterNodeGroup struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + ParentGroupID *string `json:"parent_group_id,omitempty"` + Name string `json:"name"` + Description *string `json:"description,omitempty"` + SortOrder int `json:"sort_order"` + Metadata json.RawMessage `json:"metadata"` + CreatedByUserID *string `json:"created_by_user_id,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type NodeJoinToken struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + Scope json.RawMessage `json:"scope"` + ExpiresAt time.Time `json:"expires_at"` + MaxUses int `json:"max_uses"` + UsedCount int `json:"used_count"` + Status string `json:"status"` + CreatedByUserID *string `json:"created_by_user_id,omitempty"` + CreatedAt time.Time `json:"created_at"` + RevokedAt *time.Time `json:"revoked_at,omitempty"` + AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"` + AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"` +} + +type CreatedJoinToken struct { + NodeJoinToken + Token string `json:"token"` +} + +type NodeBootstrap struct { + NodeID string `json:"node_id"` + ClusterID string `json:"cluster_id"` + IdentityStatus string `json:"identity_status"` + Certificate map[string]any `json:"certificate"` + HeartbeatEndpoint string `json:"heartbeat_endpoint"` + ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"` + AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"` + AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"` +} + +type NodeJoinRequest struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + JoinTokenID *string `json:"join_token_id,omitempty"` + NodeName string `json:"node_name"` + NodeFingerprint string `json:"node_fingerprint"` + PublicKey string `json:"public_key"` + ReportedCapabilities json.RawMessage `json:"reported_capabilities"` + ReportedFacts json.RawMessage `json:"reported_facts"` + RequestedRoles json.RawMessage `json:"requested_roles"` + Status string `json:"status"` + ReviewedByUserID *string `json:"reviewed_by_user_id,omitempty"` + ReviewedAt *time.Time `json:"reviewed_at,omitempty"` + ApprovedNodeID *string `json:"approved_node_id,omitempty"` + RejectionReason *string `json:"rejection_reason,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` + ApprovalPayload json.RawMessage `json:"approval_payload,omitempty"` + ApprovalSignature json.RawMessage `json:"approval_signature,omitempty"` +} + +type NodeRoleAssignment struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + OrganizationID *string `json:"organization_id,omitempty"` + Role string `json:"role"` + Status string `json:"status"` + Policy json.RawMessage `json:"policy"` + AssignedByUserID *string `json:"assigned_by_user_id,omitempty"` + AssignedAt time.Time `json:"assigned_at"` + RevokedAt *time.Time `json:"revoked_at,omitempty"` +} + +type NodeHeartbeat struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + HealthStatus string `json:"health_status"` + ReportedVersion *string `json:"reported_version,omitempty"` + Capabilities json.RawMessage `json:"capabilities"` + ServiceStates json.RawMessage `json:"service_states"` + Metadata json.RawMessage `json:"metadata"` + ObservedAt time.Time `json:"observed_at"` +} + +type NodeWorkloadDesiredState struct { + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + ServiceType string `json:"service_type"` + DesiredState string `json:"desired_state"` + Version *string `json:"version,omitempty"` + RuntimeMode string `json:"runtime_mode"` + ArtifactRef *string `json:"artifact_ref,omitempty"` + Config json.RawMessage `json:"config"` + Environment json.RawMessage `json:"environment"` + UpdatedByUserID *string `json:"updated_by_user_id,omitempty"` + UpdatedAt time.Time `json:"updated_at"` +} + +type NodeWorkloadStatus struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + ServiceType string `json:"service_type"` + ReportedState string `json:"reported_state"` + RuntimeMode string `json:"runtime_mode"` + Version *string `json:"version,omitempty"` + StatusPayload json.RawMessage `json:"status_payload"` + ObservedAt time.Time `json:"observed_at"` +} + +type MeshLinkObservation struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + SourceNodeID string `json:"source_node_id"` + TargetNodeID string `json:"target_node_id"` + LinkStatus string `json:"link_status"` + LatencyMs *int `json:"latency_ms,omitempty"` + QualityScore *int `json:"quality_score,omitempty"` + Metadata json.RawMessage `json:"metadata"` + ObservedAt time.Time `json:"observed_at"` +} + +type MeshRouteIntent struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + SourceSelector json.RawMessage `json:"source_selector"` + DestinationSelector json.RawMessage `json:"destination_selector"` + ServiceClass string `json:"service_class"` + Priority int `json:"priority"` + Status string `json:"status"` + Policy json.RawMessage `json:"policy"` + CreatedByUserID *string `json:"created_by_user_id,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type SyntheticMeshRouteConfig struct { + RouteID string `json:"route_id"` + ClusterID string `json:"cluster_id"` + SourceNodeID string `json:"source_node_id"` + DestinationNodeID string `json:"destination_node_id"` + Hops []string `json:"hops"` + AllowedChannels []string `json:"allowed_channels"` + ExpiresAt time.Time `json:"expires_at"` + MaxTTL int `json:"max_ttl"` + MaxHops int `json:"max_hops"` + RouteVersion string `json:"route_version,omitempty"` + PolicyVersion string `json:"policy_version,omitempty"` + PeerDirectoryVersion string `json:"peer_directory_version,omitempty"` +} + +type PeerEndpointCandidate struct { + EndpointID string `json:"endpoint_id"` + NodeID string `json:"node_id"` + Transport string `json:"transport"` + Address string `json:"address"` + AddressFamily string `json:"address_family,omitempty"` + Reachability string `json:"reachability"` + NATType string `json:"nat_type,omitempty"` + ConnectivityMode string `json:"connectivity_mode"` + Region string `json:"region,omitempty"` + Priority int `json:"priority"` + PolicyTags []string `json:"policy_tags,omitempty"` + LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` +} + +type PeerDirectoryEntry struct { + NodeID string `json:"node_id"` + RouteIDs []string `json:"route_ids,omitempty"` + EndpointCount int `json:"endpoint_count"` + CandidateCount int `json:"candidate_count"` + ConnectivityModes []string `json:"connectivity_modes,omitempty"` + RecoverySeed bool `json:"recovery_seed"` +} + +type PeerRecoverySeed struct { + NodeID string `json:"node_id"` + Endpoint string `json:"endpoint"` + Transport string `json:"transport"` + ConnectivityMode string `json:"connectivity_mode,omitempty"` + Region string `json:"region,omitempty"` + Priority int `json:"priority"` + LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` +} + +type PeerRendezvousLease struct { + LeaseID string `json:"lease_id"` + PeerNodeID string `json:"peer_node_id"` + RelayNodeID string `json:"relay_node_id"` + RelayEndpoint string `json:"relay_endpoint"` + Transport string `json:"transport"` + ConnectivityMode string `json:"connectivity_mode,omitempty"` + RouteIDs []string `json:"route_ids,omitempty"` + AllowedChannels []string `json:"allowed_channels,omitempty"` + Priority int `json:"priority"` + ControlPlaneOnly bool `json:"control_plane_only"` + IssuedAt time.Time `json:"issued_at"` + ExpiresAt time.Time `json:"expires_at"` + Reason string `json:"reason,omitempty"` + Metadata json.RawMessage `json:"metadata,omitempty"` +} + +type RendezvousRelayPolicyDecision struct { + RouteID string `json:"route_id,omitempty"` + PeerNodeID string `json:"peer_node_id"` + WithdrawnLeaseID string `json:"withdrawn_lease_id,omitempty"` + StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"` + SelectedRelayID string `json:"selected_relay_id,omitempty"` + SelectedEndpoint string `json:"selected_endpoint,omitempty"` + Score int `json:"score,omitempty"` + Reason string `json:"reason"` + ScoreReasons []string `json:"score_reasons,omitempty"` + ReporterNodeID string `json:"reporter_node_id,omitempty"` +} + +type RendezvousRelayPolicyReport struct { + SchemaVersion string `json:"schema_version"` + ScoringMode string `json:"scoring_mode"` + FeedbackMaxAgeSeconds int `json:"feedback_max_age_seconds"` + StaleRelayCount int `json:"stale_relay_count"` + WithdrawnLeaseCount int `json:"withdrawn_lease_count"` + ReplacementLeaseCount int `json:"replacement_lease_count"` + Decisions []RendezvousRelayPolicyDecision `json:"decisions,omitempty"` +} + +type RoutePathDecision struct { + DecisionID string `json:"decision_id"` + RouteID string `json:"route_id"` + ClusterID string `json:"cluster_id"` + LocalNodeID string `json:"local_node_id"` + SourceNodeID string `json:"source_node_id"` + DestinationNodeID string `json:"destination_node_id"` + OriginalHops []string `json:"original_hops"` + EffectiveHops []string `json:"effective_hops"` + PreviousHopID string `json:"previous_hop_id,omitempty"` + NextHopID string `json:"next_hop_id,omitempty"` + LocalRole string `json:"local_role"` + SelectedRelayID string `json:"selected_relay_id,omitempty"` + SelectedRelayEndpoint string `json:"selected_relay_endpoint,omitempty"` + StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"` + RendezvousPeerNodeID string `json:"rendezvous_peer_node_id,omitempty"` + RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"` + RendezvousLeaseReason string `json:"rendezvous_lease_reason,omitempty"` + DecisionSource string `json:"decision_source"` + Generation string `json:"generation"` + PathScore int `json:"path_score,omitempty"` + ScoreReasons []string `json:"score_reasons,omitempty"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` + ExpiresAt time.Time `json:"expires_at"` +} + +type RoutePathDecisionReport struct { + SchemaVersion string `json:"schema_version"` + DecisionMode string `json:"decision_mode"` + Generation string `json:"generation"` + DecisionCount int `json:"decision_count"` + ReplacementDecisionCount int `json:"replacement_decision_count"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` + Decisions []RoutePathDecision `json:"decisions,omitempty"` +} + +type NodeSyntheticMeshConfig struct { + Enabled bool `json:"enabled"` + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + LocalNodeID string `json:"local_node_id"` + AuthorityRequired bool `json:"authority_required"` + ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"` + AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"` + AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"` + ConfigVersion string `json:"config_version,omitempty"` + PeerDirectoryVersion string `json:"peer_directory_version,omitempty"` + PolicyVersion string `json:"policy_version,omitempty"` + PeerEndpoints map[string]string `json:"peer_endpoints"` + PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"` + PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"` + RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"` + RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"` + RendezvousRelayPolicy *RendezvousRelayPolicyReport `json:"rendezvous_relay_policy,omitempty"` + RoutePathDecisions *RoutePathDecisionReport `json:"route_path_decisions,omitempty"` + Routes []SyntheticMeshRouteConfig `json:"routes"` + ProductionForwarding bool `json:"production_forwarding"` +} + +type MeshQoSPolicy struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + ServiceClass string `json:"service_class"` + Priority int `json:"priority"` + ReliabilityMode string `json:"reliability_mode"` + DropPolicy string `json:"drop_policy"` + BandwidthPolicy json.RawMessage `json:"bandwidth_policy"` + Metadata json.RawMessage `json:"metadata"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type FabricEntryPoint struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + Name string `json:"name"` + Status string `json:"status"` + EndpointType string `json:"endpoint_type"` + PublicEndpoint *string `json:"public_endpoint,omitempty"` + Policy json.RawMessage `json:"policy"` + Metadata json.RawMessage `json:"metadata"` + CreatedByUserID *string `json:"created_by_user_id,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type FabricEntryPointNode struct { + EntryPointID string `json:"entry_point_id"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + Status string `json:"status"` + Priority int `json:"priority"` + Metadata json.RawMessage `json:"metadata"` + AddedByUserID *string `json:"added_by_user_id,omitempty"` + AddedAt time.Time `json:"added_at"` +} + +type FabricEgressPool struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + Name string `json:"name"` + Status string `json:"status"` + Description *string `json:"description,omitempty"` + RouteScope json.RawMessage `json:"route_scope"` + Policy json.RawMessage `json:"policy"` + Metadata json.RawMessage `json:"metadata"` + CreatedByUserID *string `json:"created_by_user_id,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type FabricEgressPoolNode struct { + EgressPoolID string `json:"egress_pool_id"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + Status string `json:"status"` + Priority int `json:"priority"` + Metadata json.RawMessage `json:"metadata"` + AddedByUserID *string `json:"added_by_user_id,omitempty"` + AddedAt time.Time `json:"added_at"` +} + +type ClusterAuthorityState struct { + ClusterID string `json:"cluster_id"` + AuthorityState string `json:"authority_state"` + MutationMode string `json:"mutation_mode"` + Term int64 `json:"term"` + Notes *string `json:"notes,omitempty"` + UpdatedByUserID *string `json:"updated_by_user_id,omitempty"` + UpdatedAt time.Time `json:"updated_at"` +} + +type ClusterSignature = clusterauth.Signature + +type ClusterAuthorityDescriptor struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + AuthorityState string `json:"authority_state"` + KeyAlgorithm string `json:"key_algorithm"` + PublicKey string `json:"public_key"` + PublicKeyFingerprint string `json:"public_key_fingerprint"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type ClusterAuthorityKey struct { + ClusterAuthorityDescriptor + PrivateKey string `json:"-"` +} + +type ClusterAdminSummary struct { + ClusterID string `json:"cluster_id"` + Slug string `json:"slug"` + Name string `json:"name"` + Status string `json:"status"` + Region *string `json:"region,omitempty"` + AuthorityState string `json:"authority_state"` + MutationMode string `json:"mutation_mode"` + ClusterKeyAlgorithm *string `json:"cluster_key_algorithm,omitempty"` + ClusterKeyFingerprint *string `json:"cluster_key_fingerprint,omitempty"` + NodeCount int64 `json:"node_count"` + HealthyNodeCount int64 `json:"healthy_node_count"` + PendingJoinCount int64 `json:"pending_join_count"` + ActiveRoleAssignmentCount int64 `json:"active_role_assignment_count"` + LastNodeSeenAt *time.Time `json:"last_node_seen_at,omitempty"` +} + +type ClusterAuditEvent struct { + ID string `json:"id"` + ClusterID *string `json:"cluster_id,omitempty"` + ActorUserID *string `json:"actor_user_id,omitempty"` + EventType string `json:"event_type"` + TargetType string `json:"target_type"` + TargetID *string `json:"target_id,omitempty"` + Payload json.RawMessage `json:"payload"` + CreatedAt time.Time `json:"created_at"` +} + +type FabricTestingFlag struct { + ID string `json:"id"` + ScopeType string `json:"scope_type"` + ScopeID *string `json:"scope_id,omitempty"` + ClusterID *string `json:"cluster_id,omitempty"` + Enabled bool `json:"enabled"` + TelemetryEnabled bool `json:"telemetry_enabled"` + SyntheticLinksEnabled bool `json:"synthetic_links_enabled"` + HistoryRetentionHours int `json:"history_retention_hours"` + Metadata json.RawMessage `json:"metadata"` + UpdatedByUserID *string `json:"updated_by_user_id,omitempty"` + UpdatedAt time.Time `json:"updated_at"` +} + +type EffectiveNodeTestingFlags struct { + Enabled bool `json:"enabled"` + TelemetryEnabled bool `json:"telemetry_enabled"` + SyntheticLinksEnabled bool `json:"synthetic_links_enabled"` + HistoryRetentionHours int `json:"history_retention_hours"` + AppliedScopes []string `json:"applied_scopes"` + Metadata json.RawMessage `json:"metadata"` +} + +type NodeTelemetryObservation struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + CPUPercent *float64 `json:"cpu_percent,omitempty"` + MemoryUsedBytes *int64 `json:"memory_used_bytes,omitempty"` + MemoryTotalBytes *int64 `json:"memory_total_bytes,omitempty"` + DiskUsedBytes *int64 `json:"disk_used_bytes,omitempty"` + DiskTotalBytes *int64 `json:"disk_total_bytes,omitempty"` + NetworkRxBytes *int64 `json:"network_rx_bytes,omitempty"` + NetworkTxBytes *int64 `json:"network_tx_bytes,omitempty"` + ProcessCount *int `json:"process_count,omitempty"` + Payload json.RawMessage `json:"payload"` + ObservedAt time.Time `json:"observed_at"` +} + +type VPNConnection struct { + ID string `json:"id"` + ClusterID string `json:"cluster_id"` + OrganizationID string `json:"organization_id"` + Name string `json:"name"` + TargetEndpoint json.RawMessage `json:"target_endpoint"` + ProtocolFamily string `json:"protocol_family"` + CredentialRef *string `json:"credential_ref,omitempty"` + Mode string `json:"mode"` + DesiredState string `json:"desired_state"` + AllowedNodePolicy json.RawMessage `json:"allowed_node_policy"` + RoutingUsage json.RawMessage `json:"routing_usage"` + RoutePolicy json.RawMessage `json:"route_policy"` + QoSPolicy json.RawMessage `json:"qos_policy"` + PlacementPolicy json.RawMessage `json:"placement_policy"` + Status string `json:"status"` + Metadata json.RawMessage `json:"metadata"` + CreatedByUserID *string `json:"created_by_user_id,omitempty"` + UpdatedByUserID *string `json:"updated_by_user_id,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type VPNConnectionAllowedNode struct { + VPNConnectionID string `json:"vpn_connection_id"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + RolePreference string `json:"role_preference"` + Status string `json:"status"` + Metadata json.RawMessage `json:"metadata"` + CreatedByUserID *string `json:"created_by_user_id,omitempty"` + CreatedAt time.Time `json:"created_at"` +} + +type VPNConnectionRoutePolicy struct { + ID string `json:"id"` + VPNConnectionID string `json:"vpn_connection_id"` + ClusterID string `json:"cluster_id"` + OrganizationID string `json:"organization_id"` + RouteType string `json:"route_type"` + Destination string `json:"destination"` + Action string `json:"action"` + ServiceType *string `json:"service_type,omitempty"` + Priority int `json:"priority"` + Policy json.RawMessage `json:"policy"` + Status string `json:"status"` + CreatedByUserID *string `json:"created_by_user_id,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type VPNConnectionLease struct { + ID string `json:"id"` + VPNConnectionID string `json:"vpn_connection_id"` + ClusterID string `json:"cluster_id"` + OwnerNodeID string `json:"owner_node_id"` + LeaseGeneration int64 `json:"lease_generation"` + FencingToken string `json:"fencing_token"` + Status string `json:"status"` + AcquiredAt time.Time `json:"acquired_at"` + RenewedAt time.Time `json:"renewed_at"` + ExpiresAt time.Time `json:"expires_at"` + ReleasedAt *time.Time `json:"released_at,omitempty"` + FencedAt *time.Time `json:"fenced_at,omitempty"` + Metadata json.RawMessage `json:"metadata"` +} + +type VPNLeaseOwnerEligibility struct { + VPNConnectionID string `json:"vpn_connection_id"` + ClusterID string `json:"cluster_id"` + OrganizationID string `json:"organization_id"` + OwnerNodeID string `json:"owner_node_id"` + MembershipStatus string `json:"membership_status"` + NodeRegistrationStatus string `json:"node_registration_status"` + AllowedByPolicy bool `json:"allowed_by_policy"` + HasAuthorizedRole bool `json:"has_authorized_role"` +} + +type NodeVPNAssignmentLease struct { + LeaseID string `json:"lease_id"` + OwnerNodeID string `json:"owner_node_id"` + LeaseGeneration int64 `json:"lease_generation"` + Status string `json:"status"` + RenewedAt time.Time `json:"renewed_at"` + ExpiresAt time.Time `json:"expires_at"` +} + +type NodeVPNAssignment struct { + VPNConnectionID string `json:"vpn_connection_id"` + ClusterID string `json:"cluster_id"` + OrganizationID string `json:"organization_id"` + Name string `json:"name"` + TargetEndpoint json.RawMessage `json:"target_endpoint"` + ProtocolFamily string `json:"protocol_family"` + Mode string `json:"mode"` + DesiredState string `json:"desired_state"` + RoutingUsage json.RawMessage `json:"routing_usage"` + RoutePolicy json.RawMessage `json:"route_policy"` + QoSPolicy json.RawMessage `json:"qos_policy"` + PlacementPolicy json.RawMessage `json:"placement_policy"` + Status string `json:"status"` + HasCredentialRef bool `json:"has_credential_ref"` + AssignmentReason string `json:"assignment_reason"` + ActiveLease *NodeVPNAssignmentLease `json:"active_lease,omitempty"` + UpdatedAt time.Time `json:"updated_at"` +} + +type NodeVPNAssignmentStatus struct { + ID string `json:"id"` + VPNConnectionID string `json:"vpn_connection_id"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + ObservedStatus string `json:"observed_status"` + StatusPayload json.RawMessage `json:"status_payload"` + ObservedAt time.Time `json:"observed_at"` +} + +type CreateClusterInput struct { + ActorUserID string + Slug string + Name string + Region *string + Metadata json.RawMessage +} + +type UpdateClusterInput struct { + ActorUserID string + ClusterID string + Name string + Status string + Region *string + Metadata json.RawMessage +} + +type CreateJoinTokenInput struct { + ActorUserID string + ClusterID string + Scope json.RawMessage + ExpiresAt time.Time + MaxUses int +} + +type CreateJoinRequestInput struct { + ClusterID string + JoinToken string + NodeName string + NodeFingerprint string + PublicKey string + ReportedCapabilities json.RawMessage + ReportedFacts json.RawMessage + RequestedRoles json.RawMessage +} + +type GetJoinRequestBootstrapInput struct { + ClusterID string + JoinRequestID string + NodeFingerprint string + PublicKey string +} + +type ApproveJoinRequestInput struct { + ActorUserID string + ClusterID string + JoinRequestID string + NodeKey string + OwnershipType string + OwnerOrganizationID *string +} + +type ApprovedJoinRequest struct { + JoinRequest NodeJoinRequest `json:"join_request"` + Bootstrap NodeBootstrap `json:"node_bootstrap"` +} + +type JoinRequestBootstrapResult struct { + Status string `json:"status"` + JoinRequest NodeJoinRequest `json:"join_request"` + Bootstrap *NodeBootstrap `json:"node_bootstrap,omitempty"` +} + +type RejectJoinRequestInput struct { + ActorUserID string + ClusterID string + JoinRequestID string + Reason string +} + +type AssignNodeRoleInput struct { + ActorUserID string + ClusterID string + NodeID string + OrganizationID *string + Role string + Status string + Policy json.RawMessage +} + +type AttachExistingNodeInput struct { + ActorUserID string + ClusterID string + NodeID string + Roles []string +} + +type CreateNodeGroupInput struct { + ActorUserID string + ClusterID string + ParentGroupID *string + Name string + Description *string + SortOrder int + Metadata json.RawMessage +} + +type AssignNodeGroupInput struct { + ActorUserID string + ClusterID string + NodeID string + GroupID *string +} + +type RevokeJoinTokenInput struct { + ActorUserID string + ClusterID string + TokenID string +} + +type RevokeNodeIdentityInput struct { + ActorUserID string + ClusterID string + NodeID string + Reason string +} + +type DisableMembershipInput struct { + ActorUserID string + ClusterID string + NodeID string + Reason string +} + +type RecordHeartbeatInput struct { + ClusterID string + NodeID string + HealthStatus string + ReportedVersion *string + Capabilities json.RawMessage + ServiceStates json.RawMessage + Metadata json.RawMessage +} + +type UpsertFabricTestingFlagInput struct { + ActorUserID string + ScopeType string + ScopeID *string + ClusterID *string + Enabled bool + TelemetryEnabled bool + SyntheticLinksEnabled bool + HistoryRetentionHours int + Metadata json.RawMessage +} + +type RecordNodeTelemetryInput struct { + ClusterID string + NodeID string + CPUPercent *float64 + MemoryUsedBytes *int64 + MemoryTotalBytes *int64 + DiskUsedBytes *int64 + DiskTotalBytes *int64 + NetworkRxBytes *int64 + NetworkTxBytes *int64 + ProcessCount *int + Payload json.RawMessage + ObservedAt time.Time +} + +type SetDesiredWorkloadInput struct { + ActorUserID string + ClusterID string + NodeID string + ServiceType string + DesiredState string + Version *string + RuntimeMode string + ArtifactRef *string + Config json.RawMessage + Environment json.RawMessage +} + +type ReportWorkloadStatusInput struct { + ClusterID string + NodeID string + ServiceType string + ReportedState string + RuntimeMode string + Version *string + StatusPayload json.RawMessage +} + +type ReportMeshLinkInput struct { + ClusterID string + SourceNodeID string + TargetNodeID string + LinkStatus string + LatencyMs *int + QualityScore *int + Metadata json.RawMessage +} + +type GetNodeSyntheticMeshConfigInput struct { + ClusterID string + NodeID string +} + +type CreateRouteIntentInput struct { + ActorUserID string + ClusterID string + SourceSelector json.RawMessage + DestinationSelector json.RawMessage + ServiceClass string + Priority int + Policy json.RawMessage +} + +type CreateFabricEntryPointInput struct { + ActorUserID string + ClusterID string + Name string + Status string + EndpointType string + PublicEndpoint *string + Policy json.RawMessage + Metadata json.RawMessage +} + +type SetFabricEntryPointNodeInput struct { + ActorUserID string + ClusterID string + EntryPointID string + NodeID string + Status string + Priority int + Metadata json.RawMessage +} + +type CreateFabricEgressPoolInput struct { + ActorUserID string + ClusterID string + Name string + Status string + Description *string + RouteScope json.RawMessage + Policy json.RawMessage + Metadata json.RawMessage +} + +type SetFabricEgressPoolNodeInput struct { + ActorUserID string + ClusterID string + EgressPoolID string + NodeID string + Status string + Priority int + Metadata json.RawMessage +} + +type UpdateClusterAuthorityInput struct { + ActorUserID string + ClusterID string + AuthorityState string + MutationMode string + Notes *string +} + +type CreateVPNConnectionInput struct { + ActorUserID string + ClusterID string + OrganizationID string + Name string + TargetEndpoint json.RawMessage + ProtocolFamily string + CredentialRef *string + Mode string + DesiredState string + AllowedNodePolicy json.RawMessage + RoutingUsage json.RawMessage + RoutePolicy json.RawMessage + QoSPolicy json.RawMessage + PlacementPolicy json.RawMessage + Metadata json.RawMessage +} + +type UpdateVPNConnectionDesiredStateInput struct { + ActorUserID string + ClusterID string + VPNConnectionID string + DesiredState string +} + +type UpsertVPNConnectionRoutePolicyInput struct { + ActorUserID string + ClusterID string + VPNConnectionID string + RouteType string + Destination string + Action string + ServiceType *string + Priority int + Policy json.RawMessage + Status string +} + +type SetVPNConnectionAllowedNodesInput struct { + ActorUserID string + ClusterID string + VPNConnectionID string + NodeIDs []string + RolePreference string + Metadata json.RawMessage +} + +type AcquireVPNConnectionLeaseInput struct { + ActorUserID string + ClusterID string + VPNConnectionID string + OwnerNodeID string + TTL time.Duration + Metadata json.RawMessage +} + +type RenewVPNConnectionLeaseInput struct { + ActorUserID string + ClusterID string + VPNConnectionID string + LeaseID string + OwnerNodeID string + FencingToken string + TTL time.Duration +} + +type ReleaseVPNConnectionLeaseInput struct { + ActorUserID string + ClusterID string + VPNConnectionID string + LeaseID string + OwnerNodeID string + FencingToken string +} + +type FenceVPNConnectionLeaseInput struct { + ActorUserID string + ClusterID string + VPNConnectionID string + LeaseID string + Reason string +} + +type ExpireStaleVPNConnectionLeasesInput struct { + ActorUserID string + ClusterID string +} + +type ReportNodeVPNAssignmentStatusInput struct { + ClusterID string + NodeID string + VPNConnectionID string + ObservedStatus string + StatusPayload json.RawMessage + ObservedAt time.Time +} diff --git a/backend/internal/modules/cluster/module.go b/backend/internal/modules/cluster/module.go new file mode 100644 index 0000000..619e3db --- /dev/null +++ b/backend/internal/modules/cluster/module.go @@ -0,0 +1,1268 @@ +package cluster + +import ( + "encoding/json" + "errors" + "net/http" + "strconv" + "time" + + "github.com/go-chi/chi/v5" + "github.com/jackc/pgx/v5" + + "github.com/example/remote-access-platform/backend/internal/platform/authority" + "github.com/example/remote-access-platform/backend/internal/platform/httpx" + "github.com/example/remote-access-platform/backend/internal/platform/module" + "github.com/example/remote-access-platform/backend/internal/platform/secrets" +) + +type Module struct { + service *Service +} + +func NewModule(deps module.Dependencies, verifiers ...*authority.Verifier) *Module { + store := NewPostgresStore(deps.Infra.DB, verifiers...) + if deps.Config.Secret.EncryptionKeyBase64 != "" { + if encryptor, err := secrets.NewEncryptor(deps.Config.Secret.EncryptionKeyBase64, deps.Config.Secret.EncryptionKeyID); err == nil { + store.WithClusterKeyEncryptor(encryptor) + } + } + return &Module{service: NewService(store)} +} + +func (m *Module) Name() string { + return "cluster" +} + +func (m *Module) RegisterRoutes(router chi.Router) { + router.Route("/clusters", func(r chi.Router) { + r.Get("/", m.listClusters) + r.Post("/", m.createCluster) + r.Get("/{clusterID}", m.getCluster) + r.Put("/{clusterID}", m.updateCluster) + r.Get("/{clusterID}/nodes", m.listClusterNodes) + r.Get("/{clusterID}/node-groups", m.listNodeGroups) + r.Post("/{clusterID}/node-groups", m.createNodeGroup) + r.Get("/{clusterID}/join-requests", m.listJoinRequests) + r.Post("/{clusterID}/join-requests", m.createJoinRequest) + r.Post("/{clusterID}/join-requests/{requestID}/approve", m.approveJoinRequest) + r.Post("/{clusterID}/join-requests/{requestID}/reject", m.rejectJoinRequest) + r.Post("/{clusterID}/join-tokens", m.createJoinToken) + r.Post("/{clusterID}/join-tokens/{tokenID}/revoke", m.revokeJoinToken) + r.Get("/{clusterID}/nodes/{nodeID}/roles", m.listNodeRoles) + r.Post("/{clusterID}/nodes/{nodeID}/roles", m.assignNodeRole) + r.Post("/{clusterID}/nodes/{nodeID}/heartbeats", m.recordHeartbeat) + r.Get("/{clusterID}/nodes/{nodeID}/heartbeats", m.listNodeHeartbeats) + r.Get("/{clusterID}/nodes/{nodeID}/testing-flags", m.getEffectiveNodeTestingFlags) + r.Get("/{clusterID}/nodes/{nodeID}/mesh/synthetic-config", m.getNodeSyntheticMeshConfig) + r.Post("/{clusterID}/nodes/{nodeID}/telemetry", m.recordNodeTelemetry) + r.Get("/{clusterID}/nodes/{nodeID}/telemetry", m.listNodeTelemetry) + r.Post("/{clusterID}/nodes/{nodeID}/membership/attach", m.attachExistingNodeToCluster) + r.Put("/{clusterID}/nodes/{nodeID}/group", m.assignNodeGroup) + r.Post("/{clusterID}/nodes/{nodeID}/identity/revoke", m.revokeNodeIdentity) + r.Post("/{clusterID}/nodes/{nodeID}/membership/disable", m.disableMembership) + r.Get("/{clusterID}/nodes/{nodeID}/workloads/desired", m.listDesiredWorkloads) + r.Put("/{clusterID}/nodes/{nodeID}/workloads/{serviceType}/desired", m.setDesiredWorkload) + r.Post("/{clusterID}/nodes/{nodeID}/workloads/{serviceType}/status", m.reportWorkloadStatus) + r.Get("/{clusterID}/nodes/{nodeID}/workloads/status", m.listWorkloadStatuses) + r.Get("/{clusterID}/mesh/links", m.listMeshLinks) + r.Post("/{clusterID}/mesh/links", m.reportMeshLink) + r.Get("/{clusterID}/mesh/route-intents", m.listRouteIntents) + r.Post("/{clusterID}/mesh/route-intents", m.createRouteIntent) + r.Get("/{clusterID}/mesh/qos-policies", m.listQoSPolicies) + r.Get("/{clusterID}/fabric/entry-points", m.listFabricEntryPoints) + r.Post("/{clusterID}/fabric/entry-points", m.createFabricEntryPoint) + r.Get("/{clusterID}/fabric/entry-points/{entryPointID}/nodes", m.listFabricEntryPointNodes) + r.Put("/{clusterID}/fabric/entry-points/{entryPointID}/nodes/{nodeID}", m.setFabricEntryPointNode) + r.Get("/{clusterID}/fabric/egress-pools", m.listFabricEgressPools) + r.Post("/{clusterID}/fabric/egress-pools", m.createFabricEgressPool) + r.Get("/{clusterID}/fabric/egress-pools/{egressPoolID}/nodes", m.listFabricEgressPoolNodes) + r.Put("/{clusterID}/fabric/egress-pools/{egressPoolID}/nodes/{nodeID}", m.setFabricEgressPoolNode) + r.Post("/{clusterID}/vpn-connection-leases/expire-stale", m.expireStaleVPNConnectionLeases) + r.Get("/{clusterID}/vpn-connections", m.listVPNConnections) + r.Post("/{clusterID}/vpn-connections", m.createVPNConnection) + r.Get("/{clusterID}/vpn-connections/{vpnConnectionID}", m.getVPNConnection) + r.Put("/{clusterID}/vpn-connections/{vpnConnectionID}/desired-state", m.updateVPNConnectionDesiredState) + r.Get("/{clusterID}/vpn-connections/{vpnConnectionID}/allowed-nodes", m.listVPNConnectionAllowedNodes) + r.Put("/{clusterID}/vpn-connections/{vpnConnectionID}/allowed-nodes", m.setVPNConnectionAllowedNodes) + r.Get("/{clusterID}/vpn-connections/{vpnConnectionID}/route-policies", m.listVPNConnectionRoutePolicies) + r.Post("/{clusterID}/vpn-connections/{vpnConnectionID}/route-policies", m.upsertVPNConnectionRoutePolicy) + r.Get("/{clusterID}/vpn-connections/{vpnConnectionID}/leases/active", m.getActiveVPNConnectionLease) + r.Post("/{clusterID}/vpn-connections/{vpnConnectionID}/leases/acquire", m.acquireVPNConnectionLease) + r.Post("/{clusterID}/vpn-connections/{vpnConnectionID}/leases/{leaseID}/renew", m.renewVPNConnectionLease) + r.Post("/{clusterID}/vpn-connections/{vpnConnectionID}/leases/{leaseID}/release", m.releaseVPNConnectionLease) + r.Post("/{clusterID}/vpn-connections/{vpnConnectionID}/leases/{leaseID}/fence", m.fenceVPNConnectionLease) + r.Get("/{clusterID}/authority", m.getClusterAuthority) + r.Put("/{clusterID}/authority", m.updateClusterAuthority) + r.Get("/{clusterID}/audit", m.listAuditEvents) + }) + router.Get("/cluster-admin-summaries", m.listClusterAdminSummaries) + router.Get("/fabric/testing-flags", m.listFabricTestingFlags) + router.Put("/fabric/testing-flags", m.upsertFabricTestingFlag) +} + +func (m *Module) listClusters(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListClusters(r.Context(), r.URL.Query().Get("actor_user_id")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"clusters": items}) +} + +func (m *Module) getCluster(w http.ResponseWriter, r *http.Request) { + item, err := m.service.GetCluster(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"cluster": item}) +} + +func (m *Module) createCluster(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Slug string `json:"slug"` + Name string `json:"name"` + Region *string `json:"region"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid cluster payload") + return + } + item, err := m.service.CreateCluster(r.Context(), CreateClusterInput{ + ActorUserID: payload.ActorUserID, + Slug: payload.Slug, + Name: payload.Name, + Region: payload.Region, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"cluster": item}) +} + +func (m *Module) updateCluster(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Name string `json:"name"` + Status string `json:"status"` + Region *string `json:"region"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid cluster payload") + return + } + item, err := m.service.UpdateCluster(r.Context(), UpdateClusterInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + Name: payload.Name, + Status: payload.Status, + Region: payload.Region, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"cluster": item}) +} + +func (m *Module) listClusterNodes(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListClusterNodes(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"nodes": items}) +} + +func (m *Module) listNodeGroups(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListNodeGroups(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"node_groups": items}) +} + +func (m *Module) createNodeGroup(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + ParentGroupID *string `json:"parent_group_id"` + Name string `json:"name"` + Description *string `json:"description"` + SortOrder int `json:"sort_order"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid node group payload") + return + } + item, err := m.service.CreateNodeGroup(r.Context(), CreateNodeGroupInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + ParentGroupID: payload.ParentGroupID, + Name: payload.Name, + Description: payload.Description, + SortOrder: payload.SortOrder, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"node_group": item}) +} + +func (m *Module) createJoinToken(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Scope json.RawMessage `json:"scope"` + ExpiresAt *time.Time `json:"expires_at"` + MaxUses int `json:"max_uses"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid join token payload") + return + } + expiresAt := time.Time{} + if payload.ExpiresAt != nil { + expiresAt = *payload.ExpiresAt + } + item, err := m.service.CreateJoinToken(r.Context(), CreateJoinTokenInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + Scope: payload.Scope, + ExpiresAt: expiresAt, + MaxUses: payload.MaxUses, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"join_token": item}) +} + +func (m *Module) createJoinRequest(w http.ResponseWriter, r *http.Request) { + var payload struct { + JoinToken string `json:"join_token"` + NodeName string `json:"node_name"` + NodeFingerprint string `json:"node_fingerprint"` + PublicKey string `json:"public_key"` + ReportedCapabilities json.RawMessage `json:"reported_capabilities"` + ReportedFacts json.RawMessage `json:"reported_facts"` + RequestedRoles json.RawMessage `json:"requested_roles"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid join request payload") + return + } + item, err := m.service.CreateJoinRequest(r.Context(), CreateJoinRequestInput{ + ClusterID: chi.URLParam(r, "clusterID"), + JoinToken: payload.JoinToken, + NodeName: payload.NodeName, + NodeFingerprint: payload.NodeFingerprint, + PublicKey: payload.PublicKey, + ReportedCapabilities: payload.ReportedCapabilities, + ReportedFacts: payload.ReportedFacts, + RequestedRoles: payload.RequestedRoles, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"join_request": item}) +} + +func (m *Module) listJoinRequests(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListJoinRequests(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"join_requests": items}) +} + +func (m *Module) approveJoinRequest(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + NodeKey string `json:"node_key"` + OwnershipType string `json:"ownership_type"` + OwnerOrganizationID *string `json:"owner_organization_id"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid join request approval payload") + return + } + item, err := m.service.ApproveJoinRequest(r.Context(), ApproveJoinRequestInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + JoinRequestID: chi.URLParam(r, "requestID"), + NodeKey: payload.NodeKey, + OwnershipType: payload.OwnershipType, + OwnerOrganizationID: payload.OwnerOrganizationID, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, item) +} + +func (m *Module) rejectJoinRequest(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Reason string `json:"reason"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid join request rejection payload") + return + } + item, err := m.service.RejectJoinRequest(r.Context(), RejectJoinRequestInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + JoinRequestID: chi.URLParam(r, "requestID"), + Reason: payload.Reason, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"join_request": item}) +} + +func (m *Module) revokeJoinToken(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid join token revoke payload") + return + } + item, err := m.service.RevokeJoinToken(r.Context(), RevokeJoinTokenInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + TokenID: chi.URLParam(r, "tokenID"), + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"join_token": item}) +} + +func (m *Module) assignNodeRole(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + OrganizationID *string `json:"organization_id"` + Role string `json:"role"` + Status string `json:"status"` + Policy json.RawMessage `json:"policy"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid node role payload") + return + } + item, err := m.service.AssignNodeRole(r.Context(), AssignNodeRoleInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + OrganizationID: payload.OrganizationID, + Role: payload.Role, + Status: payload.Status, + Policy: payload.Policy, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"role_assignment": item}) +} + +func (m *Module) listNodeRoles(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListNodeRoleAssignments(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"role_assignments": items}) +} + +func (m *Module) recordHeartbeat(w http.ResponseWriter, r *http.Request) { + var payload struct { + HealthStatus string `json:"health_status"` + ReportedVersion *string `json:"reported_version"` + Capabilities json.RawMessage `json:"capabilities"` + ServiceStates json.RawMessage `json:"service_states"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid node heartbeat payload") + return + } + item, err := m.service.RecordHeartbeat(r.Context(), RecordHeartbeatInput{ + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + HealthStatus: payload.HealthStatus, + ReportedVersion: payload.ReportedVersion, + Capabilities: payload.Capabilities, + ServiceStates: payload.ServiceStates, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + flags, _ := m.service.GetEffectiveNodeTestingFlags(r.Context(), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID")) + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"heartbeat": item, "testing_flags": flags}) +} + +func (m *Module) listNodeHeartbeats(w http.ResponseWriter, r *http.Request) { + limit, _ := strconv.Atoi(r.URL.Query().Get("limit")) + items, err := m.service.ListNodeHeartbeats(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID"), limit) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"heartbeats": items}) +} + +func (m *Module) getEffectiveNodeTestingFlags(w http.ResponseWriter, r *http.Request) { + item, err := m.service.GetEffectiveNodeTestingFlags(r.Context(), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"testing_flags": item}) +} + +func (m *Module) getNodeSyntheticMeshConfig(w http.ResponseWriter, r *http.Request) { + item, err := m.service.GetNodeSyntheticMeshConfig(r.Context(), GetNodeSyntheticMeshConfigInput{ + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"synthetic_mesh_config": item}) +} + +func (m *Module) recordNodeTelemetry(w http.ResponseWriter, r *http.Request) { + var payload struct { + CPUPercent *float64 `json:"cpu_percent"` + MemoryUsedBytes *int64 `json:"memory_used_bytes"` + MemoryTotalBytes *int64 `json:"memory_total_bytes"` + DiskUsedBytes *int64 `json:"disk_used_bytes"` + DiskTotalBytes *int64 `json:"disk_total_bytes"` + NetworkRxBytes *int64 `json:"network_rx_bytes"` + NetworkTxBytes *int64 `json:"network_tx_bytes"` + ProcessCount *int `json:"process_count"` + Payload json.RawMessage `json:"payload"` + ObservedAt *time.Time `json:"observed_at"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid node telemetry payload") + return + } + observedAt := time.Time{} + if payload.ObservedAt != nil { + observedAt = *payload.ObservedAt + } + item, err := m.service.RecordNodeTelemetry(r.Context(), RecordNodeTelemetryInput{ + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + CPUPercent: payload.CPUPercent, + MemoryUsedBytes: payload.MemoryUsedBytes, + MemoryTotalBytes: payload.MemoryTotalBytes, + DiskUsedBytes: payload.DiskUsedBytes, + DiskTotalBytes: payload.DiskTotalBytes, + NetworkRxBytes: payload.NetworkRxBytes, + NetworkTxBytes: payload.NetworkTxBytes, + ProcessCount: payload.ProcessCount, + Payload: payload.Payload, + ObservedAt: observedAt, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"telemetry": item}) +} + +func (m *Module) listNodeTelemetry(w http.ResponseWriter, r *http.Request) { + limit, _ := strconv.Atoi(r.URL.Query().Get("limit")) + items, err := m.service.ListNodeTelemetry(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID"), limit) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"telemetry": items}) +} + +func (m *Module) attachExistingNodeToCluster(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Roles []string `json:"roles"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid membership attach payload") + return + } + item, err := m.service.AttachExistingNodeToCluster(r.Context(), AttachExistingNodeInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + Roles: payload.Roles, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"node": item}) +} + +func (m *Module) assignNodeGroup(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + GroupID *string `json:"group_id"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid node group assignment payload") + return + } + item, err := m.service.AssignNodeToGroup(r.Context(), AssignNodeGroupInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + GroupID: payload.GroupID, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"node": item}) +} + +func (m *Module) revokeNodeIdentity(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Reason string `json:"reason"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid node identity revoke payload") + return + } + err := m.service.RevokeNodeIdentity(r.Context(), RevokeNodeIdentityInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + Reason: payload.Reason, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted"}) +} + +func (m *Module) disableMembership(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Reason string `json:"reason"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid membership disable payload") + return + } + err := m.service.DisableClusterMembership(r.Context(), DisableMembershipInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + Reason: payload.Reason, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted"}) +} + +func (m *Module) setDesiredWorkload(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + DesiredState string `json:"desired_state"` + Version *string `json:"version"` + RuntimeMode string `json:"runtime_mode"` + ArtifactRef *string `json:"artifact_ref"` + Config json.RawMessage `json:"config"` + Environment json.RawMessage `json:"environment"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid desired workload payload") + return + } + item, err := m.service.SetDesiredWorkload(r.Context(), SetDesiredWorkloadInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + ServiceType: chi.URLParam(r, "serviceType"), + DesiredState: payload.DesiredState, + Version: payload.Version, + RuntimeMode: payload.RuntimeMode, + ArtifactRef: payload.ArtifactRef, + Config: payload.Config, + Environment: payload.Environment, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"desired_workload": item}) +} + +func (m *Module) listDesiredWorkloads(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListDesiredWorkloads(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"desired_workloads": items}) +} + +func (m *Module) reportWorkloadStatus(w http.ResponseWriter, r *http.Request) { + var payload struct { + ReportedState string `json:"reported_state"` + RuntimeMode string `json:"runtime_mode"` + Version *string `json:"version"` + StatusPayload json.RawMessage `json:"status_payload"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid workload status payload") + return + } + item, err := m.service.ReportWorkloadStatus(r.Context(), ReportWorkloadStatusInput{ + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + ServiceType: chi.URLParam(r, "serviceType"), + ReportedState: payload.ReportedState, + RuntimeMode: payload.RuntimeMode, + Version: payload.Version, + StatusPayload: payload.StatusPayload, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"workload_status": item}) +} + +func (m *Module) listWorkloadStatuses(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListLatestWorkloadStatuses(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"workload_statuses": items}) +} + +func (m *Module) reportMeshLink(w http.ResponseWriter, r *http.Request) { + var payload struct { + SourceNodeID string `json:"source_node_id"` + TargetNodeID string `json:"target_node_id"` + LinkStatus string `json:"link_status"` + LatencyMs *int `json:"latency_ms"` + QualityScore *int `json:"quality_score"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid mesh link payload") + return + } + item, err := m.service.ReportMeshLink(r.Context(), ReportMeshLinkInput{ + ClusterID: chi.URLParam(r, "clusterID"), + SourceNodeID: payload.SourceNodeID, + TargetNodeID: payload.TargetNodeID, + LinkStatus: payload.LinkStatus, + LatencyMs: payload.LatencyMs, + QualityScore: payload.QualityScore, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"mesh_link": item}) +} + +func (m *Module) listMeshLinks(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListMeshLinks(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"mesh_links": items}) +} + +func (m *Module) createRouteIntent(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + SourceSelector json.RawMessage `json:"source_selector"` + DestinationSelector json.RawMessage `json:"destination_selector"` + ServiceClass string `json:"service_class"` + Priority int `json:"priority"` + Policy json.RawMessage `json:"policy"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid route intent payload") + return + } + item, err := m.service.CreateRouteIntent(r.Context(), CreateRouteIntentInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + SourceSelector: payload.SourceSelector, + DestinationSelector: payload.DestinationSelector, + ServiceClass: payload.ServiceClass, + Priority: payload.Priority, + Policy: payload.Policy, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"route_intent": item}) +} + +func (m *Module) listRouteIntents(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListRouteIntents(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"route_intents": items}) +} + +func (m *Module) listQoSPolicies(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListQoSPolicies(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"qos_policies": items}) +} + +func (m *Module) listFabricEntryPoints(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListFabricEntryPoints(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"entry_points": items}) +} + +func (m *Module) createFabricEntryPoint(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Name string `json:"name"` + Status string `json:"status"` + EndpointType string `json:"endpoint_type"` + PublicEndpoint *string `json:"public_endpoint"` + Policy json.RawMessage `json:"policy"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid fabric entry point payload") + return + } + item, err := m.service.CreateFabricEntryPoint(r.Context(), CreateFabricEntryPointInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + Name: payload.Name, + Status: payload.Status, + EndpointType: payload.EndpointType, + PublicEndpoint: payload.PublicEndpoint, + Policy: payload.Policy, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"entry_point": item}) +} + +func (m *Module) setFabricEntryPointNode(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Status string `json:"status"` + Priority int `json:"priority"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid fabric entry point node payload") + return + } + item, err := m.service.SetFabricEntryPointNode(r.Context(), SetFabricEntryPointNodeInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + EntryPointID: chi.URLParam(r, "entryPointID"), + NodeID: chi.URLParam(r, "nodeID"), + Status: payload.Status, + Priority: payload.Priority, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"entry_point_node": item}) +} + +func (m *Module) listFabricEntryPointNodes(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListFabricEntryPointNodes( + r.Context(), + r.URL.Query().Get("actor_user_id"), + chi.URLParam(r, "clusterID"), + chi.URLParam(r, "entryPointID"), + ) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"entry_point_nodes": items}) +} + +func (m *Module) listFabricEgressPools(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListFabricEgressPools(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"egress_pools": items}) +} + +func (m *Module) createFabricEgressPool(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Name string `json:"name"` + Status string `json:"status"` + Description *string `json:"description"` + RouteScope json.RawMessage `json:"route_scope"` + Policy json.RawMessage `json:"policy"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid fabric egress pool payload") + return + } + item, err := m.service.CreateFabricEgressPool(r.Context(), CreateFabricEgressPoolInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + Name: payload.Name, + Status: payload.Status, + Description: payload.Description, + RouteScope: payload.RouteScope, + Policy: payload.Policy, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"egress_pool": item}) +} + +func (m *Module) setFabricEgressPoolNode(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Status string `json:"status"` + Priority int `json:"priority"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid fabric egress pool node payload") + return + } + item, err := m.service.SetFabricEgressPoolNode(r.Context(), SetFabricEgressPoolNodeInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + EgressPoolID: chi.URLParam(r, "egressPoolID"), + NodeID: chi.URLParam(r, "nodeID"), + Status: payload.Status, + Priority: payload.Priority, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"egress_pool_node": item}) +} + +func (m *Module) listFabricEgressPoolNodes(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListFabricEgressPoolNodes( + r.Context(), + r.URL.Query().Get("actor_user_id"), + chi.URLParam(r, "clusterID"), + chi.URLParam(r, "egressPoolID"), + ) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"egress_pool_nodes": items}) +} + +func (m *Module) createVPNConnection(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + OrganizationID string `json:"organization_id"` + Name string `json:"name"` + TargetEndpoint json.RawMessage `json:"target_endpoint"` + ProtocolFamily string `json:"protocol_family"` + CredentialRef *string `json:"credential_ref"` + Mode string `json:"mode"` + DesiredState string `json:"desired_state"` + AllowedNodePolicy json.RawMessage `json:"allowed_node_policy"` + RoutingUsage json.RawMessage `json:"routing_usage"` + RoutePolicy json.RawMessage `json:"route_policy"` + QoSPolicy json.RawMessage `json:"qos_policy"` + PlacementPolicy json.RawMessage `json:"placement_policy"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn connection payload") + return + } + item, err := m.service.CreateVPNConnection(r.Context(), CreateVPNConnectionInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + OrganizationID: payload.OrganizationID, + Name: payload.Name, + TargetEndpoint: payload.TargetEndpoint, + ProtocolFamily: payload.ProtocolFamily, + CredentialRef: payload.CredentialRef, + Mode: payload.Mode, + DesiredState: payload.DesiredState, + AllowedNodePolicy: payload.AllowedNodePolicy, + RoutingUsage: payload.RoutingUsage, + RoutePolicy: payload.RoutePolicy, + QoSPolicy: payload.QoSPolicy, + PlacementPolicy: payload.PlacementPolicy, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"vpn_connection": item}) +} + +func (m *Module) listVPNConnections(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListVPNConnections(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"vpn_connections": items}) +} + +func (m *Module) getVPNConnection(w http.ResponseWriter, r *http.Request) { + item, err := m.service.GetVPNConnection(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), chi.URLParam(r, "vpnConnectionID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"vpn_connection": item}) +} + +func (m *Module) updateVPNConnectionDesiredState(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + DesiredState string `json:"desired_state"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn connection desired state payload") + return + } + item, err := m.service.UpdateVPNConnectionDesiredState(r.Context(), UpdateVPNConnectionDesiredStateInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + VPNConnectionID: chi.URLParam(r, "vpnConnectionID"), + DesiredState: payload.DesiredState, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"vpn_connection": item}) +} + +func (m *Module) upsertVPNConnectionRoutePolicy(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + RouteType string `json:"route_type"` + Destination string `json:"destination"` + Action string `json:"action"` + ServiceType *string `json:"service_type"` + Priority int `json:"priority"` + Policy json.RawMessage `json:"policy"` + Status string `json:"status"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn route policy payload") + return + } + item, err := m.service.UpsertVPNConnectionRoutePolicy(r.Context(), UpsertVPNConnectionRoutePolicyInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + VPNConnectionID: chi.URLParam(r, "vpnConnectionID"), + RouteType: payload.RouteType, + Destination: payload.Destination, + Action: payload.Action, + ServiceType: payload.ServiceType, + Priority: payload.Priority, + Policy: payload.Policy, + Status: payload.Status, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"route_policy": item}) +} + +func (m *Module) listVPNConnectionRoutePolicies(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListVPNConnectionRoutePolicies(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), chi.URLParam(r, "vpnConnectionID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"route_policies": items}) +} + +func (m *Module) setVPNConnectionAllowedNodes(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + NodeIDs []string `json:"node_ids"` + RolePreference string `json:"role_preference"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn allowed nodes payload") + return + } + items, err := m.service.SetVPNConnectionAllowedNodes(r.Context(), SetVPNConnectionAllowedNodesInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + VPNConnectionID: chi.URLParam(r, "vpnConnectionID"), + NodeIDs: payload.NodeIDs, + RolePreference: payload.RolePreference, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"allowed_nodes": items}) +} + +func (m *Module) listVPNConnectionAllowedNodes(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListVPNConnectionAllowedNodes(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), chi.URLParam(r, "vpnConnectionID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"allowed_nodes": items}) +} + +func (m *Module) acquireVPNConnectionLease(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + OwnerNodeID string `json:"owner_node_id"` + TTLSeconds int `json:"ttl_seconds"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn lease acquire payload") + return + } + item, err := m.service.AcquireVPNConnectionLease(r.Context(), AcquireVPNConnectionLeaseInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + VPNConnectionID: chi.URLParam(r, "vpnConnectionID"), + OwnerNodeID: payload.OwnerNodeID, + TTL: time.Duration(payload.TTLSeconds) * time.Second, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"lease": item}) +} + +func (m *Module) renewVPNConnectionLease(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + OwnerNodeID string `json:"owner_node_id"` + FencingToken string `json:"fencing_token"` + TTLSeconds int `json:"ttl_seconds"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn lease renew payload") + return + } + item, err := m.service.RenewVPNConnectionLease(r.Context(), RenewVPNConnectionLeaseInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + VPNConnectionID: chi.URLParam(r, "vpnConnectionID"), + LeaseID: chi.URLParam(r, "leaseID"), + OwnerNodeID: payload.OwnerNodeID, + FencingToken: payload.FencingToken, + TTL: time.Duration(payload.TTLSeconds) * time.Second, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"lease": item}) +} + +func (m *Module) releaseVPNConnectionLease(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + OwnerNodeID string `json:"owner_node_id"` + FencingToken string `json:"fencing_token"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn lease release payload") + return + } + item, err := m.service.ReleaseVPNConnectionLease(r.Context(), ReleaseVPNConnectionLeaseInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + VPNConnectionID: chi.URLParam(r, "vpnConnectionID"), + LeaseID: chi.URLParam(r, "leaseID"), + OwnerNodeID: payload.OwnerNodeID, + FencingToken: payload.FencingToken, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"lease": item}) +} + +func (m *Module) fenceVPNConnectionLease(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + Reason string `json:"reason"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn lease fence payload") + return + } + item, err := m.service.FenceVPNConnectionLease(r.Context(), FenceVPNConnectionLeaseInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + VPNConnectionID: chi.URLParam(r, "vpnConnectionID"), + LeaseID: chi.URLParam(r, "leaseID"), + Reason: payload.Reason, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"lease": item}) +} + +func (m *Module) getActiveVPNConnectionLease(w http.ResponseWriter, r *http.Request) { + item, err := m.service.GetActiveVPNConnectionLease(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), chi.URLParam(r, "vpnConnectionID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"lease": item}) +} + +func (m *Module) expireStaleVPNConnectionLeases(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn stale lease payload") + return + } + items, err := m.service.ExpireStaleVPNConnectionLeases(r.Context(), ExpireStaleVPNConnectionLeasesInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"expired_leases": items}) +} + +func (m *Module) getClusterAuthority(w http.ResponseWriter, r *http.Request) { + item, err := m.service.GetClusterAuthorityState(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"authority_state": item}) +} + +func (m *Module) updateClusterAuthority(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + AuthorityState string `json:"authority_state"` + MutationMode string `json:"mutation_mode"` + Notes *string `json:"notes"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid cluster authority payload") + return + } + item, err := m.service.UpdateClusterAuthorityState(r.Context(), UpdateClusterAuthorityInput{ + ActorUserID: payload.ActorUserID, + ClusterID: chi.URLParam(r, "clusterID"), + AuthorityState: payload.AuthorityState, + MutationMode: payload.MutationMode, + Notes: payload.Notes, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"authority_state": item}) +} + +func (m *Module) listClusterAdminSummaries(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListClusterAdminSummaries(r.Context(), r.URL.Query().Get("actor_user_id")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"cluster_summaries": items}) +} + +func (m *Module) listFabricTestingFlags(w http.ResponseWriter, r *http.Request) { + items, err := m.service.ListFabricTestingFlags(r.Context(), r.URL.Query().Get("actor_user_id")) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"testing_flags": items}) +} + +func (m *Module) upsertFabricTestingFlag(w http.ResponseWriter, r *http.Request) { + var payload struct { + ActorUserID string `json:"actor_user_id"` + ScopeType string `json:"scope_type"` + ScopeID *string `json:"scope_id"` + ClusterID *string `json:"cluster_id"` + Enabled bool `json:"enabled"` + TelemetryEnabled bool `json:"telemetry_enabled"` + SyntheticLinksEnabled bool `json:"synthetic_links_enabled"` + HistoryRetentionHours int `json:"history_retention_hours"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid testing flag payload") + return + } + item, err := m.service.UpsertFabricTestingFlag(r.Context(), UpsertFabricTestingFlagInput{ + ActorUserID: payload.ActorUserID, + ScopeType: payload.ScopeType, + ScopeID: payload.ScopeID, + ClusterID: payload.ClusterID, + Enabled: payload.Enabled, + TelemetryEnabled: payload.TelemetryEnabled, + SyntheticLinksEnabled: payload.SyntheticLinksEnabled, + HistoryRetentionHours: payload.HistoryRetentionHours, + Metadata: payload.Metadata, + }) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"testing_flag": item}) +} + +func (m *Module) listAuditEvents(w http.ResponseWriter, r *http.Request) { + limit, _ := strconv.Atoi(r.URL.Query().Get("limit")) + items, err := m.service.ListAuditEvents(r.Context(), r.URL.Query().Get("actor_user_id"), chi.URLParam(r, "clusterID"), limit) + if writeServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"audit_events": items}) +} + +func writeServiceError(w http.ResponseWriter, err error) bool { + if err == nil { + return false + } + switch { + case errors.Is(err, ErrAccessDenied): + httpx.WriteError(w, http.StatusForbidden, err.Error()) + case errors.Is(err, ErrVPNLeaseOwnerNotAllowed), errors.Is(err, ErrVPNLeaseOwnerRoleRequired): + httpx.WriteError(w, http.StatusForbidden, err.Error()) + case errors.Is(err, ErrClusterReadOnly): + httpx.WriteError(w, http.StatusConflict, err.Error()) + case errors.Is(err, ErrVPNLeaseAlreadyActive): + httpx.WriteError(w, http.StatusConflict, err.Error()) + case errors.Is(err, ErrInvalidPayload), errors.Is(err, ErrInvalidJoinToken), errors.Is(err, ErrInvalidNodeRole): + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + case errors.Is(err, ErrInvalidCluster), errors.Is(err, ErrInvalidJoinRequest), errors.Is(err, ErrInvalidVPNConnection), errors.Is(err, ErrInvalidVPNLease), errors.Is(err, pgx.ErrNoRows): + httpx.WriteError(w, http.StatusNotFound, err.Error()) + default: + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + } + return true +} diff --git a/backend/internal/modules/cluster/postgres_store.go b/backend/internal/modules/cluster/postgres_store.go new file mode 100644 index 0000000..a968eab --- /dev/null +++ b/backend/internal/modules/cluster/postgres_store.go @@ -0,0 +1,2994 @@ +package cluster + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strings" + "time" + + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/example/remote-access-platform/backend/internal/platform/authority" + "github.com/example/remote-access-platform/backend/internal/platform/clusterauth" + "github.com/example/remote-access-platform/backend/internal/platform/secrets" +) + +type PostgresStore struct { + db *pgxpool.Pool + authority *authority.Verifier + clusterKeyEncryptor *secrets.Encryptor +} + +const encryptedClusterAuthorityKeyPrefix = "enc:v1:" + +func NewPostgresStore(db *pgxpool.Pool, verifiers ...*authority.Verifier) *PostgresStore { + var authorityVerifier *authority.Verifier + if len(verifiers) > 0 { + authorityVerifier = verifiers[0] + } + return &PostgresStore{db: db, authority: authorityVerifier} +} + +func (s *PostgresStore) WithClusterKeyEncryptor(encryptor *secrets.Encryptor) *PostgresStore { + if s != nil { + s.clusterKeyEncryptor = encryptor + } + return s +} + +func (s *PostgresStore) encodeClusterAuthorityPrivateKey(clusterID, privateKey string) (string, error) { + if s == nil || s.clusterKeyEncryptor == nil { + return privateKey, nil + } + encrypted, err := s.clusterKeyEncryptor.Encrypt([]byte(privateKey), clusterAuthorityPrivateKeyAAD(clusterID)) + if err != nil { + return "", err + } + payload, err := json.Marshal(encrypted) + if err != nil { + return "", err + } + return encryptedClusterAuthorityKeyPrefix + string(payload), nil +} + +func (s *PostgresStore) decodeClusterAuthorityPrivateKey(clusterID, stored string) (string, error) { + if !strings.HasPrefix(stored, encryptedClusterAuthorityKeyPrefix) { + if s != nil && s.clusterKeyEncryptor != nil { + return "", fmt.Errorf("cluster authority private key is not encrypted") + } + return stored, nil + } + if s == nil || s.clusterKeyEncryptor == nil { + return "", secrets.ErrSecretEncryptionKeyMissing + } + var encrypted secrets.EncryptedPayload + if err := json.Unmarshal([]byte(strings.TrimPrefix(stored, encryptedClusterAuthorityKeyPrefix)), &encrypted); err != nil { + return "", fmt.Errorf("decode encrypted cluster authority private key: %w", err) + } + plaintext, err := s.clusterKeyEncryptor.Decrypt(encrypted, clusterAuthorityPrivateKeyAAD(clusterID)) + if err != nil { + return "", err + } + return string(plaintext), nil +} + +func clusterAuthorityPrivateKeyAAD(clusterID string) []byte { + return []byte("rap-cluster-authority-v1|" + strings.TrimSpace(clusterID)) +} + +func (s *PostgresStore) GetPlatformRole(ctx context.Context, userID string) (string, error) { + return authority.EffectivePlatformRole(ctx, s.db, s.authority, userID) +} + +func (s *PostgresStore) ListClusters(ctx context.Context) ([]Cluster, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, slug, name, status, region, metadata, created_at, updated_at + FROM clusters + ORDER BY created_at DESC + `) + if err != nil { + return nil, err + } + defer rows.Close() + var out []Cluster + for rows.Next() { + item, err := scanCluster(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) GetCluster(ctx context.Context, clusterID string) (Cluster, error) { + row := s.db.QueryRow(ctx, ` + SELECT id::text, slug, name, status, region, metadata, created_at, updated_at + FROM clusters + WHERE id = $1::uuid + `, clusterID) + return scanCluster(row) +} + +func (s *PostgresStore) CreateCluster(ctx context.Context, input CreateClusterInput) (Cluster, error) { + id := uuid.NewString() + now := time.Now().UTC() + keys, err := clusterauth.GenerateKeyPair() + if err != nil { + return Cluster{}, err + } + storedPrivateKey, err := s.encodeClusterAuthorityPrivateKey(id, keys.PrivateKeyB64) + if err != nil { + return Cluster{}, err + } + tx, err := s.db.Begin(ctx) + if err != nil { + return Cluster{}, err + } + defer func() { _ = tx.Rollback(ctx) }() + + row := tx.QueryRow(ctx, ` + INSERT INTO clusters (id, slug, name, status, region, metadata, created_at, updated_at) + VALUES ($1::uuid, $2, $3, 'active', $4, $5::jsonb, $6, $6) + RETURNING id::text, slug, name, status, region, metadata, created_at, updated_at + `, id, input.Slug, input.Name, input.Region, []byte(input.Metadata), now) + item, err := scanCluster(row) + if err != nil { + return Cluster{}, err + } + if _, err := tx.Exec(ctx, ` + INSERT INTO cluster_authority_states (cluster_id, authority_state, mutation_mode, term, notes, updated_by_user_id, updated_at) + VALUES ($1::uuid, 'authoritative', 'normal', 1, 'cluster created with authority key', $2::uuid, $3) + ON CONFLICT (cluster_id) DO NOTHING + `, id, input.ActorUserID, now); err != nil { + return Cluster{}, err + } + if _, err := tx.Exec(ctx, ` + INSERT INTO cluster_authorities ( + cluster_id, authority_state, key_algorithm, public_key, public_key_fingerprint, + private_key, created_by_user_id, created_at, updated_at, metadata + ) VALUES ($1::uuid, 'active', 'ed25519', $2, $3, $4, $5::uuid, $6, $6, $7::jsonb) + `, id, keys.PublicKeyB64, keys.Fingerprint, storedPrivateKey, input.ActorUserID, now, []byte(`{"storage":"database_signer","production_target":"external_cluster_signer_or_hsm"}`)); err != nil { + return Cluster{}, err + } + if err := tx.Commit(ctx); err != nil { + return Cluster{}, err + } + return item, nil +} + +func (s *PostgresStore) UpdateCluster(ctx context.Context, input UpdateClusterInput) (Cluster, error) { + now := time.Now().UTC() + row := s.db.QueryRow(ctx, ` + UPDATE clusters + SET name = $2, + status = $3, + region = $4, + metadata = $5::jsonb, + updated_at = $6 + WHERE id = $1::uuid + RETURNING id::text, slug, name, status, region, metadata, created_at, updated_at + `, input.ClusterID, input.Name, input.Status, input.Region, []byte(input.Metadata), now) + return scanCluster(row) +} + +func (s *PostgresStore) GetClusterAuthority(ctx context.Context, clusterID string) (ClusterAuthorityKey, error) { + row := s.db.QueryRow(ctx, ` + SELECT cluster_id::text, authority_state, key_algorithm, public_key, + public_key_fingerprint, private_key, created_at, updated_at + FROM cluster_authorities + WHERE cluster_id = $1::uuid + `, clusterID) + item, err := scanClusterAuthority(row) + if err != nil { + return ClusterAuthorityKey{}, err + } + privateKey, err := s.decodeClusterAuthorityPrivateKey(item.ClusterID, item.PrivateKey) + if err != nil { + return ClusterAuthorityKey{}, err + } + item.PrivateKey = privateKey + return item, nil +} + +func (s *PostgresStore) EnsureClusterAuthority(ctx context.Context, clusterID string, actorUserID *string) (ClusterAuthorityKey, error) { + keys, err := clusterauth.GenerateKeyPair() + if err != nil { + return ClusterAuthorityKey{}, err + } + storedPrivateKey, err := s.encodeClusterAuthorityPrivateKey(clusterID, keys.PrivateKeyB64) + if err != nil { + return ClusterAuthorityKey{}, err + } + now := time.Now().UTC() + _, err = s.db.Exec(ctx, ` + INSERT INTO cluster_authorities ( + cluster_id, authority_state, key_algorithm, public_key, public_key_fingerprint, + private_key, created_by_user_id, created_at, updated_at, metadata + ) + SELECT c.id, 'active', 'ed25519', $2, $3, $4, $5::uuid, $6, $6, + '{"storage":"database_signer","created_by":"ensure_cluster_authority"}'::jsonb + FROM clusters c + WHERE c.id = $1::uuid + ON CONFLICT (cluster_id) DO NOTHING + `, clusterID, keys.PublicKeyB64, keys.Fingerprint, storedPrivateKey, actorUserID, now) + if err != nil { + return ClusterAuthorityKey{}, err + } + _, _ = s.db.Exec(ctx, ` + INSERT INTO cluster_authority_states (cluster_id, authority_state, mutation_mode, term, notes, updated_by_user_id, updated_at) + VALUES ($1::uuid, 'authoritative', 'normal', 1, 'authority key ensured', $2::uuid, $3) + ON CONFLICT (cluster_id) DO NOTHING + `, clusterID, actorUserID, now) + return s.GetClusterAuthority(ctx, clusterID) +} + +func (s *PostgresStore) ListClusterNodes(ctx context.Context, clusterID string) ([]ClusterNode, error) { + rows, err := s.db.Query(ctx, ` + SELECT n.id::text, n.owner_organization_id::text, n.node_key, n.name, n.ownership_type, + n.registration_status, n.health_status, n.version_state, n.partition_state, + n.reported_version, n.last_seen_at, cm.membership_status, cm.metadata, + ng.id::text, ng.name, + n.created_at, n.updated_at + FROM cluster_memberships cm + JOIN nodes n ON n.id = cm.node_id + LEFT JOIN cluster_node_group_memberships ngm ON ngm.cluster_id = cm.cluster_id AND ngm.node_id = cm.node_id + LEFT JOIN cluster_node_groups ng ON ng.cluster_id = ngm.cluster_id AND ng.id = ngm.group_id + WHERE cm.cluster_id = $1::uuid + ORDER BY n.created_at DESC + `, clusterID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []ClusterNode + for rows.Next() { + item, err := scanClusterNode(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) ListNodeGroups(ctx context.Context, clusterID string) ([]ClusterNodeGroup, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, parent_group_id::text, name, description, + sort_order, metadata, created_by_user_id::text, created_at, updated_at + FROM cluster_node_groups + WHERE cluster_id = $1::uuid + ORDER BY sort_order, name + `, clusterID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []ClusterNodeGroup + for rows.Next() { + item, err := scanNodeGroup(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + if out == nil { + out = []ClusterNodeGroup{} + } + return out, rows.Err() +} + +func (s *PostgresStore) CreateNodeGroup(ctx context.Context, input CreateNodeGroupInput) (ClusterNodeGroup, error) { + id := uuid.NewString() + row := s.db.QueryRow(ctx, ` + WITH parent_ok AS ( + SELECT $3::uuid AS parent_group_id + WHERE $3::uuid IS NULL + OR EXISTS ( + SELECT 1 + FROM cluster_node_groups parent + WHERE parent.cluster_id = $2::uuid + AND parent.id = $3::uuid + ) + ) + INSERT INTO cluster_node_groups ( + id, cluster_id, parent_group_id, name, description, sort_order, metadata, created_by_user_id, created_at, updated_at + ) + SELECT $1::uuid, $2::uuid, parent_group_id, $4, $5, $6, $7::jsonb, $8::uuid, NOW(), NOW() + FROM parent_ok + RETURNING id::text, cluster_id::text, parent_group_id::text, name, description, + sort_order, metadata, created_by_user_id::text, created_at, updated_at + `, id, input.ClusterID, input.ParentGroupID, input.Name, input.Description, input.SortOrder, []byte(input.Metadata), input.ActorUserID) + item, err := scanNodeGroup(row) + if err != nil { + return ClusterNodeGroup{}, err + } + _ = s.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "cluster_node_group.created", + TargetType: "cluster_node_group", + TargetID: &item.ID, + Payload: json.RawMessage(`{"hierarchical":true}`), + CreatedAt: time.Now().UTC(), + }) + return item, nil +} + +func (s *PostgresStore) CreateJoinToken(ctx context.Context, input CreateJoinTokenInput, tokenHash string) (NodeJoinToken, error) { + id := uuid.NewString() + row := s.db.QueryRow(ctx, ` + INSERT INTO node_join_tokens ( + id, cluster_id, token_hash, scope, expires_at, max_uses, used_count, status, created_by_user_id, created_at + ) VALUES ($1::uuid, $2::uuid, $3, $4::jsonb, $5, $6, 0, 'active', $7::uuid, NOW()) + RETURNING id::text, cluster_id::text, scope, expires_at, max_uses, used_count, status, + created_by_user_id::text, created_at, revoked_at, authority_payload, authority_signature + `, id, input.ClusterID, tokenHash, []byte(input.Scope), input.ExpiresAt, input.MaxUses, input.ActorUserID) + return scanJoinToken(row) +} + +func (s *PostgresStore) SetJoinTokenAuthority(ctx context.Context, clusterID, tokenID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinToken, error) { + signatureJSON, err := json.Marshal(signature) + if err != nil { + return NodeJoinToken{}, err + } + row := s.db.QueryRow(ctx, ` + UPDATE node_join_tokens + SET authority_payload = $3::jsonb, + authority_signature = $4::jsonb + WHERE cluster_id = $1::uuid + AND id = $2::uuid + RETURNING id::text, cluster_id::text, scope, expires_at, max_uses, used_count, status, + created_by_user_id::text, created_at, revoked_at, authority_payload, authority_signature + `, clusterID, tokenID, []byte(payload), signatureJSON) + return scanJoinToken(row) +} + +func (s *PostgresStore) GetValidJoinTokenByHash(ctx context.Context, clusterID, tokenHash string) (NodeJoinToken, error) { + row := s.db.QueryRow(ctx, ` + SELECT id::text, cluster_id::text, scope, expires_at, max_uses, used_count, status, + created_by_user_id::text, created_at, revoked_at, authority_payload, authority_signature + FROM node_join_tokens + WHERE cluster_id = $1::uuid + AND token_hash = $2 + AND status = 'active' + AND expires_at > NOW() + AND used_count < max_uses + `, clusterID, tokenHash) + return scanJoinToken(row) +} + +func (s *PostgresStore) RevokeJoinToken(ctx context.Context, input RevokeJoinTokenInput) (NodeJoinToken, error) { + row := s.db.QueryRow(ctx, ` + UPDATE node_join_tokens + SET status = 'revoked', + revoked_at = NOW() + WHERE id = $1::uuid + AND cluster_id = $2::uuid + AND status = 'active' + RETURNING id::text, cluster_id::text, scope, expires_at, max_uses, used_count, status, + created_by_user_id::text, created_at, revoked_at, authority_payload, authority_signature + `, input.TokenID, input.ClusterID) + return scanJoinToken(row) +} + +func (s *PostgresStore) ExpireJoinTokens(ctx context.Context, clusterID string) error { + _, err := s.db.Exec(ctx, ` + UPDATE node_join_tokens + SET status = 'expired' + WHERE cluster_id = $1::uuid + AND status = 'active' + AND expires_at <= NOW() + `, clusterID) + return err +} + +func (s *PostgresStore) CreateJoinRequest(ctx context.Context, input CreateJoinRequestInput, joinTokenID string) (NodeJoinRequest, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return NodeJoinRequest{}, err + } + defer tx.Rollback(ctx) + + tag, err := tx.Exec(ctx, ` + UPDATE node_join_tokens + SET used_count = used_count + 1 + WHERE id = $1::uuid + AND cluster_id = $2::uuid + AND status = 'active' + AND expires_at > NOW() + AND used_count < max_uses + `, joinTokenID, input.ClusterID) + if err != nil { + return NodeJoinRequest{}, err + } + if tag.RowsAffected() != 1 { + return NodeJoinRequest{}, pgx.ErrNoRows + } + + id := uuid.NewString() + row := tx.QueryRow(ctx, ` + INSERT INTO node_join_requests ( + id, cluster_id, join_token_id, node_name, node_fingerprint, public_key, + reported_capabilities, reported_facts, requested_roles, status, created_at, updated_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7::jsonb, $8::jsonb, $9::jsonb, 'pending', NOW(), NOW()) + RETURNING id::text, cluster_id::text, join_token_id::text, node_name, node_fingerprint, public_key, + reported_capabilities, reported_facts, requested_roles, status, reviewed_by_user_id::text, + reviewed_at, approved_node_id::text, rejection_reason, created_at, updated_at, approval_payload, approval_signature + `, id, input.ClusterID, joinTokenID, input.NodeName, input.NodeFingerprint, input.PublicKey, []byte(input.ReportedCapabilities), []byte(input.ReportedFacts), []byte(input.RequestedRoles)) + item, err := scanJoinRequest(row) + if err != nil { + return NodeJoinRequest{}, err + } + if err := tx.Commit(ctx); err != nil { + return NodeJoinRequest{}, err + } + return item, nil +} + +func (s *PostgresStore) GetJoinRequestForBootstrap(ctx context.Context, input GetJoinRequestBootstrapInput) (NodeJoinRequest, error) { + row := s.db.QueryRow(ctx, ` + SELECT id::text, cluster_id::text, join_token_id::text, node_name, node_fingerprint, public_key, + reported_capabilities, reported_facts, requested_roles, status, reviewed_by_user_id::text, + reviewed_at, approved_node_id::text, rejection_reason, created_at, updated_at, approval_payload, approval_signature + FROM node_join_requests + WHERE cluster_id = $1::uuid + AND id = $2::uuid + AND node_fingerprint = $3 + AND public_key = $4 + `, input.ClusterID, input.JoinRequestID, input.NodeFingerprint, input.PublicKey) + return scanJoinRequest(row) +} + +func (s *PostgresStore) ListJoinRequests(ctx context.Context, clusterID string) ([]NodeJoinRequest, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, join_token_id::text, node_name, node_fingerprint, public_key, + reported_capabilities, reported_facts, requested_roles, status, reviewed_by_user_id::text, + reviewed_at, approved_node_id::text, rejection_reason, created_at, updated_at, approval_payload, approval_signature + FROM node_join_requests + WHERE cluster_id = $1::uuid + ORDER BY created_at DESC + `, clusterID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []NodeJoinRequest + for rows.Next() { + item, err := scanJoinRequest(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) ApproveJoinRequest(ctx context.Context, input ApproveJoinRequestInput) (ApprovedJoinRequest, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return ApprovedJoinRequest{}, err + } + defer tx.Rollback(ctx) + + req, err := getJoinRequestForUpdate(ctx, tx, input.ClusterID, input.JoinRequestID) + if err != nil { + return ApprovedJoinRequest{}, err + } + if req.Status != JoinRequestStatusPending { + return ApprovedJoinRequest{}, errors.New("join request is not pending") + } + + now := time.Now().UTC() + nodeID := uuid.NewString() + nodeKey := input.NodeKey + if nodeKey == "" { + nodeKey = req.NodeFingerprint + } + ownershipType := input.OwnershipType + if ownershipType == "" { + ownershipType = "platform_managed" + } + + if _, err := tx.Exec(ctx, ` + INSERT INTO nodes ( + id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status, + version_state, partition_state, metadata, created_at, updated_at + ) VALUES ($1::uuid, $2::uuid, $3, $4, $5, 'active', 'unknown', 'unknown', 'healthy', $6::jsonb, $7, $7) + `, nodeID, input.OwnerOrganizationID, nodeKey, req.NodeName, ownershipType, []byte(`{"created_from_join_request":true}`), now); err != nil { + return ApprovedJoinRequest{}, err + } + + if _, err := tx.Exec(ctx, ` + INSERT INTO cluster_memberships (cluster_id, node_id, membership_status, joined_at, metadata) + VALUES ($1::uuid, $2::uuid, 'active', $3, $4::jsonb) + `, input.ClusterID, nodeID, now, []byte(`{"created_from_join_request":true}`)); err != nil { + return ApprovedJoinRequest{}, err + } + + if _, err := tx.Exec(ctx, ` + INSERT INTO node_identities (node_id, public_key, identity_status, metadata, created_at, updated_at) + VALUES ($1::uuid, $2, 'active', $3::jsonb, $4, $4) + `, nodeID, req.PublicKey, []byte(`{"source":"join_request"}`), now); err != nil { + return ApprovedJoinRequest{}, err + } + + row := tx.QueryRow(ctx, ` + UPDATE node_join_requests + SET status = 'approved', + reviewed_by_user_id = $3::uuid, + reviewed_at = $4, + approved_node_id = $5::uuid, + updated_at = $4 + WHERE cluster_id = $1::uuid + AND id = $2::uuid + RETURNING id::text, cluster_id::text, join_token_id::text, node_name, node_fingerprint, public_key, + reported_capabilities, reported_facts, requested_roles, status, reviewed_by_user_id::text, + reviewed_at, approved_node_id::text, rejection_reason, created_at, updated_at, approval_payload, approval_signature + `, input.ClusterID, input.JoinRequestID, input.ActorUserID, now, nodeID) + updated, err := scanJoinRequest(row) + if err != nil { + return ApprovedJoinRequest{}, err + } + + if _, err := tx.Exec(ctx, ` + INSERT INTO cluster_audit_events (cluster_id, actor_user_id, event_type, target_type, target_id, payload, created_at) + VALUES ($1::uuid, $2::uuid, 'node_join_request.approved', 'node_join_request', $3, $4::jsonb, $5) + `, input.ClusterID, input.ActorUserID, input.JoinRequestID, []byte(fmt.Sprintf(`{"node_id":%q}`, nodeID)), now); err != nil { + return ApprovedJoinRequest{}, err + } + + if err := tx.Commit(ctx); err != nil { + return ApprovedJoinRequest{}, err + } + return ApprovedJoinRequest{ + JoinRequest: updated, + Bootstrap: NodeBootstrap{ + NodeID: nodeID, + ClusterID: input.ClusterID, + IdentityStatus: "active", + Certificate: map[string]any{ + "status": "pending_issuer_integration", + }, + HeartbeatEndpoint: fmt.Sprintf("/api/v1/clusters/%s/nodes/%s/heartbeats", input.ClusterID, nodeID), + }, + }, nil +} + +func (s *PostgresStore) SetJoinRequestApprovalAuthority(ctx context.Context, clusterID, joinRequestID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinRequest, error) { + signatureJSON, err := json.Marshal(signature) + if err != nil { + return NodeJoinRequest{}, err + } + row := s.db.QueryRow(ctx, ` + UPDATE node_join_requests + SET approval_payload = $3::jsonb, + approval_signature = $4::jsonb, + updated_at = NOW() + WHERE cluster_id = $1::uuid + AND id = $2::uuid + RETURNING id::text, cluster_id::text, join_token_id::text, node_name, node_fingerprint, public_key, + reported_capabilities, reported_facts, requested_roles, status, reviewed_by_user_id::text, + reviewed_at, approved_node_id::text, rejection_reason, created_at, updated_at, approval_payload, approval_signature + `, clusterID, joinRequestID, []byte(payload), signatureJSON) + return scanJoinRequest(row) +} + +func (s *PostgresStore) RejectJoinRequest(ctx context.Context, input RejectJoinRequestInput) (NodeJoinRequest, error) { + now := time.Now().UTC() + row := s.db.QueryRow(ctx, ` + UPDATE node_join_requests + SET status = 'rejected', + reviewed_by_user_id = $3::uuid, + reviewed_at = $4, + rejection_reason = $5, + updated_at = $4 + WHERE cluster_id = $1::uuid + AND id = $2::uuid + AND status = 'pending' + RETURNING id::text, cluster_id::text, join_token_id::text, node_name, node_fingerprint, public_key, + reported_capabilities, reported_facts, requested_roles, status, reviewed_by_user_id::text, + reviewed_at, approved_node_id::text, rejection_reason, created_at, updated_at, approval_payload, approval_signature + `, input.ClusterID, input.JoinRequestID, input.ActorUserID, now, input.Reason) + return scanJoinRequest(row) +} + +func (s *PostgresStore) AssignNodeRole(ctx context.Context, input AssignNodeRoleInput) (NodeRoleAssignment, error) { + id := uuid.NewString() + status := input.Status + if status == "" { + status = "active" + } + if status != "active" { + row := s.db.QueryRow(ctx, ` + UPDATE node_role_assignments + SET status = $6, + revoked_at = CASE WHEN $6 = 'revoked' THEN NOW() ELSE revoked_at END, + policy = $7::jsonb, + assigned_by_user_id = $8::uuid + WHERE cluster_id = $2::uuid + AND node_id = $3::uuid + AND role = $5 + AND COALESCE(organization_id, '00000000-0000-0000-0000-000000000000'::uuid) = + COALESCE($4::uuid, '00000000-0000-0000-0000-000000000000'::uuid) + AND status = 'active' + RETURNING id::text, cluster_id::text, node_id::text, organization_id::text, role, status, + policy, assigned_by_user_id::text, assigned_at, revoked_at + `, id, input.ClusterID, input.NodeID, input.OrganizationID, input.Role, status, []byte(input.Policy), input.ActorUserID) + return scanRoleAssignment(row) + } + row := s.db.QueryRow(ctx, ` + INSERT INTO node_role_assignments ( + id, cluster_id, node_id, organization_id, role, status, policy, assigned_by_user_id, assigned_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4::uuid, $5, $6, $7::jsonb, $8::uuid, NOW()) + ON CONFLICT (cluster_id, node_id, role, COALESCE(organization_id, '00000000-0000-0000-0000-000000000000'::uuid)) + WHERE status = 'active' + DO UPDATE SET + policy = EXCLUDED.policy, + assigned_by_user_id = EXCLUDED.assigned_by_user_id + RETURNING id::text, cluster_id::text, node_id::text, organization_id::text, role, status, + policy, assigned_by_user_id::text, assigned_at, revoked_at + `, id, input.ClusterID, input.NodeID, input.OrganizationID, input.Role, status, []byte(input.Policy), input.ActorUserID) + return scanRoleAssignment(row) +} + +func (s *PostgresStore) ListNodeRoleAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeRoleAssignment, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, node_id::text, organization_id::text, role, status, + policy, assigned_by_user_id::text, assigned_at, revoked_at + FROM node_role_assignments + WHERE cluster_id = $1::uuid + AND node_id = $2::uuid + ORDER BY assigned_at DESC + `, clusterID, nodeID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []NodeRoleAssignment + for rows.Next() { + item, err := scanRoleAssignment(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) AttachExistingNodeToCluster(ctx context.Context, input AttachExistingNodeInput) (ClusterNode, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return ClusterNode{}, err + } + defer tx.Rollback(ctx) + + now := time.Now().UTC() + membershipMetadata, err := json.Marshal(map[string]any{ + "attached_from_existing_node": true, + "attached_at": now.Format(time.RFC3339Nano), + }) + if err != nil { + return ClusterNode{}, err + } + tag, err := tx.Exec(ctx, ` + WITH eligible_node AS ( + SELECT id + FROM nodes + WHERE id = $2::uuid + AND registration_status = 'active' + ) + INSERT INTO cluster_memberships (cluster_id, node_id, membership_status, joined_at, metadata) + SELECT $1::uuid, id, 'active', $3, $4::jsonb + FROM eligible_node + ON CONFLICT (cluster_id, node_id) DO UPDATE SET + membership_status = 'active', + metadata = cluster_memberships.metadata || EXCLUDED.metadata + WHERE cluster_memberships.membership_status <> 'revoked' + `, input.ClusterID, input.NodeID, now, membershipMetadata) + if err != nil { + return ClusterNode{}, err + } + if tag.RowsAffected() != 1 { + return ClusterNode{}, pgx.ErrNoRows + } + + for _, role := range input.Roles { + _, err := tx.Exec(ctx, ` + INSERT INTO node_role_assignments ( + id, cluster_id, node_id, role, status, policy, assigned_by_user_id, assigned_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, 'active', '{}'::jsonb, $5::uuid, $6) + ON CONFLICT DO NOTHING + `, uuid.NewString(), input.ClusterID, input.NodeID, role, input.ActorUserID, now) + if err != nil { + return ClusterNode{}, err + } + } + + auditPayload, err := json.Marshal(map[string]any{ + "attached_from_existing_node": true, + "roles": input.Roles, + }) + if err != nil { + return ClusterNode{}, err + } + if _, err := tx.Exec(ctx, ` + INSERT INTO cluster_audit_events (cluster_id, actor_user_id, event_type, target_type, target_id, payload, created_at) + VALUES ($1::uuid, $2::uuid, 'cluster_membership.attached_existing_node', 'node', $3, $4::jsonb, $5) + `, input.ClusterID, input.ActorUserID, input.NodeID, auditPayload, now); err != nil { + return ClusterNode{}, err + } + + row := tx.QueryRow(ctx, ` + SELECT n.id::text, n.owner_organization_id::text, n.node_key, n.name, n.ownership_type, + n.registration_status, n.health_status, n.version_state, n.partition_state, + n.reported_version, n.last_seen_at, cm.membership_status, cm.metadata, + ng.id::text, ng.name, + n.created_at, n.updated_at + FROM cluster_memberships cm + JOIN nodes n ON n.id = cm.node_id + LEFT JOIN cluster_node_group_memberships ngm ON ngm.cluster_id = cm.cluster_id AND ngm.node_id = cm.node_id + LEFT JOIN cluster_node_groups ng ON ng.cluster_id = ngm.cluster_id AND ng.id = ngm.group_id + WHERE cm.cluster_id = $1::uuid + AND cm.node_id = $2::uuid + `, input.ClusterID, input.NodeID) + item, err := scanClusterNode(row) + if err != nil { + return ClusterNode{}, err + } + if err := tx.Commit(ctx); err != nil { + return ClusterNode{}, err + } + return item, nil +} + +func (s *PostgresStore) AssignNodeToGroup(ctx context.Context, input AssignNodeGroupInput) (ClusterNode, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return ClusterNode{}, err + } + defer tx.Rollback(ctx) + + now := time.Now().UTC() + if input.GroupID == nil { + tag, err := tx.Exec(ctx, ` + DELETE FROM cluster_node_group_memberships + WHERE cluster_id = $1::uuid + AND node_id = $2::uuid + AND EXISTS ( + SELECT 1 + FROM cluster_memberships cm + WHERE cm.cluster_id = $1::uuid + AND cm.node_id = $2::uuid + AND cm.membership_status <> 'revoked' + ) + `, input.ClusterID, input.NodeID) + if err != nil { + return ClusterNode{}, err + } + if tag.RowsAffected() == 0 { + var exists bool + if err := tx.QueryRow(ctx, ` + SELECT EXISTS ( + SELECT 1 + FROM cluster_memberships + WHERE cluster_id = $1::uuid + AND node_id = $2::uuid + AND membership_status <> 'revoked' + ) + `, input.ClusterID, input.NodeID).Scan(&exists); err != nil { + return ClusterNode{}, err + } + if !exists { + return ClusterNode{}, pgx.ErrNoRows + } + } + } else { + tag, err := tx.Exec(ctx, ` + INSERT INTO cluster_node_group_memberships ( + cluster_id, node_id, group_id, assigned_by_user_id, assigned_at, metadata + ) + SELECT $1::uuid, $2::uuid, $3::uuid, $4::uuid, $5, '{}'::jsonb + WHERE EXISTS ( + SELECT 1 + FROM cluster_memberships cm + WHERE cm.cluster_id = $1::uuid + AND cm.node_id = $2::uuid + AND cm.membership_status <> 'revoked' + ) + AND EXISTS ( + SELECT 1 + FROM cluster_node_groups ng + WHERE ng.cluster_id = $1::uuid + AND ng.id = $3::uuid + ) + ON CONFLICT (cluster_id, node_id) DO UPDATE SET + group_id = EXCLUDED.group_id, + assigned_by_user_id = EXCLUDED.assigned_by_user_id, + assigned_at = EXCLUDED.assigned_at + `, input.ClusterID, input.NodeID, input.GroupID, input.ActorUserID, now) + if err != nil { + return ClusterNode{}, err + } + if tag.RowsAffected() != 1 { + return ClusterNode{}, pgx.ErrNoRows + } + } + + auditPayload := json.RawMessage(`{"group_id":null}`) + if input.GroupID != nil { + auditPayload = json.RawMessage(fmt.Sprintf(`{"group_id":%q}`, *input.GroupID)) + } + if _, err := tx.Exec(ctx, ` + INSERT INTO cluster_audit_events (cluster_id, actor_user_id, event_type, target_type, target_id, payload, created_at) + VALUES ($1::uuid, $2::uuid, 'cluster_node_group.assigned', 'node', $3, $4::jsonb, $5) + `, input.ClusterID, input.ActorUserID, input.NodeID, auditPayload, now); err != nil { + return ClusterNode{}, err + } + + row := tx.QueryRow(ctx, ` + SELECT n.id::text, n.owner_organization_id::text, n.node_key, n.name, n.ownership_type, + n.registration_status, n.health_status, n.version_state, n.partition_state, + n.reported_version, n.last_seen_at, cm.membership_status, cm.metadata, + ng.id::text, ng.name, + n.created_at, n.updated_at + FROM cluster_memberships cm + JOIN nodes n ON n.id = cm.node_id + LEFT JOIN cluster_node_group_memberships ngm ON ngm.cluster_id = cm.cluster_id AND ngm.node_id = cm.node_id + LEFT JOIN cluster_node_groups ng ON ng.cluster_id = ngm.cluster_id AND ng.id = ngm.group_id + WHERE cm.cluster_id = $1::uuid + AND cm.node_id = $2::uuid + `, input.ClusterID, input.NodeID) + item, err := scanClusterNode(row) + if err != nil { + return ClusterNode{}, err + } + if err := tx.Commit(ctx); err != nil { + return ClusterNode{}, err + } + return item, nil +} + +func (s *PostgresStore) RecordHeartbeat(ctx context.Context, input RecordHeartbeatInput) (NodeHeartbeat, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return NodeHeartbeat{}, err + } + defer tx.Rollback(ctx) + + id := uuid.NewString() + now := time.Now().UTC() + row := tx.QueryRow(ctx, ` + INSERT INTO node_heartbeats ( + id, cluster_id, node_id, health_status, reported_version, capabilities, service_states, metadata, observed_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, $5, $6::jsonb, $7::jsonb, $8::jsonb, $9) + RETURNING id::text, cluster_id::text, node_id::text, health_status, reported_version, + capabilities, service_states, metadata, observed_at + `, id, input.ClusterID, input.NodeID, input.HealthStatus, input.ReportedVersion, []byte(input.Capabilities), []byte(input.ServiceStates), []byte(input.Metadata), now) + heartbeat, err := scanHeartbeat(row) + if err != nil { + return NodeHeartbeat{}, err + } + if _, err := tx.Exec(ctx, ` + INSERT INTO node_latest_heartbeats ( + cluster_id, node_id, heartbeat_id, health_status, reported_version, capabilities, service_states, metadata, observed_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, $5, $6::jsonb, $7::jsonb, $8::jsonb, $9) + ON CONFLICT (cluster_id, node_id) DO UPDATE SET + heartbeat_id = EXCLUDED.heartbeat_id, + health_status = EXCLUDED.health_status, + reported_version = EXCLUDED.reported_version, + capabilities = EXCLUDED.capabilities, + service_states = EXCLUDED.service_states, + metadata = EXCLUDED.metadata, + observed_at = EXCLUDED.observed_at + `, input.ClusterID, input.NodeID, heartbeat.ID, heartbeat.HealthStatus, heartbeat.ReportedVersion, []byte(heartbeat.Capabilities), []byte(heartbeat.ServiceStates), []byte(heartbeat.Metadata), heartbeat.ObservedAt); err != nil { + return NodeHeartbeat{}, err + } + if _, err := tx.Exec(ctx, ` + UPDATE nodes + SET health_status = $2, + reported_version = COALESCE($3, reported_version), + last_seen_at = $4, + updated_at = $4 + WHERE id = $1::uuid + `, input.NodeID, input.HealthStatus, input.ReportedVersion, heartbeat.ObservedAt); err != nil { + return NodeHeartbeat{}, err + } + if _, err := tx.Exec(ctx, ` + UPDATE cluster_memberships + SET last_seen_at = $3 + WHERE cluster_id = $1::uuid + AND node_id = $2::uuid + `, input.ClusterID, input.NodeID, heartbeat.ObservedAt); err != nil { + return NodeHeartbeat{}, err + } + if err := tx.Commit(ctx); err != nil { + return NodeHeartbeat{}, err + } + return heartbeat, nil +} + +func (s *PostgresStore) ListNodeHeartbeats(ctx context.Context, clusterID, nodeID string, limit int) ([]NodeHeartbeat, error) { + if limit <= 0 || limit > 500 { + limit = 100 + } + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, node_id::text, health_status, reported_version, + capabilities, service_states, metadata, observed_at + FROM node_heartbeats + WHERE cluster_id = $1::uuid + AND node_id = $2::uuid + ORDER BY observed_at DESC + LIMIT $3 + `, clusterID, nodeID, limit) + if err != nil { + return nil, err + } + defer rows.Close() + var out []NodeHeartbeat + for rows.Next() { + item, err := scanHeartbeat(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) RevokeNodeIdentity(ctx context.Context, input RevokeNodeIdentityInput) error { + tx, err := s.db.Begin(ctx) + if err != nil { + return err + } + defer tx.Rollback(ctx) + now := time.Now().UTC() + tag, err := tx.Exec(ctx, ` + UPDATE node_identities + SET identity_status = 'revoked', + revoked_at = $3, + updated_at = $3, + metadata = metadata || $4::jsonb + WHERE node_id = $1::uuid + AND EXISTS ( + SELECT 1 FROM cluster_memberships cm + WHERE cm.cluster_id = $2::uuid + AND cm.node_id = node_identities.node_id + ) + `, input.NodeID, input.ClusterID, now, []byte(fmt.Sprintf(`{"revocation_reason":%q}`, input.Reason))) + if err != nil { + return err + } + if tag.RowsAffected() != 1 { + return pgx.ErrNoRows + } + if _, err := tx.Exec(ctx, ` + UPDATE nodes + SET registration_status = 'revoked', + updated_at = $2 + WHERE id = $1::uuid + `, input.NodeID, now); err != nil { + return err + } + if _, err := tx.Exec(ctx, ` + INSERT INTO cluster_audit_events (cluster_id, actor_user_id, event_type, target_type, target_id, payload, created_at) + VALUES ($1::uuid, $2::uuid, 'node_identity.revoked', 'node', $3, $4::jsonb, $5) + `, input.ClusterID, input.ActorUserID, input.NodeID, []byte(fmt.Sprintf(`{"reason":%q}`, input.Reason)), now); err != nil { + return err + } + return tx.Commit(ctx) +} + +func (s *PostgresStore) DisableClusterMembership(ctx context.Context, input DisableMembershipInput) error { + now := time.Now().UTC() + tag, err := s.db.Exec(ctx, ` + UPDATE cluster_memberships + SET membership_status = 'disabled', + metadata = metadata || $4::jsonb + WHERE cluster_id = $1::uuid + AND node_id = $2::uuid + AND membership_status <> 'revoked' + `, input.ClusterID, input.NodeID, now, []byte(fmt.Sprintf(`{"disabled_reason":%q,"disabled_at":%q}`, input.Reason, now.Format(time.RFC3339Nano)))) + if err != nil { + return err + } + if tag.RowsAffected() != 1 { + return pgx.ErrNoRows + } + return s.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "cluster_membership.disabled", + TargetType: "node", + TargetID: &input.NodeID, + Payload: json.RawMessage(fmt.Sprintf(`{"reason":%q}`, input.Reason)), + CreatedAt: now, + }) +} + +func (s *PostgresStore) UpsertFabricTestingFlag(ctx context.Context, input UpsertFabricTestingFlagInput) (FabricTestingFlag, error) { + if input.HistoryRetentionHours <= 0 { + input.HistoryRetentionHours = 24 + } + if len(input.Metadata) == 0 { + input.Metadata = json.RawMessage(`{}`) + } + row := s.db.QueryRow(ctx, ` + UPDATE fabric_testing_flags + SET enabled = $4, + telemetry_enabled = $5, + synthetic_links_enabled = $6, + history_retention_hours = $7, + metadata = $8::jsonb, + updated_by_user_id = $9::uuid, + updated_at = NOW() + WHERE scope_type = $1 + AND COALESCE(scope_id, '00000000-0000-0000-0000-000000000000'::uuid) = COALESCE($2::uuid, '00000000-0000-0000-0000-000000000000'::uuid) + AND COALESCE(cluster_id, '00000000-0000-0000-0000-000000000000'::uuid) = COALESCE($3::uuid, '00000000-0000-0000-0000-000000000000'::uuid) + RETURNING id::text, scope_type, scope_id::text, cluster_id::text, enabled, telemetry_enabled, + synthetic_links_enabled, history_retention_hours, metadata, updated_by_user_id::text, updated_at + `, input.ScopeType, input.ScopeID, input.ClusterID, input.Enabled, input.TelemetryEnabled, input.SyntheticLinksEnabled, input.HistoryRetentionHours, []byte(input.Metadata), input.ActorUserID) + item, err := scanFabricTestingFlag(row) + if err == nil { + return item, nil + } + if !errors.Is(err, pgx.ErrNoRows) { + return FabricTestingFlag{}, err + } + row = s.db.QueryRow(ctx, ` + INSERT INTO fabric_testing_flags ( + id, scope_type, scope_id, cluster_id, enabled, telemetry_enabled, synthetic_links_enabled, + history_retention_hours, metadata, updated_by_user_id, updated_at + ) VALUES ($1::uuid, $2, $3::uuid, $4::uuid, $5, $6, $7, $8, $9::jsonb, $10::uuid, NOW()) + RETURNING id::text, scope_type, scope_id::text, cluster_id::text, enabled, telemetry_enabled, + synthetic_links_enabled, history_retention_hours, metadata, updated_by_user_id::text, updated_at + `, uuid.NewString(), input.ScopeType, input.ScopeID, input.ClusterID, input.Enabled, input.TelemetryEnabled, input.SyntheticLinksEnabled, input.HistoryRetentionHours, []byte(input.Metadata), input.ActorUserID) + return scanFabricTestingFlag(row) +} + +func (s *PostgresStore) ListFabricTestingFlags(ctx context.Context) ([]FabricTestingFlag, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, scope_type, scope_id::text, cluster_id::text, enabled, telemetry_enabled, + synthetic_links_enabled, history_retention_hours, metadata, updated_by_user_id::text, updated_at + FROM fabric_testing_flags + ORDER BY scope_type, updated_at DESC + `) + if err != nil { + return nil, err + } + defer rows.Close() + var out []FabricTestingFlag + for rows.Next() { + item, err := scanFabricTestingFlag(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) GetEffectiveNodeTestingFlags(ctx context.Context, clusterID, nodeID string) (EffectiveNodeTestingFlags, error) { + rows, err := s.db.Query(ctx, ` + WITH node_scope AS ( + SELECT n.owner_organization_id + FROM nodes n + JOIN cluster_memberships cm ON cm.node_id = n.id AND cm.cluster_id = $1::uuid + WHERE n.id = $2::uuid + ) + SELECT f.scope_type, f.enabled, f.telemetry_enabled, f.synthetic_links_enabled, + f.history_retention_hours, f.metadata + FROM fabric_testing_flags f + LEFT JOIN node_scope ns ON TRUE + WHERE ( + f.scope_type = 'platform' + OR (f.scope_type = 'organization' AND f.scope_id = ns.owner_organization_id) + OR (f.scope_type = 'node' AND f.scope_id = $2::uuid) + ) + AND (f.cluster_id IS NULL OR f.cluster_id = $1::uuid) + ORDER BY CASE f.scope_type + WHEN 'platform' THEN 1 + WHEN 'organization' THEN 2 + WHEN 'node' THEN 3 + ELSE 4 + END + `, clusterID, nodeID) + if err != nil { + return EffectiveNodeTestingFlags{}, err + } + defer rows.Close() + out := EffectiveNodeTestingFlags{HistoryRetentionHours: 24, Metadata: json.RawMessage(`{}`)} + for rows.Next() { + var scope string + var metadata json.RawMessage + var retention int + var enabled, telemetry, links bool + if err := rows.Scan(&scope, &enabled, &telemetry, &links, &retention, &metadata); err != nil { + return EffectiveNodeTestingFlags{}, err + } + if enabled { + out.Enabled = true + } + if telemetry { + out.TelemetryEnabled = true + } + if links { + out.SyntheticLinksEnabled = true + } + if retention > 0 { + out.HistoryRetentionHours = retention + } + out.AppliedScopes = append(out.AppliedScopes, scope) + if len(metadata) > 0 && string(metadata) != "{}" { + out.Metadata = metadata + } + } + return out, rows.Err() +} + +func (s *PostgresStore) RecordNodeTelemetry(ctx context.Context, input RecordNodeTelemetryInput) (NodeTelemetryObservation, error) { + if input.ObservedAt.IsZero() { + input.ObservedAt = time.Now().UTC() + } + if len(input.Payload) == 0 { + input.Payload = json.RawMessage(`{}`) + } + row := s.db.QueryRow(ctx, ` + INSERT INTO node_telemetry_observations ( + id, cluster_id, node_id, cpu_percent, memory_used_bytes, memory_total_bytes, + disk_used_bytes, disk_total_bytes, network_rx_bytes, network_tx_bytes, + process_count, payload, observed_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8, $9, $10, $11, $12::jsonb, $13) + RETURNING id::text, cluster_id::text, node_id::text, cpu_percent, memory_used_bytes, + memory_total_bytes, disk_used_bytes, disk_total_bytes, network_rx_bytes, + network_tx_bytes, process_count, payload, observed_at + `, uuid.NewString(), input.ClusterID, input.NodeID, input.CPUPercent, input.MemoryUsedBytes, input.MemoryTotalBytes, input.DiskUsedBytes, input.DiskTotalBytes, input.NetworkRxBytes, input.NetworkTxBytes, input.ProcessCount, []byte(input.Payload), input.ObservedAt) + return scanNodeTelemetry(row) +} + +func (s *PostgresStore) ListNodeTelemetry(ctx context.Context, clusterID, nodeID string, limit int) ([]NodeTelemetryObservation, error) { + if limit <= 0 || limit > 1000 { + limit = 240 + } + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, node_id::text, cpu_percent, memory_used_bytes, + memory_total_bytes, disk_used_bytes, disk_total_bytes, network_rx_bytes, + network_tx_bytes, process_count, payload, observed_at + FROM node_telemetry_observations + WHERE cluster_id = $1::uuid + AND node_id = $2::uuid + ORDER BY observed_at DESC + LIMIT $3 + `, clusterID, nodeID, limit) + if err != nil { + return nil, err + } + defer rows.Close() + var out []NodeTelemetryObservation + for rows.Next() { + item, err := scanNodeTelemetry(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) SetDesiredWorkload(ctx context.Context, input SetDesiredWorkloadInput) (NodeWorkloadDesiredState, error) { + row := s.db.QueryRow(ctx, ` + INSERT INTO node_workload_desired_states ( + cluster_id, node_id, service_type, desired_state, version, runtime_mode, + artifact_ref, config, environment, updated_by_user_id, updated_at + ) VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6, $7, $8::jsonb, $9::jsonb, $10::uuid, NOW()) + ON CONFLICT (cluster_id, node_id, service_type) DO UPDATE SET + desired_state = EXCLUDED.desired_state, + version = EXCLUDED.version, + runtime_mode = EXCLUDED.runtime_mode, + artifact_ref = EXCLUDED.artifact_ref, + config = EXCLUDED.config, + environment = EXCLUDED.environment, + updated_by_user_id = EXCLUDED.updated_by_user_id, + updated_at = EXCLUDED.updated_at + RETURNING cluster_id::text, node_id::text, service_type, desired_state, version, runtime_mode, + artifact_ref, config, environment, updated_by_user_id::text, updated_at + `, input.ClusterID, input.NodeID, input.ServiceType, input.DesiredState, input.Version, input.RuntimeMode, input.ArtifactRef, []byte(input.Config), []byte(input.Environment), input.ActorUserID) + return scanDesiredWorkload(row) +} + +func (s *PostgresStore) ListDesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]NodeWorkloadDesiredState, error) { + rows, err := s.db.Query(ctx, ` + SELECT cluster_id::text, node_id::text, service_type, desired_state, version, runtime_mode, + artifact_ref, config, environment, updated_by_user_id::text, updated_at + FROM node_workload_desired_states + WHERE cluster_id = $1::uuid + AND node_id = $2::uuid + ORDER BY service_type + `, clusterID, nodeID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []NodeWorkloadDesiredState + for rows.Next() { + item, err := scanDesiredWorkload(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) ReportWorkloadStatus(ctx context.Context, input ReportWorkloadStatusInput) (NodeWorkloadStatus, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return NodeWorkloadStatus{}, err + } + defer tx.Rollback(ctx) + + id := uuid.NewString() + row := tx.QueryRow(ctx, ` + INSERT INTO node_workload_status_reports ( + id, cluster_id, node_id, service_type, reported_state, runtime_mode, version, status_payload, observed_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8::jsonb, NOW()) + RETURNING id::text, cluster_id::text, node_id::text, service_type, reported_state, + runtime_mode, version, status_payload, observed_at + `, id, input.ClusterID, input.NodeID, input.ServiceType, input.ReportedState, input.RuntimeMode, input.Version, []byte(input.StatusPayload)) + status, err := scanWorkloadStatus(row) + if err != nil { + return NodeWorkloadStatus{}, err + } + if _, err := tx.Exec(ctx, ` + INSERT INTO node_workload_latest_statuses ( + cluster_id, node_id, service_type, status_report_id, reported_state, runtime_mode, version, status_payload, observed_at + ) VALUES ($1::uuid, $2::uuid, $3, $4::uuid, $5, $6, $7, $8::jsonb, $9) + ON CONFLICT (cluster_id, node_id, service_type) DO UPDATE SET + status_report_id = EXCLUDED.status_report_id, + reported_state = EXCLUDED.reported_state, + runtime_mode = EXCLUDED.runtime_mode, + version = EXCLUDED.version, + status_payload = EXCLUDED.status_payload, + observed_at = EXCLUDED.observed_at + `, status.ClusterID, status.NodeID, status.ServiceType, status.ID, status.ReportedState, status.RuntimeMode, status.Version, []byte(status.StatusPayload), status.ObservedAt); err != nil { + return NodeWorkloadStatus{}, err + } + if _, err := tx.Exec(ctx, ` + INSERT INTO node_services (node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at) + VALUES ($1::uuid, $2, FALSE, 'disabled', $3, $4, $5::jsonb, $4) + ON CONFLICT (node_id, service_type) DO UPDATE SET + reported_state = EXCLUDED.reported_state, + last_reported_at = EXCLUDED.last_reported_at, + metadata = EXCLUDED.metadata, + updated_at = EXCLUDED.updated_at + `, status.NodeID, status.ServiceType, status.ReportedState, status.ObservedAt, []byte(status.StatusPayload)); err != nil { + return NodeWorkloadStatus{}, err + } + if err := tx.Commit(ctx); err != nil { + return NodeWorkloadStatus{}, err + } + return status, nil +} + +func (s *PostgresStore) ListLatestWorkloadStatuses(ctx context.Context, clusterID, nodeID string) ([]NodeWorkloadStatus, error) { + rows, err := s.db.Query(ctx, ` + SELECT COALESCE(status_report_id::text, '00000000-0000-0000-0000-000000000000'), cluster_id::text, node_id::text, + service_type, reported_state, runtime_mode, version, status_payload, observed_at + FROM node_workload_latest_statuses + WHERE cluster_id = $1::uuid + AND node_id = $2::uuid + ORDER BY service_type + `, clusterID, nodeID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []NodeWorkloadStatus + for rows.Next() { + item, err := scanWorkloadStatus(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) ReportMeshLink(ctx context.Context, input ReportMeshLinkInput) (MeshLinkObservation, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return MeshLinkObservation{}, err + } + defer tx.Rollback(ctx) + + id := uuid.NewString() + row := tx.QueryRow(ctx, ` + INSERT INTO mesh_link_observations ( + id, cluster_id, source_node_id, target_node_id, link_status, latency_ms, quality_score, metadata, observed_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4::uuid, $5, $6, $7, $8::jsonb, NOW()) + RETURNING id::text, cluster_id::text, source_node_id::text, target_node_id::text, link_status, + latency_ms, quality_score, metadata, observed_at + `, id, input.ClusterID, input.SourceNodeID, input.TargetNodeID, input.LinkStatus, input.LatencyMs, input.QualityScore, []byte(input.Metadata)) + observation, err := scanMeshLink(row) + if err != nil { + return MeshLinkObservation{}, err + } + observationKey := meshLatestObservationKey(observation.Metadata) + if _, err := tx.Exec(ctx, ` + INSERT INTO mesh_latest_links ( + cluster_id, source_node_id, target_node_id, observation_id, link_status, latency_ms, quality_score, metadata, observed_at, observation_key + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4::uuid, $5, $6, $7, $8::jsonb, $9, $10) + ON CONFLICT (cluster_id, source_node_id, target_node_id, observation_key) DO UPDATE SET + observation_id = EXCLUDED.observation_id, + link_status = EXCLUDED.link_status, + latency_ms = EXCLUDED.latency_ms, + quality_score = EXCLUDED.quality_score, + metadata = EXCLUDED.metadata, + observed_at = EXCLUDED.observed_at + `, observation.ClusterID, observation.SourceNodeID, observation.TargetNodeID, observation.ID, observation.LinkStatus, observation.LatencyMs, observation.QualityScore, []byte(observation.Metadata), observation.ObservedAt, observationKey); err != nil { + return MeshLinkObservation{}, err + } + if err := tx.Commit(ctx); err != nil { + return MeshLinkObservation{}, err + } + return observation, nil +} + +func meshLatestObservationKey(metadata json.RawMessage) string { + var values map[string]any + if err := json.Unmarshal(metadata, &values); err != nil { + return "default" + } + observationType := meshMetadataString(values, "observation_type") + if observationType == "" { + observationType = "default" + } + switch observationType { + case "synthetic_route_health": + if routeID := meshMetadataString(values, "route_id"); routeID != "" { + return observationType + ":" + routeID + } + case "peer_connection_manager": + transportMode := meshMetadataString(values, "transport_mode") + relayNodeID := meshMetadataString(values, "relay_node_id") + if transportMode != "" || relayNodeID != "" { + return observationType + ":" + transportMode + ":" + relayNodeID + } + } + return observationType +} + +func meshMetadataString(values map[string]any, key string) string { + value, ok := values[key].(string) + if !ok { + return "" + } + return value +} + +func (s *PostgresStore) ListMeshLinks(ctx context.Context, clusterID string) ([]MeshLinkObservation, error) { + rows, err := s.db.Query(ctx, ` + SELECT COALESCE(observation_id::text, '00000000-0000-0000-0000-000000000000'), cluster_id::text, + source_node_id::text, target_node_id::text, link_status, latency_ms, quality_score, metadata, observed_at + FROM mesh_latest_links + WHERE cluster_id = $1::uuid + ORDER BY observed_at DESC + `, clusterID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []MeshLinkObservation + for rows.Next() { + item, err := scanMeshLink(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) CreateRouteIntent(ctx context.Context, input CreateRouteIntentInput) (MeshRouteIntent, error) { + id := uuid.NewString() + row := s.db.QueryRow(ctx, ` + INSERT INTO mesh_route_intents ( + id, cluster_id, source_selector, destination_selector, service_class, + priority, status, policy, created_by_user_id, created_at, updated_at + ) VALUES ($1::uuid, $2::uuid, $3::jsonb, $4::jsonb, $5, $6, 'active', $7::jsonb, $8::uuid, NOW(), NOW()) + RETURNING id::text, cluster_id::text, source_selector, destination_selector, service_class, + priority, status, policy, created_by_user_id::text, created_at, updated_at + `, id, input.ClusterID, []byte(input.SourceSelector), []byte(input.DestinationSelector), input.ServiceClass, input.Priority, []byte(input.Policy), input.ActorUserID) + return scanRouteIntent(row) +} + +func (s *PostgresStore) ListRouteIntents(ctx context.Context, clusterID string) ([]MeshRouteIntent, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, source_selector, destination_selector, service_class, + priority, status, policy, created_by_user_id::text, created_at, updated_at + FROM mesh_route_intents + WHERE cluster_id = $1::uuid + ORDER BY priority ASC, created_at DESC + `, clusterID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []MeshRouteIntent + for rows.Next() { + item, err := scanRouteIntent(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) ListQoSPolicies(ctx context.Context, clusterID string) ([]MeshQoSPolicy, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, service_class, priority, reliability_mode, + drop_policy, bandwidth_policy, metadata, created_at, updated_at + FROM mesh_qos_policies + WHERE cluster_id = $1::uuid + ORDER BY priority ASC + `, clusterID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []MeshQoSPolicy + for rows.Next() { + item, err := scanQoSPolicy(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) ListFabricEntryPoints(ctx context.Context, clusterID string) ([]FabricEntryPoint, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, name, status, endpoint_type, public_endpoint, + policy, metadata, created_by_user_id::text, created_at, updated_at + FROM fabric_entry_points + WHERE cluster_id = $1::uuid + ORDER BY name + `, clusterID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []FabricEntryPoint + for rows.Next() { + item, err := scanFabricEntryPoint(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + if out == nil { + out = []FabricEntryPoint{} + } + return out, rows.Err() +} + +func (s *PostgresStore) CreateFabricEntryPoint(ctx context.Context, input CreateFabricEntryPointInput) (FabricEntryPoint, error) { + id := uuid.NewString() + row := s.db.QueryRow(ctx, ` + INSERT INTO fabric_entry_points ( + id, cluster_id, name, status, endpoint_type, public_endpoint, + policy, metadata, created_by_user_id, created_at, updated_at + ) VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6, $7::jsonb, $8::jsonb, $9::uuid, NOW(), NOW()) + RETURNING id::text, cluster_id::text, name, status, endpoint_type, public_endpoint, + policy, metadata, created_by_user_id::text, created_at, updated_at + `, id, input.ClusterID, input.Name, input.Status, input.EndpointType, input.PublicEndpoint, []byte(input.Policy), []byte(input.Metadata), input.ActorUserID) + return scanFabricEntryPoint(row) +} + +func (s *PostgresStore) SetFabricEntryPointNode(ctx context.Context, input SetFabricEntryPointNodeInput) (FabricEntryPointNode, error) { + row := s.db.QueryRow(ctx, ` + WITH endpoint_ok AS ( + SELECT id + FROM fabric_entry_points + WHERE id = $2::uuid + AND cluster_id = $1::uuid + ), membership_ok AS ( + SELECT node_id + FROM cluster_memberships + WHERE cluster_id = $1::uuid + AND node_id = $3::uuid + AND membership_status = 'active' + ) + INSERT INTO fabric_entry_point_nodes ( + entry_point_id, cluster_id, node_id, status, priority, metadata, added_by_user_id, added_at + ) + SELECT endpoint_ok.id, $1::uuid, membership_ok.node_id, $4, $5, $6::jsonb, $7::uuid, NOW() + FROM endpoint_ok + CROSS JOIN membership_ok + ON CONFLICT (entry_point_id, node_id) DO UPDATE SET + status = EXCLUDED.status, + priority = EXCLUDED.priority, + metadata = EXCLUDED.metadata + RETURNING entry_point_id::text, cluster_id::text, node_id::text, status, priority, + metadata, added_by_user_id::text, added_at + `, input.ClusterID, input.EntryPointID, input.NodeID, input.Status, input.Priority, []byte(input.Metadata), input.ActorUserID) + return scanFabricEntryPointNode(row) +} + +func (s *PostgresStore) ListFabricEntryPointNodes(ctx context.Context, clusterID, entryPointID string) ([]FabricEntryPointNode, error) { + rows, err := s.db.Query(ctx, ` + SELECT entry_point_id::text, cluster_id::text, node_id::text, status, priority, + metadata, added_by_user_id::text, added_at + FROM fabric_entry_point_nodes + WHERE cluster_id = $1::uuid + AND entry_point_id = $2::uuid + ORDER BY priority, added_at + `, clusterID, entryPointID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []FabricEntryPointNode + for rows.Next() { + item, err := scanFabricEntryPointNode(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + if out == nil { + out = []FabricEntryPointNode{} + } + return out, rows.Err() +} + +func (s *PostgresStore) ListFabricEgressPools(ctx context.Context, clusterID string) ([]FabricEgressPool, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, name, status, description, route_scope, + policy, metadata, created_by_user_id::text, created_at, updated_at + FROM fabric_egress_pools + WHERE cluster_id = $1::uuid + ORDER BY name + `, clusterID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []FabricEgressPool + for rows.Next() { + item, err := scanFabricEgressPool(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + if out == nil { + out = []FabricEgressPool{} + } + return out, rows.Err() +} + +func (s *PostgresStore) CreateFabricEgressPool(ctx context.Context, input CreateFabricEgressPoolInput) (FabricEgressPool, error) { + id := uuid.NewString() + row := s.db.QueryRow(ctx, ` + INSERT INTO fabric_egress_pools ( + id, cluster_id, name, status, description, route_scope, + policy, metadata, created_by_user_id, created_at, updated_at + ) VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6::jsonb, $7::jsonb, $8::jsonb, $9::uuid, NOW(), NOW()) + RETURNING id::text, cluster_id::text, name, status, description, route_scope, + policy, metadata, created_by_user_id::text, created_at, updated_at + `, id, input.ClusterID, input.Name, input.Status, input.Description, []byte(input.RouteScope), []byte(input.Policy), []byte(input.Metadata), input.ActorUserID) + return scanFabricEgressPool(row) +} + +func (s *PostgresStore) SetFabricEgressPoolNode(ctx context.Context, input SetFabricEgressPoolNodeInput) (FabricEgressPoolNode, error) { + row := s.db.QueryRow(ctx, ` + WITH pool_ok AS ( + SELECT id + FROM fabric_egress_pools + WHERE id = $2::uuid + AND cluster_id = $1::uuid + ), membership_ok AS ( + SELECT node_id + FROM cluster_memberships + WHERE cluster_id = $1::uuid + AND node_id = $3::uuid + AND membership_status = 'active' + ) + INSERT INTO fabric_egress_pool_nodes ( + egress_pool_id, cluster_id, node_id, status, priority, metadata, added_by_user_id, added_at + ) + SELECT pool_ok.id, $1::uuid, membership_ok.node_id, $4, $5, $6::jsonb, $7::uuid, NOW() + FROM pool_ok + CROSS JOIN membership_ok + ON CONFLICT (egress_pool_id, node_id) DO UPDATE SET + status = EXCLUDED.status, + priority = EXCLUDED.priority, + metadata = EXCLUDED.metadata + RETURNING egress_pool_id::text, cluster_id::text, node_id::text, status, priority, + metadata, added_by_user_id::text, added_at + `, input.ClusterID, input.EgressPoolID, input.NodeID, input.Status, input.Priority, []byte(input.Metadata), input.ActorUserID) + return scanFabricEgressPoolNode(row) +} + +func (s *PostgresStore) ListFabricEgressPoolNodes(ctx context.Context, clusterID, egressPoolID string) ([]FabricEgressPoolNode, error) { + rows, err := s.db.Query(ctx, ` + SELECT egress_pool_id::text, cluster_id::text, node_id::text, status, priority, + metadata, added_by_user_id::text, added_at + FROM fabric_egress_pool_nodes + WHERE cluster_id = $1::uuid + AND egress_pool_id = $2::uuid + ORDER BY priority, added_at + `, clusterID, egressPoolID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []FabricEgressPoolNode + for rows.Next() { + item, err := scanFabricEgressPoolNode(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + if out == nil { + out = []FabricEgressPoolNode{} + } + return out, rows.Err() +} + +func (s *PostgresStore) GetClusterAuthorityState(ctx context.Context, clusterID string) (ClusterAuthorityState, error) { + row := s.db.QueryRow(ctx, ` + SELECT cluster_id::text, authority_state, mutation_mode, term, notes, updated_by_user_id::text, updated_at + FROM cluster_authority_states + WHERE cluster_id = $1::uuid + `, clusterID) + return scanAuthorityState(row) +} + +func (s *PostgresStore) UpdateClusterAuthorityState(ctx context.Context, input UpdateClusterAuthorityInput) (ClusterAuthorityState, error) { + row := s.db.QueryRow(ctx, ` + INSERT INTO cluster_authority_states ( + cluster_id, authority_state, mutation_mode, term, notes, updated_by_user_id, updated_at + ) VALUES ($1::uuid, $2, $3, 1, $4, $5::uuid, NOW()) + ON CONFLICT (cluster_id) DO UPDATE SET + authority_state = EXCLUDED.authority_state, + mutation_mode = EXCLUDED.mutation_mode, + term = cluster_authority_states.term + 1, + notes = EXCLUDED.notes, + updated_by_user_id = EXCLUDED.updated_by_user_id, + updated_at = EXCLUDED.updated_at + RETURNING cluster_id::text, authority_state, mutation_mode, term, notes, updated_by_user_id::text, updated_at + `, input.ClusterID, input.AuthorityState, input.MutationMode, input.Notes, input.ActorUserID) + return scanAuthorityState(row) +} + +func (s *PostgresStore) ListClusterAdminSummaries(ctx context.Context) ([]ClusterAdminSummary, error) { + rows, err := s.db.Query(ctx, ` + SELECT cluster_id::text, slug, name, status, region, authority_state, mutation_mode, + cluster_key_algorithm, cluster_key_fingerprint, + node_count, healthy_node_count, pending_join_count, active_role_assignment_count, last_node_seen_at + FROM cluster_admin_summaries + ORDER BY name + `) + if err != nil { + return nil, err + } + defer rows.Close() + var out []ClusterAdminSummary + for rows.Next() { + var item ClusterAdminSummary + if err := rows.Scan( + &item.ClusterID, + &item.Slug, + &item.Name, + &item.Status, + &item.Region, + &item.AuthorityState, + &item.MutationMode, + &item.ClusterKeyAlgorithm, + &item.ClusterKeyFingerprint, + &item.NodeCount, + &item.HealthyNodeCount, + &item.PendingJoinCount, + &item.ActiveRoleAssignmentCount, + &item.LastNodeSeenAt, + ); err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) CreateVPNConnection(ctx context.Context, input CreateVPNConnectionInput) (VPNConnection, error) { + id := uuid.NewString() + status := VPNConnectionStatusDisabled + if input.DesiredState == VPNConnectionDesiredEnabled { + status = VPNConnectionStatusEnabled + } + row := s.db.QueryRow(ctx, ` + INSERT INTO vpn_connections ( + id, cluster_id, organization_id, name, target_endpoint, protocol_family, + credential_ref, mode, desired_state, allowed_node_policy, routing_usage, + route_policy, qos_policy, placement_policy, status, metadata, + created_by_user_id, updated_by_user_id, created_at, updated_at + ) VALUES ( + $1::uuid, $2::uuid, $3::uuid, $4, $5::jsonb, $6, + $7, $8, $9, $10::jsonb, $11::jsonb, + $12::jsonb, $13::jsonb, $14::jsonb, $15, $16::jsonb, + $17::uuid, $17::uuid, NOW(), NOW() + ) + RETURNING id::text, cluster_id::text, organization_id::text, name, target_endpoint, + protocol_family, credential_ref, mode, desired_state, allowed_node_policy, + routing_usage, route_policy, qos_policy, placement_policy, status, metadata, + created_by_user_id::text, updated_by_user_id::text, created_at, updated_at + `, id, input.ClusterID, input.OrganizationID, input.Name, []byte(input.TargetEndpoint), input.ProtocolFamily, + input.CredentialRef, input.Mode, input.DesiredState, []byte(input.AllowedNodePolicy), []byte(input.RoutingUsage), + []byte(input.RoutePolicy), []byte(input.QoSPolicy), []byte(input.PlacementPolicy), status, []byte(input.Metadata), + input.ActorUserID) + return scanVPNConnection(row) +} + +func (s *PostgresStore) ListVPNConnections(ctx context.Context, clusterID string) ([]VPNConnection, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, organization_id::text, name, target_endpoint, + protocol_family, credential_ref, mode, desired_state, allowed_node_policy, + routing_usage, route_policy, qos_policy, placement_policy, status, metadata, + created_by_user_id::text, updated_by_user_id::text, created_at, updated_at + FROM vpn_connections + WHERE cluster_id = $1::uuid + ORDER BY created_at DESC + `, clusterID) + if err != nil { + return nil, err + } + defer rows.Close() + return scanVPNConnections(rows) +} + +func (s *PostgresStore) GetVPNConnection(ctx context.Context, clusterID, vpnConnectionID string) (VPNConnection, error) { + row := s.db.QueryRow(ctx, ` + SELECT id::text, cluster_id::text, organization_id::text, name, target_endpoint, + protocol_family, credential_ref, mode, desired_state, allowed_node_policy, + routing_usage, route_policy, qos_policy, placement_policy, status, metadata, + created_by_user_id::text, updated_by_user_id::text, created_at, updated_at + FROM vpn_connections + WHERE cluster_id = $1::uuid + AND id = $2::uuid + `, clusterID, vpnConnectionID) + return scanVPNConnection(row) +} + +func (s *PostgresStore) UpdateVPNConnectionDesiredState(ctx context.Context, input UpdateVPNConnectionDesiredStateInput) (VPNConnection, error) { + status := VPNConnectionStatusDisabled + if input.DesiredState == VPNConnectionDesiredEnabled { + status = VPNConnectionStatusEnabled + } + row := s.db.QueryRow(ctx, ` + UPDATE vpn_connections + SET desired_state = $3, + status = $4, + updated_by_user_id = $5::uuid, + updated_at = NOW() + WHERE cluster_id = $1::uuid + AND id = $2::uuid + RETURNING id::text, cluster_id::text, organization_id::text, name, target_endpoint, + protocol_family, credential_ref, mode, desired_state, allowed_node_policy, + routing_usage, route_policy, qos_policy, placement_policy, status, metadata, + created_by_user_id::text, updated_by_user_id::text, created_at, updated_at + `, input.ClusterID, input.VPNConnectionID, input.DesiredState, status, input.ActorUserID) + return scanVPNConnection(row) +} + +func (s *PostgresStore) UpsertVPNConnectionRoutePolicy(ctx context.Context, input UpsertVPNConnectionRoutePolicyInput) (VPNConnectionRoutePolicy, error) { + id := uuid.NewString() + row := s.db.QueryRow(ctx, ` + INSERT INTO vpn_connection_route_policies ( + id, vpn_connection_id, cluster_id, organization_id, route_type, destination, + action, service_type, priority, policy, status, created_by_user_id, created_at, updated_at + ) + SELECT $1::uuid, vc.id, vc.cluster_id, vc.organization_id, $4, $5, + $6, $7, $8, $9::jsonb, $10, $11::uuid, NOW(), NOW() + FROM vpn_connections vc + WHERE vc.cluster_id = $2::uuid + AND vc.id = $3::uuid + RETURNING id::text, vpn_connection_id::text, cluster_id::text, organization_id::text, + route_type, destination, action, service_type, priority, policy, status, + created_by_user_id::text, created_at, updated_at + `, id, input.ClusterID, input.VPNConnectionID, input.RouteType, input.Destination, input.Action, + input.ServiceType, input.Priority, []byte(input.Policy), input.Status, input.ActorUserID) + return scanVPNRoutePolicy(row) +} + +func (s *PostgresStore) ListVPNConnectionRoutePolicies(ctx context.Context, clusterID, vpnConnectionID string) ([]VPNConnectionRoutePolicy, error) { + rows, err := s.db.Query(ctx, ` + SELECT id::text, vpn_connection_id::text, cluster_id::text, organization_id::text, + route_type, destination, action, service_type, priority, policy, status, + created_by_user_id::text, created_at, updated_at + FROM vpn_connection_route_policies + WHERE cluster_id = $1::uuid + AND vpn_connection_id = $2::uuid + ORDER BY priority ASC, created_at DESC + `, clusterID, vpnConnectionID) + if err != nil { + return nil, err + } + defer rows.Close() + return scanVPNRoutePolicies(rows) +} + +func (s *PostgresStore) SetVPNConnectionAllowedNodes(ctx context.Context, input SetVPNConnectionAllowedNodesInput) ([]VPNConnectionAllowedNode, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return nil, err + } + defer tx.Rollback(ctx) + + var exists string + if err := tx.QueryRow(ctx, ` + SELECT id::text + FROM vpn_connections + WHERE cluster_id = $1::uuid + AND id = $2::uuid + FOR UPDATE + `, input.ClusterID, input.VPNConnectionID).Scan(&exists); err != nil { + return nil, err + } + if _, err := tx.Exec(ctx, ` + DELETE FROM vpn_connection_allowed_nodes + WHERE cluster_id = $1::uuid + AND vpn_connection_id = $2::uuid + `, input.ClusterID, input.VPNConnectionID); err != nil { + return nil, err + } + for _, nodeID := range input.NodeIDs { + if _, err := tx.Exec(ctx, ` + INSERT INTO vpn_connection_allowed_nodes ( + vpn_connection_id, cluster_id, node_id, role_preference, status, + metadata, created_by_user_id, created_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, 'active', $5::jsonb, $6::uuid, NOW()) + `, input.VPNConnectionID, input.ClusterID, nodeID, input.RolePreference, []byte(input.Metadata), input.ActorUserID); err != nil { + return nil, err + } + } + items, err := listVPNConnectionAllowedNodes(ctx, tx, input.ClusterID, input.VPNConnectionID) + if err != nil { + return nil, err + } + if err := tx.Commit(ctx); err != nil { + return nil, err + } + return items, nil +} + +func (s *PostgresStore) ListVPNConnectionAllowedNodes(ctx context.Context, clusterID, vpnConnectionID string) ([]VPNConnectionAllowedNode, error) { + return listVPNConnectionAllowedNodes(ctx, s.db, clusterID, vpnConnectionID) +} + +func (s *PostgresStore) AcquireVPNConnectionLease(ctx context.Context, input AcquireVPNConnectionLeaseInput, expiresAt time.Time, fencingToken string) (VPNConnectionLease, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return VPNConnectionLease{}, err + } + defer tx.Rollback(ctx) + + if _, err := tx.Exec(ctx, ` + UPDATE vpn_connection_leases + SET status = 'expired' + WHERE vpn_connection_id = $1::uuid + AND cluster_id = $2::uuid + AND status = 'active' + AND expires_at <= NOW() + `, input.VPNConnectionID, input.ClusterID); err != nil { + return VPNConnectionLease{}, err + } + + existingRow := tx.QueryRow(ctx, ` + SELECT id::text, vpn_connection_id::text, cluster_id::text, owner_node_id::text, + lease_generation, fencing_token, status, acquired_at, renewed_at, expires_at, + released_at, fenced_at, metadata + FROM vpn_connection_leases + WHERE vpn_connection_id = $1::uuid + AND cluster_id = $2::uuid + AND status = 'active' + AND expires_at > NOW() + FOR UPDATE + `, input.VPNConnectionID, input.ClusterID) + existing, err := scanVPNLease(existingRow) + if err == nil { + if existing.OwnerNodeID == input.OwnerNodeID { + if err := tx.Commit(ctx); err != nil { + return VPNConnectionLease{}, err + } + return existing, nil + } + return VPNConnectionLease{}, ErrVPNLeaseAlreadyActive + } + if !errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionLease{}, err + } + + id := uuid.NewString() + row := tx.QueryRow(ctx, ` + WITH next_generation AS ( + SELECT COALESCE(MAX(lease_generation), 0) + 1 AS value + FROM vpn_connection_leases + WHERE vpn_connection_id = $2::uuid + ) + INSERT INTO vpn_connection_leases ( + id, vpn_connection_id, cluster_id, owner_node_id, lease_generation, + fencing_token, status, acquired_at, renewed_at, expires_at, metadata + ) + SELECT $1::uuid, vc.id, vc.cluster_id, $4::uuid, next_generation.value, + $5, 'active', NOW(), NOW(), $6, $7::jsonb + FROM vpn_connections vc, next_generation + WHERE vc.cluster_id = $3::uuid + AND vc.id = $2::uuid + RETURNING id::text, vpn_connection_id::text, cluster_id::text, owner_node_id::text, + lease_generation, fencing_token, status, acquired_at, renewed_at, expires_at, + released_at, fenced_at, metadata + `, id, input.VPNConnectionID, input.ClusterID, input.OwnerNodeID, fencingToken, expiresAt, []byte(input.Metadata)) + item, err := scanVPNLease(row) + if err != nil { + if isUniqueViolation(err) { + return VPNConnectionLease{}, ErrVPNLeaseAlreadyActive + } + return VPNConnectionLease{}, err + } + if err := tx.Commit(ctx); err != nil { + return VPNConnectionLease{}, err + } + return item, nil +} + +func (s *PostgresStore) RenewVPNConnectionLease(ctx context.Context, input RenewVPNConnectionLeaseInput, expiresAt time.Time) (VPNConnectionLease, error) { + row := s.db.QueryRow(ctx, ` + UPDATE vpn_connection_leases + SET renewed_at = NOW(), + expires_at = $6 + WHERE id = $1::uuid + AND vpn_connection_id = $2::uuid + AND cluster_id = $3::uuid + AND owner_node_id = $4::uuid + AND fencing_token = $5 + AND status = 'active' + AND expires_at > NOW() + RETURNING id::text, vpn_connection_id::text, cluster_id::text, owner_node_id::text, + lease_generation, fencing_token, status, acquired_at, renewed_at, expires_at, + released_at, fenced_at, metadata + `, input.LeaseID, input.VPNConnectionID, input.ClusterID, input.OwnerNodeID, input.FencingToken, expiresAt) + return scanVPNLease(row) +} + +func (s *PostgresStore) ReleaseVPNConnectionLease(ctx context.Context, input ReleaseVPNConnectionLeaseInput) (VPNConnectionLease, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return VPNConnectionLease{}, err + } + defer tx.Rollback(ctx) + + row := tx.QueryRow(ctx, ` + UPDATE vpn_connection_leases + SET status = 'released', + released_at = NOW() + WHERE id = $1::uuid + AND vpn_connection_id = $2::uuid + AND cluster_id = $3::uuid + AND owner_node_id = $4::uuid + AND fencing_token = $5 + AND status = 'active' + RETURNING id::text, vpn_connection_id::text, cluster_id::text, owner_node_id::text, + lease_generation, fencing_token, status, acquired_at, renewed_at, expires_at, + released_at, fenced_at, metadata + `, input.LeaseID, input.VPNConnectionID, input.ClusterID, input.OwnerNodeID, input.FencingToken) + item, err := scanVPNLease(row) + if err == nil { + if err := tx.Commit(ctx); err != nil { + return VPNConnectionLease{}, err + } + return item, nil + } + if !errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionLease{}, err + } + row = tx.QueryRow(ctx, ` + SELECT id::text, vpn_connection_id::text, cluster_id::text, owner_node_id::text, + lease_generation, fencing_token, status, acquired_at, renewed_at, expires_at, + released_at, fenced_at, metadata + FROM vpn_connection_leases + WHERE id = $1::uuid + AND vpn_connection_id = $2::uuid + AND cluster_id = $3::uuid + AND owner_node_id = $4::uuid + AND fencing_token = $5 + AND status = 'released' + `, input.LeaseID, input.VPNConnectionID, input.ClusterID, input.OwnerNodeID, input.FencingToken) + item, err = scanVPNLease(row) + if err != nil { + return VPNConnectionLease{}, err + } + if err := tx.Commit(ctx); err != nil { + return VPNConnectionLease{}, err + } + return item, nil +} + +func (s *PostgresStore) FenceVPNConnectionLease(ctx context.Context, input FenceVPNConnectionLeaseInput) (VPNConnectionLease, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return VPNConnectionLease{}, err + } + defer tx.Rollback(ctx) + + row := tx.QueryRow(ctx, ` + UPDATE vpn_connection_leases + SET status = 'fenced', + fenced_at = NOW(), + metadata = metadata || $4::jsonb + WHERE id = $1::uuid + AND vpn_connection_id = $2::uuid + AND cluster_id = $3::uuid + AND status = 'active' + RETURNING id::text, vpn_connection_id::text, cluster_id::text, owner_node_id::text, + lease_generation, fencing_token, status, acquired_at, renewed_at, expires_at, + released_at, fenced_at, metadata + `, input.LeaseID, input.VPNConnectionID, input.ClusterID, []byte(fmt.Sprintf(`{"fence_reason":%q}`, input.Reason))) + item, err := scanVPNLease(row) + if err == nil { + if err := tx.Commit(ctx); err != nil { + return VPNConnectionLease{}, err + } + return item, nil + } + if !errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionLease{}, err + } + row = tx.QueryRow(ctx, ` + SELECT id::text, vpn_connection_id::text, cluster_id::text, owner_node_id::text, + lease_generation, fencing_token, status, acquired_at, renewed_at, expires_at, + released_at, fenced_at, metadata + FROM vpn_connection_leases + WHERE id = $1::uuid + AND vpn_connection_id = $2::uuid + AND cluster_id = $3::uuid + AND status = 'fenced' + `, input.LeaseID, input.VPNConnectionID, input.ClusterID) + item, err = scanVPNLease(row) + if err != nil { + return VPNConnectionLease{}, err + } + if err := tx.Commit(ctx); err != nil { + return VPNConnectionLease{}, err + } + return item, nil +} + +func (s *PostgresStore) GetActiveVPNConnectionLease(ctx context.Context, clusterID, vpnConnectionID string) (VPNConnectionLease, error) { + row := s.db.QueryRow(ctx, ` + SELECT id::text, vpn_connection_id::text, cluster_id::text, owner_node_id::text, + lease_generation, fencing_token, status, acquired_at, renewed_at, expires_at, + released_at, fenced_at, metadata + FROM vpn_connection_leases + WHERE cluster_id = $1::uuid + AND vpn_connection_id = $2::uuid + AND status = 'active' + AND expires_at > NOW() + `, clusterID, vpnConnectionID) + return scanVPNLease(row) +} + +func (s *PostgresStore) CheckVPNLeaseOwnerEligibility(ctx context.Context, clusterID, vpnConnectionID, ownerNodeID string) (VPNLeaseOwnerEligibility, error) { + row := s.db.QueryRow(ctx, ` + SELECT vc.id::text, + vc.cluster_id::text, + vc.organization_id::text, + $3::text AS owner_node_id, + COALESCE(cm.membership_status, '') AS membership_status, + COALESCE(n.registration_status, '') AS node_registration_status, + ( + COALESCE(vc.allowed_node_policy->>'mode', 'explicit') = 'any_capable' + OR EXISTS ( + SELECT 1 + FROM vpn_connection_allowed_nodes van + WHERE van.vpn_connection_id = vc.id + AND van.cluster_id = vc.cluster_id + AND van.node_id = $3::uuid + AND van.status = 'active' + ) + OR ( + COALESCE(vc.allowed_node_policy->>'mode', 'explicit') = 'explicit' + AND COALESCE(vc.allowed_node_policy->'node_ids', '[]'::jsonb) ? $3::text + ) + ) AS allowed_by_policy, + EXISTS ( + SELECT 1 + FROM node_role_assignments nra + WHERE nra.cluster_id = vc.cluster_id + AND nra.node_id = $3::uuid + AND nra.status = 'active' + AND nra.role IN ('vpn-exit', 'vpn-connector') + AND (nra.organization_id IS NULL OR nra.organization_id = vc.organization_id) + ) AS has_authorized_role + FROM vpn_connections vc + LEFT JOIN cluster_memberships cm ON cm.cluster_id = vc.cluster_id AND cm.node_id = $3::uuid + LEFT JOIN nodes n ON n.id = $3::uuid + WHERE vc.cluster_id = $1::uuid + AND vc.id = $2::uuid + `, clusterID, vpnConnectionID, ownerNodeID) + var item VPNLeaseOwnerEligibility + if err := row.Scan( + &item.VPNConnectionID, + &item.ClusterID, + &item.OrganizationID, + &item.OwnerNodeID, + &item.MembershipStatus, + &item.NodeRegistrationStatus, + &item.AllowedByPolicy, + &item.HasAuthorizedRole, + ); err != nil { + return VPNLeaseOwnerEligibility{}, err + } + return item, nil +} + +func (s *PostgresStore) ExpireStaleVPNConnectionLeases(ctx context.Context, clusterID string, now time.Time) ([]VPNConnectionLease, error) { + rows, err := s.db.Query(ctx, ` + UPDATE vpn_connection_leases + SET status = 'expired' + WHERE cluster_id = $1::uuid + AND status = 'active' + AND expires_at <= $2 + RETURNING id::text, vpn_connection_id::text, cluster_id::text, owner_node_id::text, + lease_generation, fencing_token, status, acquired_at, renewed_at, expires_at, + released_at, fenced_at, metadata + `, clusterID, now) + if err != nil { + return nil, err + } + defer rows.Close() + var out []VPNConnectionLease + for rows.Next() { + item, err := scanVPNLease(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) ListNodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error) { + rows, err := s.db.Query(ctx, ` + WITH active_node AS ( + SELECT 1 + FROM cluster_memberships cm + JOIN nodes n ON n.id = cm.node_id + WHERE cm.cluster_id = $1::uuid + AND cm.node_id = $2::uuid + AND cm.membership_status = 'active' + AND n.registration_status = 'active' + ), + visible AS ( + SELECT vc.*, + ( + COALESCE(vc.allowed_node_policy->>'mode', 'explicit') = 'any_capable' + OR EXISTS ( + SELECT 1 + FROM vpn_connection_allowed_nodes van + WHERE van.vpn_connection_id = vc.id + AND van.cluster_id = vc.cluster_id + AND van.node_id = $2::uuid + AND van.status = 'active' + ) + OR ( + COALESCE(vc.allowed_node_policy->>'mode', 'explicit') = 'explicit' + AND COALESCE(vc.allowed_node_policy->'node_ids', '[]'::jsonb) ? $2::text + ) + ) AS allowed_by_policy, + EXISTS ( + SELECT 1 + FROM node_role_assignments nra + WHERE nra.cluster_id = vc.cluster_id + AND nra.node_id = $2::uuid + AND nra.status = 'active' + AND nra.role IN ('vpn-exit', 'vpn-connector') + AND (nra.organization_id IS NULL OR nra.organization_id = vc.organization_id) + ) AS has_authorized_role, + EXISTS ( + SELECT 1 + FROM vpn_connection_leases active_owner + WHERE active_owner.cluster_id = vc.cluster_id + AND active_owner.vpn_connection_id = vc.id + AND active_owner.owner_node_id = $2::uuid + AND active_owner.status = 'active' + AND active_owner.expires_at > NOW() + ) AS is_active_owner + FROM vpn_connections vc + WHERE vc.cluster_id = $1::uuid + AND vc.desired_state = 'enabled' + ) + SELECT v.id::text, + v.cluster_id::text, + v.organization_id::text, + v.name, + v.target_endpoint, + v.protocol_family, + v.mode, + v.desired_state, + v.routing_usage, + v.route_policy, + v.qos_policy, + v.placement_policy, + v.status, + (v.credential_ref IS NOT NULL) AS has_credential_ref, + CASE WHEN v.is_active_owner THEN 'active_owner' ELSE 'eligible_candidate' END AS assignment_reason, + CASE WHEN l.id IS NULL THEN NULL ELSE jsonb_build_object( + 'lease_id', l.id::text, + 'owner_node_id', l.owner_node_id::text, + 'lease_generation', l.lease_generation, + 'status', l.status, + 'renewed_at', l.renewed_at, + 'expires_at', l.expires_at + ) END AS active_lease, + v.updated_at + FROM visible v + JOIN active_node an ON TRUE + LEFT JOIN vpn_connection_leases l + ON l.cluster_id = v.cluster_id + AND l.vpn_connection_id = v.id + AND l.status = 'active' + AND l.expires_at > NOW() + WHERE (v.allowed_by_policy AND v.has_authorized_role) + OR v.is_active_owner + ORDER BY v.name ASC, v.id ASC + `, clusterID, nodeID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []NodeVPNAssignment + for rows.Next() { + item, err := scanNodeVPNAssignment(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (s *PostgresStore) ReportNodeVPNAssignmentStatus(ctx context.Context, input ReportNodeVPNAssignmentStatusInput) (NodeVPNAssignmentStatus, error) { + tx, err := s.db.Begin(ctx) + if err != nil { + return NodeVPNAssignmentStatus{}, err + } + defer func() { _ = tx.Rollback(ctx) }() + + id := uuid.NewString() + row := tx.QueryRow(ctx, ` + INSERT INTO vpn_connection_assignment_status_reports ( + id, vpn_connection_id, cluster_id, node_id, observed_status, status_payload, observed_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4::uuid, $5, $6::jsonb, $7) + RETURNING id::text, vpn_connection_id::text, cluster_id::text, node_id::text, + observed_status, status_payload, observed_at + `, id, input.VPNConnectionID, input.ClusterID, input.NodeID, input.ObservedStatus, []byte(input.StatusPayload), input.ObservedAt) + item, err := scanNodeVPNAssignmentStatus(row) + if err != nil { + return NodeVPNAssignmentStatus{}, err + } + + if _, err := tx.Exec(ctx, ` + INSERT INTO vpn_connection_assignment_latest_statuses ( + vpn_connection_id, cluster_id, node_id, report_id, observed_status, status_payload, observed_at, updated_at + ) VALUES ($1::uuid, $2::uuid, $3::uuid, $4::uuid, $5, $6::jsonb, $7, NOW()) + ON CONFLICT (vpn_connection_id, node_id) DO UPDATE SET + report_id = EXCLUDED.report_id, + observed_status = EXCLUDED.observed_status, + status_payload = EXCLUDED.status_payload, + observed_at = EXCLUDED.observed_at, + updated_at = NOW() + `, input.VPNConnectionID, input.ClusterID, input.NodeID, item.ID, input.ObservedStatus, []byte(input.StatusPayload), input.ObservedAt); err != nil { + return NodeVPNAssignmentStatus{}, err + } + if err := tx.Commit(ctx); err != nil { + return NodeVPNAssignmentStatus{}, err + } + return item, nil +} + +func (s *PostgresStore) RecordAudit(ctx context.Context, event ClusterAuditEvent) error { + if len(event.Payload) == 0 { + event.Payload = json.RawMessage(`{}`) + } + _, err := s.db.Exec(ctx, ` + INSERT INTO cluster_audit_events (id, cluster_id, actor_user_id, event_type, target_type, target_id, payload, created_at) + VALUES ($1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7::jsonb, COALESCE($8, NOW())) + `, uuid.NewString(), event.ClusterID, event.ActorUserID, event.EventType, event.TargetType, event.TargetID, []byte(event.Payload), event.CreatedAt) + return err +} + +func (s *PostgresStore) ListAuditEvents(ctx context.Context, clusterID string, limit int) ([]ClusterAuditEvent, error) { + if limit <= 0 || limit > 200 { + limit = 100 + } + rows, err := s.db.Query(ctx, ` + SELECT id::text, cluster_id::text, actor_user_id::text, event_type, target_type, target_id, payload, created_at + FROM cluster_audit_events + WHERE cluster_id = $1::uuid + ORDER BY created_at DESC + LIMIT $2 + `, clusterID, limit) + if err != nil { + return nil, err + } + defer rows.Close() + var out []ClusterAuditEvent + for rows.Next() { + item, err := scanAuditEvent(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +type scanner interface { + Scan(dest ...any) error +} + +type rowQuerier interface { + Query(ctx context.Context, sql string, args ...any) (pgx.Rows, error) +} + +func scanCluster(row scanner) (Cluster, error) { + var item Cluster + if err := row.Scan(&item.ID, &item.Slug, &item.Name, &item.Status, &item.Region, &item.Metadata, &item.CreatedAt, &item.UpdatedAt); err != nil { + return Cluster{}, err + } + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanClusterAuthority(row scanner) (ClusterAuthorityKey, error) { + var item ClusterAuthorityKey + if err := row.Scan( + &item.ClusterID, + &item.AuthorityState, + &item.KeyAlgorithm, + &item.PublicKey, + &item.PublicKeyFingerprint, + &item.PrivateKey, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return ClusterAuthorityKey{}, err + } + item.SchemaVersion = clusterauth.AuthoritySchemaVersion + return item, nil +} + +func scanNodeGroup(row scanner) (ClusterNodeGroup, error) { + var item ClusterNodeGroup + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.ParentGroupID, + &item.Name, + &item.Description, + &item.SortOrder, + &item.Metadata, + &item.CreatedByUserID, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return ClusterNodeGroup{}, err + } + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanClusterNode(row scanner) (ClusterNode, error) { + var item ClusterNode + if err := row.Scan( + &item.ID, + &item.OwnerOrganizationID, + &item.NodeKey, + &item.Name, + &item.OwnershipType, + &item.RegistrationStatus, + &item.HealthStatus, + &item.VersionState, + &item.PartitionState, + &item.ReportedVersion, + &item.LastSeenAt, + &item.MembershipStatus, + &item.MembershipMetadata, + &item.NodeGroupID, + &item.NodeGroupName, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return ClusterNode{}, err + } + ensureRaw(&item.MembershipMetadata, `{}`) + return item, nil +} + +func scanJoinToken(row scanner) (NodeJoinToken, error) { + var item NodeJoinToken + var signatureRaw json.RawMessage + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.Scope, + &item.ExpiresAt, + &item.MaxUses, + &item.UsedCount, + &item.Status, + &item.CreatedByUserID, + &item.CreatedAt, + &item.RevokedAt, + &item.AuthorityPayload, + &signatureRaw, + ); err != nil { + return NodeJoinToken{}, err + } + ensureRaw(&item.Scope, `{}`) + ensureRaw(&item.AuthorityPayload, `{}`) + if len(signatureRaw) > 0 && string(signatureRaw) != "{}" { + var signature ClusterSignature + if err := json.Unmarshal(signatureRaw, &signature); err != nil { + return NodeJoinToken{}, err + } + item.AuthoritySignature = &signature + } + return item, nil +} + +func scanJoinRequest(row scanner) (NodeJoinRequest, error) { + var item NodeJoinRequest + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.JoinTokenID, + &item.NodeName, + &item.NodeFingerprint, + &item.PublicKey, + &item.ReportedCapabilities, + &item.ReportedFacts, + &item.RequestedRoles, + &item.Status, + &item.ReviewedByUserID, + &item.ReviewedAt, + &item.ApprovedNodeID, + &item.RejectionReason, + &item.CreatedAt, + &item.UpdatedAt, + &item.ApprovalPayload, + &item.ApprovalSignature, + ); err != nil { + return NodeJoinRequest{}, err + } + ensureRaw(&item.ReportedCapabilities, `{}`) + ensureRaw(&item.ReportedFacts, `{}`) + ensureRaw(&item.RequestedRoles, `[]`) + ensureRaw(&item.ApprovalPayload, `{}`) + ensureRaw(&item.ApprovalSignature, `{}`) + return item, nil +} + +func scanRoleAssignment(row scanner) (NodeRoleAssignment, error) { + var item NodeRoleAssignment + if err := row.Scan(&item.ID, &item.ClusterID, &item.NodeID, &item.OrganizationID, &item.Role, &item.Status, &item.Policy, &item.AssignedByUserID, &item.AssignedAt, &item.RevokedAt); err != nil { + return NodeRoleAssignment{}, err + } + ensureRaw(&item.Policy, `{}`) + return item, nil +} + +func scanHeartbeat(row scanner) (NodeHeartbeat, error) { + var item NodeHeartbeat + if err := row.Scan(&item.ID, &item.ClusterID, &item.NodeID, &item.HealthStatus, &item.ReportedVersion, &item.Capabilities, &item.ServiceStates, &item.Metadata, &item.ObservedAt); err != nil { + return NodeHeartbeat{}, err + } + ensureRaw(&item.Capabilities, `{}`) + ensureRaw(&item.ServiceStates, `{}`) + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanAuditEvent(row scanner) (ClusterAuditEvent, error) { + var item ClusterAuditEvent + if err := row.Scan(&item.ID, &item.ClusterID, &item.ActorUserID, &item.EventType, &item.TargetType, &item.TargetID, &item.Payload, &item.CreatedAt); err != nil { + return ClusterAuditEvent{}, err + } + ensureRaw(&item.Payload, `{}`) + return item, nil +} + +func scanFabricTestingFlag(row scanner) (FabricTestingFlag, error) { + var item FabricTestingFlag + if err := row.Scan( + &item.ID, + &item.ScopeType, + &item.ScopeID, + &item.ClusterID, + &item.Enabled, + &item.TelemetryEnabled, + &item.SyntheticLinksEnabled, + &item.HistoryRetentionHours, + &item.Metadata, + &item.UpdatedByUserID, + &item.UpdatedAt, + ); err != nil { + return FabricTestingFlag{}, err + } + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanNodeTelemetry(row scanner) (NodeTelemetryObservation, error) { + var item NodeTelemetryObservation + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.NodeID, + &item.CPUPercent, + &item.MemoryUsedBytes, + &item.MemoryTotalBytes, + &item.DiskUsedBytes, + &item.DiskTotalBytes, + &item.NetworkRxBytes, + &item.NetworkTxBytes, + &item.ProcessCount, + &item.Payload, + &item.ObservedAt, + ); err != nil { + return NodeTelemetryObservation{}, err + } + ensureRaw(&item.Payload, `{}`) + return item, nil +} + +func scanDesiredWorkload(row scanner) (NodeWorkloadDesiredState, error) { + var item NodeWorkloadDesiredState + if err := row.Scan( + &item.ClusterID, + &item.NodeID, + &item.ServiceType, + &item.DesiredState, + &item.Version, + &item.RuntimeMode, + &item.ArtifactRef, + &item.Config, + &item.Environment, + &item.UpdatedByUserID, + &item.UpdatedAt, + ); err != nil { + return NodeWorkloadDesiredState{}, err + } + ensureRaw(&item.Config, `{}`) + ensureRaw(&item.Environment, `{}`) + return item, nil +} + +func scanWorkloadStatus(row scanner) (NodeWorkloadStatus, error) { + var item NodeWorkloadStatus + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.NodeID, + &item.ServiceType, + &item.ReportedState, + &item.RuntimeMode, + &item.Version, + &item.StatusPayload, + &item.ObservedAt, + ); err != nil { + return NodeWorkloadStatus{}, err + } + ensureRaw(&item.StatusPayload, `{}`) + return item, nil +} + +func scanMeshLink(row scanner) (MeshLinkObservation, error) { + var item MeshLinkObservation + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.SourceNodeID, + &item.TargetNodeID, + &item.LinkStatus, + &item.LatencyMs, + &item.QualityScore, + &item.Metadata, + &item.ObservedAt, + ); err != nil { + return MeshLinkObservation{}, err + } + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanRouteIntent(row scanner) (MeshRouteIntent, error) { + var item MeshRouteIntent + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.SourceSelector, + &item.DestinationSelector, + &item.ServiceClass, + &item.Priority, + &item.Status, + &item.Policy, + &item.CreatedByUserID, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return MeshRouteIntent{}, err + } + ensureRaw(&item.SourceSelector, `{}`) + ensureRaw(&item.DestinationSelector, `{}`) + ensureRaw(&item.Policy, `{}`) + return item, nil +} + +func scanQoSPolicy(row scanner) (MeshQoSPolicy, error) { + var item MeshQoSPolicy + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.ServiceClass, + &item.Priority, + &item.ReliabilityMode, + &item.DropPolicy, + &item.BandwidthPolicy, + &item.Metadata, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return MeshQoSPolicy{}, err + } + ensureRaw(&item.BandwidthPolicy, `{}`) + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanFabricEntryPoint(row scanner) (FabricEntryPoint, error) { + var item FabricEntryPoint + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.Name, + &item.Status, + &item.EndpointType, + &item.PublicEndpoint, + &item.Policy, + &item.Metadata, + &item.CreatedByUserID, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return FabricEntryPoint{}, err + } + ensureRaw(&item.Policy, `{}`) + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanFabricEntryPointNode(row scanner) (FabricEntryPointNode, error) { + var item FabricEntryPointNode + if err := row.Scan( + &item.EntryPointID, + &item.ClusterID, + &item.NodeID, + &item.Status, + &item.Priority, + &item.Metadata, + &item.AddedByUserID, + &item.AddedAt, + ); err != nil { + return FabricEntryPointNode{}, err + } + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanFabricEgressPool(row scanner) (FabricEgressPool, error) { + var item FabricEgressPool + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.Name, + &item.Status, + &item.Description, + &item.RouteScope, + &item.Policy, + &item.Metadata, + &item.CreatedByUserID, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return FabricEgressPool{}, err + } + ensureRaw(&item.RouteScope, `{}`) + ensureRaw(&item.Policy, `{}`) + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanFabricEgressPoolNode(row scanner) (FabricEgressPoolNode, error) { + var item FabricEgressPoolNode + if err := row.Scan( + &item.EgressPoolID, + &item.ClusterID, + &item.NodeID, + &item.Status, + &item.Priority, + &item.Metadata, + &item.AddedByUserID, + &item.AddedAt, + ); err != nil { + return FabricEgressPoolNode{}, err + } + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanAuthorityState(row scanner) (ClusterAuthorityState, error) { + var item ClusterAuthorityState + if err := row.Scan( + &item.ClusterID, + &item.AuthorityState, + &item.MutationMode, + &item.Term, + &item.Notes, + &item.UpdatedByUserID, + &item.UpdatedAt, + ); err != nil { + return ClusterAuthorityState{}, err + } + return item, nil +} + +func scanVPNConnection(row scanner) (VPNConnection, error) { + var item VPNConnection + if err := row.Scan( + &item.ID, + &item.ClusterID, + &item.OrganizationID, + &item.Name, + &item.TargetEndpoint, + &item.ProtocolFamily, + &item.CredentialRef, + &item.Mode, + &item.DesiredState, + &item.AllowedNodePolicy, + &item.RoutingUsage, + &item.RoutePolicy, + &item.QoSPolicy, + &item.PlacementPolicy, + &item.Status, + &item.Metadata, + &item.CreatedByUserID, + &item.UpdatedByUserID, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return VPNConnection{}, err + } + ensureRaw(&item.TargetEndpoint, `{}`) + ensureRaw(&item.AllowedNodePolicy, `{"mode":"explicit","node_ids":[]}`) + ensureRaw(&item.RoutingUsage, `[]`) + ensureRaw(&item.RoutePolicy, `{}`) + ensureRaw(&item.QoSPolicy, `{}`) + ensureRaw(&item.PlacementPolicy, `{}`) + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanVPNConnections(rows pgx.Rows) ([]VPNConnection, error) { + var out []VPNConnection + for rows.Next() { + item, err := scanVPNConnection(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func scanVPNAllowedNode(row scanner) (VPNConnectionAllowedNode, error) { + var item VPNConnectionAllowedNode + if err := row.Scan( + &item.VPNConnectionID, + &item.ClusterID, + &item.NodeID, + &item.RolePreference, + &item.Status, + &item.Metadata, + &item.CreatedByUserID, + &item.CreatedAt, + ); err != nil { + return VPNConnectionAllowedNode{}, err + } + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func listVPNConnectionAllowedNodes(ctx context.Context, q rowQuerier, clusterID, vpnConnectionID string) ([]VPNConnectionAllowedNode, error) { + rows, err := q.Query(ctx, ` + SELECT vpn_connection_id::text, cluster_id::text, node_id::text, role_preference, + status, metadata, created_by_user_id::text, created_at + FROM vpn_connection_allowed_nodes + WHERE cluster_id = $1::uuid + AND vpn_connection_id = $2::uuid + ORDER BY role_preference, created_at DESC + `, clusterID, vpnConnectionID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []VPNConnectionAllowedNode + for rows.Next() { + item, err := scanVPNAllowedNode(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func scanVPNRoutePolicy(row scanner) (VPNConnectionRoutePolicy, error) { + var item VPNConnectionRoutePolicy + if err := row.Scan( + &item.ID, + &item.VPNConnectionID, + &item.ClusterID, + &item.OrganizationID, + &item.RouteType, + &item.Destination, + &item.Action, + &item.ServiceType, + &item.Priority, + &item.Policy, + &item.Status, + &item.CreatedByUserID, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return VPNConnectionRoutePolicy{}, err + } + ensureRaw(&item.Policy, `{}`) + return item, nil +} + +func scanVPNRoutePolicies(rows pgx.Rows) ([]VPNConnectionRoutePolicy, error) { + var out []VPNConnectionRoutePolicy + for rows.Next() { + item, err := scanVPNRoutePolicy(rows) + if err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func scanVPNLease(row scanner) (VPNConnectionLease, error) { + var item VPNConnectionLease + if err := row.Scan( + &item.ID, + &item.VPNConnectionID, + &item.ClusterID, + &item.OwnerNodeID, + &item.LeaseGeneration, + &item.FencingToken, + &item.Status, + &item.AcquiredAt, + &item.RenewedAt, + &item.ExpiresAt, + &item.ReleasedAt, + &item.FencedAt, + &item.Metadata, + ); err != nil { + return VPNConnectionLease{}, err + } + ensureRaw(&item.Metadata, `{}`) + return item, nil +} + +func scanNodeVPNAssignment(row scanner) (NodeVPNAssignment, error) { + var item NodeVPNAssignment + var activeLeaseRaw json.RawMessage + if err := row.Scan( + &item.VPNConnectionID, + &item.ClusterID, + &item.OrganizationID, + &item.Name, + &item.TargetEndpoint, + &item.ProtocolFamily, + &item.Mode, + &item.DesiredState, + &item.RoutingUsage, + &item.RoutePolicy, + &item.QoSPolicy, + &item.PlacementPolicy, + &item.Status, + &item.HasCredentialRef, + &item.AssignmentReason, + &activeLeaseRaw, + &item.UpdatedAt, + ); err != nil { + return NodeVPNAssignment{}, err + } + ensureRaw(&item.TargetEndpoint, `{}`) + ensureRaw(&item.RoutingUsage, `[]`) + ensureRaw(&item.RoutePolicy, `{}`) + ensureRaw(&item.QoSPolicy, `{}`) + ensureRaw(&item.PlacementPolicy, `{}`) + if len(activeLeaseRaw) > 0 && string(activeLeaseRaw) != "null" { + var lease NodeVPNAssignmentLease + if err := json.Unmarshal(activeLeaseRaw, &lease); err != nil { + return NodeVPNAssignment{}, err + } + item.ActiveLease = &lease + } + return item, nil +} + +func scanNodeVPNAssignmentStatus(row scanner) (NodeVPNAssignmentStatus, error) { + var item NodeVPNAssignmentStatus + if err := row.Scan( + &item.ID, + &item.VPNConnectionID, + &item.ClusterID, + &item.NodeID, + &item.ObservedStatus, + &item.StatusPayload, + &item.ObservedAt, + ); err != nil { + return NodeVPNAssignmentStatus{}, err + } + ensureRaw(&item.StatusPayload, `{}`) + return item, nil +} + +func isUniqueViolation(err error) bool { + var pgErr *pgconn.PgError + if errors.As(err, &pgErr) { + return pgErr.Code == "23505" + } + return false +} + +func getJoinRequestForUpdate(ctx context.Context, tx pgx.Tx, clusterID, joinRequestID string) (NodeJoinRequest, error) { + row := tx.QueryRow(ctx, ` + SELECT id::text, cluster_id::text, join_token_id::text, node_name, node_fingerprint, public_key, + reported_capabilities, reported_facts, requested_roles, status, reviewed_by_user_id::text, + reviewed_at, approved_node_id::text, rejection_reason, created_at, updated_at, approval_payload, approval_signature + FROM node_join_requests + WHERE cluster_id = $1::uuid + AND id = $2::uuid + FOR UPDATE + `, clusterID, joinRequestID) + return scanJoinRequest(row) +} + +func ensureRaw(raw *json.RawMessage, fallback string) { + if len(*raw) == 0 { + *raw = json.RawMessage(fallback) + } +} diff --git a/backend/internal/modules/cluster/postgres_store_test.go b/backend/internal/modules/cluster/postgres_store_test.go new file mode 100644 index 0000000..1e4f1de --- /dev/null +++ b/backend/internal/modules/cluster/postgres_store_test.go @@ -0,0 +1,34 @@ +package cluster + +import ( + "encoding/json" + "testing" +) + +func TestMeshLatestObservationKeySeparatesRouteHealthByRoute(t *testing.T) { + key := meshLatestObservationKey(json.RawMessage(`{ + "observation_type":"synthetic_route_health", + "route_id":"route-1" + }`)) + if key != "synthetic_route_health:route-1" { + t.Fatalf("key = %q", key) + } +} + +func TestMeshLatestObservationKeySeparatesConnectionManagerMode(t *testing.T) { + key := meshLatestObservationKey(json.RawMessage(`{ + "observation_type":"peer_connection_manager", + "transport_mode":"relay_control", + "relay_node_id":"node-r" + }`)) + if key != "peer_connection_manager:relay_control:node-r" { + t.Fatalf("key = %q", key) + } +} + +func TestMeshLatestObservationKeyDefaults(t *testing.T) { + key := meshLatestObservationKey(json.RawMessage(`{}`)) + if key != "default" { + t.Fatalf("key = %q", key) + } +} diff --git a/backend/internal/modules/cluster/repository.go b/backend/internal/modules/cluster/repository.go new file mode 100644 index 0000000..077443b --- /dev/null +++ b/backend/internal/modules/cluster/repository.go @@ -0,0 +1,91 @@ +package cluster + +import ( + "context" + "encoding/json" + "time" +) + +type Repository interface { + GetPlatformRole(ctx context.Context, userID string) (string, error) + + ListClusters(ctx context.Context) ([]Cluster, error) + GetCluster(ctx context.Context, clusterID string) (Cluster, error) + CreateCluster(ctx context.Context, input CreateClusterInput) (Cluster, error) + UpdateCluster(ctx context.Context, input UpdateClusterInput) (Cluster, error) + GetClusterAuthority(ctx context.Context, clusterID string) (ClusterAuthorityKey, error) + EnsureClusterAuthority(ctx context.Context, clusterID string, actorUserID *string) (ClusterAuthorityKey, error) + + ListClusterNodes(ctx context.Context, clusterID string) ([]ClusterNode, error) + ListNodeGroups(ctx context.Context, clusterID string) ([]ClusterNodeGroup, error) + CreateNodeGroup(ctx context.Context, input CreateNodeGroupInput) (ClusterNodeGroup, error) + AssignNodeToGroup(ctx context.Context, input AssignNodeGroupInput) (ClusterNode, error) + + CreateJoinToken(ctx context.Context, input CreateJoinTokenInput, tokenHash string) (NodeJoinToken, error) + SetJoinTokenAuthority(ctx context.Context, clusterID, tokenID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinToken, error) + GetValidJoinTokenByHash(ctx context.Context, clusterID, tokenHash string) (NodeJoinToken, error) + RevokeJoinToken(ctx context.Context, input RevokeJoinTokenInput) (NodeJoinToken, error) + ExpireJoinTokens(ctx context.Context, clusterID string) error + + CreateJoinRequest(ctx context.Context, input CreateJoinRequestInput, joinTokenID string) (NodeJoinRequest, error) + GetJoinRequestForBootstrap(ctx context.Context, input GetJoinRequestBootstrapInput) (NodeJoinRequest, error) + ListJoinRequests(ctx context.Context, clusterID string) ([]NodeJoinRequest, error) + ApproveJoinRequest(ctx context.Context, input ApproveJoinRequestInput) (ApprovedJoinRequest, error) + SetJoinRequestApprovalAuthority(ctx context.Context, clusterID, joinRequestID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinRequest, error) + RejectJoinRequest(ctx context.Context, input RejectJoinRequestInput) (NodeJoinRequest, error) + + AssignNodeRole(ctx context.Context, input AssignNodeRoleInput) (NodeRoleAssignment, error) + ListNodeRoleAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeRoleAssignment, error) + AttachExistingNodeToCluster(ctx context.Context, input AttachExistingNodeInput) (ClusterNode, error) + + RecordHeartbeat(ctx context.Context, input RecordHeartbeatInput) (NodeHeartbeat, error) + ListNodeHeartbeats(ctx context.Context, clusterID, nodeID string, limit int) ([]NodeHeartbeat, error) + RevokeNodeIdentity(ctx context.Context, input RevokeNodeIdentityInput) error + DisableClusterMembership(ctx context.Context, input DisableMembershipInput) error + UpsertFabricTestingFlag(ctx context.Context, input UpsertFabricTestingFlagInput) (FabricTestingFlag, error) + ListFabricTestingFlags(ctx context.Context) ([]FabricTestingFlag, error) + GetEffectiveNodeTestingFlags(ctx context.Context, clusterID, nodeID string) (EffectiveNodeTestingFlags, error) + RecordNodeTelemetry(ctx context.Context, input RecordNodeTelemetryInput) (NodeTelemetryObservation, error) + ListNodeTelemetry(ctx context.Context, clusterID, nodeID string, limit int) ([]NodeTelemetryObservation, error) + SetDesiredWorkload(ctx context.Context, input SetDesiredWorkloadInput) (NodeWorkloadDesiredState, error) + ListDesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]NodeWorkloadDesiredState, error) + ReportWorkloadStatus(ctx context.Context, input ReportWorkloadStatusInput) (NodeWorkloadStatus, error) + ListLatestWorkloadStatuses(ctx context.Context, clusterID, nodeID string) ([]NodeWorkloadStatus, error) + ReportMeshLink(ctx context.Context, input ReportMeshLinkInput) (MeshLinkObservation, error) + ListMeshLinks(ctx context.Context, clusterID string) ([]MeshLinkObservation, error) + CreateRouteIntent(ctx context.Context, input CreateRouteIntentInput) (MeshRouteIntent, error) + ListRouteIntents(ctx context.Context, clusterID string) ([]MeshRouteIntent, error) + ListQoSPolicies(ctx context.Context, clusterID string) ([]MeshQoSPolicy, error) + ListFabricEntryPoints(ctx context.Context, clusterID string) ([]FabricEntryPoint, error) + CreateFabricEntryPoint(ctx context.Context, input CreateFabricEntryPointInput) (FabricEntryPoint, error) + SetFabricEntryPointNode(ctx context.Context, input SetFabricEntryPointNodeInput) (FabricEntryPointNode, error) + ListFabricEntryPointNodes(ctx context.Context, clusterID, entryPointID string) ([]FabricEntryPointNode, error) + ListFabricEgressPools(ctx context.Context, clusterID string) ([]FabricEgressPool, error) + CreateFabricEgressPool(ctx context.Context, input CreateFabricEgressPoolInput) (FabricEgressPool, error) + SetFabricEgressPoolNode(ctx context.Context, input SetFabricEgressPoolNodeInput) (FabricEgressPoolNode, error) + ListFabricEgressPoolNodes(ctx context.Context, clusterID, egressPoolID string) ([]FabricEgressPoolNode, error) + GetClusterAuthorityState(ctx context.Context, clusterID string) (ClusterAuthorityState, error) + UpdateClusterAuthorityState(ctx context.Context, input UpdateClusterAuthorityInput) (ClusterAuthorityState, error) + ListClusterAdminSummaries(ctx context.Context) ([]ClusterAdminSummary, error) + + CreateVPNConnection(ctx context.Context, input CreateVPNConnectionInput) (VPNConnection, error) + ListVPNConnections(ctx context.Context, clusterID string) ([]VPNConnection, error) + GetVPNConnection(ctx context.Context, clusterID, vpnConnectionID string) (VPNConnection, error) + UpdateVPNConnectionDesiredState(ctx context.Context, input UpdateVPNConnectionDesiredStateInput) (VPNConnection, error) + UpsertVPNConnectionRoutePolicy(ctx context.Context, input UpsertVPNConnectionRoutePolicyInput) (VPNConnectionRoutePolicy, error) + ListVPNConnectionRoutePolicies(ctx context.Context, clusterID, vpnConnectionID string) ([]VPNConnectionRoutePolicy, error) + SetVPNConnectionAllowedNodes(ctx context.Context, input SetVPNConnectionAllowedNodesInput) ([]VPNConnectionAllowedNode, error) + ListVPNConnectionAllowedNodes(ctx context.Context, clusterID, vpnConnectionID string) ([]VPNConnectionAllowedNode, error) + AcquireVPNConnectionLease(ctx context.Context, input AcquireVPNConnectionLeaseInput, expiresAt time.Time, fencingToken string) (VPNConnectionLease, error) + RenewVPNConnectionLease(ctx context.Context, input RenewVPNConnectionLeaseInput, expiresAt time.Time) (VPNConnectionLease, error) + ReleaseVPNConnectionLease(ctx context.Context, input ReleaseVPNConnectionLeaseInput) (VPNConnectionLease, error) + FenceVPNConnectionLease(ctx context.Context, input FenceVPNConnectionLeaseInput) (VPNConnectionLease, error) + GetActiveVPNConnectionLease(ctx context.Context, clusterID, vpnConnectionID string) (VPNConnectionLease, error) + CheckVPNLeaseOwnerEligibility(ctx context.Context, clusterID, vpnConnectionID, ownerNodeID string) (VPNLeaseOwnerEligibility, error) + ExpireStaleVPNConnectionLeases(ctx context.Context, clusterID string, now time.Time) ([]VPNConnectionLease, error) + ListNodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error) + ReportNodeVPNAssignmentStatus(ctx context.Context, input ReportNodeVPNAssignmentStatusInput) (NodeVPNAssignmentStatus, error) + + RecordAudit(ctx context.Context, event ClusterAuditEvent) error + ListAuditEvents(ctx context.Context, clusterID string, limit int) ([]ClusterAuditEvent, error) +} diff --git a/backend/internal/modules/cluster/service.go b/backend/internal/modules/cluster/service.go new file mode 100644 index 0000000..d29ae9e --- /dev/null +++ b/backend/internal/modules/cluster/service.go @@ -0,0 +1,3700 @@ +package cluster + +import ( + "context" + "crypto/rand" + "encoding/hex" + "encoding/json" + "errors" + "sort" + "strings" + "time" + + "github.com/jackc/pgx/v5" + + "github.com/example/remote-access-platform/backend/internal/platform/clusterauth" +) + +var ( + ErrAccessDenied = errors.New("platform admin role is required") + ErrInvalidPayload = errors.New("invalid cluster payload") + ErrInvalidJoinToken = errors.New("invalid or expired join token") + ErrInvalidNodeRole = errors.New("invalid node role") + ErrInvalidCluster = errors.New("cluster not found") + ErrInvalidJoinRequest = errors.New("join request not found") + ErrClusterReadOnly = errors.New("cluster is not authoritative for policy mutation") + ErrInvalidVPNConnection = errors.New("vpn connection not found") + ErrInvalidVPNLease = errors.New("vpn connection lease not found") + ErrVPNLeaseAlreadyActive = errors.New("vpn connection already has an active lease") + ErrVPNLeaseOwnerNotAllowed = errors.New("vpn lease owner is not allowed") + ErrVPNLeaseOwnerRoleRequired = errors.New("vpn lease owner requires active vpn-exit or vpn-connector role") +) + +type Service struct { + store Repository + now func() time.Time +} + +func NewService(store Repository) *Service { + return &Service{store: store, now: func() time.Time { return time.Now().UTC() }} +} + +const ( + clusterJoinTokenAuthoritySchema = "rap.cluster.join_token.v1" + clusterNodeApprovalAuthoritySchema = "rap.cluster.node_approval.v1" + clusterMeshConfigAuthoritySchema = "rap.cluster.mesh_config_snapshot.v1" +) + +type clusterJoinTokenAuthorityPayload struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + TokenID string `json:"token_id"` + Scope json.RawMessage `json:"scope"` + ExpiresAt time.Time `json:"expires_at"` + MaxUses int `json:"max_uses"` + CreatedByUserID *string `json:"created_by_user_id,omitempty"` + IssuedAt time.Time `json:"issued_at"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` +} + +type clusterNodeApprovalAuthorityPayload struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + JoinRequestID string `json:"join_request_id"` + NodeID string `json:"node_id"` + NodeFingerprint string `json:"node_fingerprint"` + IdentityStatus string `json:"identity_status"` + HeartbeatEndpoint string `json:"heartbeat_endpoint"` + ApprovedByUserID string `json:"approved_by_user_id"` + IssuedAt time.Time `json:"issued_at"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` +} + +type clusterMeshConfigAuthorityPayload struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + LocalNodeID string `json:"local_node_id"` + ConfigVersion string `json:"config_version"` + ConfigSHA256 string `json:"config_sha256"` + IssuedAt time.Time `json:"issued_at"` + ExpiresAt time.Time `json:"expires_at"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` +} + +func (s *Service) ListClusters(ctx context.Context, actorUserID string) ([]Cluster, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListClusters(ctx) +} + +func (s *Service) GetCluster(ctx context.Context, actorUserID, clusterID string) (Cluster, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return Cluster{}, err + } + item, err := s.store.GetCluster(ctx, clusterID) + if errors.Is(err, pgx.ErrNoRows) { + return Cluster{}, ErrInvalidCluster + } + return item, err +} + +func (s *Service) CreateCluster(ctx context.Context, input CreateClusterInput) (Cluster, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return Cluster{}, err + } + input.Slug = strings.TrimSpace(input.Slug) + input.Name = strings.TrimSpace(input.Name) + if input.Slug == "" || input.Name == "" { + return Cluster{}, ErrInvalidPayload + } + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Metadata) { + return Cluster{}, errors.New("metadata must be valid json") + } + item, err := s.store.CreateCluster(ctx, input) + if err != nil { + return Cluster{}, err + } + auditPayload := json.RawMessage(`{}`) + if authorityKey, err := s.ensureClusterAuthority(ctx, item.ID, &input.ActorUserID); err == nil { + auditPayload, _ = json.Marshal(map[string]any{ + "cluster_authority": map[string]any{ + "key_algorithm": authorityKey.KeyAlgorithm, + "public_key_fingerprint": authorityKey.PublicKeyFingerprint, + }, + }) + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &item.ID, + ActorUserID: &input.ActorUserID, + EventType: "cluster.created", + TargetType: "cluster", + TargetID: &item.ID, + Payload: auditPayload, + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ensureClusterAuthority(ctx context.Context, clusterID string, actorUserID *string) (ClusterAuthorityKey, error) { + authorityKey, err := s.store.GetClusterAuthority(ctx, clusterID) + if errors.Is(err, pgx.ErrNoRows) { + return s.store.EnsureClusterAuthority(ctx, clusterID, actorUserID) + } + return authorityKey, err +} + +func authorityDescriptor(authorityKey ClusterAuthorityKey) *ClusterAuthorityDescriptor { + descriptor := authorityKey.ClusterAuthorityDescriptor + if descriptor.SchemaVersion == "" { + descriptor.SchemaVersion = clusterauth.AuthoritySchemaVersion + } + return &descriptor +} + +func (s *Service) UpdateCluster(ctx context.Context, input UpdateClusterInput) (Cluster, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return Cluster{}, err + } + if input.ClusterID == "" { + return Cluster{}, ErrInvalidCluster + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return Cluster{}, err + } + input.Name = strings.TrimSpace(input.Name) + input.Status = strings.TrimSpace(input.Status) + if input.Name == "" { + return Cluster{}, ErrInvalidPayload + } + if input.Status == "" { + input.Status = ClusterStatusActive + } + if input.Status != ClusterStatusActive && input.Status != ClusterStatusDisabled { + return Cluster{}, ErrInvalidPayload + } + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Metadata) { + return Cluster{}, errors.New("metadata must be valid json") + } + item, err := s.store.UpdateCluster(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return Cluster{}, ErrInvalidCluster + } + if err != nil { + return Cluster{}, err + } + payload, _ := json.Marshal(map[string]any{ + "name": item.Name, + "status": item.Status, + "region": item.Region, + }) + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &item.ID, + ActorUserID: &input.ActorUserID, + EventType: "cluster.updated", + TargetType: "cluster", + TargetID: &item.ID, + Payload: payload, + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListClusterNodes(ctx context.Context, actorUserID, clusterID string) ([]ClusterNode, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListClusterNodes(ctx, clusterID) +} + +func (s *Service) ListNodeGroups(ctx context.Context, actorUserID, clusterID string) ([]ClusterNodeGroup, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListNodeGroups(ctx, clusterID) +} + +func (s *Service) CreateNodeGroup(ctx context.Context, input CreateNodeGroupInput) (ClusterNodeGroup, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return ClusterNodeGroup{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return ClusterNodeGroup{}, err + } + input.Name = strings.TrimSpace(input.Name) + if input.ClusterID == "" || input.Name == "" { + return ClusterNodeGroup{}, ErrInvalidPayload + } + if input.Description != nil { + trimmed := strings.TrimSpace(*input.Description) + input.Description = &trimmed + } + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Metadata) { + return ClusterNodeGroup{}, errors.New("node group metadata must be valid json") + } + item, err := s.store.CreateNodeGroup(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return ClusterNodeGroup{}, ErrInvalidPayload + } + return item, err +} + +func (s *Service) CreateJoinToken(ctx context.Context, input CreateJoinTokenInput) (CreatedJoinToken, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return CreatedJoinToken{}, err + } + if input.ClusterID == "" { + return CreatedJoinToken{}, ErrInvalidCluster + } + input.Scope = defaultJSON(input.Scope, `{}`) + if !json.Valid(input.Scope) { + return CreatedJoinToken{}, errors.New("scope must be valid json") + } + if input.ExpiresAt.IsZero() { + input.ExpiresAt = defaultJoinTokenExpiry(s.now()) + } + if input.ExpiresAt.Before(s.now()) { + return CreatedJoinToken{}, errors.New("expires_at must be in the future") + } + if input.MaxUses <= 0 { + input.MaxUses = 1 + } + rawToken, err := generateJoinToken() + if err != nil { + return CreatedJoinToken{}, err + } + tokenHash, err := hashJoinToken(rawToken) + if err != nil { + return CreatedJoinToken{}, err + } + item, err := s.store.CreateJoinToken(ctx, input, tokenHash) + if err != nil { + return CreatedJoinToken{}, err + } + item, err = s.signJoinToken(ctx, input, item) + if err != nil { + return CreatedJoinToken{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "node_join_token.created", + TargetType: "node_join_token", + TargetID: &item.ID, + Payload: json.RawMessage(`{"raw_token_returned_once":true}`), + CreatedAt: s.now(), + }) + return CreatedJoinToken{NodeJoinToken: item, Token: rawToken}, nil +} + +func (s *Service) signJoinToken(ctx context.Context, input CreateJoinTokenInput, item NodeJoinToken) (NodeJoinToken, error) { + authorityKey, err := s.ensureClusterAuthority(ctx, input.ClusterID, &input.ActorUserID) + if err != nil { + return NodeJoinToken{}, err + } + payload := clusterJoinTokenAuthorityPayload{ + SchemaVersion: clusterJoinTokenAuthoritySchema, + ClusterID: input.ClusterID, + TokenID: item.ID, + Scope: item.Scope, + ExpiresAt: item.ExpiresAt, + MaxUses: item.MaxUses, + CreatedByUserID: item.CreatedByUserID, + IssuedAt: item.CreatedAt, + ControlPlaneOnly: true, + ProductionForwarding: false, + } + rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now()) + if err != nil { + return NodeJoinToken{}, err + } + return s.store.SetJoinTokenAuthority(ctx, input.ClusterID, item.ID, rawPayload, signature) +} + +func (s *Service) CreateJoinRequest(ctx context.Context, input CreateJoinRequestInput) (NodeJoinRequest, error) { + if input.ClusterID == "" { + return NodeJoinRequest{}, ErrInvalidCluster + } + if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil { + return NodeJoinRequest{}, err + } + input.NodeName = strings.TrimSpace(input.NodeName) + input.NodeFingerprint = strings.TrimSpace(input.NodeFingerprint) + input.PublicKey = strings.TrimSpace(input.PublicKey) + if input.NodeName == "" || input.NodeFingerprint == "" || input.PublicKey == "" { + return NodeJoinRequest{}, ErrInvalidPayload + } + input.ReportedCapabilities = defaultJSON(input.ReportedCapabilities, `{}`) + input.ReportedFacts = defaultJSON(input.ReportedFacts, `{}`) + input.RequestedRoles = defaultJSON(input.RequestedRoles, `[]`) + if !json.Valid(input.ReportedCapabilities) || !json.Valid(input.ReportedFacts) || !json.Valid(input.RequestedRoles) { + return NodeJoinRequest{}, errors.New("reported_capabilities, reported_facts, and requested_roles must be valid json") + } + tokenHash, err := hashJoinToken(input.JoinToken) + if err != nil { + return NodeJoinRequest{}, ErrInvalidJoinToken + } + token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return NodeJoinRequest{}, ErrInvalidJoinToken + } + return NodeJoinRequest{}, err + } + item, err := s.store.CreateJoinRequest(ctx, input, token.ID) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return NodeJoinRequest{}, ErrInvalidJoinToken + } + return NodeJoinRequest{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + EventType: "node_join_request.created", + TargetType: "node_join_request", + TargetID: &item.ID, + Payload: json.RawMessage(`{"source":"node_agent"}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListJoinRequests(ctx context.Context, actorUserID, clusterID string) ([]NodeJoinRequest, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListJoinRequests(ctx, clusterID) +} + +func (s *Service) GetJoinRequestBootstrap(ctx context.Context, input GetJoinRequestBootstrapInput) (JoinRequestBootstrapResult, error) { + input.ClusterID = strings.TrimSpace(input.ClusterID) + input.JoinRequestID = strings.TrimSpace(input.JoinRequestID) + input.NodeFingerprint = strings.TrimSpace(input.NodeFingerprint) + input.PublicKey = strings.TrimSpace(input.PublicKey) + if input.ClusterID == "" || input.JoinRequestID == "" || input.NodeFingerprint == "" || input.PublicKey == "" { + return JoinRequestBootstrapResult{}, ErrInvalidJoinRequest + } + item, err := s.store.GetJoinRequestForBootstrap(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return JoinRequestBootstrapResult{}, ErrInvalidJoinRequest + } + if err != nil { + return JoinRequestBootstrapResult{}, err + } + result := JoinRequestBootstrapResult{Status: item.Status, JoinRequest: item} + if item.Status != JoinRequestStatusApproved { + return result, nil + } + bootstrap, updated, err := s.bootstrapForApprovedJoinRequest(ctx, item) + if err != nil { + return JoinRequestBootstrapResult{}, err + } + result.JoinRequest = updated + result.Bootstrap = &bootstrap + return result, nil +} + +func (s *Service) RevokeJoinToken(ctx context.Context, input RevokeJoinTokenInput) (NodeJoinToken, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return NodeJoinToken{}, err + } + item, err := s.store.RevokeJoinToken(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return NodeJoinToken{}, ErrInvalidJoinToken + } + if err != nil { + return NodeJoinToken{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "node_join_token.revoked", + TargetType: "node_join_token", + TargetID: &input.TokenID, + Payload: json.RawMessage(`{}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ApproveJoinRequest(ctx context.Context, input ApproveJoinRequestInput) (ApprovedJoinRequest, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return ApprovedJoinRequest{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return ApprovedJoinRequest{}, err + } + if input.ClusterID == "" || input.JoinRequestID == "" { + return ApprovedJoinRequest{}, ErrInvalidJoinRequest + } + item, err := s.store.ApproveJoinRequest(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return ApprovedJoinRequest{}, ErrInvalidJoinRequest + } + if err != nil { + return ApprovedJoinRequest{}, err + } + item, err = s.signApprovedJoinRequest(ctx, input, item) + if err != nil { + return ApprovedJoinRequest{}, err + } + return item, nil +} + +func (s *Service) signApprovedJoinRequest(ctx context.Context, input ApproveJoinRequestInput, item ApprovedJoinRequest) (ApprovedJoinRequest, error) { + authorityKey, err := s.ensureClusterAuthority(ctx, input.ClusterID, &input.ActorUserID) + if err != nil { + return ApprovedJoinRequest{}, err + } + if item.Bootstrap.HeartbeatEndpoint == "" { + item.Bootstrap.HeartbeatEndpoint = nodeHeartbeatEndpoint(input.ClusterID, item.Bootstrap.NodeID) + } + payload := clusterNodeApprovalAuthorityPayload{ + SchemaVersion: clusterNodeApprovalAuthoritySchema, + ClusterID: input.ClusterID, + JoinRequestID: item.JoinRequest.ID, + NodeID: item.Bootstrap.NodeID, + NodeFingerprint: item.JoinRequest.NodeFingerprint, + IdentityStatus: item.Bootstrap.IdentityStatus, + HeartbeatEndpoint: item.Bootstrap.HeartbeatEndpoint, + ApprovedByUserID: input.ActorUserID, + IssuedAt: s.now(), + ControlPlaneOnly: true, + ProductionForwarding: false, + } + rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now()) + if err != nil { + return ApprovedJoinRequest{}, err + } + updated, err := s.store.SetJoinRequestApprovalAuthority(ctx, input.ClusterID, item.JoinRequest.ID, rawPayload, signature) + if err != nil { + return ApprovedJoinRequest{}, err + } + item.JoinRequest = updated + item.Bootstrap.ClusterAuthority = authorityDescriptor(authorityKey) + item.Bootstrap.AuthorityPayload = rawPayload + item.Bootstrap.AuthoritySignature = &signature + return item, nil +} + +func (s *Service) bootstrapForApprovedJoinRequest(ctx context.Context, item NodeJoinRequest) (NodeBootstrap, NodeJoinRequest, error) { + if item.Status != JoinRequestStatusApproved || item.ApprovedNodeID == nil || strings.TrimSpace(*item.ApprovedNodeID) == "" { + return NodeBootstrap{}, NodeJoinRequest{}, ErrInvalidJoinRequest + } + authorityKey, err := s.ensureClusterAuthority(ctx, item.ClusterID, item.ReviewedByUserID) + if err != nil { + return NodeBootstrap{}, NodeJoinRequest{}, err + } + heartbeatEndpoint := nodeHeartbeatEndpoint(item.ClusterID, *item.ApprovedNodeID) + identityStatus := NodeRegistrationActive + if rawMessageEmpty(item.ApprovalPayload) || rawMessageEmpty(item.ApprovalSignature) { + approvedBy := "system" + if item.ReviewedByUserID != nil && strings.TrimSpace(*item.ReviewedByUserID) != "" { + approvedBy = strings.TrimSpace(*item.ReviewedByUserID) + } + payload := clusterNodeApprovalAuthorityPayload{ + SchemaVersion: clusterNodeApprovalAuthoritySchema, + ClusterID: item.ClusterID, + JoinRequestID: item.ID, + NodeID: *item.ApprovedNodeID, + NodeFingerprint: item.NodeFingerprint, + IdentityStatus: identityStatus, + HeartbeatEndpoint: heartbeatEndpoint, + ApprovedByUserID: approvedBy, + IssuedAt: s.now(), + ControlPlaneOnly: true, + ProductionForwarding: false, + } + rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now()) + if err != nil { + return NodeBootstrap{}, NodeJoinRequest{}, err + } + item, err = s.store.SetJoinRequestApprovalAuthority(ctx, item.ClusterID, item.ID, rawPayload, signature) + if err != nil { + return NodeBootstrap{}, NodeJoinRequest{}, err + } + } else { + var signature ClusterSignature + if err := json.Unmarshal(item.ApprovalSignature, &signature); err != nil { + return NodeBootstrap{}, NodeJoinRequest{}, err + } + if err := clusterauth.VerifyRaw(authorityKey.PublicKey, item.ApprovalPayload, signature); err != nil { + return NodeBootstrap{}, NodeJoinRequest{}, err + } + } + var signature ClusterSignature + if err := json.Unmarshal(item.ApprovalSignature, &signature); err != nil { + return NodeBootstrap{}, NodeJoinRequest{}, err + } + bootstrap := NodeBootstrap{ + NodeID: *item.ApprovedNodeID, + ClusterID: item.ClusterID, + IdentityStatus: identityStatus, + Certificate: map[string]any{ + "status": "pending_issuer_integration", + }, + HeartbeatEndpoint: heartbeatEndpoint, + ClusterAuthority: authorityDescriptor(authorityKey), + AuthorityPayload: item.ApprovalPayload, + AuthoritySignature: &signature, + } + return bootstrap, item, nil +} + +func nodeHeartbeatEndpoint(clusterID, nodeID string) string { + return "/api/v1/clusters/" + clusterID + "/nodes/" + nodeID + "/heartbeats" +} + +func rawMessageEmpty(raw json.RawMessage) bool { + value := strings.TrimSpace(string(raw)) + return value == "" || value == "{}" || value == "null" +} + +func (s *Service) RejectJoinRequest(ctx context.Context, input RejectJoinRequestInput) (NodeJoinRequest, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return NodeJoinRequest{}, err + } + input.Reason = strings.TrimSpace(input.Reason) + if input.Reason == "" { + input.Reason = "Rejected by platform administrator." + } + item, err := s.store.RejectJoinRequest(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return NodeJoinRequest{}, ErrInvalidJoinRequest + } + return item, err +} + +func (s *Service) AssignNodeRole(ctx context.Context, input AssignNodeRoleInput) (NodeRoleAssignment, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return NodeRoleAssignment{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return NodeRoleAssignment{}, err + } + if !isAllowedNodeRole(input.Role) { + return NodeRoleAssignment{}, ErrInvalidNodeRole + } + if input.Status == "" { + input.Status = "active" + } + if input.Status != "active" && input.Status != "disabled" && input.Status != "revoked" { + return NodeRoleAssignment{}, ErrInvalidPayload + } + input.Policy = defaultJSON(input.Policy, `{}`) + if !json.Valid(input.Policy) { + return NodeRoleAssignment{}, errors.New("policy must be valid json") + } + item, err := s.store.AssignNodeRole(ctx, input) + if err != nil { + return NodeRoleAssignment{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "node_role." + input.Status, + TargetType: "node", + TargetID: &input.NodeID, + Payload: json.RawMessage(`{"capability_is_not_permission":true}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListNodeRoleAssignments(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeRoleAssignment, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListNodeRoleAssignments(ctx, clusterID, nodeID) +} + +func (s *Service) AttachExistingNodeToCluster(ctx context.Context, input AttachExistingNodeInput) (ClusterNode, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return ClusterNode{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return ClusterNode{}, err + } + if input.ClusterID == "" || input.NodeID == "" { + return ClusterNode{}, ErrInvalidPayload + } + for _, role := range input.Roles { + if !isAllowedNodeRole(role) { + return ClusterNode{}, ErrInvalidNodeRole + } + } + item, err := s.store.AttachExistingNodeToCluster(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return ClusterNode{}, ErrInvalidPayload + } + return item, err +} + +func (s *Service) AssignNodeToGroup(ctx context.Context, input AssignNodeGroupInput) (ClusterNode, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return ClusterNode{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return ClusterNode{}, err + } + if input.ClusterID == "" || input.NodeID == "" { + return ClusterNode{}, ErrInvalidPayload + } + if input.GroupID != nil { + trimmed := strings.TrimSpace(*input.GroupID) + if trimmed == "" { + input.GroupID = nil + } else { + input.GroupID = &trimmed + } + } + item, err := s.store.AssignNodeToGroup(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return ClusterNode{}, ErrInvalidPayload + } + return item, err +} + +func (s *Service) RevokeNodeIdentity(ctx context.Context, input RevokeNodeIdentityInput) error { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return err + } + input.Reason = strings.TrimSpace(input.Reason) + if input.Reason == "" { + input.Reason = "revoked by platform administrator" + } + if err := s.store.RevokeNodeIdentity(ctx, input); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return ErrInvalidPayload + } + return err + } + return nil +} + +func (s *Service) DisableClusterMembership(ctx context.Context, input DisableMembershipInput) error { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return err + } + input.Reason = strings.TrimSpace(input.Reason) + if input.Reason == "" { + input.Reason = "disabled by platform administrator" + } + if err := s.store.DisableClusterMembership(ctx, input); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return ErrInvalidPayload + } + return err + } + return nil +} + +func (s *Service) RecordHeartbeat(ctx context.Context, input RecordHeartbeatInput) (NodeHeartbeat, error) { + if input.ClusterID == "" || input.NodeID == "" { + return NodeHeartbeat{}, ErrInvalidPayload + } + if input.HealthStatus == "" { + input.HealthStatus = "unknown" + } + input.Capabilities = defaultJSON(input.Capabilities, `{}`) + input.ServiceStates = defaultJSON(input.ServiceStates, `{}`) + input.Metadata = defaultJSON(input.Metadata, `{}`) + return s.store.RecordHeartbeat(ctx, input) +} + +func (s *Service) ListNodeHeartbeats(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeHeartbeat, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, limit) +} + +func (s *Service) UpsertFabricTestingFlag(ctx context.Context, input UpsertFabricTestingFlagInput) (FabricTestingFlag, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return FabricTestingFlag{}, err + } + input.ScopeType = strings.TrimSpace(input.ScopeType) + if input.ScopeType == "" { + return FabricTestingFlag{}, ErrInvalidPayload + } + switch input.ScopeType { + case "platform": + input.ScopeID = nil + case "organization", "node": + if input.ScopeID == nil || strings.TrimSpace(*input.ScopeID) == "" { + return FabricTestingFlag{}, ErrInvalidPayload + } + default: + return FabricTestingFlag{}, ErrInvalidPayload + } + if input.HistoryRetentionHours <= 0 { + input.HistoryRetentionHours = 24 + } + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Metadata) { + return FabricTestingFlag{}, errors.New("testing flag metadata must be valid json") + } + item, err := s.store.UpsertFabricTestingFlag(ctx, input) + if err != nil { + return FabricTestingFlag{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "fabric.testing_flag.updated", + TargetType: input.ScopeType, + TargetID: input.ScopeID, + Payload: json.RawMessage(`{"runtime_mesh_enabled":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListFabricTestingFlags(ctx context.Context, actorUserID string) ([]FabricTestingFlag, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListFabricTestingFlags(ctx) +} + +func (s *Service) GetEffectiveNodeTestingFlags(ctx context.Context, clusterID, nodeID string) (EffectiveNodeTestingFlags, error) { + if clusterID == "" || nodeID == "" { + return EffectiveNodeTestingFlags{}, ErrInvalidPayload + } + return s.store.GetEffectiveNodeTestingFlags(ctx, clusterID, nodeID) +} + +func (s *Service) GetNodeSyntheticMeshConfig(ctx context.Context, input GetNodeSyntheticMeshConfigInput) (NodeSyntheticMeshConfig, error) { + input.ClusterID = strings.TrimSpace(input.ClusterID) + input.NodeID = strings.TrimSpace(input.NodeID) + if input.ClusterID == "" || input.NodeID == "" { + return NodeSyntheticMeshConfig{}, ErrInvalidPayload + } + cfg := NodeSyntheticMeshConfig{ + Enabled: false, + SchemaVersion: "c17z18.synthetic.v1", + ClusterID: input.ClusterID, + LocalNodeID: input.NodeID, + AuthorityRequired: true, + ConfigVersion: "disabled", + PeerDirectoryVersion: "disabled", + PolicyVersion: "disabled", + PeerEndpoints: map[string]string{}, + PeerEndpointCandidates: map[string][]PeerEndpointCandidate{}, + PeerDirectory: []PeerDirectoryEntry{}, + RecoverySeeds: []PeerRecoverySeed{}, + RendezvousLeases: []PeerRendezvousLease{}, + Routes: []SyntheticMeshRouteConfig{}, + ProductionForwarding: false, + } + flags, err := s.store.GetEffectiveNodeTestingFlags(ctx, input.ClusterID, input.NodeID) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + if !flags.Enabled || !flags.SyntheticLinksEnabled { + return s.signSyntheticMeshConfig(ctx, cfg) + } + intents, err := s.store.ListRouteIntents(ctx, input.ClusterID) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + cfg.Enabled = true + cfg.ConfigVersion = "c17z18-" + s.now().UTC().Format("20060102T150405Z") + cfg.PeerDirectoryVersion = cfg.ConfigVersion + cfg.PolicyVersion = cfg.ConfigVersion + meshLinks, err := s.store.ListMeshLinks(ctx, input.ClusterID) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + relayPolicy := newRendezvousRelayPolicy(input.NodeID, meshLinks, s.now()) + peerDirectory := map[string]*PeerDirectoryEntry{} + recoverySeeds := map[string]PeerRecoverySeed{} + rendezvousLeases := map[string]PeerRendezvousLease{} + routePathDecisions := []RoutePathDecision{} + for _, intent := range intents { + route, peers, candidates, seeds, policyLeases, ok := s.syntheticRouteFromIntent(input, intent) + if !ok { + continue + } + reportedPeers, reportedCandidates, err := s.reportedEndpointConfig(ctx, input.ClusterID, input.NodeID, route.Hops) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + feedback, err := s.rendezvousRelayFeedback(ctx, input.ClusterID, route.Hops, s.now()) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + relayPolicy.addFeedback(feedback) + replacementHints, err := s.rendezvousRelayReplacementHints(ctx, input.ClusterID, route.Hops, s.now()) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + relayPolicy.addReplacementHints(replacementHints) + relayPolicy.addFeedback(replacementHintFeedback(replacementHints, s.now())) + relayPolicy.addFeedback(rendezvousRelayRouteHealthFeedback(input.NodeID, route, meshLinks, s.now())) + for nodeID, endpoint := range reportedPeers { + peers[nodeID] = endpoint + } + for nodeID, items := range reportedCandidates { + candidates[nodeID] = append(candidates[nodeID], items...) + } + routeLeases := scopedRendezvousLeases(policyLeases, route, input.NodeID, relayPolicy, s.now()) + routeLeases = append(routeLeases, derivedRendezvousLeases(route, peers, candidates, input.NodeID, relayPolicy, s.now())...) + cfg.Routes = append(cfg.Routes, route) + routePathDecisions = append(routePathDecisions, routePathDecisionForRoute(route, input.NodeID, routeLeases, relayPolicy, cfg.ConfigVersion)) + mergePeerDirectoryRoute(peerDirectory, route, input.NodeID) + for nodeID, endpoint := range peers { + if strings.TrimSpace(nodeID) != "" && strings.TrimSpace(endpoint) != "" { + cfg.PeerEndpoints[nodeID] = endpoint + peerDirectoryEntry(peerDirectory, nodeID).EndpointCount++ + } + } + for nodeID, nodeCandidates := range candidates { + if strings.TrimSpace(nodeID) == "" || len(nodeCandidates) == 0 { + continue + } + cfg.PeerEndpointCandidates[nodeID] = append(cfg.PeerEndpointCandidates[nodeID], nodeCandidates...) + mergePeerDirectoryCandidates(peerDirectory, nodeID, nodeCandidates) + } + mergeRecoverySeeds(recoverySeeds, seeds) + mergeRendezvousLeases(rendezvousLeases, routeLeases) + } + cfg.RecoverySeeds = sortedRecoverySeeds(recoverySeeds, maxScopedRecoverySeeds) + cfg.RendezvousLeases = sortedRendezvousLeases(rendezvousLeases, maxScopedRendezvousLeases) + cfg.RendezvousRelayPolicy = relayPolicy.report() + cfg.RoutePathDecisions = routePathDecisionReport(cfg.ConfigVersion, routePathDecisions) + markPeerDirectoryRecoverySeeds(peerDirectory, cfg.RecoverySeeds) + markPeerDirectoryRendezvousLeases(peerDirectory, cfg.RendezvousLeases, input.NodeID) + cfg.PeerDirectory = sortedPeerDirectory(peerDirectory) + return s.signSyntheticMeshConfig(ctx, cfg) +} + +func (s *Service) signSyntheticMeshConfig(ctx context.Context, cfg NodeSyntheticMeshConfig) (NodeSyntheticMeshConfig, error) { + authorityKey, err := s.ensureClusterAuthority(ctx, cfg.ClusterID, nil) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + cfg.AuthorityRequired = true + cfg.ClusterAuthority = authorityDescriptor(authorityKey) + unsigned := cfg + unsigned.AuthorityPayload = nil + unsigned.AuthoritySignature = nil + rawConfig, err := json.Marshal(unsigned) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + configHash, err := clusterauth.HashRaw(rawConfig) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + issuedAt := s.now().UTC() + payload := clusterMeshConfigAuthorityPayload{ + SchemaVersion: clusterMeshConfigAuthoritySchema, + ClusterID: cfg.ClusterID, + LocalNodeID: cfg.LocalNodeID, + ConfigVersion: cfg.ConfigVersion, + ConfigSHA256: configHash, + IssuedAt: issuedAt, + ExpiresAt: issuedAt.Add(5 * time.Minute), + ControlPlaneOnly: true, + ProductionForwarding: false, + } + rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, issuedAt) + if err != nil { + return NodeSyntheticMeshConfig{}, err + } + cfg.AuthorityPayload = rawPayload + cfg.AuthoritySignature = &signature + return cfg, nil +} + +func (s *Service) RecordNodeTelemetry(ctx context.Context, input RecordNodeTelemetryInput) (NodeTelemetryObservation, error) { + if input.ClusterID == "" || input.NodeID == "" { + return NodeTelemetryObservation{}, ErrInvalidPayload + } + input.Payload = defaultJSON(input.Payload, `{}`) + if !json.Valid(input.Payload) { + return NodeTelemetryObservation{}, errors.New("telemetry payload must be valid json") + } + if input.ObservedAt.IsZero() { + input.ObservedAt = s.now() + } + return s.store.RecordNodeTelemetry(ctx, input) +} + +func (s *Service) ListNodeTelemetry(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeTelemetryObservation, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListNodeTelemetry(ctx, clusterID, nodeID, limit) +} + +func (s *Service) SetDesiredWorkload(ctx context.Context, input SetDesiredWorkloadInput) (NodeWorkloadDesiredState, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return NodeWorkloadDesiredState{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return NodeWorkloadDesiredState{}, err + } + input.ServiceType = strings.TrimSpace(input.ServiceType) + if input.ClusterID == "" || input.NodeID == "" || input.ServiceType == "" { + return NodeWorkloadDesiredState{}, ErrInvalidPayload + } + if input.DesiredState == "" { + input.DesiredState = "disabled" + } + if input.RuntimeMode == "" { + input.RuntimeMode = "container" + } + input.Config = defaultJSON(input.Config, `{}`) + input.Environment = defaultJSON(input.Environment, `{}`) + if !json.Valid(input.Config) || !json.Valid(input.Environment) { + return NodeWorkloadDesiredState{}, errors.New("config and environment must be valid json") + } + item, err := s.store.SetDesiredWorkload(ctx, input) + if err != nil { + return NodeWorkloadDesiredState{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "node_workload.desired_state_set", + TargetType: "node", + TargetID: &input.NodeID, + Payload: json.RawMessage(`{"supervision_runtime":"stub_c5"}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListDesiredWorkloads(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeWorkloadDesiredState, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + if clusterID == "" || nodeID == "" { + return nil, ErrInvalidPayload + } + return s.store.ListDesiredWorkloads(ctx, clusterID, nodeID) +} + +func (s *Service) ReportWorkloadStatus(ctx context.Context, input ReportWorkloadStatusInput) (NodeWorkloadStatus, error) { + input.ServiceType = strings.TrimSpace(input.ServiceType) + if input.ClusterID == "" || input.NodeID == "" || input.ServiceType == "" { + return NodeWorkloadStatus{}, ErrInvalidPayload + } + if input.ReportedState == "" { + input.ReportedState = "unknown" + } + if input.RuntimeMode == "" { + input.RuntimeMode = "container" + } + input.StatusPayload = defaultJSON(input.StatusPayload, `{}`) + if !json.Valid(input.StatusPayload) { + return NodeWorkloadStatus{}, errors.New("status_payload must be valid json") + } + return s.store.ReportWorkloadStatus(ctx, input) +} + +func (s *Service) ListLatestWorkloadStatuses(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeWorkloadStatus, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListLatestWorkloadStatuses(ctx, clusterID, nodeID) +} + +func (s *Service) ReportMeshLink(ctx context.Context, input ReportMeshLinkInput) (MeshLinkObservation, error) { + if input.ClusterID == "" || input.SourceNodeID == "" || input.TargetNodeID == "" { + return MeshLinkObservation{}, ErrInvalidPayload + } + if input.LinkStatus == "" { + input.LinkStatus = "unknown" + } + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Metadata) { + return MeshLinkObservation{}, errors.New("metadata must be valid json") + } + return s.store.ReportMeshLink(ctx, input) +} + +func (s *Service) ListMeshLinks(ctx context.Context, actorUserID, clusterID string) ([]MeshLinkObservation, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListMeshLinks(ctx, clusterID) +} + +func (s *Service) CreateRouteIntent(ctx context.Context, input CreateRouteIntentInput) (MeshRouteIntent, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return MeshRouteIntent{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return MeshRouteIntent{}, err + } + if input.ClusterID == "" || input.ServiceClass == "" { + return MeshRouteIntent{}, ErrInvalidPayload + } + if input.Priority == 0 { + input.Priority = 100 + } + input.SourceSelector = defaultJSON(input.SourceSelector, `{}`) + input.DestinationSelector = defaultJSON(input.DestinationSelector, `{}`) + input.Policy = defaultJSON(input.Policy, `{}`) + if !json.Valid(input.SourceSelector) || !json.Valid(input.DestinationSelector) || !json.Valid(input.Policy) { + return MeshRouteIntent{}, errors.New("source_selector, destination_selector, and policy must be valid json") + } + item, err := s.store.CreateRouteIntent(ctx, input) + if err != nil { + return MeshRouteIntent{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "mesh.route_intent.created", + TargetType: "mesh_route_intent", + TargetID: &item.ID, + Payload: json.RawMessage(`{"traffic_forwarding_enabled":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListRouteIntents(ctx context.Context, actorUserID, clusterID string) ([]MeshRouteIntent, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListRouteIntents(ctx, clusterID) +} + +func (s *Service) ListQoSPolicies(ctx context.Context, actorUserID, clusterID string) ([]MeshQoSPolicy, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListQoSPolicies(ctx, clusterID) +} + +func (s *Service) ListFabricEntryPoints(ctx context.Context, actorUserID, clusterID string) ([]FabricEntryPoint, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListFabricEntryPoints(ctx, clusterID) +} + +func (s *Service) CreateFabricEntryPoint(ctx context.Context, input CreateFabricEntryPointInput) (FabricEntryPoint, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return FabricEntryPoint{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return FabricEntryPoint{}, err + } + input.Name = strings.TrimSpace(input.Name) + input.Status = strings.TrimSpace(input.Status) + input.EndpointType = strings.TrimSpace(input.EndpointType) + if input.Status == "" { + input.Status = "active" + } + if input.EndpointType == "" { + input.EndpointType = "client_access" + } + if input.ClusterID == "" || input.Name == "" || !isFabricEndpointStatus(input.Status) || !isFabricEntryPointType(input.EndpointType) { + return FabricEntryPoint{}, ErrInvalidPayload + } + if input.PublicEndpoint != nil { + trimmed := strings.TrimSpace(*input.PublicEndpoint) + if trimmed == "" { + input.PublicEndpoint = nil + } else { + input.PublicEndpoint = &trimmed + } + } + input.Policy = defaultJSON(input.Policy, `{}`) + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Policy) || !json.Valid(input.Metadata) { + return FabricEntryPoint{}, errors.New("entry point policy and metadata must be valid json") + } + item, err := s.store.CreateFabricEntryPoint(ctx, input) + if err != nil { + return FabricEntryPoint{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "fabric.entry_point.created", + TargetType: "fabric_entry_point", + TargetID: &item.ID, + Payload: json.RawMessage(`{"runtime_routing_enabled":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) SetFabricEntryPointNode(ctx context.Context, input SetFabricEntryPointNodeInput) (FabricEntryPointNode, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return FabricEntryPointNode{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return FabricEntryPointNode{}, err + } + input.Status = strings.TrimSpace(input.Status) + if input.Status == "" { + input.Status = "active" + } + if input.Priority <= 0 { + input.Priority = 100 + } + if input.ClusterID == "" || input.EntryPointID == "" || input.NodeID == "" || !isFabricEndpointStatus(input.Status) { + return FabricEntryPointNode{}, ErrInvalidPayload + } + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Metadata) { + return FabricEntryPointNode{}, errors.New("entry point node metadata must be valid json") + } + return s.store.SetFabricEntryPointNode(ctx, input) +} + +func (s *Service) ListFabricEntryPointNodes(ctx context.Context, actorUserID, clusterID, entryPointID string) ([]FabricEntryPointNode, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + if clusterID == "" || entryPointID == "" { + return nil, ErrInvalidPayload + } + return s.store.ListFabricEntryPointNodes(ctx, clusterID, entryPointID) +} + +func (s *Service) ListFabricEgressPools(ctx context.Context, actorUserID, clusterID string) ([]FabricEgressPool, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListFabricEgressPools(ctx, clusterID) +} + +func (s *Service) CreateFabricEgressPool(ctx context.Context, input CreateFabricEgressPoolInput) (FabricEgressPool, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return FabricEgressPool{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return FabricEgressPool{}, err + } + input.Name = strings.TrimSpace(input.Name) + input.Status = strings.TrimSpace(input.Status) + if input.Status == "" { + input.Status = "active" + } + if input.ClusterID == "" || input.Name == "" || !isFabricEndpointStatus(input.Status) { + return FabricEgressPool{}, ErrInvalidPayload + } + if input.Description != nil { + trimmed := strings.TrimSpace(*input.Description) + if trimmed == "" { + input.Description = nil + } else { + input.Description = &trimmed + } + } + input.RouteScope = defaultJSON(input.RouteScope, `{}`) + input.Policy = defaultJSON(input.Policy, `{}`) + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.RouteScope) || !json.Valid(input.Policy) || !json.Valid(input.Metadata) { + return FabricEgressPool{}, errors.New("egress pool route_scope, policy, and metadata must be valid json") + } + item, err := s.store.CreateFabricEgressPool(ctx, input) + if err != nil { + return FabricEgressPool{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "fabric.egress_pool.created", + TargetType: "fabric_egress_pool", + TargetID: &item.ID, + Payload: json.RawMessage(`{"runtime_routing_enabled":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) SetFabricEgressPoolNode(ctx context.Context, input SetFabricEgressPoolNodeInput) (FabricEgressPoolNode, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return FabricEgressPoolNode{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return FabricEgressPoolNode{}, err + } + input.Status = strings.TrimSpace(input.Status) + if input.Status == "" { + input.Status = "active" + } + if input.Priority <= 0 { + input.Priority = 100 + } + if input.ClusterID == "" || input.EgressPoolID == "" || input.NodeID == "" || !isFabricEndpointStatus(input.Status) { + return FabricEgressPoolNode{}, ErrInvalidPayload + } + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Metadata) { + return FabricEgressPoolNode{}, errors.New("egress pool node metadata must be valid json") + } + return s.store.SetFabricEgressPoolNode(ctx, input) +} + +func (s *Service) ListFabricEgressPoolNodes(ctx context.Context, actorUserID, clusterID, egressPoolID string) ([]FabricEgressPoolNode, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + if clusterID == "" || egressPoolID == "" { + return nil, ErrInvalidPayload + } + return s.store.ListFabricEgressPoolNodes(ctx, clusterID, egressPoolID) +} + +func (s *Service) GetClusterAuthorityState(ctx context.Context, actorUserID, clusterID string) (ClusterAuthorityState, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return ClusterAuthorityState{}, err + } + return s.store.GetClusterAuthorityState(ctx, clusterID) +} + +func (s *Service) UpdateClusterAuthorityState(ctx context.Context, input UpdateClusterAuthorityInput) (ClusterAuthorityState, error) { + role, err := s.store.GetPlatformRole(ctx, strings.TrimSpace(input.ActorUserID)) + if err != nil { + return ClusterAuthorityState{}, err + } + if !isPlatformAdminRole(role) { + return ClusterAuthorityState{}, ErrAccessDenied + } + if input.MutationMode == "recovery_override" && role != PlatformRoleRecoveryAdmin { + return ClusterAuthorityState{}, ErrAccessDenied + } + if input.AuthorityState == "" { + input.AuthorityState = "authoritative" + } + if input.MutationMode == "" { + input.MutationMode = "normal" + } + item, err := s.store.UpdateClusterAuthorityState(ctx, input) + if err != nil { + return ClusterAuthorityState{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "cluster_authority.updated", + TargetType: "cluster", + TargetID: &input.ClusterID, + Payload: json.RawMessage(`{"split_brain_guard":true}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListClusterAdminSummaries(ctx context.Context, actorUserID string) ([]ClusterAdminSummary, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListClusterAdminSummaries(ctx) +} + +func (s *Service) CreateVPNConnection(ctx context.Context, input CreateVPNConnectionInput) (VPNConnection, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return VPNConnection{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return VPNConnection{}, err + } + input.Name = strings.TrimSpace(input.Name) + input.ProtocolFamily = strings.TrimSpace(input.ProtocolFamily) + if input.ProtocolFamily == "" { + input.ProtocolFamily = "generic" + } + input.Mode = strings.TrimSpace(input.Mode) + if input.Mode == "" { + input.Mode = VPNConnectionModeSingleActive + } + input.DesiredState = strings.TrimSpace(input.DesiredState) + if input.DesiredState == "" { + input.DesiredState = VPNConnectionDesiredDisabled + } + if input.ClusterID == "" || input.OrganizationID == "" || input.Name == "" { + return VPNConnection{}, ErrInvalidPayload + } + if input.Mode != VPNConnectionModeSingleActive { + return VPNConnection{}, errors.New("vpn connection mode must be single_active") + } + if !isAllowedVPNDesiredState(input.DesiredState) { + return VPNConnection{}, errors.New("vpn connection desired_state must be enabled or disabled") + } + input.TargetEndpoint = defaultJSON(input.TargetEndpoint, `{}`) + input.AllowedNodePolicy = defaultJSON(input.AllowedNodePolicy, `{"mode":"explicit","node_ids":[]}`) + input.RoutingUsage = defaultJSON(input.RoutingUsage, `[]`) + input.RoutePolicy = defaultJSON(input.RoutePolicy, `{}`) + input.QoSPolicy = defaultJSON(input.QoSPolicy, `{}`) + input.PlacementPolicy = defaultJSON(input.PlacementPolicy, `{}`) + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.TargetEndpoint) || + !json.Valid(input.AllowedNodePolicy) || + !json.Valid(input.RoutingUsage) || + !json.Valid(input.RoutePolicy) || + !json.Valid(input.QoSPolicy) || + !json.Valid(input.PlacementPolicy) || + !json.Valid(input.Metadata) { + return VPNConnection{}, errors.New("vpn connection json fields must be valid json") + } + item, err := s.store.CreateVPNConnection(ctx, input) + if err != nil { + return VPNConnection{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "vpn_connection.created", + TargetType: "vpn_connection", + TargetID: &item.ID, + Payload: json.RawMessage(`{"runtime_created":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListVPNConnections(ctx context.Context, actorUserID, clusterID string) ([]VPNConnection, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListVPNConnections(ctx, clusterID) +} + +func (s *Service) GetVPNConnection(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) (VPNConnection, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return VPNConnection{}, err + } + item, err := s.store.GetVPNConnection(ctx, clusterID, vpnConnectionID) + if errors.Is(err, pgx.ErrNoRows) { + return VPNConnection{}, ErrInvalidVPNConnection + } + return item, err +} + +func (s *Service) UpdateVPNConnectionDesiredState(ctx context.Context, input UpdateVPNConnectionDesiredStateInput) (VPNConnection, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return VPNConnection{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return VPNConnection{}, err + } + input.DesiredState = strings.TrimSpace(input.DesiredState) + if !isAllowedVPNDesiredState(input.DesiredState) { + return VPNConnection{}, errors.New("vpn connection desired_state must be enabled or disabled") + } + item, err := s.store.UpdateVPNConnectionDesiredState(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return VPNConnection{}, ErrInvalidVPNConnection + } + if err != nil { + return VPNConnection{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "vpn_connection.desired_state_changed", + TargetType: "vpn_connection", + TargetID: &input.VPNConnectionID, + Payload: json.RawMessage(`{"runtime_executed":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) UpsertVPNConnectionRoutePolicy(ctx context.Context, input UpsertVPNConnectionRoutePolicyInput) (VPNConnectionRoutePolicy, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return VPNConnectionRoutePolicy{}, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return VPNConnectionRoutePolicy{}, err + } + input.RouteType = strings.TrimSpace(input.RouteType) + input.Destination = strings.TrimSpace(input.Destination) + input.Action = strings.TrimSpace(input.Action) + input.Status = strings.TrimSpace(input.Status) + if input.Action == "" { + input.Action = "allow" + } + if input.Status == "" { + input.Status = "active" + } + if input.Priority == 0 { + input.Priority = 100 + } + if input.ClusterID == "" || input.VPNConnectionID == "" || input.RouteType == "" || input.Destination == "" { + return VPNConnectionRoutePolicy{}, ErrInvalidPayload + } + if !isAllowedVPNRouteType(input.RouteType) || !isAllowedVPNRouteAction(input.Action) || !isAllowedVPNPolicyStatus(input.Status) { + return VPNConnectionRoutePolicy{}, ErrInvalidPayload + } + input.Policy = defaultJSON(input.Policy, `{}`) + if !json.Valid(input.Policy) { + return VPNConnectionRoutePolicy{}, errors.New("vpn route policy json must be valid json") + } + item, err := s.store.UpsertVPNConnectionRoutePolicy(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionRoutePolicy{}, ErrInvalidVPNConnection + } + if err != nil { + return VPNConnectionRoutePolicy{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "vpn_connection.route_policy_changed", + TargetType: "vpn_connection", + TargetID: &input.VPNConnectionID, + Payload: json.RawMessage(`{"routing_runtime_changed":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListVPNConnectionRoutePolicies(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) ([]VPNConnectionRoutePolicy, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListVPNConnectionRoutePolicies(ctx, clusterID, vpnConnectionID) +} + +func (s *Service) SetVPNConnectionAllowedNodes(ctx context.Context, input SetVPNConnectionAllowedNodesInput) ([]VPNConnectionAllowedNode, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return nil, err + } + if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { + return nil, err + } + input.RolePreference = strings.TrimSpace(input.RolePreference) + if input.RolePreference == "" { + input.RolePreference = "candidate" + } + if input.ClusterID == "" || input.VPNConnectionID == "" { + return nil, ErrInvalidPayload + } + if !isAllowedVPNNodePreference(input.RolePreference) { + return nil, ErrInvalidPayload + } + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Metadata) { + return nil, errors.New("allowed node metadata must be valid json") + } + nodes := make([]string, 0, len(input.NodeIDs)) + seen := map[string]struct{}{} + for _, nodeID := range input.NodeIDs { + nodeID = strings.TrimSpace(nodeID) + if nodeID == "" { + continue + } + if _, ok := seen[nodeID]; ok { + continue + } + seen[nodeID] = struct{}{} + nodes = append(nodes, nodeID) + } + input.NodeIDs = nodes + items, err := s.store.SetVPNConnectionAllowedNodes(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return nil, ErrInvalidVPNConnection + } + if err != nil { + return nil, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "vpn_connection.allowed_nodes_changed", + TargetType: "vpn_connection", + TargetID: &input.VPNConnectionID, + Payload: json.RawMessage(`{"node_runtime_changed":false}`), + CreatedAt: s.now(), + }) + return items, nil +} + +func (s *Service) ListVPNConnectionAllowedNodes(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) ([]VPNConnectionAllowedNode, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListVPNConnectionAllowedNodes(ctx, clusterID, vpnConnectionID) +} + +func (s *Service) AcquireVPNConnectionLease(ctx context.Context, input AcquireVPNConnectionLeaseInput) (VPNConnectionLease, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return VPNConnectionLease{}, err + } + if input.ClusterID == "" || input.VPNConnectionID == "" || input.OwnerNodeID == "" { + return VPNConnectionLease{}, ErrInvalidPayload + } + conn, err := s.store.GetVPNConnection(ctx, input.ClusterID, input.VPNConnectionID) + if errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionLease{}, ErrInvalidVPNConnection + } + if err != nil { + return VPNConnectionLease{}, err + } + if conn.Mode != VPNConnectionModeSingleActive || conn.DesiredState != VPNConnectionDesiredEnabled { + return VPNConnectionLease{}, errors.New("vpn connection must be enabled single_active before lease acquisition") + } + if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil { + return VPNConnectionLease{}, err + } + if input.TTL <= 0 { + input.TTL = 30 * time.Second + } + input.Metadata = defaultJSON(input.Metadata, `{}`) + if !json.Valid(input.Metadata) { + return VPNConnectionLease{}, errors.New("lease metadata must be valid json") + } + token, err := generateFencingToken() + if err != nil { + return VPNConnectionLease{}, err + } + item, err := s.store.AcquireVPNConnectionLease(ctx, input, s.now().Add(input.TTL), token) + if errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionLease{}, ErrInvalidVPNLease + } + if errors.Is(err, ErrVPNLeaseAlreadyActive) { + return VPNConnectionLease{}, ErrVPNLeaseAlreadyActive + } + if err != nil { + return VPNConnectionLease{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "vpn_connection.lease_acquired", + TargetType: "vpn_connection", + TargetID: &input.VPNConnectionID, + Payload: json.RawMessage(`{"vpn_runtime_started":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) RenewVPNConnectionLease(ctx context.Context, input RenewVPNConnectionLeaseInput) (VPNConnectionLease, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return VPNConnectionLease{}, err + } + if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" || input.FencingToken == "" { + return VPNConnectionLease{}, ErrInvalidPayload + } + if input.TTL <= 0 { + input.TTL = 30 * time.Second + } + if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil { + return VPNConnectionLease{}, err + } + item, err := s.store.RenewVPNConnectionLease(ctx, input, s.now().Add(input.TTL)) + if errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionLease{}, ErrInvalidVPNLease + } + if err != nil { + return VPNConnectionLease{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "vpn_connection.lease_renewed", + TargetType: "vpn_connection", + TargetID: &input.VPNConnectionID, + Payload: json.RawMessage(`{"vpn_runtime_changed":false}`), + CreatedAt: s.now(), + }) + return item, err +} + +func (s *Service) ReleaseVPNConnectionLease(ctx context.Context, input ReleaseVPNConnectionLeaseInput) (VPNConnectionLease, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return VPNConnectionLease{}, err + } + if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" || input.FencingToken == "" { + return VPNConnectionLease{}, ErrInvalidPayload + } + item, err := s.store.ReleaseVPNConnectionLease(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionLease{}, ErrInvalidVPNLease + } + if err != nil { + return VPNConnectionLease{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "vpn_connection.lease_released", + TargetType: "vpn_connection", + TargetID: &input.VPNConnectionID, + Payload: json.RawMessage(`{"vpn_runtime_stopped":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) FenceVPNConnectionLease(ctx context.Context, input FenceVPNConnectionLeaseInput) (VPNConnectionLease, error) { + if err := s.ensurePlatformRecoveryAdmin(ctx, input.ActorUserID); err != nil { + return VPNConnectionLease{}, err + } + input.Reason = strings.TrimSpace(input.Reason) + if input.Reason == "" { + input.Reason = "fenced by platform recovery administrator" + } + if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" { + return VPNConnectionLease{}, ErrInvalidPayload + } + item, err := s.store.FenceVPNConnectionLease(ctx, input) + if errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionLease{}, ErrInvalidVPNLease + } + if err != nil { + return VPNConnectionLease{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "vpn_connection.owner_fenced", + TargetType: "vpn_connection", + TargetID: &input.VPNConnectionID, + Payload: json.RawMessage(`{"split_brain_guard":true}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) GetActiveVPNConnectionLease(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) (VPNConnectionLease, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return VPNConnectionLease{}, err + } + item, err := s.store.GetActiveVPNConnectionLease(ctx, clusterID, vpnConnectionID) + if errors.Is(err, pgx.ErrNoRows) { + return VPNConnectionLease{}, ErrInvalidVPNLease + } + return item, err +} + +func (s *Service) ExpireStaleVPNConnectionLeases(ctx context.Context, input ExpireStaleVPNConnectionLeasesInput) ([]VPNConnectionLease, error) { + if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { + return nil, err + } + if input.ClusterID == "" { + return nil, ErrInvalidPayload + } + items, err := s.store.ExpireStaleVPNConnectionLeases(ctx, input.ClusterID, s.now()) + if err != nil { + return nil, err + } + for _, item := range items { + vpnConnectionID := item.VPNConnectionID + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + ActorUserID: &input.ActorUserID, + EventType: "vpn_connection.lease_expired", + TargetType: "vpn_connection", + TargetID: &vpnConnectionID, + Payload: json.RawMessage(`{"stale_reclamation":true,"vpn_runtime_changed":false}`), + CreatedAt: s.now(), + }) + } + return items, nil +} + +func (s *Service) ListNodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error) { + clusterID = strings.TrimSpace(clusterID) + nodeID = strings.TrimSpace(nodeID) + if clusterID == "" || nodeID == "" { + return nil, ErrInvalidPayload + } + return s.store.ListNodeVPNAssignments(ctx, clusterID, nodeID) +} + +func (s *Service) ReportNodeVPNAssignmentStatus(ctx context.Context, input ReportNodeVPNAssignmentStatusInput) (NodeVPNAssignmentStatus, error) { + input.ClusterID = strings.TrimSpace(input.ClusterID) + input.NodeID = strings.TrimSpace(input.NodeID) + input.VPNConnectionID = strings.TrimSpace(input.VPNConnectionID) + input.ObservedStatus = strings.TrimSpace(input.ObservedStatus) + if input.ClusterID == "" || input.NodeID == "" || input.VPNConnectionID == "" { + return NodeVPNAssignmentStatus{}, ErrInvalidPayload + } + if input.ObservedStatus == "" { + input.ObservedStatus = VPNAssignmentStatusUnknown + } + if !isAllowedVPNAssignmentStatus(input.ObservedStatus) { + return NodeVPNAssignmentStatus{}, ErrInvalidPayload + } + input.StatusPayload = defaultJSON(input.StatusPayload, `{}`) + if !json.Valid(input.StatusPayload) { + return NodeVPNAssignmentStatus{}, errors.New("status_payload must be valid json") + } + if input.ObservedAt.IsZero() { + input.ObservedAt = s.now() + } + + assignments, err := s.store.ListNodeVPNAssignments(ctx, input.ClusterID, input.NodeID) + if err != nil { + return NodeVPNAssignmentStatus{}, err + } + visible := false + for _, assignment := range assignments { + if assignment.VPNConnectionID == input.VPNConnectionID { + visible = true + break + } + } + if !visible { + return NodeVPNAssignmentStatus{}, ErrVPNLeaseOwnerNotAllowed + } + + item, err := s.store.ReportNodeVPNAssignmentStatus(ctx, input) + if err != nil { + return NodeVPNAssignmentStatus{}, err + } + _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ + ClusterID: &input.ClusterID, + EventType: "vpn_connection.assignment_status_reported", + TargetType: "vpn_connection", + TargetID: &input.VPNConnectionID, + Payload: json.RawMessage(`{"node_agent_runtime_executed":false}`), + CreatedAt: s.now(), + }) + return item, nil +} + +func (s *Service) ListAuditEvents(ctx context.Context, actorUserID, clusterID string, limit int) ([]ClusterAuditEvent, error) { + if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { + return nil, err + } + return s.store.ListAuditEvents(ctx, clusterID, limit) +} + +func (s *Service) ensurePlatformAdmin(ctx context.Context, userID string) error { + userID = strings.TrimSpace(userID) + if userID == "" { + return ErrAccessDenied + } + role, err := s.store.GetPlatformRole(ctx, userID) + if err != nil { + return err + } + if !isPlatformAdminRole(role) { + return ErrAccessDenied + } + return nil +} + +func (s *Service) ensurePlatformRecoveryAdmin(ctx context.Context, userID string) error { + userID = strings.TrimSpace(userID) + if userID == "" { + return ErrAccessDenied + } + role, err := s.store.GetPlatformRole(ctx, userID) + if err != nil { + return err + } + if role != PlatformRoleRecoveryAdmin { + return ErrAccessDenied + } + return nil +} + +func (s *Service) ensureClusterMutable(ctx context.Context, actorUserID, clusterID string) error { + role, err := s.store.GetPlatformRole(ctx, strings.TrimSpace(actorUserID)) + if err != nil { + return err + } + if role == PlatformRoleRecoveryAdmin { + return nil + } + state, err := s.store.GetClusterAuthorityState(ctx, clusterID) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return nil + } + return err + } + if state.AuthorityState != "authoritative" || state.MutationMode != "normal" { + return ErrClusterReadOnly + } + return nil +} + +func (s *Service) ensureVPNLeaseOwnerEligible(ctx context.Context, clusterID, vpnConnectionID, ownerNodeID string) error { + eligibility, err := s.store.CheckVPNLeaseOwnerEligibility(ctx, clusterID, vpnConnectionID, ownerNodeID) + if errors.Is(err, pgx.ErrNoRows) { + return ErrInvalidVPNConnection + } + if err != nil { + return err + } + if eligibility.MembershipStatus != "active" || eligibility.NodeRegistrationStatus != NodeRegistrationActive { + return ErrVPNLeaseOwnerNotAllowed + } + if !eligibility.AllowedByPolicy { + return ErrVPNLeaseOwnerNotAllowed + } + if !eligibility.HasAuthorizedRole { + return ErrVPNLeaseOwnerRoleRequired + } + return nil +} + +func defaultJSON(raw json.RawMessage, fallback string) json.RawMessage { + if len(raw) == 0 { + return json.RawMessage(fallback) + } + return raw +} + +func isAllowedVPNDesiredState(state string) bool { + return state == VPNConnectionDesiredEnabled || state == VPNConnectionDesiredDisabled +} + +func isAllowedVPNRouteType(routeType string) bool { + switch routeType { + case "cidr", "dns_suffix", "service", "resource": + return true + default: + return false + } +} + +func isAllowedVPNRouteAction(action string) bool { + return action == "allow" || action == "deny" +} + +func isAllowedVPNPolicyStatus(status string) bool { + return status == "active" || status == "disabled" +} + +func isFabricEndpointStatus(status string) bool { + switch status { + case "active", "disabled", "maintenance": + return true + default: + return false + } +} + +func isFabricEntryPointType(endpointType string) bool { + switch endpointType { + case "client_access", "admin", "api", "other": + return true + default: + return false + } +} + +func isAllowedVPNNodePreference(preference string) bool { + switch preference { + case "candidate", "standby", "preferred": + return true + default: + return false + } +} + +func isAllowedVPNAssignmentStatus(status string) bool { + switch status { + case VPNAssignmentStatusNotStarted, + VPNAssignmentStatusAssigned, + VPNAssignmentStatusLeaseRequired, + VPNAssignmentStatusBlocked, + VPNAssignmentStatusUnknown: + return true + default: + return false + } +} + +type syntheticRoutePolicy struct { + SyntheticEnabled bool `json:"synthetic_enabled"` + PeerEndpoints map[string]string `json:"peer_endpoints"` + PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates"` + RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds"` + RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases"` + Hops []string `json:"hops"` + AllowedChannels []string `json:"allowed_channels"` + MaxTTL int `json:"max_ttl"` + MaxHops int `json:"max_hops"` + ExpiresAt *time.Time `json:"expires_at"` + RouteVersion string `json:"route_version"` + PolicyVersion string `json:"policy_version"` + PeerDirectoryVersion string `json:"peer_directory_version"` +} + +type heartbeatMeshEndpointReport struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + PeerEndpoint string `json:"peer_endpoint"` + Transport string `json:"transport"` + ConnectivityMode string `json:"connectivity_mode"` + NATType string `json:"nat_type"` + Region string `json:"region"` + EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates"` + ObservedAt *time.Time `json:"observed_at"` +} + +type heartbeatRendezvousLeaseReport struct { + SchemaVersion string `json:"schema_version"` + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + ObservedAt string `json:"observed_at"` + Leases []heartbeatRendezvousLeaseDetails `json:"leases"` +} + +type heartbeatRendezvousLeaseDetails struct { + LeaseID string `json:"lease_id"` + PeerNodeID string `json:"peer_node_id"` + RelayNodeID string `json:"relay_node_id"` + RouteIDs []string `json:"route_ids"` + StaleRelay bool `json:"stale_relay"` + WithdrawalNeeded bool `json:"withdrawal_needed"` + ReselectionNeeded bool `json:"reselection_needed"` + ConnectionState string `json:"connection_state"` + Reason string `json:"reason"` +} + +type meshRouteHealthObservationMetadata struct { + ObservationType string `json:"observation_type"` + RouteID string `json:"route_id"` + RoutePathDecisionApplied bool `json:"route_path_decision_applied"` + RoutePathDecisionSelectedRelayID string `json:"route_path_decision_selected_relay_id"` + RoutePathDecisionStaleRelayNodeID string `json:"route_path_decision_stale_relay_node_id"` + RoutePathDecisionRendezvousPeerNodeID string `json:"route_path_decision_rendezvous_peer_node_id"` + RoutePathDecisionRendezvousLeaseID string `json:"route_path_decision_rendezvous_lease_id"` + RoutePathDecisionRendezvousLeaseReason string `json:"route_path_decision_rendezvous_lease_reason"` + RoutePathDecisionSource string `json:"route_path_decision_source"` + ExpectedEffectiveHops []string `json:"expected_effective_hops"` + ObservedAckPath []string `json:"observed_ack_path"` + RoutePathDriftDetected bool `json:"route_path_drift_detected"` + FailureReason string `json:"failure_reason"` + ControlPlaneOnly bool `json:"control_plane_only"` + ProductionForwarding bool `json:"production_forwarding"` + ProductionPayloadForwarding bool `json:"production_payload_forwarding"` + RouteHealthProductionPayloadForwarding bool `json:"route_health_production_payload_forwarding"` + RouteHealthServicePayloadForwarding bool `json:"route_health_service_payload_forwarding"` +} + +type rendezvousRelayFeedbackEntry struct { + ReporterNodeID string + RouteIDs []string + LeaseID string + PeerNodeID string + RelayNodeID string + ConnectionState string + Reason string + WithdrawalNeeded bool + ReselectionNeeded bool + ObservedAt time.Time +} + +type rendezvousRelaySelection struct { + RelayNodeID string + Endpoint string + Score int + Reasons []string +} + +type rendezvousRelayPolicy struct { + localNodeID string + now time.Time + links []MeshLinkObservation + feedback []rendezvousRelayFeedbackEntry + withdrawn map[string]RendezvousRelayPolicyDecision + replacements map[string]RendezvousRelayPolicyDecision +} + +const ( + maxScopedRecoverySeeds = 20 + maxScopedRendezvousLeases = 20 + rendezvousRelayFeedbackMaxAge = 2 * time.Minute +) + +type nodeSelector struct { + NodeID string `json:"node_id"` + NodeIDs []string `json:"node_ids"` +} + +func (s *Service) syntheticRouteFromIntent(input GetNodeSyntheticMeshConfigInput, intent MeshRouteIntent) (SyntheticMeshRouteConfig, map[string]string, map[string][]PeerEndpointCandidate, []PeerRecoverySeed, []PeerRendezvousLease, bool) { + if intent.Status != "active" { + return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false + } + var policy syntheticRoutePolicy + if err := json.Unmarshal(intent.Policy, &policy); err != nil { + return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false + } + if !policy.SyntheticEnabled { + return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false + } + var source nodeSelector + var destination nodeSelector + _ = json.Unmarshal(intent.SourceSelector, &source) + _ = json.Unmarshal(intent.DestinationSelector, &destination) + sourceNodeID := firstNodeID(source) + destinationNodeID := firstNodeID(destination) + hops := append([]string{}, policy.Hops...) + if len(hops) == 0 && sourceNodeID != "" && destinationNodeID != "" { + hops = []string{sourceNodeID, destinationNodeID} + } + if len(hops) < 2 || !containsString(hops, input.NodeID) { + return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false + } + if err := validatePeerEndpointCandidates(policy.PeerEndpointCandidates, hops); err != nil { + return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false + } + if err := validatePeerRecoverySeeds(policy.RecoverySeeds); err != nil { + return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false + } + if err := validatePeerRendezvousLeases(policy.RendezvousLeases, hops, s.now()); err != nil { + return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false + } + if sourceNodeID == "" { + sourceNodeID = hops[0] + } + if destinationNodeID == "" { + destinationNodeID = hops[len(hops)-1] + } + expiresAt := s.now().UTC().Add(5 * time.Minute) + if policy.ExpiresAt != nil { + expiresAt = policy.ExpiresAt.UTC() + } + allowedChannels := policy.AllowedChannels + if len(allowedChannels) == 0 { + allowedChannels = []string{"fabric_control", "route_control"} + } + maxTTL := policy.MaxTTL + if maxTTL <= 0 { + maxTTL = 8 + } + maxHops := policy.MaxHops + if maxHops <= 0 { + maxHops = 8 + } + routeVersion := policy.RouteVersion + if routeVersion == "" { + routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339) + } + policyVersion := policy.PolicyVersion + if policyVersion == "" { + policyVersion = routeVersion + } + peerDirectoryVersion := policy.PeerDirectoryVersion + if peerDirectoryVersion == "" { + peerDirectoryVersion = routeVersion + } + route := SyntheticMeshRouteConfig{ + RouteID: intent.ID, + ClusterID: input.ClusterID, + SourceNodeID: sourceNodeID, + DestinationNodeID: destinationNodeID, + Hops: hops, + AllowedChannels: allowedChannels, + ExpiresAt: expiresAt, + MaxTTL: maxTTL, + MaxHops: maxHops, + RouteVersion: routeVersion, + PolicyVersion: policyVersion, + PeerDirectoryVersion: peerDirectoryVersion, + } + return route, + scopedPeerEndpoints(policy.PeerEndpoints, hops), + scopedPeerEndpointCandidates(policy.PeerEndpointCandidates, hops), + policy.RecoverySeeds, + normalizeRendezvousLeases(policy.RendezvousLeases, route, s.now()), + true +} + +func (s *Service) reportedEndpointConfig(ctx context.Context, clusterID string, localNodeID string, routePath []string) (map[string]string, map[string][]PeerEndpointCandidate, error) { + peers := map[string]string{} + candidates := map[string][]PeerEndpointCandidate{} + for _, nodeID := range routePath { + nodeID = strings.TrimSpace(nodeID) + if nodeID == "" || nodeID == localNodeID { + continue + } + heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1) + if err != nil { + return nil, nil, err + } + if len(heartbeats) == 0 { + continue + } + peerEndpoint, nodeCandidates, ok := endpointReportFromHeartbeat(heartbeats[0]) + if !ok { + continue + } + if peerEndpoint != "" { + peers[nodeID] = peerEndpoint + } + if len(nodeCandidates) > 0 { + candidates[nodeID] = append(candidates[nodeID], nodeCandidates...) + } + } + return peers, candidates, nil +} + +func endpointReportFromHeartbeat(heartbeat NodeHeartbeat) (string, []PeerEndpointCandidate, bool) { + var metadata struct { + MeshEndpointReport heartbeatMeshEndpointReport `json:"mesh_endpoint_report"` + } + if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { + return "", nil, false + } + if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { + return "", nil, false + } + report := metadata.MeshEndpointReport + if report.NodeID != "" && report.NodeID != heartbeat.NodeID { + return "", nil, false + } + if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID { + return "", nil, false + } + nodeID := heartbeat.NodeID + peerEndpoint := strings.TrimSpace(report.PeerEndpoint) + out := make([]PeerEndpointCandidate, 0, len(report.EndpointCandidates)) + for _, candidate := range report.EndpointCandidates { + if candidate.NodeID == "" { + candidate.NodeID = nodeID + } + if candidate.EndpointID == "" { + candidate.EndpointID = nodeID + "-reported" + } + if candidate.Address == "" { + candidate.Address = peerEndpoint + } + if candidate.Transport == "" { + candidate.Transport = report.Transport + } + if candidate.ConnectivityMode == "" { + candidate.ConnectivityMode = report.ConnectivityMode + } + if candidate.NATType == "" { + candidate.NATType = report.NATType + } + if candidate.Region == "" { + candidate.Region = report.Region + } + if candidate.Reachability == "" { + candidate.Reachability = reachabilityFromConnectivityMode(candidate.ConnectivityMode) + } + if candidate.Metadata == nil { + candidate.Metadata = json.RawMessage(`{}`) + } + if candidate.NodeID != nodeID { + return "", nil, false + } + out = append(out, candidate) + } + if len(out) > 0 { + if err := validatePeerEndpointCandidates(map[string][]PeerEndpointCandidate{nodeID: out}, []string{nodeID}); err != nil { + return "", nil, false + } + } + return peerEndpoint, out, peerEndpoint != "" || len(out) > 0 +} + +func (s *Service) rendezvousRelayFeedback(ctx context.Context, clusterID string, routePath []string, now time.Time) ([]rendezvousRelayFeedbackEntry, error) { + out := []rendezvousRelayFeedbackEntry{} + seenNodes := map[string]struct{}{} + for _, nodeID := range routePath { + nodeID = strings.TrimSpace(nodeID) + if nodeID == "" { + continue + } + if _, duplicate := seenNodes[nodeID]; duplicate { + continue + } + seenNodes[nodeID] = struct{}{} + heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1) + if err != nil { + return nil, err + } + if len(heartbeats) == 0 { + continue + } + out = append(out, rendezvousRelayFeedbackFromHeartbeat(heartbeats[0], now)...) + } + return out, nil +} + +func rendezvousRelayFeedbackFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) []rendezvousRelayFeedbackEntry { + if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { + return nil + } + if now.IsZero() { + now = time.Now().UTC() + } else { + now = now.UTC() + } + if heartbeat.ObservedAt.IsZero() || + heartbeat.ObservedAt.After(now.Add(time.Minute)) || + now.Sub(heartbeat.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge { + return nil + } + var metadata struct { + MeshRendezvousLeaseReport heartbeatRendezvousLeaseReport `json:"mesh_rendezvous_lease_report"` + } + if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { + return nil + } + report := metadata.MeshRendezvousLeaseReport + if report.NodeID != "" && report.NodeID != heartbeat.NodeID { + return nil + } + if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID { + return nil + } + out := []rendezvousRelayFeedbackEntry{} + for _, lease := range report.Leases { + if !lease.StaleRelay && !lease.WithdrawalNeeded && !lease.ReselectionNeeded { + continue + } + if strings.TrimSpace(lease.PeerNodeID) == "" || strings.TrimSpace(lease.RelayNodeID) == "" { + continue + } + out = append(out, rendezvousRelayFeedbackEntry{ + ReporterNodeID: heartbeat.NodeID, + RouteIDs: append([]string{}, lease.RouteIDs...), + LeaseID: strings.TrimSpace(lease.LeaseID), + PeerNodeID: strings.TrimSpace(lease.PeerNodeID), + RelayNodeID: strings.TrimSpace(lease.RelayNodeID), + ConnectionState: strings.TrimSpace(lease.ConnectionState), + Reason: strings.TrimSpace(lease.Reason), + WithdrawalNeeded: lease.WithdrawalNeeded, + ReselectionNeeded: lease.ReselectionNeeded, + ObservedAt: heartbeat.ObservedAt.UTC(), + }) + } + return out +} + +func (s *Service) rendezvousRelayReplacementHints(ctx context.Context, clusterID string, routePath []string, now time.Time) ([]RendezvousRelayPolicyDecision, error) { + out := []RendezvousRelayPolicyDecision{} + seenNodes := map[string]struct{}{} + for _, nodeID := range routePath { + nodeID = strings.TrimSpace(nodeID) + if nodeID == "" { + continue + } + if _, duplicate := seenNodes[nodeID]; duplicate { + continue + } + seenNodes[nodeID] = struct{}{} + heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1) + if err != nil { + return nil, err + } + if len(heartbeats) == 0 { + continue + } + out = append(out, rendezvousRelayReplacementHintsFromHeartbeat(heartbeats[0], now)...) + } + return out, nil +} + +func rendezvousRelayReplacementHintsFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) []RendezvousRelayPolicyDecision { + if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { + return nil + } + if now.IsZero() { + now = time.Now().UTC() + } else { + now = now.UTC() + } + if heartbeat.ObservedAt.IsZero() || + heartbeat.ObservedAt.After(now.Add(time.Minute)) || + now.Sub(heartbeat.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge { + return nil + } + var metadata struct { + MeshRoutePathDecisionReport struct { + ClusterID string `json:"cluster_id"` + NodeID string `json:"node_id"` + Decisions []RoutePathDecision `json:"decisions"` + } `json:"mesh_route_path_decision_report"` + } + if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { + return nil + } + report := metadata.MeshRoutePathDecisionReport + if report.NodeID != "" && report.NodeID != heartbeat.NodeID { + return nil + } + if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID { + return nil + } + out := []RendezvousRelayPolicyDecision{} + for _, decision := range report.Decisions { + if strings.TrimSpace(decision.RouteID) == "" || + decision.DecisionSource != "stale_relay_replacement" || + strings.TrimSpace(decision.SelectedRelayID) == "" || + strings.TrimSpace(decision.StaleRelayNodeID) == "" || + decision.ProductionForwarding || + !decision.ControlPlaneOnly || + (!decision.ExpiresAt.IsZero() && !decision.ExpiresAt.After(now)) { + continue + } + peerNodeID := strings.TrimSpace(decision.RendezvousPeerNodeID) + if peerNodeID == "" { + peerNodeID = replacementPeerNodeIDFromDecision(decision) + } + if peerNodeID == "" { + continue + } + out = append(out, RendezvousRelayPolicyDecision{ + RouteID: strings.TrimSpace(decision.RouteID), + PeerNodeID: peerNodeID, + StaleRelayNodeID: strings.TrimSpace(decision.StaleRelayNodeID), + SelectedRelayID: strings.TrimSpace(decision.SelectedRelayID), + SelectedEndpoint: strings.TrimRight(strings.TrimSpace(decision.SelectedRelayEndpoint), "/"), + Score: decision.PathScore, + Reason: "stale_relay_replacement", + ScoreReasons: append([]string{}, decision.ScoreReasons...), + ReporterNodeID: heartbeat.NodeID, + }) + } + return out +} + +func replacementPeerNodeIDFromDecision(decision RoutePathDecision) string { + effectiveHops := cleanRouteNodePath(decision.EffectiveHops) + selectedRelayID := strings.TrimSpace(decision.SelectedRelayID) + for index, nodeID := range effectiveHops { + if nodeID == selectedRelayID && index+1 < len(effectiveHops) { + return effectiveHops[index+1] + } + } + return strings.TrimSpace(decision.DestinationNodeID) +} + +func replacementHintFeedback(hints []RendezvousRelayPolicyDecision, now time.Time) []rendezvousRelayFeedbackEntry { + if len(hints) == 0 { + return nil + } + if now.IsZero() { + now = time.Now().UTC() + } else { + now = now.UTC() + } + out := make([]rendezvousRelayFeedbackEntry, 0, len(hints)) + for _, hint := range hints { + if strings.TrimSpace(hint.RouteID) == "" || + strings.TrimSpace(hint.PeerNodeID) == "" || + strings.TrimSpace(hint.StaleRelayNodeID) == "" || + strings.TrimSpace(hint.SelectedRelayID) == "" { + continue + } + out = append(out, rendezvousRelayFeedbackEntry{ + ReporterNodeID: strings.TrimSpace(hint.ReporterNodeID), + RouteIDs: []string{strings.TrimSpace(hint.RouteID)}, + PeerNodeID: strings.TrimSpace(hint.PeerNodeID), + RelayNodeID: strings.TrimSpace(hint.StaleRelayNodeID), + ConnectionState: "replacement_hint", + Reason: "stale_relay_replacement_hint", + WithdrawalNeeded: true, + ReselectionNeeded: true, + ObservedAt: now, + }) + } + return out +} + +func rendezvousRelayRouteHealthFeedback(localNodeID string, route SyntheticMeshRouteConfig, links []MeshLinkObservation, now time.Time) []rendezvousRelayFeedbackEntry { + out := []rendezvousRelayFeedbackEntry{} + for _, link := range links { + item, ok := rendezvousRelayRouteHealthFeedbackFromLink(localNodeID, route, link, now) + if ok { + out = append(out, item) + } + } + return out +} + +func rendezvousRelayRouteHealthFeedbackFromLink(localNodeID string, route SyntheticMeshRouteConfig, link MeshLinkObservation, now time.Time) (rendezvousRelayFeedbackEntry, bool) { + localNodeID = strings.TrimSpace(localNodeID) + if localNodeID == "" || link.SourceNodeID != localNodeID || strings.TrimSpace(route.RouteID) == "" { + return rendezvousRelayFeedbackEntry{}, false + } + if !meshLinkObservationFresh(link, now) { + return rendezvousRelayFeedbackEntry{}, false + } + metadata, ok := routeHealthMetadataFromLink(link) + if !ok || + metadata.ObservationType != "synthetic_route_health" || + strings.TrimSpace(metadata.RouteID) != route.RouteID || + !metadata.RoutePathDecisionApplied || + metadata.ProductionForwarding || + metadata.ProductionPayloadForwarding || + metadata.RouteHealthProductionPayloadForwarding || + metadata.RouteHealthServicePayloadForwarding { + return rendezvousRelayFeedbackEntry{}, false + } + selectedRelayID := strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID) + if selectedRelayID == "" { + return rendezvousRelayFeedbackEntry{}, false + } + reason := "" + switch { + case metadata.RoutePathDriftDetected: + reason = "synthetic_route_health_drift" + case link.LinkStatus == "unreachable": + reason = "synthetic_route_health_unreachable" + case strings.TrimSpace(metadata.FailureReason) != "": + reason = "synthetic_route_health_failure" + default: + return rendezvousRelayFeedbackEntry{}, false + } + peerNodeID := routeHealthPeerNodeID(metadata, route, link.TargetNodeID) + if peerNodeID == "" { + return rendezvousRelayFeedbackEntry{}, false + } + return rendezvousRelayFeedbackEntry{ + ReporterNodeID: link.SourceNodeID, + RouteIDs: []string{route.RouteID}, + LeaseID: strings.TrimSpace(metadata.RoutePathDecisionRendezvousLeaseID), + PeerNodeID: peerNodeID, + RelayNodeID: selectedRelayID, + ConnectionState: reason, + Reason: reason, + WithdrawalNeeded: true, + ReselectionNeeded: true, + ObservedAt: link.ObservedAt.UTC(), + }, true +} + +func routeHealthMetadataFromLink(link MeshLinkObservation) (meshRouteHealthObservationMetadata, bool) { + if len(link.Metadata) == 0 || !json.Valid(link.Metadata) { + return meshRouteHealthObservationMetadata{}, false + } + var metadata meshRouteHealthObservationMetadata + if err := json.Unmarshal(link.Metadata, &metadata); err != nil { + return meshRouteHealthObservationMetadata{}, false + } + return metadata, true +} + +func meshLinkObservationFresh(link MeshLinkObservation, now time.Time) bool { + if now.IsZero() { + now = time.Now().UTC() + } else { + now = now.UTC() + } + return !link.ObservedAt.IsZero() && + !link.ObservedAt.After(now.Add(time.Minute)) && + now.Sub(link.ObservedAt.UTC()) <= rendezvousRelayFeedbackMaxAge +} + +func routeHealthPeerNodeID(metadata meshRouteHealthObservationMetadata, route SyntheticMeshRouteConfig, targetNodeID string) string { + if peerNodeID := strings.TrimSpace(metadata.RoutePathDecisionRendezvousPeerNodeID); peerNodeID != "" { + return peerNodeID + } + selectedRelayID := strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID) + if peerNodeID := nodeAfterInPath(cleanRouteNodePath(metadata.ExpectedEffectiveHops), selectedRelayID); peerNodeID != "" { + return peerNodeID + } + if peerNodeID := nodeAfterInPath(cleanRouteNodePath(route.Hops), selectedRelayID); peerNodeID != "" { + return peerNodeID + } + if targetNodeID = strings.TrimSpace(targetNodeID); targetNodeID != "" { + return targetNodeID + } + return strings.TrimSpace(route.DestinationNodeID) +} + +func nodeAfterInPath(path []string, nodeID string) string { + nodeID = strings.TrimSpace(nodeID) + if nodeID == "" { + return "" + } + for index, item := range path { + if item == nodeID && index+1 < len(path) { + return path[index+1] + } + } + return "" +} + +func newRendezvousRelayPolicy(localNodeID string, links []MeshLinkObservation, now time.Time) *rendezvousRelayPolicy { + if now.IsZero() { + now = time.Now().UTC() + } else { + now = now.UTC() + } + return &rendezvousRelayPolicy{ + localNodeID: strings.TrimSpace(localNodeID), + now: now, + links: append([]MeshLinkObservation{}, links...), + withdrawn: map[string]RendezvousRelayPolicyDecision{}, + replacements: map[string]RendezvousRelayPolicyDecision{}, + } +} + +func (p *rendezvousRelayPolicy) addFeedback(items []rendezvousRelayFeedbackEntry) { + if p == nil { + return + } + p.feedback = append(p.feedback, items...) +} + +func (p *rendezvousRelayPolicy) staleForLease(routeID string, lease PeerRendezvousLease) (rendezvousRelayFeedbackEntry, bool) { + if p == nil { + return rendezvousRelayFeedbackEntry{}, false + } + for _, item := range p.feedback { + if !rendezvousFeedbackAppliesToRoute(item, routeID) { + continue + } + if item.LeaseID != "" && lease.LeaseID != "" && item.LeaseID == lease.LeaseID { + return item, true + } + if item.PeerNodeID == lease.PeerNodeID && item.RelayNodeID == lease.RelayNodeID { + return item, true + } + } + return rendezvousRelayFeedbackEntry{}, false +} + +func (p *rendezvousRelayPolicy) relayStale(routeID string, peerNodeID string, relayNodeID string) (rendezvousRelayFeedbackEntry, bool) { + if p == nil { + return rendezvousRelayFeedbackEntry{}, false + } + for _, item := range p.feedback { + if item.PeerNodeID == peerNodeID && + item.RelayNodeID == relayNodeID && + rendezvousFeedbackAppliesToRoute(item, routeID) { + return item, true + } + } + return rendezvousRelayFeedbackEntry{}, false +} + +func (p *rendezvousRelayPolicy) hasStalePeer(routeID string, peerNodeID string) (rendezvousRelayFeedbackEntry, bool) { + if p == nil { + return rendezvousRelayFeedbackEntry{}, false + } + for _, item := range p.feedback { + if item.PeerNodeID == peerNodeID && rendezvousFeedbackAppliesToRoute(item, routeID) { + return item, true + } + } + return rendezvousRelayFeedbackEntry{}, false +} + +func (p *rendezvousRelayPolicy) recordWithdrawal(route SyntheticMeshRouteConfig, lease PeerRendezvousLease, feedback rendezvousRelayFeedbackEntry) { + if p == nil { + return + } + key := route.RouteID + "\x00" + lease.LeaseID + "\x00" + lease.RelayNodeID + p.withdrawn[key] = RendezvousRelayPolicyDecision{ + RouteID: route.RouteID, + PeerNodeID: lease.PeerNodeID, + WithdrawnLeaseID: lease.LeaseID, + StaleRelayNodeID: lease.RelayNodeID, + Reason: "stale_relay_withdrawn", + ReporterNodeID: feedback.ReporterNodeID, + } +} + +func (p *rendezvousRelayPolicy) recordReplacement(route SyntheticMeshRouteConfig, peerNodeID string, feedback rendezvousRelayFeedbackEntry, selection rendezvousRelaySelection) { + if p == nil || selection.RelayNodeID == "" { + return + } + key := rendezvousRelayReplacementKey(route.RouteID, peerNodeID, feedback.RelayNodeID, selection.RelayNodeID) + p.replacements[key] = RendezvousRelayPolicyDecision{ + RouteID: route.RouteID, + PeerNodeID: peerNodeID, + StaleRelayNodeID: feedback.RelayNodeID, + SelectedRelayID: selection.RelayNodeID, + SelectedEndpoint: selection.Endpoint, + Score: selection.Score, + Reason: "stale_relay_replacement", + ScoreReasons: append([]string{}, selection.Reasons...), + ReporterNodeID: feedback.ReporterNodeID, + } +} + +func (p *rendezvousRelayPolicy) addReplacementHints(hints []RendezvousRelayPolicyDecision) { + if p == nil { + return + } + for _, hint := range hints { + hint.RouteID = strings.TrimSpace(hint.RouteID) + hint.PeerNodeID = strings.TrimSpace(hint.PeerNodeID) + hint.StaleRelayNodeID = strings.TrimSpace(hint.StaleRelayNodeID) + hint.SelectedRelayID = strings.TrimSpace(hint.SelectedRelayID) + hint.SelectedEndpoint = strings.TrimRight(strings.TrimSpace(hint.SelectedEndpoint), "/") + if hint.RouteID == "" || hint.PeerNodeID == "" || hint.StaleRelayNodeID == "" || hint.SelectedRelayID == "" { + continue + } + if hint.Reason == "" { + hint.Reason = "stale_relay_replacement" + } + if len(hint.ScoreReasons) == 0 { + hint.ScoreReasons = []string{"route_path_decision_hint"} + } + key := rendezvousRelayReplacementKey(hint.RouteID, hint.PeerNodeID, hint.StaleRelayNodeID, hint.SelectedRelayID) + existing, exists := p.replacements[key] + if !exists || hint.Score > existing.Score { + p.replacements[key] = hint + } + } +} + +func (p *rendezvousRelayPolicy) report() *RendezvousRelayPolicyReport { + if p == nil || (len(p.feedback) == 0 && len(p.withdrawn) == 0 && len(p.replacements) == 0) { + return nil + } + decisions := make([]RendezvousRelayPolicyDecision, 0, len(p.withdrawn)+len(p.replacements)) + for _, decision := range p.withdrawn { + decisions = append(decisions, decision) + } + for _, decision := range p.replacements { + decisions = append(decisions, decision) + } + sort.SliceStable(decisions, func(i, j int) bool { + if decisions[i].RouteID != decisions[j].RouteID { + return decisions[i].RouteID < decisions[j].RouteID + } + if decisions[i].PeerNodeID != decisions[j].PeerNodeID { + return decisions[i].PeerNodeID < decisions[j].PeerNodeID + } + if decisions[i].Reason != decisions[j].Reason { + return decisions[i].Reason < decisions[j].Reason + } + return decisions[i].SelectedRelayID < decisions[j].SelectedRelayID + }) + return &RendezvousRelayPolicyReport{ + SchemaVersion: "c17z15.rendezvous_relay_policy.v1", + ScoringMode: "route_adjacency_endpoint_priority_mesh_link_health_synthetic_route_health_feedback", + FeedbackMaxAgeSeconds: int(rendezvousRelayFeedbackMaxAge / time.Second), + StaleRelayCount: len(p.feedback), + WithdrawnLeaseCount: len(p.withdrawn), + ReplacementLeaseCount: len(p.replacements), + Decisions: decisions, + } +} + +func (p *rendezvousRelayPolicy) replacementDecision(routeID string, peerNodeID string, selectedRelayID string) (RendezvousRelayPolicyDecision, bool) { + if p == nil { + return RendezvousRelayPolicyDecision{}, false + } + for _, decision := range p.replacements { + if decision.RouteID == routeID && + decision.PeerNodeID == peerNodeID && + decision.SelectedRelayID == selectedRelayID { + return decision, true + } + } + return RendezvousRelayPolicyDecision{}, false +} + +func rendezvousRelayReplacementKey(routeID string, peerNodeID string, staleRelayNodeID string, selectedRelayID string) string { + return strings.TrimSpace(routeID) + "\x00" + + strings.TrimSpace(peerNodeID) + "\x00" + + strings.TrimSpace(staleRelayNodeID) + "\x00" + + strings.TrimSpace(selectedRelayID) +} + +func routePathDecisionReport(generation string, decisions []RoutePathDecision) *RoutePathDecisionReport { + if len(decisions) == 0 { + return nil + } + out := append([]RoutePathDecision{}, decisions...) + sort.SliceStable(out, func(i, j int) bool { + if out[i].RouteID != out[j].RouteID { + return out[i].RouteID < out[j].RouteID + } + return out[i].DecisionID < out[j].DecisionID + }) + replacements := 0 + for _, decision := range out { + if decision.DecisionSource == "stale_relay_replacement" { + replacements++ + } + } + return &RoutePathDecisionReport{ + SchemaVersion: "c17z18.route_path_decisions.v1", + DecisionMode: "control_plane_effective_path_from_relay_policy", + Generation: generation, + DecisionCount: len(out), + ReplacementDecisionCount: replacements, + ControlPlaneOnly: true, + ProductionForwarding: false, + Decisions: out, + } +} + +func routePathDecisionForRoute(route SyntheticMeshRouteConfig, localNodeID string, leases []PeerRendezvousLease, relayPolicy *rendezvousRelayPolicy, generation string) RoutePathDecision { + decision := RoutePathDecision{ + DecisionID: route.RouteID + "-path-" + localNodeID, + RouteID: route.RouteID, + ClusterID: route.ClusterID, + LocalNodeID: localNodeID, + SourceNodeID: route.SourceNodeID, + DestinationNodeID: route.DestinationNodeID, + OriginalHops: append([]string{}, route.Hops...), + EffectiveHops: append([]string{}, route.Hops...), + DecisionSource: "route_intent", + Generation: generation, + PathScore: 1000, + ScoreReasons: []string{"route_intent_hops"}, + ControlPlaneOnly: true, + ProductionForwarding: false, + ExpiresAt: route.ExpiresAt.UTC(), + } + var replacementLease PeerRendezvousLease + var replacementDecision RendezvousRelayPolicyDecision + replacementFound := false + for _, lease := range leases { + if !containsString(lease.RouteIDs, route.RouteID) { + continue + } + relayDecision, ok := relayPolicy.replacementDecision(route.RouteID, lease.PeerNodeID, lease.RelayNodeID) + if !ok && lease.Reason != "stale_relay_replacement" { + continue + } + if !ok { + relayDecision = RendezvousRelayPolicyDecision{ + RouteID: route.RouteID, + PeerNodeID: lease.PeerNodeID, + SelectedRelayID: lease.RelayNodeID, + SelectedEndpoint: lease.RelayEndpoint, + Reason: "stale_relay_replacement", + } + } + if !replacementFound || relayDecision.Score > replacementDecision.Score { + replacementFound = true + replacementLease = lease + replacementDecision = relayDecision + } + } + if replacementFound { + decision.DecisionID = route.RouteID + "-path-" + localNodeID + "-via-" + replacementLease.RelayNodeID + decision.EffectiveHops = effectiveRoutePathWithReplacement(route.Hops, replacementLease.PeerNodeID, replacementDecision.StaleRelayNodeID, replacementLease.RelayNodeID) + decision.SelectedRelayID = replacementLease.RelayNodeID + decision.SelectedRelayEndpoint = replacementLease.RelayEndpoint + decision.StaleRelayNodeID = replacementDecision.StaleRelayNodeID + decision.RendezvousPeerNodeID = replacementLease.PeerNodeID + decision.RendezvousLeaseID = replacementLease.LeaseID + decision.RendezvousLeaseReason = replacementLease.Reason + decision.DecisionSource = "stale_relay_replacement" + decision.PathScore = replacementDecision.Score + if decision.PathScore == 0 { + decision.PathScore = 1000 + } + decision.ScoreReasons = append([]string{}, replacementDecision.ScoreReasons...) + if len(decision.ScoreReasons) == 0 { + decision.ScoreReasons = []string{"relay_replacement_policy"} + } + } + decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, localNodeID, decision.SelectedRelayID, decision.StaleRelayNodeID) + return decision +} + +func effectiveRoutePathWithReplacement(original []string, peerNodeID string, staleRelayNodeID string, selectedRelayID string) []string { + out := make([]string, 0, len(original)+1) + for _, nodeID := range original { + nodeID = strings.TrimSpace(nodeID) + if nodeID == "" || (staleRelayNodeID != "" && nodeID == staleRelayNodeID) { + continue + } + out = append(out, nodeID) + } + if selectedRelayID == "" || containsString(out, selectedRelayID) { + return out + } + peerIndex := -1 + for index, nodeID := range out { + if nodeID == peerNodeID { + peerIndex = index + break + } + } + if peerIndex < 0 { + return append(out, selectedRelayID) + } + out = append(out, "") + copy(out[peerIndex+1:], out[peerIndex:]) + out[peerIndex] = selectedRelayID + return out +} + +func routePathLocalPosition(path []string, localNodeID string, selectedRelayID string, staleRelayNodeID string) (string, string, string) { + localIndex := -1 + for index, nodeID := range path { + if nodeID == localNodeID { + localIndex = index + break + } + } + if localIndex < 0 { + if staleRelayNodeID != "" && localNodeID == staleRelayNodeID { + return "", "", "withdrawn_relay" + } + return "", "", "not_on_effective_path" + } + previous := "" + next := "" + if localIndex > 0 { + previous = path[localIndex-1] + } + if localIndex < len(path)-1 { + next = path[localIndex+1] + } + role := "transit" + switch { + case localIndex == 0: + role = "entry" + case localIndex == len(path)-1: + role = "exit" + case selectedRelayID != "" && localNodeID == selectedRelayID: + role = "selected_relay" + } + return previous, next, role +} + +func rendezvousFeedbackAppliesToRoute(item rendezvousRelayFeedbackEntry, routeID string) bool { + if strings.TrimSpace(routeID) == "" || len(item.RouteIDs) == 0 { + return true + } + return containsString(item.RouteIDs, routeID) +} + +func reachabilityFromConnectivityMode(connectivityMode string) string { + switch connectivityMode { + case "outbound_only": + return "outbound_only" + case "relay_required": + return "relay" + case "direct": + return "public" + default: + return "unknown" + } +} + +func validatePeerRecoverySeeds(seeds []PeerRecoverySeed) error { + if len(seeds) > maxScopedRecoverySeeds { + return ErrInvalidPayload + } + seen := map[string]struct{}{} + for _, seed := range seeds { + key := strings.TrimSpace(seed.NodeID) + "\x00" + strings.TrimSpace(seed.Endpoint) + if strings.TrimSpace(seed.NodeID) == "" || + strings.TrimSpace(seed.Endpoint) == "" || + !isPeerEndpointTransport(seed.Transport) || + (seed.ConnectivityMode != "" && !isPeerEndpointConnectivityMode(seed.ConnectivityMode)) || + (len(seed.Metadata) > 0 && !json.Valid(seed.Metadata)) { + return ErrInvalidPayload + } + if _, duplicate := seen[key]; duplicate { + return ErrInvalidPayload + } + seen[key] = struct{}{} + } + return nil +} + +func validatePeerRendezvousLeases(leases []PeerRendezvousLease, routePath []string, now time.Time) error { + if len(leases) > maxScopedRendezvousLeases { + return ErrInvalidPayload + } + now = now.UTC() + seen := map[string]struct{}{} + for _, lease := range leases { + peerNodeID := strings.TrimSpace(lease.PeerNodeID) + relayNodeID := strings.TrimSpace(lease.RelayNodeID) + relayEndpoint := strings.TrimSpace(lease.RelayEndpoint) + transport := strings.TrimSpace(lease.Transport) + if peerNodeID == "" || + relayNodeID == "" || + relayEndpoint == "" || + peerNodeID == relayNodeID || + !containsString(routePath, peerNodeID) || + !containsString(routePath, relayNodeID) || + (transport != "" && !isPeerRendezvousTransport(transport)) || + (!lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now)) || + (len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) { + return ErrInvalidPayload + } + if strings.TrimSpace(lease.LeaseID) == "" { + continue + } + if _, duplicate := seen[lease.LeaseID]; duplicate { + return ErrInvalidPayload + } + seen[lease.LeaseID] = struct{}{} + } + return nil +} + +func normalizeRendezvousLeases(leases []PeerRendezvousLease, route SyntheticMeshRouteConfig, now time.Time) []PeerRendezvousLease { + out := make([]PeerRendezvousLease, 0, len(leases)) + now = now.UTC() + for _, lease := range leases { + lease.PeerNodeID = strings.TrimSpace(lease.PeerNodeID) + lease.RelayNodeID = strings.TrimSpace(lease.RelayNodeID) + lease.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/") + if lease.LeaseID == "" { + lease.LeaseID = route.RouteID + "-rv-" + lease.PeerNodeID + "-via-" + lease.RelayNodeID + } + if lease.Transport == "" { + lease.Transport = "relay_control" + } + if lease.ConnectivityMode == "" { + lease.ConnectivityMode = "relay_required" + } + if lease.Priority <= 0 { + lease.Priority = 100 + } + if len(lease.RouteIDs) == 0 { + lease.RouteIDs = []string{route.RouteID} + } else if !containsString(lease.RouteIDs, route.RouteID) { + lease.RouteIDs = append(append([]string{}, lease.RouteIDs...), route.RouteID) + } + lease.AllowedChannels = controlPlaneAllowedChannels(firstNonEmptyStringSlice(lease.AllowedChannels, route.AllowedChannels)) + if len(lease.AllowedChannels) == 0 { + lease.AllowedChannels = []string{"fabric_control", "route_control"} + } + lease.ControlPlaneOnly = true + if lease.IssuedAt.IsZero() { + lease.IssuedAt = now + } else { + lease.IssuedAt = lease.IssuedAt.UTC() + } + if lease.ExpiresAt.IsZero() || (!route.ExpiresAt.IsZero() && lease.ExpiresAt.After(route.ExpiresAt)) { + lease.ExpiresAt = route.ExpiresAt.UTC() + } else { + lease.ExpiresAt = lease.ExpiresAt.UTC() + } + if lease.Reason == "" { + lease.Reason = "policy_rendezvous_lease" + } + if lease.Metadata == nil { + lease.Metadata = json.RawMessage(`{}`) + } + if !lease.ExpiresAt.IsZero() && lease.ExpiresAt.After(now) { + out = append(out, lease) + } + } + return out +} + +func scopedRendezvousLeases(leases []PeerRendezvousLease, route SyntheticMeshRouteConfig, localNodeID string, relayPolicy *rendezvousRelayPolicy, now time.Time) []PeerRendezvousLease { + if !containsString(route.Hops, localNodeID) { + return nil + } + normalized := normalizeRendezvousLeases(leases, route, now) + out := make([]PeerRendezvousLease, 0, len(normalized)) + for _, lease := range normalized { + if feedback, stale := relayPolicy.staleForLease(route.RouteID, lease); stale { + relayPolicy.recordWithdrawal(route, lease, feedback) + continue + } + if containsString(route.Hops, lease.PeerNodeID) && containsString(route.Hops, lease.RelayNodeID) { + out = append(out, lease) + } + } + return out +} + +func derivedRendezvousLeases(route SyntheticMeshRouteConfig, peers map[string]string, candidates map[string][]PeerEndpointCandidate, localNodeID string, relayPolicy *rendezvousRelayPolicy, now time.Time) []PeerRendezvousLease { + if !containsString(route.Hops, localNodeID) { + return nil + } + out := []PeerRendezvousLease{} + for peerNodeID, items := range candidates { + peerNodeID = strings.TrimSpace(peerNodeID) + if peerNodeID == "" || !containsString(route.Hops, peerNodeID) || !peerEndpointCandidatesRequireRendezvous(items) { + continue + } + selection := selectRendezvousRelay(route, peerNodeID, localNodeID, peers, candidates, relayPolicy) + if selection.RelayNodeID == "" || selection.Endpoint == "" { + continue + } + _, replacement := relayPolicy.hasStalePeer(route.RouteID, peerNodeID) + reason := rendezvousLeaseReason(items) + if replacement { + reason = "stale_relay_replacement" + } + lease := PeerRendezvousLease{ + LeaseID: route.RouteID + "-rv-" + peerNodeID + "-via-" + selection.RelayNodeID, + PeerNodeID: peerNodeID, + RelayNodeID: selection.RelayNodeID, + RelayEndpoint: selection.Endpoint, + Transport: "relay_control", + ConnectivityMode: "relay_required", + RouteIDs: []string{route.RouteID}, + AllowedChannels: controlPlaneAllowedChannels(route.AllowedChannels), + Priority: rendezvousLeasePriority(items), + ControlPlaneOnly: true, + IssuedAt: now.UTC(), + ExpiresAt: route.ExpiresAt.UTC(), + Reason: reason, + Metadata: rendezvousRelayLeaseMetadata(selection, replacement), + } + if len(lease.AllowedChannels) == 0 { + lease.AllowedChannels = []string{"fabric_control", "route_control"} + } + if lease.Priority <= 0 { + lease.Priority = 100 + } + if lease.ExpiresAt.After(now.UTC()) { + out = append(out, lease) + if feedback, ok := relayPolicy.hasStalePeer(route.RouteID, peerNodeID); ok && feedback.RelayNodeID != selection.RelayNodeID { + relayPolicy.recordReplacement(route, peerNodeID, feedback, selection) + } + } + } + return out +} + +func selectRendezvousRelay(route SyntheticMeshRouteConfig, peerNodeID string, localNodeID string, peers map[string]string, candidates map[string][]PeerEndpointCandidate, relayPolicy *rendezvousRelayPolicy) rendezvousRelaySelection { + routePath := route.Hops + peerIndex := -1 + for index, nodeID := range routePath { + if nodeID == peerNodeID { + peerIndex = index + break + } + } + preferred := []string{} + if peerIndex > 0 { + preferred = append(preferred, routePath[peerIndex-1]) + } + if peerIndex >= 0 && peerIndex < len(routePath)-1 { + preferred = append(preferred, routePath[peerIndex+1]) + } + preferred = append(preferred, routePath...) + seen := map[string]struct{}{} + relayCandidates := []rendezvousRelaySelection{} + for _, relayNodeID := range preferred { + relayNodeID = strings.TrimSpace(relayNodeID) + if relayNodeID == "" || relayNodeID == peerNodeID { + continue + } + if _, duplicate := seen[relayNodeID]; duplicate { + continue + } + seen[relayNodeID] = struct{}{} + if _, stale := relayPolicy.relayStale(route.RouteID, peerNodeID, relayNodeID); stale { + continue + } + endpoint, endpointScore, endpointReasons := relayControlEndpointForNode(relayNodeID, peers, candidates) + if endpoint == "" { + continue + } + score, scoreReasons := rendezvousRelayCandidateScore(route.RouteID, routePath, peerIndex, relayNodeID, localNodeID, endpointScore, endpointReasons, relayPolicy) + relayCandidates = append(relayCandidates, rendezvousRelaySelection{ + RelayNodeID: relayNodeID, + Endpoint: endpoint, + Score: score, + Reasons: scoreReasons, + }) + } + if len(relayCandidates) == 0 { + return rendezvousRelaySelection{} + } + sort.SliceStable(relayCandidates, func(i, j int) bool { + if relayCandidates[i].Score != relayCandidates[j].Score { + return relayCandidates[i].Score > relayCandidates[j].Score + } + return relayCandidates[i].RelayNodeID < relayCandidates[j].RelayNodeID + }) + return relayCandidates[0] +} + +func relayControlEndpointForNode(nodeID string, peers map[string]string, candidates map[string][]PeerEndpointCandidate) (string, int, []string) { + if endpoint := strings.TrimRight(strings.TrimSpace(peers[nodeID]), "/"); isHTTPControlEndpoint(endpoint) { + return endpoint, 80, []string{"reported_peer_endpoint"} + } + items := append([]PeerEndpointCandidate{}, candidates[nodeID]...) + sort.SliceStable(items, func(i, j int) bool { + if items[i].Priority != items[j].Priority { + return items[i].Priority < items[j].Priority + } + return items[i].EndpointID < items[j].EndpointID + }) + for _, candidate := range items { + if endpointCandidateRequiresRendezvous(candidate) { + continue + } + endpoint := strings.TrimRight(strings.TrimSpace(candidate.Address), "/") + if isHTTPControlEndpoint(endpoint) { + score := 70 + reasons := []string{"endpoint_candidate"} + if candidate.Priority > 0 { + score += maxInt(0, 50-candidate.Priority) + } + if hasPolicyTag(candidate.PolicyTags, "fast-path") { + score += 25 + reasons = append(reasons, "fast_path") + } + if hasPolicyTag(candidate.PolicyTags, "same-site") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "private-lan") { + score += 20 + reasons = append(reasons, "same_site") + } + if strings.EqualFold(candidate.ConnectivityMode, "direct") { + score += 10 + reasons = append(reasons, "direct") + } + return endpoint, score, reasons + } + } + return "", 0, nil +} + +func rendezvousRelayCandidateScore(routeID string, routePath []string, peerIndex int, relayNodeID string, localNodeID string, endpointScore int, endpointReasons []string, relayPolicy *rendezvousRelayPolicy) (int, []string) { + score := 500 + endpointScore + reasons := append([]string{}, endpointReasons...) + relayIndex := -1 + for index, nodeID := range routePath { + if nodeID == relayNodeID { + relayIndex = index + break + } + } + if peerIndex >= 0 && relayIndex >= 0 { + distance := absInt(peerIndex - relayIndex) + switch { + case distance == 1: + score += 180 + reasons = append(reasons, "adjacent_to_peer") + case distance == 2: + score += 120 + reasons = append(reasons, "near_peer") + default: + score += maxInt(0, 80-distance*10) + reasons = append(reasons, "route_path_candidate") + } + } + if relayIndex == 0 && len(routePath) > 2 { + score -= 120 + reasons = append(reasons, "entry_relay_fallback") + } + if relayNodeID == localNodeID { + score += 40 + reasons = append(reasons, "local_entry_relay") + } + linkScore, linkReasons := rendezvousRelayLinkScore(relayNodeID, relayPolicy) + score += linkScore + reasons = append(reasons, linkReasons...) + routeHealthScore, routeHealthReasons := rendezvousRelayRouteHealthScore(routeID, relayNodeID, relayPolicy) + score += routeHealthScore + reasons = append(reasons, routeHealthReasons...) + return score, reasons +} + +func rendezvousRelayLinkScore(relayNodeID string, relayPolicy *rendezvousRelayPolicy) (int, []string) { + if relayPolicy == nil || relayPolicy.localNodeID == "" { + return 0, nil + } + var latest *MeshLinkObservation + for i := range relayPolicy.links { + link := &relayPolicy.links[i] + if link.SourceNodeID != relayPolicy.localNodeID || link.TargetNodeID != relayNodeID { + continue + } + if !link.ObservedAt.IsZero() && relayPolicy.now.Sub(link.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge { + continue + } + if latest == nil || link.ObservedAt.After(latest.ObservedAt) { + latest = link + } + } + if latest == nil { + return 0, nil + } + switch latest.LinkStatus { + case "reachable": + score := 60 + reasons := []string{"mesh_link_reachable"} + if latest.QualityScore != nil { + score += *latest.QualityScore + reasons = append(reasons, "mesh_link_quality") + } + if latest.LatencyMs != nil { + score += maxInt(0, 80-*latest.LatencyMs) + reasons = append(reasons, "mesh_link_latency") + } + return score, reasons + case "unreachable": + return -250, []string{"mesh_link_unreachable"} + default: + return 0, nil + } +} + +func rendezvousRelayRouteHealthScore(routeID string, relayNodeID string, relayPolicy *rendezvousRelayPolicy) (int, []string) { + if relayPolicy == nil || relayPolicy.localNodeID == "" { + return 0, nil + } + routeID = strings.TrimSpace(routeID) + relayNodeID = strings.TrimSpace(relayNodeID) + if routeID == "" || relayNodeID == "" { + return 0, nil + } + var latest *MeshLinkObservation + var latestMetadata meshRouteHealthObservationMetadata + for i := range relayPolicy.links { + link := &relayPolicy.links[i] + if link.SourceNodeID != relayPolicy.localNodeID || !meshLinkObservationFresh(*link, relayPolicy.now) { + continue + } + metadata, ok := routeHealthMetadataFromLink(*link) + if !ok || + metadata.ObservationType != "synthetic_route_health" || + strings.TrimSpace(metadata.RouteID) != routeID || + strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID) != relayNodeID || + metadata.ProductionForwarding || + metadata.ProductionPayloadForwarding || + metadata.RouteHealthProductionPayloadForwarding || + metadata.RouteHealthServicePayloadForwarding { + continue + } + if latest == nil || link.ObservedAt.After(latest.ObservedAt) { + latest = link + latestMetadata = metadata + } + } + if latest == nil { + return 0, nil + } + if latestMetadata.RoutePathDriftDetected { + return -360, []string{"route_health_drift"} + } + if latest.LinkStatus == "unreachable" || strings.TrimSpace(latestMetadata.FailureReason) != "" { + return -320, []string{"route_health_unreachable"} + } + if latest.LinkStatus != "reachable" { + return 0, nil + } + score := 90 + reasons := []string{"route_health_reachable", "route_health_no_drift"} + if latest.QualityScore != nil { + score += *latest.QualityScore + reasons = append(reasons, "route_health_quality") + } + if latest.LatencyMs != nil { + score += maxInt(0, 100-*latest.LatencyMs) + reasons = append(reasons, "route_health_latency") + } + return score, reasons +} + +func rendezvousRelayLeaseMetadata(selection rendezvousRelaySelection, replacement bool) json.RawMessage { + payload := map[string]any{ + "source": "control-plane", + "derived_from": "endpoint_candidate", + "lease_refresh_contract": "node_scoped_synthetic_config_get", + "relay_replacement_contract": "stale_relay_feedback_policy", + "relay_selection_score": selection.Score, + "relay_selection_score_reasons": selection.Reasons, + "production_payload_forwarding": false, + } + if replacement { + payload["replacement_for_stale_relay"] = true + } + raw, err := json.Marshal(payload) + if err != nil { + return json.RawMessage(`{"source":"control-plane","derived_from":"endpoint_candidate","lease_refresh_contract":"node_scoped_synthetic_config_get","relay_replacement_contract":"stale_relay_feedback_policy","production_payload_forwarding":false}`) + } + return raw +} + +func hasPolicyTag(tags []string, want string) bool { + want = strings.ToLower(strings.TrimSpace(want)) + for _, tag := range tags { + if strings.ToLower(strings.TrimSpace(tag)) == want { + return true + } + } + return false +} + +func maxInt(a int, b int) int { + if a > b { + return a + } + return b +} + +func absInt(value int) int { + if value < 0 { + return -value + } + return value +} + +func peerEndpointCandidatesRequireRendezvous(candidates []PeerEndpointCandidate) bool { + for _, candidate := range candidates { + if endpointCandidateRequiresRendezvous(candidate) { + return true + } + } + return false +} + +func endpointCandidateRequiresRendezvous(candidate PeerEndpointCandidate) bool { + transport := strings.ToLower(strings.TrimSpace(candidate.Transport)) + reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability)) + connectivityMode := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode)) + return strings.Contains(transport, "relay") || + strings.Contains(transport, "outbound") || + reachability == "relay" || + reachability == "outbound_only" || + connectivityMode == "relay_required" || + connectivityMode == "outbound_only" +} + +func rendezvousLeasePriority(candidates []PeerEndpointCandidate) int { + priority := 0 + for _, candidate := range candidates { + if !endpointCandidateRequiresRendezvous(candidate) { + continue + } + if priority == 0 || (candidate.Priority > 0 && candidate.Priority < priority) { + priority = candidate.Priority + } + } + return priority +} + +func rendezvousLeaseReason(candidates []PeerEndpointCandidate) string { + for _, candidate := range candidates { + connectivityMode := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode)) + reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability)) + if connectivityMode == "outbound_only" || reachability == "outbound_only" { + return "auto_outbound_only" + } + if connectivityMode == "relay_required" || reachability == "relay" { + return "auto_relay_required" + } + } + return "auto_rendezvous_required" +} + +func mergeRendezvousLeases(out map[string]PeerRendezvousLease, leases []PeerRendezvousLease) { + for _, lease := range leases { + if lease.Metadata == nil { + lease.Metadata = json.RawMessage(`{}`) + } + key := strings.TrimSpace(lease.LeaseID) + if key == "" { + key = lease.PeerNodeID + "\x00" + lease.RelayNodeID + "\x00" + lease.RelayEndpoint + } + existing, ok := out[key] + if !ok || lease.Priority < existing.Priority || existing.ExpiresAt.Before(lease.ExpiresAt) { + out[key] = lease + } + } +} + +func sortedRendezvousLeases(items map[string]PeerRendezvousLease, limit int) []PeerRendezvousLease { + out := make([]PeerRendezvousLease, 0, len(items)) + for _, item := range items { + out = append(out, item) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Priority != out[j].Priority { + return out[i].Priority < out[j].Priority + } + if out[i].PeerNodeID != out[j].PeerNodeID { + return out[i].PeerNodeID < out[j].PeerNodeID + } + if out[i].RelayNodeID != out[j].RelayNodeID { + return out[i].RelayNodeID < out[j].RelayNodeID + } + return out[i].LeaseID < out[j].LeaseID + }) + if len(out) > limit { + out = out[:limit] + } + return out +} + +func markPeerDirectoryRendezvousLeases(directory map[string]*PeerDirectoryEntry, leases []PeerRendezvousLease, localNodeID string) { + for _, lease := range leases { + if lease.PeerNodeID != "" && lease.PeerNodeID != localNodeID { + entry := peerDirectoryEntry(directory, lease.PeerNodeID) + entry.CandidateCount++ + if !containsString(entry.ConnectivityModes, "relay_required") { + entry.ConnectivityModes = append(entry.ConnectivityModes, "relay_required") + } + } + if lease.RelayNodeID != "" && lease.RelayNodeID != localNodeID { + entry := peerDirectoryEntry(directory, lease.RelayNodeID) + entry.EndpointCount++ + if !containsString(entry.ConnectivityModes, "relay_control") { + entry.ConnectivityModes = append(entry.ConnectivityModes, "relay_control") + } + } + } +} + +func mergePeerDirectoryRoute(directory map[string]*PeerDirectoryEntry, route SyntheticMeshRouteConfig, localNodeID string) { + for _, nodeID := range route.Hops { + nodeID = strings.TrimSpace(nodeID) + if nodeID == "" || nodeID == localNodeID { + continue + } + entry := peerDirectoryEntry(directory, nodeID) + if !containsString(entry.RouteIDs, route.RouteID) { + entry.RouteIDs = append(entry.RouteIDs, route.RouteID) + } + } +} + +func mergePeerDirectoryCandidates(directory map[string]*PeerDirectoryEntry, nodeID string, candidates []PeerEndpointCandidate) { + entry := peerDirectoryEntry(directory, nodeID) + entry.CandidateCount += len(candidates) + for _, candidate := range candidates { + if strings.TrimSpace(candidate.ConnectivityMode) != "" && !containsString(entry.ConnectivityModes, candidate.ConnectivityMode) { + entry.ConnectivityModes = append(entry.ConnectivityModes, candidate.ConnectivityMode) + } + } +} + +func peerDirectoryEntry(directory map[string]*PeerDirectoryEntry, nodeID string) *PeerDirectoryEntry { + if entry, ok := directory[nodeID]; ok { + return entry + } + entry := &PeerDirectoryEntry{NodeID: nodeID} + directory[nodeID] = entry + return entry +} + +func mergeRecoverySeeds(out map[string]PeerRecoverySeed, seeds []PeerRecoverySeed) { + for _, seed := range seeds { + if seed.Metadata == nil { + seed.Metadata = json.RawMessage(`{}`) + } + key := seed.NodeID + "\x00" + seed.Endpoint + existing, ok := out[key] + if !ok || seed.Priority < existing.Priority { + out[key] = seed + } + } +} + +func sortedRecoverySeeds(items map[string]PeerRecoverySeed, limit int) []PeerRecoverySeed { + out := make([]PeerRecoverySeed, 0, len(items)) + for _, item := range items { + out = append(out, item) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Priority != out[j].Priority { + return out[i].Priority < out[j].Priority + } + if out[i].NodeID != out[j].NodeID { + return out[i].NodeID < out[j].NodeID + } + return out[i].Endpoint < out[j].Endpoint + }) + if len(out) > limit { + out = out[:limit] + } + return out +} + +func markPeerDirectoryRecoverySeeds(directory map[string]*PeerDirectoryEntry, seeds []PeerRecoverySeed) { + for _, seed := range seeds { + entry := peerDirectoryEntry(directory, seed.NodeID) + entry.RecoverySeed = true + if strings.TrimSpace(seed.ConnectivityMode) != "" && !containsString(entry.ConnectivityModes, seed.ConnectivityMode) { + entry.ConnectivityModes = append(entry.ConnectivityModes, seed.ConnectivityMode) + } + } +} + +func sortedPeerDirectory(items map[string]*PeerDirectoryEntry) []PeerDirectoryEntry { + out := make([]PeerDirectoryEntry, 0, len(items)) + for _, entry := range items { + sort.Strings(entry.RouteIDs) + sort.Strings(entry.ConnectivityModes) + if entry.NodeID != "" { + out = append(out, *entry) + } + } + sort.SliceStable(out, func(i, j int) bool { + return out[i].NodeID < out[j].NodeID + }) + return out +} + +func validatePeerEndpointCandidates(candidates map[string][]PeerEndpointCandidate, routePath []string) error { + if len(candidates) == 0 { + return nil + } + for nodeID, items := range candidates { + if strings.TrimSpace(nodeID) == "" || !containsString(routePath, nodeID) { + return ErrInvalidPayload + } + for _, candidate := range items { + if strings.TrimSpace(candidate.EndpointID) == "" || + strings.TrimSpace(candidate.NodeID) == "" || + candidate.NodeID != nodeID || + strings.TrimSpace(candidate.Address) == "" || + !isPeerEndpointTransport(candidate.Transport) || + !isPeerEndpointReachability(candidate.Reachability) || + !isPeerEndpointConnectivityMode(candidate.ConnectivityMode) || + (candidate.NATType != "" && !isPeerEndpointNATType(candidate.NATType)) { + return ErrInvalidPayload + } + if len(candidate.Metadata) > 0 && !json.Valid(candidate.Metadata) { + return ErrInvalidPayload + } + } + } + return nil +} + +func scopedPeerEndpoints(peers map[string]string, routePath []string) map[string]string { + out := map[string]string{} + for nodeID, endpoint := range peers { + if containsString(routePath, nodeID) && strings.TrimSpace(endpoint) != "" { + out[nodeID] = endpoint + } + } + return out +} + +func scopedPeerEndpointCandidates(candidates map[string][]PeerEndpointCandidate, routePath []string) map[string][]PeerEndpointCandidate { + out := map[string][]PeerEndpointCandidate{} + for nodeID, items := range candidates { + if !containsString(routePath, nodeID) { + continue + } + for _, candidate := range items { + if candidate.Metadata == nil { + candidate.Metadata = json.RawMessage(`{}`) + } + out[nodeID] = append(out[nodeID], candidate) + } + } + return out +} + +func isPeerEndpointTransport(value string) bool { + switch value { + case "direct_tcp_tls", "wss", "relay", "outbound_reverse": + return true + default: + return false + } +} + +func isPeerRendezvousTransport(value string) bool { + switch value { + case "relay_control", "relay", "wss", "direct_tcp_tls": + return true + default: + return false + } +} + +func isPeerEndpointReachability(value string) bool { + switch value { + case "public", "private", "relay", "outbound_only", "unknown": + return true + default: + return false + } +} + +func isPeerEndpointConnectivityMode(value string) bool { + switch value { + case "direct", "relay_required", "outbound_only", "unknown": + return true + default: + return false + } +} + +func isPeerEndpointNATType(value string) bool { + switch value { + case "unknown", "none", "full_cone", "restricted", "port_restricted", "symmetric", "blocked": + return true + default: + return false + } +} + +func controlPlaneAllowedChannels(channels []string) []string { + out := []string{} + for _, channel := range channels { + channel = strings.TrimSpace(channel) + switch channel { + case "fabric_control", "route_control": + if !containsString(out, channel) { + out = append(out, channel) + } + } + } + return out +} + +func firstNonEmptyStringSlice(values ...[]string) []string { + for _, value := range values { + if len(value) > 0 { + return value + } + } + return nil +} + +func isHTTPControlEndpoint(endpoint string) bool { + endpoint = strings.ToLower(strings.TrimSpace(endpoint)) + return strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://") +} + +func firstNodeID(selector nodeSelector) string { + if strings.TrimSpace(selector.NodeID) != "" { + return strings.TrimSpace(selector.NodeID) + } + for _, nodeID := range selector.NodeIDs { + if strings.TrimSpace(nodeID) != "" { + return strings.TrimSpace(nodeID) + } + } + return "" +} + +func cleanRouteNodePath(values []string) []string { + out := make([]string, 0, len(values)) + for _, value := range values { + value = strings.TrimSpace(value) + if value != "" { + out = append(out, value) + } + } + return out +} + +func containsString(values []string, needle string) bool { + needle = strings.TrimSpace(needle) + if needle == "" { + return false + } + for _, value := range values { + if strings.TrimSpace(value) == needle { + return true + } + } + return false +} + +func generateFencingToken() (string, error) { + buf := make([]byte, 32) + if _, err := rand.Read(buf); err != nil { + return "", err + } + return "rap_vpn_fence_" + hex.EncodeToString(buf), nil +} diff --git a/backend/internal/modules/cluster/service_test.go b/backend/internal/modules/cluster/service_test.go new file mode 100644 index 0000000..1dd5bdd --- /dev/null +++ b/backend/internal/modules/cluster/service_test.go @@ -0,0 +1,2505 @@ +package cluster + +import ( + "context" + "encoding/json" + "errors" + "strings" + "testing" + "time" + + "github.com/example/remote-access-platform/backend/internal/platform/clusterauth" + "github.com/example/remote-access-platform/backend/internal/platform/secrets" + "github.com/jackc/pgx/v5" +) + +func TestHashJoinTokenDoesNotStoreRawToken(t *testing.T) { + raw := "rap_join_example" + hashed, err := hashJoinToken(raw) + if err != nil { + t.Fatalf("hash join token: %v", err) + } + if hashed == raw { + t.Fatal("hash must not equal raw token") + } + if got, wantPrefix := hashed[:len(joinTokenHashPrefix)], joinTokenHashPrefix; got != wantPrefix { + t.Fatalf("hash prefix = %q, want %q", got, wantPrefix) + } + hashedAgain, err := hashJoinToken(raw) + if err != nil { + t.Fatalf("hash join token again: %v", err) + } + if hashed != hashedAgain { + t.Fatal("hash must be deterministic") + } +} + +func TestClusterAuthorityPrivateKeyEncodingUsesSecretEncryptor(t *testing.T) { + encryptor, err := secrets.NewEncryptor("MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY=", "test-key") + if err != nil { + t.Fatalf("NewEncryptor: %v", err) + } + store := (&PostgresStore{}).WithClusterKeyEncryptor(encryptor) + + encoded, err := store.encodeClusterAuthorityPrivateKey("cluster-1", "private-key") + if err != nil { + t.Fatalf("encodeClusterAuthorityPrivateKey: %v", err) + } + if encoded == "private-key" || !strings.HasPrefix(encoded, encryptedClusterAuthorityKeyPrefix) { + t.Fatalf("private key was not encrypted: %q", encoded) + } + decoded, err := store.decodeClusterAuthorityPrivateKey("cluster-1", encoded) + if err != nil { + t.Fatalf("decodeClusterAuthorityPrivateKey: %v", err) + } + if decoded != "private-key" { + t.Fatalf("decoded private key = %q", decoded) + } + if _, err := store.decodeClusterAuthorityPrivateKey("cluster-2", encoded); err == nil { + t.Fatal("expected wrong cluster AAD to fail") + } +} + +func TestCreateJoinTokenRequiresPlatformAdmin(t *testing.T) { + store := &fakeRepository{platformRole: "user"} + service := NewService(store) + + _, err := service.CreateJoinToken(context.Background(), CreateJoinTokenInput{ + ActorUserID: "user-1", + ClusterID: "cluster-1", + }) + if !errors.Is(err, ErrAccessDenied) { + t.Fatalf("err = %v, want ErrAccessDenied", err) + } +} + +func TestCreateJoinTokenStoresHashOnlyAndReturnsRawOnce(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + service.now = func() time.Time { return time.Date(2026, 4, 26, 12, 0, 0, 0, time.UTC) } + + created, err := service.CreateJoinToken(context.Background(), CreateJoinTokenInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Scope: json.RawMessage(`{"roles":["rdp-worker"]}`), + MaxUses: 1, + }) + if err != nil { + t.Fatalf("create join token: %v", err) + } + if created.Token == "" { + t.Fatal("raw token must be returned to caller once") + } + if store.lastTokenHash == "" || store.lastTokenHash == created.Token { + t.Fatalf("stored token hash = %q, raw token = %q", store.lastTokenHash, created.Token) + } + if created.AuthoritySignature == nil || len(created.AuthorityPayload) == 0 { + t.Fatalf("created token missing authority signature: %+v", created.NodeJoinToken) + } + if err := clusterauth.VerifyRaw(store.clusterAuthority.PublicKey, created.AuthorityPayload, *created.AuthoritySignature); err != nil { + t.Fatalf("verify token authority signature: %v", err) + } +} + +func TestUpdateClusterRequiresMutableAuthority(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + authorityState: ClusterAuthorityState{ + ClusterID: "cluster-1", + AuthorityState: "minority", + MutationMode: "read_only", + }, + } + service := NewService(store) + + _, err := service.UpdateCluster(context.Background(), UpdateClusterInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Name: "Cluster One", + Status: ClusterStatusActive, + Metadata: json.RawMessage(`{}`), + }) + if !errors.Is(err, ErrClusterReadOnly) { + t.Fatalf("err = %v, want ErrClusterReadOnly", err) + } +} + +func TestUpdateClusterValidatesStatusAndMetadata(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + _, err := service.UpdateCluster(context.Background(), UpdateClusterInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Name: "Cluster One", + Status: "unknown", + Metadata: json.RawMessage(`{}`), + }) + if !errors.Is(err, ErrInvalidPayload) { + t.Fatalf("err = %v, want ErrInvalidPayload", err) + } + + _, err = service.UpdateCluster(context.Background(), UpdateClusterInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Name: "Cluster One", + Status: ClusterStatusActive, + Metadata: json.RawMessage(`{`), + }) + if err == nil || !strings.Contains(err.Error(), "metadata") { + t.Fatalf("err = %v, want metadata validation error", err) + } +} + +func TestCreateNodeGroupValidatesNameAndMetadata(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + _, err := service.CreateNodeGroup(context.Background(), CreateNodeGroupInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Name: " ", + }) + if !errors.Is(err, ErrInvalidPayload) { + t.Fatalf("err = %v, want ErrInvalidPayload", err) + } + + _, err = service.CreateNodeGroup(context.Background(), CreateNodeGroupInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Name: "DC-1", + Metadata: json.RawMessage(`{`), + }) + if err == nil || !strings.Contains(err.Error(), "metadata") { + t.Fatalf("err = %v, want metadata validation error", err) + } +} + +func TestAssignNodeToGroupPreservesConcreteMembership(t *testing.T) { + groupID := "group-1" + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + node, err := service.AssignNodeToGroup(context.Background(), AssignNodeGroupInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + NodeID: "node-1", + GroupID: &groupID, + }) + if err != nil { + t.Fatalf("assign node group: %v", err) + } + if node.ID != "node-1" || node.NodeGroupID == nil || *node.NodeGroupID != groupID { + t.Fatalf("unexpected node group assignment: %+v", node) + } + if store.lastAssignGroupInput.NodeID != "node-1" || store.lastAssignGroupInput.GroupID == nil { + t.Fatalf("assignment input not preserved: %+v", store.lastAssignGroupInput) + } +} + +func TestCreateFabricEntryPointValidatesControlPlanePayload(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + _, err := service.CreateFabricEntryPoint(context.Background(), CreateFabricEntryPointInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Name: " ", + EndpointType: "client_access", + }) + if !errors.Is(err, ErrInvalidPayload) { + t.Fatalf("err = %v, want ErrInvalidPayload", err) + } + + _, err = service.CreateFabricEntryPoint(context.Background(), CreateFabricEntryPointInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Name: "Main Entry", + EndpointType: "client_access", + Policy: json.RawMessage(`{`), + }) + if err == nil || !strings.Contains(err.Error(), "valid json") { + t.Fatalf("err = %v, want json validation error", err) + } +} + +func TestCreateFabricEgressPoolDefaultsAndAudits(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + item, err := service.CreateFabricEgressPool(context.Background(), CreateFabricEgressPoolInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Name: "Office Moscow", + }) + if err != nil { + t.Fatalf("create egress pool: %v", err) + } + if item.Status != "active" || string(item.RouteScope) != "{}" { + t.Fatalf("unexpected egress pool defaults: %+v", item) + } + if len(store.auditEvents) == 0 || store.auditEvents[len(store.auditEvents)-1].EventType != "fabric.egress_pool.created" { + t.Fatalf("missing egress pool audit event: %+v", store.auditEvents) + } +} + +func TestAssignNodeRoleRejectsUnknownRole(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + _, err := service.AssignNodeRole(context.Background(), AssignNodeRoleInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + NodeID: "node-1", + Role: "can_run_rdp_worker", + }) + if !errors.Is(err, ErrInvalidNodeRole) { + t.Fatalf("err = %v, want ErrInvalidNodeRole", err) + } +} + +func TestAttachExistingNodeRequiresPlatformAdmin(t *testing.T) { + store := &fakeRepository{platformRole: "user"} + service := NewService(store) + + _, err := service.AttachExistingNodeToCluster(context.Background(), AttachExistingNodeInput{ + ActorUserID: "user-1", + ClusterID: "cluster-1", + NodeID: "node-1", + }) + if !errors.Is(err, ErrAccessDenied) { + t.Fatalf("err = %v, want ErrAccessDenied", err) + } +} + +func TestAttachExistingNodeRejectsUnknownRole(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + _, err := service.AttachExistingNodeToCluster(context.Background(), AttachExistingNodeInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + NodeID: "node-1", + Roles: []string{"can_run_rdp_worker"}, + }) + if !errors.Is(err, ErrInvalidNodeRole) { + t.Fatalf("err = %v, want ErrInvalidNodeRole", err) + } +} + +func TestAttachExistingNodeUsesConcreteNodeAndRoles(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + node, err := service.AttachExistingNodeToCluster(context.Background(), AttachExistingNodeInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + NodeID: "node-1", + Roles: []string{"entry-node", "rdp-worker"}, + }) + if err != nil { + t.Fatalf("attach existing node: %v", err) + } + if node.ID != "node-1" || node.MembershipStatus != "active" { + t.Fatalf("unexpected node: %+v", node) + } + if store.lastAttachInput.NodeID != "node-1" || len(store.lastAttachInput.Roles) != 2 { + t.Fatalf("attach input not preserved: %+v", store.lastAttachInput) + } +} + +func TestCreateJoinRequestRejectsExpiredOrRevokedToken(t *testing.T) { + store := &fakeRepository{validTokenErr: ErrInvalidJoinToken} + service := NewService(store) + + _, err := service.CreateJoinRequest(context.Background(), CreateJoinRequestInput{ + ClusterID: "cluster-1", + JoinToken: "rap_join_invalid", + NodeName: "node-a", + NodeFingerprint: "fingerprint-a", + PublicKey: "public-key", + }) + if !errors.Is(err, ErrInvalidJoinToken) { + t.Fatalf("err = %v, want ErrInvalidJoinToken", err) + } +} + +func TestRevokeJoinTokenRequiresPlatformAdmin(t *testing.T) { + store := &fakeRepository{platformRole: "user"} + service := NewService(store) + + _, err := service.RevokeJoinToken(context.Background(), RevokeJoinTokenInput{ + ActorUserID: "user-1", + ClusterID: "cluster-1", + TokenID: "token-1", + }) + if !errors.Is(err, ErrAccessDenied) { + t.Fatalf("err = %v, want ErrAccessDenied", err) + } +} + +func TestApproveJoinRequestReturnsBootstrapContract(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + approved, err := service.ApproveJoinRequest(context.Background(), ApproveJoinRequestInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + JoinRequestID: "join-request-1", + NodeKey: "node-key-1", + }) + if err != nil { + t.Fatalf("approve join request: %v", err) + } + if approved.Bootstrap.ClusterID != "cluster-1" || approved.Bootstrap.IdentityStatus == "" { + t.Fatalf("unexpected bootstrap contract: %+v", approved.Bootstrap) + } + if approved.Bootstrap.ClusterAuthority == nil || approved.Bootstrap.AuthoritySignature == nil || len(approved.Bootstrap.AuthorityPayload) == 0 { + t.Fatalf("bootstrap missing authority contract: %+v", approved.Bootstrap) + } + if err := clusterauth.VerifyRaw(store.clusterAuthority.PublicKey, approved.Bootstrap.AuthorityPayload, *approved.Bootstrap.AuthoritySignature); err != nil { + t.Fatalf("verify approval authority signature: %v", err) + } +} + +func TestGetJoinRequestBootstrapReturnsSignedApproval(t *testing.T) { + nodeID := "node-1" + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + bootstrapJoinRequest: NodeJoinRequest{ + ID: "join-request-1", + ClusterID: "cluster-1", + NodeFingerprint: "node-fp", + PublicKey: "node-public-key", + Status: JoinRequestStatusApproved, + ApprovedNodeID: &nodeID, + }, + } + service := NewService(store) + + result, err := service.GetJoinRequestBootstrap(context.Background(), GetJoinRequestBootstrapInput{ + ClusterID: "cluster-1", + JoinRequestID: "join-request-1", + NodeFingerprint: "node-fp", + PublicKey: "node-public-key", + }) + if err != nil { + t.Fatalf("get join request bootstrap: %v", err) + } + if result.Bootstrap == nil || result.Bootstrap.NodeID != nodeID || result.Bootstrap.ClusterAuthority == nil { + t.Fatalf("unexpected bootstrap result: %+v", result) + } + if result.Bootstrap.AuthoritySignature == nil || len(result.Bootstrap.AuthorityPayload) == 0 { + t.Fatalf("bootstrap missing authority signature: %+v", result.Bootstrap) + } + if err := clusterauth.VerifyRaw(store.clusterAuthority.PublicKey, result.Bootstrap.AuthorityPayload, *result.Bootstrap.AuthoritySignature); err != nil { + t.Fatalf("verify bootstrap authority signature: %v", err) + } +} + +func TestSetDesiredWorkloadRequiresPlatformAdmin(t *testing.T) { + store := &fakeRepository{platformRole: "user"} + service := NewService(store) + + _, err := service.SetDesiredWorkload(context.Background(), SetDesiredWorkloadInput{ + ActorUserID: "user-1", + ClusterID: "cluster-1", + NodeID: "node-1", + ServiceType: "rdp-worker", + }) + if !errors.Is(err, ErrAccessDenied) { + t.Fatalf("err = %v, want ErrAccessDenied", err) + } +} + +func TestReportWorkloadStatusDefaultsToSafeStubState(t *testing.T) { + store := &fakeRepository{} + service := NewService(store) + + status, err := service.ReportWorkloadStatus(context.Background(), ReportWorkloadStatusInput{ + ClusterID: "cluster-1", + NodeID: "node-1", + ServiceType: "rdp-worker", + }) + if err != nil { + t.Fatalf("report workload status: %v", err) + } + if status.ReportedState != "unknown" || status.RuntimeMode != "container" { + t.Fatalf("unexpected status defaults: %+v", status) + } +} + +func TestReportMeshLinkDoesNotRequirePlatformAdmin(t *testing.T) { + store := &fakeRepository{} + service := NewService(store) + + link, err := service.ReportMeshLink(context.Background(), ReportMeshLinkInput{ + ClusterID: "cluster-1", + SourceNodeID: "node-a", + TargetNodeID: "node-b", + LinkStatus: "reachable", + }) + if err != nil { + t.Fatalf("report mesh link: %v", err) + } + if link.LinkStatus != "reachable" { + t.Fatalf("LinkStatus = %q", link.LinkStatus) + } +} + +func TestCreateRouteIntentRequiresPlatformAdmin(t *testing.T) { + store := &fakeRepository{platformRole: "user"} + service := NewService(store) + + _, err := service.CreateRouteIntent(context.Background(), CreateRouteIntentInput{ + ActorUserID: "user-1", + ClusterID: "cluster-1", + ServiceClass: "input", + }) + if !errors.Is(err, ErrAccessDenied) { + t.Fatalf("err = %v, want ErrAccessDenied", err) + } +} + +func TestGetNodeSyntheticMeshConfigRequiresTestingFlag(t *testing.T) { + service := NewService(&fakeRepository{}) + + cfg, err := service.GetNodeSyntheticMeshConfig(context.Background(), GetNodeSyntheticMeshConfigInput{ + ClusterID: "cluster-1", + NodeID: "node-a", + }) + if err != nil { + t.Fatalf("get synthetic config: %v", err) + } + if cfg.Enabled { + t.Fatal("config must be disabled when synthetic testing flag is off") + } + if len(cfg.Routes) != 0 || len(cfg.PeerEndpoints) != 0 { + t.Fatalf("disabled config must not leak topology: %+v", cfg) + } +} + +func TestGetNodeSyntheticMeshConfigIsNodeScoped(t *testing.T) { + now := time.Date(2026, 4, 27, 12, 0, 0, 0, time.UTC) + service := NewService(&fakeRepository{ + testingFlags: EffectiveNodeTestingFlags{ + Enabled: true, + SyntheticLinksEnabled: true, + }, + routeIntents: []MeshRouteIntent{ + { + ID: "route-a-b", + ClusterID: "cluster-1", + SourceSelector: json.RawMessage(`{"node_id":"node-a"}`), + DestinationSelector: json.RawMessage(`{"node_id":"node-b"}`), + ServiceClass: "synthetic", + Status: "active", + Policy: json.RawMessage(`{ + "synthetic_enabled": true, + "hops": ["node-a", "node-r", "node-b"], + "allowed_channels": ["fabric_control", "route_control"], + "peer_endpoints": { + "node-r": "http://node-r:19000", + "node-b": "http://node-b:19000", + "node-y": "http://node-y:19000" + }, + "peer_endpoint_candidates": { + "node-r": [ + { + "endpoint_id": "node-r-public", + "node_id": "node-r", + "transport": "direct_tcp_tls", + "address": "203.0.113.10:443", + "address_family": "ipv4", + "reachability": "public", + "nat_type": "none", + "connectivity_mode": "direct", + "region": "eu", + "priority": 10, + "policy_tags": ["fast-path"], + "metadata": {"source":"test"} + } + ], + "node-b": [ + { + "endpoint_id": "node-b-outbound", + "node_id": "node-b", + "transport": "outbound_reverse", + "address": "node-b.reverse.local", + "reachability": "outbound_only", + "nat_type": "symmetric", + "connectivity_mode": "outbound_only", + "priority": 20 + } + ] + }, + "recovery_seeds": [ + { + "node_id": "node-r", + "endpoint": "https://node-r.example.test:443", + "transport": "direct_tcp_tls", + "connectivity_mode": "direct", + "region": "eu", + "priority": 10, + "metadata": {"role":"stable-recovery"} + }, + { + "node_id": "node-seed", + "endpoint": "wss://seed.example.test/mesh", + "transport": "wss", + "connectivity_mode": "direct", + "priority": 20 + } + ], + "route_version": "route-v1", + "policy_version": "policy-v1", + "peer_directory_version": "peers-v1" + }`), + UpdatedAt: now, + }, + { + ID: "route-x-y", + ClusterID: "cluster-1", + SourceSelector: json.RawMessage(`{"node_id":"node-x"}`), + DestinationSelector: json.RawMessage(`{"node_id":"node-y"}`), + ServiceClass: "synthetic", + Status: "active", + Policy: json.RawMessage(`{ + "synthetic_enabled": true, + "hops": ["node-x", "node-y"], + "peer_endpoints": {"node-y": "http://node-y:19000"} + }`), + UpdatedAt: now, + }, + }, + }) + service.now = func() time.Time { return now } + + cfg, err := service.GetNodeSyntheticMeshConfig(context.Background(), GetNodeSyntheticMeshConfigInput{ + ClusterID: "cluster-1", + NodeID: "node-a", + }) + if err != nil { + t.Fatalf("get synthetic config: %v", err) + } + if !cfg.Enabled { + t.Fatal("config should be enabled") + } + if len(cfg.Routes) != 1 || cfg.Routes[0].RouteID != "route-a-b" { + t.Fatalf("routes = %+v", cfg.Routes) + } + if cfg.PeerEndpoints["node-r"] == "" || cfg.PeerEndpoints["node-b"] == "" { + t.Fatalf("peer endpoints missing: %+v", cfg.PeerEndpoints) + } + if _, leaked := cfg.PeerEndpoints["node-y"]; leaked { + t.Fatalf("unrelated topology leaked: %+v", cfg.PeerEndpoints) + } + nodeRCandidates := cfg.PeerEndpointCandidates["node-r"] + if len(nodeRCandidates) != 1 { + t.Fatalf("node-r candidates = %+v", cfg.PeerEndpointCandidates) + } + if got := nodeRCandidates[0]; got.EndpointID != "node-r-public" || + got.Transport != "direct_tcp_tls" || + got.Reachability != "public" || + got.NATType != "none" || + got.ConnectivityMode != "direct" || + got.Priority != 10 { + t.Fatalf("unexpected node-r candidate: %+v", got) + } + if _, leaked := cfg.PeerEndpointCandidates["node-y"]; leaked { + t.Fatalf("unrelated candidate topology leaked: %+v", cfg.PeerEndpointCandidates) + } + if len(cfg.RecoverySeeds) != 2 || cfg.RecoverySeeds[0].NodeID != "node-r" || cfg.RecoverySeeds[1].NodeID != "node-seed" { + t.Fatalf("unexpected recovery seeds: %+v", cfg.RecoverySeeds) + } + nodeRDirectory, ok := findPeerDirectoryEntry(cfg.PeerDirectory, "node-r") + if !ok || nodeRDirectory.CandidateCount != 1 || !nodeRDirectory.RecoverySeed { + t.Fatalf("node-r peer directory missing recovery/candidate metadata: %+v", cfg.PeerDirectory) + } + if _, ok := findPeerDirectoryEntry(cfg.PeerDirectory, "node-a"); ok { + t.Fatalf("local node leaked into peer directory: %+v", cfg.PeerDirectory) + } + if _, ok := findPeerDirectoryEntry(cfg.PeerDirectory, "node-y"); ok { + t.Fatalf("unrelated node leaked into peer directory: %+v", cfg.PeerDirectory) + } + if cfg.ProductionForwarding { + t.Fatal("production forwarding must remain false") + } +} + +func TestGetNodeSyntheticMeshConfigUsesReportedMeshEndpoint(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + service := NewService(&fakeRepository{ + testingFlags: EffectiveNodeTestingFlags{ + Enabled: true, + SyntheticLinksEnabled: true, + }, + routeIntents: []MeshRouteIntent{ + { + ID: "route-a-b", + ClusterID: "cluster-1", + SourceSelector: json.RawMessage(`{"node_id":"node-a"}`), + DestinationSelector: json.RawMessage(`{"node_id":"node-b"}`), + ServiceClass: "synthetic", + Status: "active", + Policy: json.RawMessage(`{ + "synthetic_enabled": true, + "hops": ["node-a", "node-b"] + }`), + UpdatedAt: now, + }, + }, + heartbeats: map[string][]NodeHeartbeat{ + "node-b": { + { + ClusterID: "cluster-1", + NodeID: "node-b", + Metadata: json.RawMessage(`{ + "mesh_endpoint_report": { + "schema_version": "c17z6.mesh_endpoint_report.v1", + "cluster_id": "cluster-1", + "node_id": "node-b", + "peer_endpoint": "https://node-b.dynamic.example.test:443", + "transport": "direct_tcp_tls", + "connectivity_mode": "direct", + "nat_type": "none", + "endpoint_candidates": [ + { + "endpoint_id": "node-b-dynamic", + "node_id": "node-b", + "transport": "direct_tcp_tls", + "address": "https://node-b.dynamic.example.test:443", + "reachability": "public", + "connectivity_mode": "direct", + "nat_type": "none", + "priority": 1, + "metadata": {"source":"heartbeat"} + } + ] + } + }`), + ObservedAt: now, + }, + }, + }, + }) + service.now = func() time.Time { return now } + + cfg, err := service.GetNodeSyntheticMeshConfig(context.Background(), GetNodeSyntheticMeshConfigInput{ + ClusterID: "cluster-1", + NodeID: "node-a", + }) + if err != nil { + t.Fatalf("get synthetic config: %v", err) + } + if cfg.PeerEndpoints["node-b"] != "https://node-b.dynamic.example.test:443" { + t.Fatalf("reported endpoint not projected: %+v", cfg.PeerEndpoints) + } + if got := cfg.PeerEndpointCandidates["node-b"]; len(got) != 1 || got[0].EndpointID != "node-b-dynamic" { + t.Fatalf("reported candidates not projected: %+v", cfg.PeerEndpointCandidates) + } + entry, ok := findPeerDirectoryEntry(cfg.PeerDirectory, "node-b") + if !ok || entry.EndpointCount != 1 || entry.CandidateCount != 1 { + t.Fatalf("peer directory did not include reported endpoint/candidate: %+v", cfg.PeerDirectory) + } +} + +func TestGetNodeSyntheticMeshConfigIssuesRendezvousRelayLeases(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) + service := NewService(&fakeRepository{ + testingFlags: EffectiveNodeTestingFlags{ + Enabled: true, + SyntheticLinksEnabled: true, + }, + routeIntents: []MeshRouteIntent{ + { + ID: "route-a-b", + ClusterID: "cluster-1", + SourceSelector: json.RawMessage(`{"node_id":"node-a"}`), + DestinationSelector: json.RawMessage(`{"node_id":"node-b"}`), + ServiceClass: "synthetic", + Status: "active", + Policy: json.RawMessage(`{ + "synthetic_enabled": true, + "hops": ["node-a", "node-r", "node-b"], + "allowed_channels": ["fabric_control", "route_control", "service_payload"], + "peer_endpoints": { + "node-r": "http://node-r:19000" + }, + "peer_endpoint_candidates": { + "node-b": [ + { + "endpoint_id": "node-b-outbound", + "node_id": "node-b", + "transport": "outbound_reverse", + "address": "node-b.reverse.local", + "reachability": "outbound_only", + "nat_type": "symmetric", + "connectivity_mode": "outbound_only", + "priority": 20 + } + ] + }, + "rendezvous_leases": [ + { + "peer_node_id": "node-b", + "relay_node_id": "node-r", + "relay_endpoint": "http://node-r:19000", + "priority": 5 + } + ] + }`), + UpdatedAt: now, + }, + { + ID: "route-x-y", + ClusterID: "cluster-1", + SourceSelector: json.RawMessage(`{"node_id":"node-x"}`), + DestinationSelector: json.RawMessage(`{"node_id":"node-y"}`), + ServiceClass: "synthetic", + Status: "active", + Policy: json.RawMessage(`{ + "synthetic_enabled": true, + "hops": ["node-x", "node-y"], + "peer_endpoints": {"node-x": "http://node-x:19000"}, + "rendezvous_leases": [ + { + "peer_node_id": "node-y", + "relay_node_id": "node-x", + "relay_endpoint": "http://node-x:19000" + } + ] + }`), + UpdatedAt: now, + }, + }, + }) + service.now = func() time.Time { return now } + + cfg, err := service.GetNodeSyntheticMeshConfig(context.Background(), GetNodeSyntheticMeshConfigInput{ + ClusterID: "cluster-1", + NodeID: "node-a", + }) + if err != nil { + t.Fatalf("get synthetic config: %v", err) + } + if cfg.SchemaVersion != "c17z18.synthetic.v1" { + t.Fatalf("schema version = %s, want c17z18.synthetic.v1", cfg.SchemaVersion) + } + if len(cfg.RendezvousLeases) != 1 { + t.Fatalf("unexpected rendezvous leases: %+v", cfg.RendezvousLeases) + } + lease := cfg.RendezvousLeases[0] + if lease.LeaseID != "route-a-b-rv-node-b-via-node-r" || + lease.PeerNodeID != "node-b" || + lease.RelayNodeID != "node-r" || + lease.RelayEndpoint != "http://node-r:19000" || + lease.Transport != "relay_control" || + lease.Priority != 5 || + !lease.ControlPlaneOnly || + !containsString(lease.AllowedChannels, "fabric_control") || + containsString(lease.AllowedChannels, "service_payload") { + t.Fatalf("unexpected rendezvous lease contract: %+v", lease) + } + if _, ok := findPeerDirectoryEntry(cfg.PeerDirectory, "node-y"); ok { + t.Fatalf("unrelated rendezvous lease leaked into peer directory: %+v", cfg.PeerDirectory) + } + nodeB, ok := findPeerDirectoryEntry(cfg.PeerDirectory, "node-b") + if !ok || !containsString(nodeB.ConnectivityModes, "relay_required") { + t.Fatalf("peer directory missing rendezvous peer mode: %+v", cfg.PeerDirectory) + } + nodeR, ok := findPeerDirectoryEntry(cfg.PeerDirectory, "node-r") + if !ok || !containsString(nodeR.ConnectivityModes, "relay_control") { + t.Fatalf("peer directory missing relay control mode: %+v", cfg.PeerDirectory) + } +} + +func TestGetNodeSyntheticMeshConfigReplacesStaleRendezvousRelay(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 30, 0, 0, time.UTC) + staleHeartbeatMetadata, err := json.Marshal(map[string]any{ + "mesh_rendezvous_lease_report": map[string]any{ + "schema_version": "c17z18.mesh_rendezvous_lease_report.v1", + "cluster_id": "cluster-1", + "node_id": "node-a", + "observed_at": now.Format(time.RFC3339Nano), + "leases": []map[string]any{ + { + "lease_id": "route-a-b-rv-node-b-via-node-r-old", + "peer_node_id": "node-b", + "relay_node_id": "node-r-old", + "route_ids": []string{"route-a-b"}, + "stale_relay": true, + "reselection_needed": true, + "connection_state": "degraded", + "reason": "auto_outbound_only", + }, + }, + }, + }) + if err != nil { + t.Fatalf("marshal heartbeat metadata: %v", err) + } + service := NewService(&fakeRepository{ + testingFlags: EffectiveNodeTestingFlags{ + Enabled: true, + SyntheticLinksEnabled: true, + }, + heartbeats: map[string][]NodeHeartbeat{ + "node-a": { + { + ClusterID: "cluster-1", + NodeID: "node-a", + Metadata: staleHeartbeatMetadata, + ObservedAt: now.Add(-10 * time.Second), + }, + }, + }, + routeIntents: []MeshRouteIntent{ + { + ID: "route-a-b", + ClusterID: "cluster-1", + SourceSelector: json.RawMessage(`{"node_id":"node-a"}`), + DestinationSelector: json.RawMessage(`{"node_id":"node-b"}`), + ServiceClass: "synthetic", + Status: "active", + Policy: json.RawMessage(`{ + "synthetic_enabled": true, + "hops": ["node-a", "node-r-old", "node-r-new", "node-b"], + "allowed_channels": ["fabric_control", "route_control"], + "peer_endpoints": { + "node-r-old": "http://node-r-old:19000", + "node-r-new": "http://node-r-new:19000" + }, + "peer_endpoint_candidates": { + "node-b": [ + { + "endpoint_id": "node-b-outbound", + "node_id": "node-b", + "transport": "outbound_reverse", + "address": "node-b.reverse.local", + "reachability": "outbound_only", + "nat_type": "symmetric", + "connectivity_mode": "outbound_only", + "priority": 5 + } + ] + }, + "rendezvous_leases": [ + { + "lease_id": "route-a-b-rv-node-b-via-node-r-old", + "peer_node_id": "node-b", + "relay_node_id": "node-r-old", + "relay_endpoint": "http://node-r-old:19000", + "priority": 4 + } + ] + }`), + UpdatedAt: now, + }, + }, + }) + service.now = func() time.Time { return now } + + cfg, err := service.GetNodeSyntheticMeshConfig(context.Background(), GetNodeSyntheticMeshConfigInput{ + ClusterID: "cluster-1", + NodeID: "node-a", + }) + if err != nil { + t.Fatalf("get synthetic config: %v", err) + } + if len(cfg.RendezvousLeases) != 1 { + t.Fatalf("unexpected rendezvous leases: %+v", cfg.RendezvousLeases) + } + lease := cfg.RendezvousLeases[0] + if lease.RelayNodeID != "node-r-new" || + lease.LeaseID != "route-a-b-rv-node-b-via-node-r-new" || + lease.Reason != "stale_relay_replacement" { + t.Fatalf("stale relay was not replaced: %+v", lease) + } + var metadata map[string]any + if err := json.Unmarshal(lease.Metadata, &metadata); err != nil { + t.Fatalf("unmarshal lease metadata: %v", err) + } + if metadata["replacement_for_stale_relay"] != true || + metadata["relay_replacement_contract"] != "stale_relay_feedback_policy" { + t.Fatalf("replacement metadata missing: %+v", metadata) + } + if cfg.RendezvousRelayPolicy == nil || + cfg.RendezvousRelayPolicy.StaleRelayCount != 1 || + cfg.RendezvousRelayPolicy.WithdrawnLeaseCount != 1 || + cfg.RendezvousRelayPolicy.ReplacementLeaseCount != 1 { + t.Fatalf("unexpected relay policy report: %+v", cfg.RendezvousRelayPolicy) + } + var decision RendezvousRelayPolicyDecision + for _, item := range cfg.RendezvousRelayPolicy.Decisions { + if item.Reason == "stale_relay_replacement" { + decision = item + break + } + } + if decision.SelectedRelayID != "node-r-new" || decision.StaleRelayNodeID != "node-r-old" { + t.Fatalf("unexpected relay replacement decision: %+v", cfg.RendezvousRelayPolicy.Decisions) + } + if cfg.RoutePathDecisions == nil || + cfg.RoutePathDecisions.SchemaVersion != "c17z18.route_path_decisions.v1" || + cfg.RoutePathDecisions.DecisionCount != 1 || + cfg.RoutePathDecisions.ReplacementDecisionCount != 1 { + t.Fatalf("unexpected route path decisions: %+v", cfg.RoutePathDecisions) + } + pathDecision := cfg.RoutePathDecisions.Decisions[0] + if pathDecision.DecisionSource != "stale_relay_replacement" || + pathDecision.SelectedRelayID != "node-r-new" || + pathDecision.StaleRelayNodeID != "node-r-old" || + pathDecision.RendezvousPeerNodeID != "node-b" || + pathDecision.RendezvousLeaseID != "route-a-b-rv-node-b-via-node-r-new" || + pathDecision.NextHopID != "node-r-new" || + pathDecision.ProductionForwarding || + !pathDecision.ControlPlaneOnly || + strings.Join(pathDecision.EffectiveHops, ",") != "node-a,node-r-new,node-b" { + t.Fatalf("unexpected route path decision: %+v", pathDecision) + } +} + +func TestGetNodeSyntheticMeshConfigAppliesReplacementPathHintForExit(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 30, 0, 0, time.UTC) + hintMetadata, err := json.Marshal(map[string]any{ + "mesh_route_path_decision_report": map[string]any{ + "cluster_id": "cluster-1", + "node_id": "node-a", + "decisions": []map[string]any{ + { + "decision_id": "route-a-b-path-node-a-via-node-r-new", + "route_id": "route-a-b", + "cluster_id": "cluster-1", + "local_node_id": "node-a", + "source_node_id": "node-a", + "destination_node_id": "node-b", + "original_hops": []string{"node-a", "node-r-old", "node-r-new", "node-b"}, + "effective_hops": []string{"node-a", "node-r-new", "node-b"}, + "next_hop_id": "node-r-new", + "local_role": "entry", + "selected_relay_id": "node-r-new", + "selected_relay_endpoint": "http://node-r-new:19000", + "stale_relay_node_id": "node-r-old", + "rendezvous_peer_node_id": "node-b", + "rendezvous_lease_id": "route-a-b-rv-node-b-via-node-r-new", + "rendezvous_lease_reason": "stale_relay_replacement", + "decision_source": "stale_relay_replacement", + "generation": "hint-generation", + "path_score": 900, + "score_reasons": []string{"route_path_decision_hint"}, + "control_plane_only": true, + "production_forwarding": false, + "expires_at": now.Add(time.Hour).UTC().Format(time.RFC3339Nano), + }, + }, + }, + }) + if err != nil { + t.Fatalf("marshal hint metadata: %v", err) + } + service := NewService(&fakeRepository{ + testingFlags: EffectiveNodeTestingFlags{ + Enabled: true, + SyntheticLinksEnabled: true, + }, + heartbeats: map[string][]NodeHeartbeat{ + "node-a": { + { + ClusterID: "cluster-1", + NodeID: "node-a", + Metadata: hintMetadata, + ObservedAt: now.Add(-10 * time.Second), + }, + }, + }, + routeIntents: []MeshRouteIntent{ + { + ID: "route-a-b", + ClusterID: "cluster-1", + SourceSelector: json.RawMessage(`{"node_id":"node-a"}`), + DestinationSelector: json.RawMessage(`{"node_id":"node-b"}`), + ServiceClass: "synthetic", + Status: "active", + Policy: json.RawMessage(`{ + "synthetic_enabled": true, + "hops": ["node-a", "node-r-old", "node-r-new", "node-b"], + "allowed_channels": ["fabric_control", "route_control"], + "peer_endpoints": { + "node-r-old": "http://node-r-old:19000", + "node-r-new": "http://node-r-new:19000" + }, + "peer_endpoint_candidates": { + "node-b": [ + { + "endpoint_id": "node-b-outbound", + "node_id": "node-b", + "transport": "outbound_reverse", + "address": "node-b.reverse.local", + "reachability": "outbound_only", + "nat_type": "symmetric", + "connectivity_mode": "outbound_only", + "priority": 5 + } + ] + }, + "rendezvous_leases": [ + { + "lease_id": "route-a-b-rv-node-b-via-node-r-old", + "peer_node_id": "node-b", + "relay_node_id": "node-r-old", + "relay_endpoint": "http://node-r-old:19000", + "priority": 4 + } + ] + }`), + UpdatedAt: now, + }, + }, + }) + service.now = func() time.Time { return now } + + cfg, err := service.GetNodeSyntheticMeshConfig(context.Background(), GetNodeSyntheticMeshConfigInput{ + ClusterID: "cluster-1", + NodeID: "node-b", + }) + if err != nil { + t.Fatalf("get synthetic config: %v", err) + } + if len(cfg.RendezvousLeases) != 1 || + cfg.RendezvousLeases[0].RelayNodeID != "node-r-new" || + cfg.RendezvousLeases[0].Reason != "stale_relay_replacement" { + t.Fatalf("replacement hint did not withdraw stale relay lease: %+v", cfg.RendezvousLeases) + } + if cfg.RoutePathDecisions == nil || + cfg.RoutePathDecisions.ReplacementDecisionCount != 1 || + len(cfg.RoutePathDecisions.Decisions) != 1 { + t.Fatalf("unexpected route path decisions: %+v", cfg.RoutePathDecisions) + } + decision := cfg.RoutePathDecisions.Decisions[0] + if decision.DecisionSource != "stale_relay_replacement" || + decision.LocalRole != "exit" || + decision.PreviousHopID != "node-r-new" || + decision.SelectedRelayID != "node-r-new" || + decision.StaleRelayNodeID != "node-r-old" || + decision.RendezvousPeerNodeID != "node-b" || + strings.Join(decision.EffectiveHops, ",") != "node-a,node-r-new,node-b" { + t.Fatalf("unexpected hinted route path decision: %+v", decision) + } +} + +func TestGetNodeSyntheticMeshConfigUsesRouteHealthDriftToReselectRelay(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 30, 0, 0, time.UTC) + routeHealthMetadata, err := json.Marshal(map[string]any{ + "observation_type": "synthetic_route_health", + "route_id": "route-a-b", + "route_path_decision_applied": true, + "route_path_decision_selected_relay_id": "node-s", + "route_path_decision_rendezvous_peer_node_id": "node-b", + "route_path_decision_rendezvous_lease_id": "route-a-b-rv-node-b-via-node-s", + "route_path_decision_rendezvous_lease_reason": "auto_rendezvous_required", + "expected_effective_hops": []string{"node-a", "node-s", "node-b"}, + "observed_ack_path": []string{"node-a", "node-t", "node-b"}, + "route_path_drift_detected": true, + "control_plane_only": true, + "production_forwarding": false, + "production_payload_forwarding": false, + "route_health_production_payload_forwarding": false, + "route_health_service_payload_forwarding": false, + "synthetic_route_health_route_path_runtime": true, + "production_route_path_forwarding_runtime": false, + "route_health_route_config_contract": "control_plane_route_path_decisions_to_synthetic_route_health", + }) + if err != nil { + t.Fatalf("marshal route health metadata: %v", err) + } + service := NewService(&fakeRepository{ + testingFlags: EffectiveNodeTestingFlags{ + Enabled: true, + SyntheticLinksEnabled: true, + }, + meshLinks: []MeshLinkObservation{ + { + ClusterID: "cluster-1", + SourceNodeID: "node-a", + TargetNodeID: "node-b", + LinkStatus: "reachable", + Metadata: routeHealthMetadata, + ObservedAt: now.Add(-10 * time.Second), + }, + }, + routeIntents: []MeshRouteIntent{ + { + ID: "route-a-b", + ClusterID: "cluster-1", + SourceSelector: json.RawMessage(`{"node_id":"node-a"}`), + DestinationSelector: json.RawMessage(`{"node_id":"node-b"}`), + ServiceClass: "synthetic", + Status: "active", + Policy: json.RawMessage(`{ + "synthetic_enabled": true, + "hops": ["node-a", "node-s", "node-t", "node-b"], + "allowed_channels": ["fabric_control", "route_control"], + "peer_endpoint_candidates": { + "node-b": [ + { + "endpoint_id": "node-b-outbound", + "node_id": "node-b", + "transport": "outbound_reverse", + "address": "node-b.reverse.local", + "reachability": "outbound_only", + "nat_type": "symmetric", + "connectivity_mode": "outbound_only", + "priority": 5 + } + ], + "node-s": [ + { + "endpoint_id": "node-s-public", + "node_id": "node-s", + "transport": "direct_tcp_tls", + "address": "http://node-s:19000", + "reachability": "public", + "nat_type": "none", + "connectivity_mode": "direct", + "priority": 1, + "policy_tags": ["fast-path"] + } + ], + "node-t": [ + { + "endpoint_id": "node-t-public", + "node_id": "node-t", + "transport": "direct_tcp_tls", + "address": "http://node-t:19000", + "reachability": "public", + "nat_type": "none", + "connectivity_mode": "direct", + "priority": 50 + } + ] + } + }`), + UpdatedAt: now, + }, + }, + }) + service.now = func() time.Time { return now } + + cfg, err := service.GetNodeSyntheticMeshConfig(context.Background(), GetNodeSyntheticMeshConfigInput{ + ClusterID: "cluster-1", + NodeID: "node-a", + }) + if err != nil { + t.Fatalf("get synthetic config: %v", err) + } + if len(cfg.RendezvousLeases) != 1 { + t.Fatalf("unexpected rendezvous leases: %+v", cfg.RendezvousLeases) + } + lease := cfg.RendezvousLeases[0] + if lease.RelayNodeID != "node-t" || lease.Reason != "stale_relay_replacement" { + t.Fatalf("route health drift did not reselect relay: %+v", lease) + } + if cfg.RendezvousRelayPolicy == nil || + cfg.RendezvousRelayPolicy.StaleRelayCount != 1 || + cfg.RendezvousRelayPolicy.ReplacementLeaseCount != 1 || + cfg.RendezvousRelayPolicy.ScoringMode != "route_adjacency_endpoint_priority_mesh_link_health_synthetic_route_health_feedback" { + t.Fatalf("unexpected relay policy report: %+v", cfg.RendezvousRelayPolicy) + } + var policyDecision RendezvousRelayPolicyDecision + for _, item := range cfg.RendezvousRelayPolicy.Decisions { + if item.Reason == "stale_relay_replacement" { + policyDecision = item + break + } + } + if policyDecision.StaleRelayNodeID != "node-s" || policyDecision.SelectedRelayID != "node-t" || policyDecision.PeerNodeID != "node-b" { + t.Fatalf("unexpected route health replacement decision: %+v", cfg.RendezvousRelayPolicy.Decisions) + } + if cfg.RoutePathDecisions == nil || cfg.RoutePathDecisions.ReplacementDecisionCount != 1 { + t.Fatalf("expected replacement route path decision: %+v", cfg.RoutePathDecisions) + } + decision := cfg.RoutePathDecisions.Decisions[0] + if decision.SelectedRelayID != "node-t" || + decision.StaleRelayNodeID != "node-s" || + decision.RendezvousPeerNodeID != "node-b" || + strings.Join(decision.EffectiveHops, ",") != "node-a,node-t,node-b" || + decision.ProductionForwarding || + !decision.ControlPlaneOnly { + t.Fatalf("unexpected route path decision from route health feedback: %+v", decision) + } +} + +func TestGetNodeSyntheticMeshConfigUsesRouteHealthLatencyForRelayScore(t *testing.T) { + now := time.Date(2026, 4, 28, 12, 30, 0, 0, time.UTC) + routeHealthMetadata, err := json.Marshal(map[string]any{ + "observation_type": "synthetic_route_health", + "route_id": "route-a-b", + "route_path_decision_applied": true, + "route_path_decision_selected_relay_id": "node-t", + "route_path_decision_rendezvous_peer_node_id": "node-b", + "expected_effective_hops": []string{"node-a", "node-t", "node-b"}, + "observed_ack_path": []string{"node-a", "node-t", "node-b"}, + "route_path_drift_detected": false, + "control_plane_only": true, + "production_forwarding": false, + "production_payload_forwarding": false, + "route_health_production_payload_forwarding": false, + "route_health_service_payload_forwarding": false, + }) + if err != nil { + t.Fatalf("marshal route health metadata: %v", err) + } + latency := 5 + quality := 99 + service := NewService(&fakeRepository{ + testingFlags: EffectiveNodeTestingFlags{ + Enabled: true, + SyntheticLinksEnabled: true, + }, + meshLinks: []MeshLinkObservation{ + { + ClusterID: "cluster-1", + SourceNodeID: "node-a", + TargetNodeID: "node-b", + LinkStatus: "reachable", + LatencyMs: &latency, + QualityScore: &quality, + Metadata: routeHealthMetadata, + ObservedAt: now.Add(-10 * time.Second), + }, + }, + routeIntents: []MeshRouteIntent{ + { + ID: "route-a-b", + ClusterID: "cluster-1", + SourceSelector: json.RawMessage(`{"node_id":"node-a"}`), + DestinationSelector: json.RawMessage(`{"node_id":"node-b"}`), + ServiceClass: "synthetic", + Status: "active", + Policy: json.RawMessage(`{ + "synthetic_enabled": true, + "hops": ["node-a", "node-s", "node-t", "node-b"], + "allowed_channels": ["fabric_control", "route_control"], + "peer_endpoint_candidates": { + "node-b": [ + { + "endpoint_id": "node-b-outbound", + "node_id": "node-b", + "transport": "outbound_reverse", + "address": "node-b.reverse.local", + "reachability": "outbound_only", + "nat_type": "symmetric", + "connectivity_mode": "outbound_only", + "priority": 5 + } + ], + "node-s": [ + { + "endpoint_id": "node-s-public", + "node_id": "node-s", + "transport": "direct_tcp_tls", + "address": "http://node-s:19000", + "reachability": "public", + "nat_type": "none", + "connectivity_mode": "direct", + "priority": 1, + "policy_tags": ["fast-path"] + } + ], + "node-t": [ + { + "endpoint_id": "node-t-public", + "node_id": "node-t", + "transport": "direct_tcp_tls", + "address": "http://node-t:19000", + "reachability": "public", + "nat_type": "none", + "connectivity_mode": "direct", + "priority": 50 + } + ] + } + }`), + UpdatedAt: now, + }, + }, + }) + service.now = func() time.Time { return now } + + cfg, err := service.GetNodeSyntheticMeshConfig(context.Background(), GetNodeSyntheticMeshConfigInput{ + ClusterID: "cluster-1", + NodeID: "node-a", + }) + if err != nil { + t.Fatalf("get synthetic config: %v", err) + } + if len(cfg.RendezvousLeases) != 1 { + t.Fatalf("unexpected rendezvous leases: %+v", cfg.RendezvousLeases) + } + lease := cfg.RendezvousLeases[0] + if lease.RelayNodeID != "node-t" || lease.Reason == "stale_relay_replacement" { + t.Fatalf("route health latency did not influence relay score: %+v", lease) + } + var metadata map[string]any + if err := json.Unmarshal(lease.Metadata, &metadata); err != nil { + t.Fatalf("unmarshal lease metadata: %v", err) + } + reasons, _ := metadata["relay_selection_score_reasons"].([]any) + if !anyString(reasons, "route_health_reachable") || + !anyString(reasons, "route_health_no_drift") || + !anyString(reasons, "route_health_latency") { + t.Fatalf("route health score reasons missing: %+v", metadata) + } +} + +func anyString(values []any, want string) bool { + for _, value := range values { + if text, ok := value.(string); ok && text == want { + return true + } + } + return false +} + +func findPeerDirectoryEntry(entries []PeerDirectoryEntry, nodeID string) (PeerDirectoryEntry, bool) { + for _, entry := range entries { + if entry.NodeID == nodeID { + return entry, true + } + } + return PeerDirectoryEntry{}, false +} + +func TestValidatePeerEndpointCandidates(t *testing.T) { + valid := map[string][]PeerEndpointCandidate{ + "node-b": { + { + EndpointID: "node-b-public", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + AddressFamily: "ipv4", + Reachability: "public", + NATType: "restricted", + ConnectivityMode: "direct", + Priority: 10, + Metadata: json.RawMessage(`{"source":"test"}`), + }, + }, + } + if err := validatePeerEndpointCandidates(valid, []string{"node-a", "node-b"}); err != nil { + t.Fatalf("validate valid candidates: %v", err) + } + + tests := []struct { + name string + candidates map[string][]PeerEndpointCandidate + }{ + { + name: "unknown transport", + candidates: map[string][]PeerEndpointCandidate{"node-b": {{ + EndpointID: "node-b-public", + NodeID: "node-b", + Transport: "udp-hole-punch", + Address: "203.0.113.20:443", + Reachability: "public", + ConnectivityMode: "direct", + }}}, + }, + { + name: "unknown nat", + candidates: map[string][]PeerEndpointCandidate{"node-b": {{ + EndpointID: "node-b-public", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + NATType: "mystery_nat", + ConnectivityMode: "direct", + }}}, + }, + { + name: "node outside route path", + candidates: map[string][]PeerEndpointCandidate{"node-y": {{ + EndpointID: "node-y-public", + NodeID: "node-y", + Transport: "direct_tcp_tls", + Address: "203.0.113.30:443", + Reachability: "public", + ConnectivityMode: "direct", + }}}, + }, + { + name: "node mismatch", + candidates: map[string][]PeerEndpointCandidate{"node-b": {{ + EndpointID: "node-b-public", + NodeID: "node-c", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + ConnectivityMode: "direct", + }}}, + }, + { + name: "invalid metadata", + candidates: map[string][]PeerEndpointCandidate{"node-b": {{ + EndpointID: "node-b-public", + NodeID: "node-b", + Transport: "direct_tcp_tls", + Address: "203.0.113.20:443", + Reachability: "public", + ConnectivityMode: "direct", + Metadata: json.RawMessage(`{`), + }}}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validatePeerEndpointCandidates(tt.candidates, []string{"node-a", "node-b"}) + if !errors.Is(err, ErrInvalidPayload) { + t.Fatalf("err = %v, want ErrInvalidPayload", err) + } + }) + } +} + +func TestMinorityClusterBlocksPolicyMutation(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + authorityState: ClusterAuthorityState{ + ClusterID: "cluster-1", + AuthorityState: "minority", + MutationMode: "read_only", + }, + } + service := NewService(store) + + _, err := service.AssignNodeRole(context.Background(), AssignNodeRoleInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + NodeID: "node-1", + Role: "rdp-worker", + }) + if !errors.Is(err, ErrClusterReadOnly) { + t.Fatalf("err = %v, want ErrClusterReadOnly", err) + } +} + +func TestRecoveryAdminCanMutateReadOnlyCluster(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleRecoveryAdmin, + authorityState: ClusterAuthorityState{ + ClusterID: "cluster-1", + AuthorityState: "isolated", + MutationMode: "read_only", + }, + } + service := NewService(store) + + _, err := service.AssignNodeRole(context.Background(), AssignNodeRoleInput{ + ActorUserID: "recovery-1", + ClusterID: "cluster-1", + NodeID: "node-1", + Role: "rdp-worker", + }) + if err != nil { + t.Fatalf("recovery admin mutate: %v", err) + } +} + +func TestCreateVPNConnectionRequiresPlatformAdmin(t *testing.T) { + store := &fakeRepository{platformRole: "user"} + service := NewService(store) + + _, err := service.CreateVPNConnection(context.Background(), CreateVPNConnectionInput{ + ActorUserID: "user-1", + ClusterID: "cluster-1", + OrganizationID: "org-1", + Name: "office-a", + }) + if !errors.Is(err, ErrAccessDenied) { + t.Fatalf("err = %v, want ErrAccessDenied", err) + } +} + +func TestCreateVPNConnectionDefaultsToDisabledSingleActive(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + item, err := service.CreateVPNConnection(context.Background(), CreateVPNConnectionInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + OrganizationID: "org-1", + Name: "office-a", + }) + if err != nil { + t.Fatalf("create vpn connection: %v", err) + } + if item.Mode != VPNConnectionModeSingleActive || item.DesiredState != VPNConnectionDesiredDisabled { + t.Fatalf("unexpected defaults: %+v", item) + } + if string(store.lastVPNConnectionInput.AllowedNodePolicy) == "" || string(store.lastVPNConnectionInput.RoutingUsage) == "" { + t.Fatalf("expected default json policies, got %+v", store.lastVPNConnectionInput) + } +} + +func TestCreateVPNConnectionRequiresClusterAndOrganizationScope(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + _, err := service.CreateVPNConnection(context.Background(), CreateVPNConnectionInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + Name: "office-a", + }) + if !errors.Is(err, ErrInvalidPayload) { + t.Fatalf("err = %v, want ErrInvalidPayload", err) + } +} + +func TestCreateVPNConnectionBlockedInReadOnlyCluster(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + authorityState: ClusterAuthorityState{ + ClusterID: "cluster-1", + AuthorityState: "minority", + MutationMode: "read_only", + }, + } + service := NewService(store) + + _, err := service.CreateVPNConnection(context.Background(), CreateVPNConnectionInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + OrganizationID: "org-1", + Name: "office-a", + }) + if !errors.Is(err, ErrClusterReadOnly) { + t.Fatalf("err = %v, want ErrClusterReadOnly", err) + } +} + +func TestAcquireVPNLeaseRequiresEnabledConnection(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + vpnConnection: VPNConnection{ + ID: "vpn-1", + ClusterID: "cluster-1", + Mode: VPNConnectionModeSingleActive, + DesiredState: VPNConnectionDesiredDisabled, + }, + } + service := NewService(store) + + _, err := service.AcquireVPNConnectionLease(context.Background(), AcquireVPNConnectionLeaseInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + VPNConnectionID: "vpn-1", + OwnerNodeID: "node-1", + }) + if err == nil || !strings.Contains(err.Error(), "enabled single_active") { + t.Fatalf("err = %v, want enabled single_active validation", err) + } +} + +func TestAcquireVPNLeaseRejectsSecondActiveOwner(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + vpnConnection: VPNConnection{ + ID: "vpn-1", + ClusterID: "cluster-1", + Mode: VPNConnectionModeSingleActive, + DesiredState: VPNConnectionDesiredEnabled, + }, + acquireVPNLeaseErr: ErrVPNLeaseAlreadyActive, + } + service := NewService(store) + + _, err := service.AcquireVPNConnectionLease(context.Background(), AcquireVPNConnectionLeaseInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + VPNConnectionID: "vpn-1", + OwnerNodeID: "node-2", + }) + if !errors.Is(err, ErrVPNLeaseAlreadyActive) { + t.Fatalf("err = %v, want ErrVPNLeaseAlreadyActive", err) + } +} + +func TestAcquireVPNLeaseRejectsOwnerOutsideAllowedPolicy(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + vpnConnection: VPNConnection{ + ID: "vpn-1", + ClusterID: "cluster-1", + Mode: VPNConnectionModeSingleActive, + DesiredState: VPNConnectionDesiredEnabled, + }, + ownerEligibility: VPNLeaseOwnerEligibility{ + VPNConnectionID: "vpn-1", + ClusterID: "cluster-1", + OwnerNodeID: "node-1", + MembershipStatus: "active", + NodeRegistrationStatus: NodeRegistrationActive, + AllowedByPolicy: false, + HasAuthorizedRole: true, + }, + } + service := NewService(store) + + _, err := service.AcquireVPNConnectionLease(context.Background(), AcquireVPNConnectionLeaseInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + VPNConnectionID: "vpn-1", + OwnerNodeID: "node-1", + }) + if !errors.Is(err, ErrVPNLeaseOwnerNotAllowed) { + t.Fatalf("err = %v, want ErrVPNLeaseOwnerNotAllowed", err) + } +} + +func TestAcquireVPNLeaseRejectsOwnerWithoutVPNRole(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + vpnConnection: VPNConnection{ + ID: "vpn-1", + ClusterID: "cluster-1", + Mode: VPNConnectionModeSingleActive, + DesiredState: VPNConnectionDesiredEnabled, + }, + ownerEligibility: VPNLeaseOwnerEligibility{ + VPNConnectionID: "vpn-1", + ClusterID: "cluster-1", + OwnerNodeID: "node-1", + MembershipStatus: "active", + NodeRegistrationStatus: NodeRegistrationActive, + AllowedByPolicy: true, + HasAuthorizedRole: false, + }, + } + service := NewService(store) + + _, err := service.AcquireVPNConnectionLease(context.Background(), AcquireVPNConnectionLeaseInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + VPNConnectionID: "vpn-1", + OwnerNodeID: "node-1", + }) + if !errors.Is(err, ErrVPNLeaseOwnerRoleRequired) { + t.Fatalf("err = %v, want ErrVPNLeaseOwnerRoleRequired", err) + } +} + +func TestAcquireVPNLeaseRejectsWrongCluster(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + vpnConnection: VPNConnection{ + ID: "vpn-1", + ClusterID: "cluster-1", + Mode: VPNConnectionModeSingleActive, + DesiredState: VPNConnectionDesiredEnabled, + }, + ownerEligibilityErr: pgx.ErrNoRows, + } + service := NewService(store) + + _, err := service.AcquireVPNConnectionLease(context.Background(), AcquireVPNConnectionLeaseInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-other", + VPNConnectionID: "vpn-1", + OwnerNodeID: "node-1", + }) + if !errors.Is(err, ErrInvalidVPNConnection) { + t.Fatalf("err = %v, want ErrInvalidVPNConnection", err) + } +} + +func TestRenewVPNLeaseRejectsExpiredLease(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + ownerEligibility: VPNLeaseOwnerEligibility{ + VPNConnectionID: "vpn-1", + ClusterID: "cluster-1", + OwnerNodeID: "node-1", + MembershipStatus: "active", + NodeRegistrationStatus: NodeRegistrationActive, + AllowedByPolicy: true, + HasAuthorizedRole: true, + }, + renewVPNLeaseErr: pgx.ErrNoRows, + } + service := NewService(store) + + _, err := service.RenewVPNConnectionLease(context.Background(), RenewVPNConnectionLeaseInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + VPNConnectionID: "vpn-1", + LeaseID: "lease-1", + OwnerNodeID: "node-1", + FencingToken: "token-1", + }) + if !errors.Is(err, ErrInvalidVPNLease) { + t.Fatalf("err = %v, want ErrInvalidVPNLease", err) + } +} + +func TestFenceVPNLeaseRequiresRecoveryAdmin(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + _, err := service.FenceVPNConnectionLease(context.Background(), FenceVPNConnectionLeaseInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + VPNConnectionID: "vpn-1", + LeaseID: "lease-1", + }) + if !errors.Is(err, ErrAccessDenied) { + t.Fatalf("err = %v, want ErrAccessDenied", err) + } +} + +func TestExpireStaleVPNConnectionLeasesAuditsEachExpiredLease(t *testing.T) { + store := &fakeRepository{ + platformRole: PlatformRoleAdmin, + expiredVPNLeases: []VPNConnectionLease{ + {ID: "lease-1", ClusterID: "cluster-1", VPNConnectionID: "vpn-1", Status: VPNLeaseStatusExpired}, + {ID: "lease-2", ClusterID: "cluster-1", VPNConnectionID: "vpn-2", Status: VPNLeaseStatusExpired}, + }, + } + service := NewService(store) + + items, err := service.ExpireStaleVPNConnectionLeases(context.Background(), ExpireStaleVPNConnectionLeasesInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + }) + if err != nil { + t.Fatalf("expire stale vpn leases: %v", err) + } + if got, want := len(items), 2; got != want { + t.Fatalf("expired leases = %d, want %d", got, want) + } + var auditCount int + for _, event := range store.auditEvents { + if event.EventType == "vpn_connection.lease_expired" { + auditCount++ + } + } + if got, want := auditCount, 2; got != want { + t.Fatalf("lease_expired audit count = %d, want %d", got, want) + } +} + +func TestSetVPNConnectionAllowedNodesDeduplicatesScope(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + items, err := service.SetVPNConnectionAllowedNodes(context.Background(), SetVPNConnectionAllowedNodesInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + VPNConnectionID: "vpn-1", + NodeIDs: []string{"node-1", "node-1", " ", "node-2"}, + }) + if err != nil { + t.Fatalf("set allowed nodes: %v", err) + } + if got, want := len(store.lastAllowedNodesInput.NodeIDs), 2; got != want { + t.Fatalf("deduped nodes = %d, want %d", got, want) + } + if got, want := len(items), 2; got != want { + t.Fatalf("allowed nodes returned = %d, want %d", got, want) + } +} + +func TestUpsertVPNRoutePolicyRejectsInvalidType(t *testing.T) { + store := &fakeRepository{platformRole: PlatformRoleAdmin} + service := NewService(store) + + _, err := service.UpsertVPNConnectionRoutePolicy(context.Background(), UpsertVPNConnectionRoutePolicyInput{ + ActorUserID: "admin-1", + ClusterID: "cluster-1", + VPNConnectionID: "vpn-1", + RouteType: "submarine", + Destination: "10.0.0.0/24", + }) + if !errors.Is(err, ErrInvalidPayload) { + t.Fatalf("err = %v, want ErrInvalidPayload", err) + } +} + +func TestListNodeVPNAssignmentsDoesNotRequirePlatformAdmin(t *testing.T) { + store := &fakeRepository{ + platformRole: "user", + nodeVPNAssignments: []NodeVPNAssignment{ + {VPNConnectionID: "vpn-1", ClusterID: "cluster-1", OrganizationID: "org-1", AssignmentReason: "eligible_candidate"}, + }, + } + service := NewService(store) + + items, err := service.ListNodeVPNAssignments(context.Background(), "cluster-1", "node-1") + if err != nil { + t.Fatalf("list node vpn assignments: %v", err) + } + if got, want := len(items), 1; got != want { + t.Fatalf("assignments = %d, want %d", got, want) + } +} + +func TestReportNodeVPNAssignmentStatusRejectsInvisibleAssignment(t *testing.T) { + store := &fakeRepository{} + service := NewService(store) + + _, err := service.ReportNodeVPNAssignmentStatus(context.Background(), ReportNodeVPNAssignmentStatusInput{ + ClusterID: "cluster-1", + NodeID: "node-1", + VPNConnectionID: "vpn-foreign", + ObservedStatus: VPNAssignmentStatusAssigned, + }) + if !errors.Is(err, ErrVPNLeaseOwnerNotAllowed) { + t.Fatalf("err = %v, want ErrVPNLeaseOwnerNotAllowed", err) + } +} + +func TestReportNodeVPNAssignmentStatusAcceptsExplicitStates(t *testing.T) { + store := &fakeRepository{ + nodeVPNAssignments: []NodeVPNAssignment{ + {VPNConnectionID: "vpn-1", ClusterID: "cluster-1", OrganizationID: "org-1"}, + }, + } + service := NewService(store) + + status, err := service.ReportNodeVPNAssignmentStatus(context.Background(), ReportNodeVPNAssignmentStatusInput{ + ClusterID: "cluster-1", + NodeID: "node-1", + VPNConnectionID: "vpn-1", + ObservedStatus: VPNAssignmentStatusLeaseRequired, + StatusPayload: json.RawMessage(`{"reason":"no_lease"}`), + }) + if err != nil { + t.Fatalf("report node vpn assignment status: %v", err) + } + if status.ObservedStatus != VPNAssignmentStatusLeaseRequired { + t.Fatalf("ObservedStatus = %q, want %q", status.ObservedStatus, VPNAssignmentStatusLeaseRequired) + } +} + +func TestReportNodeVPNAssignmentStatusRejectsInvalidStatus(t *testing.T) { + store := &fakeRepository{ + nodeVPNAssignments: []NodeVPNAssignment{ + {VPNConnectionID: "vpn-1", ClusterID: "cluster-1", OrganizationID: "org-1"}, + }, + } + service := NewService(store) + + _, err := service.ReportNodeVPNAssignmentStatus(context.Background(), ReportNodeVPNAssignmentStatusInput{ + ClusterID: "cluster-1", + NodeID: "node-1", + VPNConnectionID: "vpn-1", + ObservedStatus: "running_tunnel", + }) + if !errors.Is(err, ErrInvalidPayload) { + t.Fatalf("err = %v, want ErrInvalidPayload", err) + } +} + +type fakeRepository struct { + platformRole string + lastTokenHash string + validTokenErr error + createJoinRequestID string + bootstrapJoinRequest NodeJoinRequest + clusterAuthority ClusterAuthorityKey + lastTokenAuthority json.RawMessage + lastApprovalAuthority json.RawMessage + authorityState ClusterAuthorityState + vpnConnection VPNConnection + lastVPNConnectionInput CreateVPNConnectionInput + lastAllowedNodesInput SetVPNConnectionAllowedNodesInput + lastAttachInput AttachExistingNodeInput + lastNodeGroupInput CreateNodeGroupInput + lastAssignGroupInput AssignNodeGroupInput + lastEntryPointInput CreateFabricEntryPointInput + lastEgressPoolInput CreateFabricEgressPoolInput + acquireVPNLeaseErr error + ownerEligibility VPNLeaseOwnerEligibility + ownerEligibilityErr error + renewVPNLeaseErr error + expiredVPNLeases []VPNConnectionLease + nodeVPNAssignments []NodeVPNAssignment + testingFlags EffectiveNodeTestingFlags + routeIntents []MeshRouteIntent + meshLinks []MeshLinkObservation + heartbeats map[string][]NodeHeartbeat + auditEvents []ClusterAuditEvent +} + +func (f *fakeRepository) GetPlatformRole(context.Context, string) (string, error) { + return f.platformRole, nil +} + +func (f *fakeRepository) ListClusters(context.Context) ([]Cluster, error) { + return nil, nil +} + +func (f *fakeRepository) GetCluster(context.Context, string) (Cluster, error) { + return Cluster{}, nil +} + +func (f *fakeRepository) CreateCluster(context.Context, CreateClusterInput) (Cluster, error) { + return Cluster{}, nil +} + +func (f *fakeRepository) UpdateCluster(_ context.Context, input UpdateClusterInput) (Cluster, error) { + return Cluster{ + ID: input.ClusterID, + Slug: "cluster-1", + Name: input.Name, + Status: input.Status, + Region: input.Region, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) GetClusterAuthority(_ context.Context, clusterID string) (ClusterAuthorityKey, error) { + if f.clusterAuthority.PrivateKey == "" { + keys, err := clusterauth.GenerateKeyPair() + if err != nil { + return ClusterAuthorityKey{}, err + } + f.clusterAuthority = ClusterAuthorityKey{ + ClusterAuthorityDescriptor: ClusterAuthorityDescriptor{ + SchemaVersion: clusterauth.AuthoritySchemaVersion, + ClusterID: clusterID, + AuthorityState: "active", + KeyAlgorithm: clusterauth.AlgorithmEd25519, + PublicKey: keys.PublicKeyB64, + PublicKeyFingerprint: keys.Fingerprint, + CreatedAt: time.Now().UTC(), + UpdatedAt: time.Now().UTC(), + }, + PrivateKey: keys.PrivateKeyB64, + } + } + if f.clusterAuthority.ClusterID == "" { + f.clusterAuthority.ClusterID = clusterID + } + return f.clusterAuthority, nil +} + +func (f *fakeRepository) EnsureClusterAuthority(ctx context.Context, clusterID string, _ *string) (ClusterAuthorityKey, error) { + return f.GetClusterAuthority(ctx, clusterID) +} + +func (f *fakeRepository) ListClusterNodes(context.Context, string) ([]ClusterNode, error) { + return nil, nil +} + +func (f *fakeRepository) ListNodeGroups(context.Context, string) ([]ClusterNodeGroup, error) { + return nil, nil +} + +func (f *fakeRepository) CreateNodeGroup(_ context.Context, input CreateNodeGroupInput) (ClusterNodeGroup, error) { + f.lastNodeGroupInput = input + return ClusterNodeGroup{ + ID: "group-1", + ClusterID: input.ClusterID, + ParentGroupID: input.ParentGroupID, + Name: input.Name, + Description: input.Description, + SortOrder: input.SortOrder, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) AssignNodeToGroup(_ context.Context, input AssignNodeGroupInput) (ClusterNode, error) { + f.lastAssignGroupInput = input + return ClusterNode{ + ID: input.NodeID, + NodeKey: "node-key-1", + Name: "Node One", + RegistrationStatus: NodeRegistrationActive, + MembershipStatus: "active", + NodeGroupID: input.GroupID, + }, nil +} + +func (f *fakeRepository) CreateJoinToken(_ context.Context, input CreateJoinTokenInput, tokenHash string) (NodeJoinToken, error) { + f.lastTokenHash = tokenHash + return NodeJoinToken{ + ID: "token-1", + ClusterID: input.ClusterID, + Scope: input.Scope, + ExpiresAt: input.ExpiresAt, + MaxUses: input.MaxUses, + Status: "active", + CreatedByUserID: &input.ActorUserID, + CreatedAt: time.Now().UTC(), + }, nil +} + +func (f *fakeRepository) SetJoinTokenAuthority(_ context.Context, clusterID, tokenID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinToken, error) { + f.lastTokenAuthority = payload + return NodeJoinToken{ + ID: tokenID, + ClusterID: clusterID, + Scope: json.RawMessage(`{"roles":["rdp-worker"]}`), + ExpiresAt: time.Now().UTC().Add(time.Hour), + MaxUses: 1, + Status: "active", + AuthorityPayload: payload, + AuthoritySignature: &signature, + }, nil +} + +func (f *fakeRepository) GetValidJoinTokenByHash(context.Context, string, string) (NodeJoinToken, error) { + if f.validTokenErr != nil { + return NodeJoinToken{}, f.validTokenErr + } + return NodeJoinToken{ID: "token-1", Status: "active", ExpiresAt: time.Now().Add(time.Hour), MaxUses: 1}, nil +} + +func (f *fakeRepository) RevokeJoinToken(context.Context, RevokeJoinTokenInput) (NodeJoinToken, error) { + return NodeJoinToken{ID: "token-1", Status: "revoked"}, nil +} + +func (f *fakeRepository) ExpireJoinTokens(context.Context, string) error { + return nil +} + +func (f *fakeRepository) CreateJoinRequest(_ context.Context, input CreateJoinRequestInput, joinTokenID string) (NodeJoinRequest, error) { + id := f.createJoinRequestID + if id == "" { + id = "join-request-1" + } + return NodeJoinRequest{ + ID: id, + ClusterID: input.ClusterID, + JoinTokenID: &joinTokenID, + NodeName: input.NodeName, + NodeFingerprint: input.NodeFingerprint, + PublicKey: input.PublicKey, + ReportedCapabilities: input.ReportedCapabilities, + ReportedFacts: input.ReportedFacts, + RequestedRoles: input.RequestedRoles, + Status: JoinRequestStatusPending, + }, nil +} + +func (f *fakeRepository) GetJoinRequestForBootstrap(context.Context, GetJoinRequestBootstrapInput) (NodeJoinRequest, error) { + if f.bootstrapJoinRequest.ID != "" { + return f.bootstrapJoinRequest, nil + } + return NodeJoinRequest{ID: "join-request-1", ClusterID: "cluster-1", Status: JoinRequestStatusPending}, nil +} + +func (f *fakeRepository) ListJoinRequests(context.Context, string) ([]NodeJoinRequest, error) { + return nil, nil +} + +func (f *fakeRepository) ApproveJoinRequest(_ context.Context, input ApproveJoinRequestInput) (ApprovedJoinRequest, error) { + return ApprovedJoinRequest{ + JoinRequest: NodeJoinRequest{ID: input.JoinRequestID, ClusterID: input.ClusterID, Status: JoinRequestStatusApproved, ApprovedNodeID: &input.NodeKey}, + Bootstrap: NodeBootstrap{NodeID: input.NodeKey, ClusterID: input.ClusterID, IdentityStatus: "active"}, + }, nil +} + +func (f *fakeRepository) SetJoinRequestApprovalAuthority(_ context.Context, clusterID, joinRequestID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinRequest, error) { + f.lastApprovalAuthority = payload + signatureRaw, _ := json.Marshal(signature) + nodeID := "node-1" + return NodeJoinRequest{ + ID: joinRequestID, + ClusterID: clusterID, + Status: JoinRequestStatusApproved, + ApprovedNodeID: &nodeID, + ApprovalPayload: payload, + ApprovalSignature: signatureRaw, + }, nil +} + +func (f *fakeRepository) RejectJoinRequest(context.Context, RejectJoinRequestInput) (NodeJoinRequest, error) { + return NodeJoinRequest{}, nil +} + +func (f *fakeRepository) AssignNodeRole(_ context.Context, input AssignNodeRoleInput) (NodeRoleAssignment, error) { + return NodeRoleAssignment{ClusterID: input.ClusterID, NodeID: input.NodeID, Role: input.Role}, nil +} + +func (f *fakeRepository) ListNodeRoleAssignments(context.Context, string, string) ([]NodeRoleAssignment, error) { + return nil, nil +} + +func (f *fakeRepository) AttachExistingNodeToCluster(_ context.Context, input AttachExistingNodeInput) (ClusterNode, error) { + f.lastAttachInput = input + return ClusterNode{ + ID: input.NodeID, + NodeKey: "node-key-1", + Name: "Node One", + RegistrationStatus: NodeRegistrationActive, + MembershipStatus: "active", + }, nil +} + +func (f *fakeRepository) RecordHeartbeat(context.Context, RecordHeartbeatInput) (NodeHeartbeat, error) { + return NodeHeartbeat{}, nil +} + +func (f *fakeRepository) ListNodeHeartbeats(_ context.Context, _ string, nodeID string, _ int) ([]NodeHeartbeat, error) { + return f.heartbeats[nodeID], nil +} + +func (f *fakeRepository) RevokeNodeIdentity(context.Context, RevokeNodeIdentityInput) error { + return nil +} + +func (f *fakeRepository) DisableClusterMembership(context.Context, DisableMembershipInput) error { + return nil +} + +func (f *fakeRepository) UpsertFabricTestingFlag(_ context.Context, input UpsertFabricTestingFlagInput) (FabricTestingFlag, error) { + return FabricTestingFlag{ + ScopeType: input.ScopeType, + ScopeID: input.ScopeID, + ClusterID: input.ClusterID, + Enabled: input.Enabled, + TelemetryEnabled: input.TelemetryEnabled, + SyntheticLinksEnabled: input.SyntheticLinksEnabled, + HistoryRetentionHours: input.HistoryRetentionHours, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) ListFabricTestingFlags(context.Context) ([]FabricTestingFlag, error) { + return nil, nil +} + +func (f *fakeRepository) GetEffectiveNodeTestingFlags(context.Context, string, string) (EffectiveNodeTestingFlags, error) { + return f.testingFlags, nil +} + +func (f *fakeRepository) RecordNodeTelemetry(_ context.Context, input RecordNodeTelemetryInput) (NodeTelemetryObservation, error) { + return NodeTelemetryObservation{ + ClusterID: input.ClusterID, + NodeID: input.NodeID, + Payload: input.Payload, + }, nil +} + +func (f *fakeRepository) ListNodeTelemetry(context.Context, string, string, int) ([]NodeTelemetryObservation, error) { + return nil, nil +} + +func (f *fakeRepository) SetDesiredWorkload(_ context.Context, input SetDesiredWorkloadInput) (NodeWorkloadDesiredState, error) { + return NodeWorkloadDesiredState{ + ClusterID: input.ClusterID, + NodeID: input.NodeID, + ServiceType: input.ServiceType, + DesiredState: input.DesiredState, + RuntimeMode: input.RuntimeMode, + Config: input.Config, + Environment: input.Environment, + }, nil +} + +func (f *fakeRepository) ListDesiredWorkloads(context.Context, string, string) ([]NodeWorkloadDesiredState, error) { + return nil, nil +} + +func (f *fakeRepository) ReportWorkloadStatus(_ context.Context, input ReportWorkloadStatusInput) (NodeWorkloadStatus, error) { + return NodeWorkloadStatus{ + ClusterID: input.ClusterID, + NodeID: input.NodeID, + ServiceType: input.ServiceType, + ReportedState: input.ReportedState, + RuntimeMode: input.RuntimeMode, + StatusPayload: input.StatusPayload, + }, nil +} + +func (f *fakeRepository) ListLatestWorkloadStatuses(context.Context, string, string) ([]NodeWorkloadStatus, error) { + return nil, nil +} + +func (f *fakeRepository) ReportMeshLink(_ context.Context, input ReportMeshLinkInput) (MeshLinkObservation, error) { + return MeshLinkObservation{ + ClusterID: input.ClusterID, + SourceNodeID: input.SourceNodeID, + TargetNodeID: input.TargetNodeID, + LinkStatus: input.LinkStatus, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) ListMeshLinks(context.Context, string) ([]MeshLinkObservation, error) { + return f.meshLinks, nil +} + +func (f *fakeRepository) CreateRouteIntent(_ context.Context, input CreateRouteIntentInput) (MeshRouteIntent, error) { + return MeshRouteIntent{ + ClusterID: input.ClusterID, + SourceSelector: input.SourceSelector, + DestinationSelector: input.DestinationSelector, + ServiceClass: input.ServiceClass, + Priority: input.Priority, + Policy: input.Policy, + }, nil +} + +func (f *fakeRepository) ListRouteIntents(context.Context, string) ([]MeshRouteIntent, error) { + return f.routeIntents, nil +} + +func (f *fakeRepository) ListQoSPolicies(context.Context, string) ([]MeshQoSPolicy, error) { + return nil, nil +} + +func (f *fakeRepository) ListFabricEntryPoints(context.Context, string) ([]FabricEntryPoint, error) { + return nil, nil +} + +func (f *fakeRepository) CreateFabricEntryPoint(_ context.Context, input CreateFabricEntryPointInput) (FabricEntryPoint, error) { + f.lastEntryPointInput = input + return FabricEntryPoint{ + ID: "entry-1", + ClusterID: input.ClusterID, + Name: input.Name, + Status: input.Status, + EndpointType: input.EndpointType, + PublicEndpoint: input.PublicEndpoint, + Policy: input.Policy, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) SetFabricEntryPointNode(_ context.Context, input SetFabricEntryPointNodeInput) (FabricEntryPointNode, error) { + return FabricEntryPointNode{ + EntryPointID: input.EntryPointID, + ClusterID: input.ClusterID, + NodeID: input.NodeID, + Status: input.Status, + Priority: input.Priority, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) ListFabricEntryPointNodes(context.Context, string, string) ([]FabricEntryPointNode, error) { + return []FabricEntryPointNode{}, nil +} + +func (f *fakeRepository) ListFabricEgressPools(context.Context, string) ([]FabricEgressPool, error) { + return nil, nil +} + +func (f *fakeRepository) CreateFabricEgressPool(_ context.Context, input CreateFabricEgressPoolInput) (FabricEgressPool, error) { + f.lastEgressPoolInput = input + return FabricEgressPool{ + ID: "egress-1", + ClusterID: input.ClusterID, + Name: input.Name, + Status: input.Status, + Description: input.Description, + RouteScope: input.RouteScope, + Policy: input.Policy, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) SetFabricEgressPoolNode(_ context.Context, input SetFabricEgressPoolNodeInput) (FabricEgressPoolNode, error) { + return FabricEgressPoolNode{ + EgressPoolID: input.EgressPoolID, + ClusterID: input.ClusterID, + NodeID: input.NodeID, + Status: input.Status, + Priority: input.Priority, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) ListFabricEgressPoolNodes(context.Context, string, string) ([]FabricEgressPoolNode, error) { + return []FabricEgressPoolNode{}, nil +} + +func (f *fakeRepository) GetClusterAuthorityState(context.Context, string) (ClusterAuthorityState, error) { + if f.authorityState.ClusterID == "" { + return ClusterAuthorityState{ClusterID: "cluster-1", AuthorityState: "authoritative", MutationMode: "normal"}, nil + } + return f.authorityState, nil +} + +func (f *fakeRepository) UpdateClusterAuthorityState(_ context.Context, input UpdateClusterAuthorityInput) (ClusterAuthorityState, error) { + return ClusterAuthorityState{ + ClusterID: input.ClusterID, + AuthorityState: input.AuthorityState, + MutationMode: input.MutationMode, + Notes: input.Notes, + }, nil +} + +func (f *fakeRepository) ListClusterAdminSummaries(context.Context) ([]ClusterAdminSummary, error) { + return nil, nil +} + +func (f *fakeRepository) CreateVPNConnection(_ context.Context, input CreateVPNConnectionInput) (VPNConnection, error) { + f.lastVPNConnectionInput = input + return VPNConnection{ + ID: "vpn-1", + ClusterID: input.ClusterID, + OrganizationID: input.OrganizationID, + Name: input.Name, + TargetEndpoint: input.TargetEndpoint, + ProtocolFamily: input.ProtocolFamily, + CredentialRef: input.CredentialRef, + Mode: input.Mode, + DesiredState: input.DesiredState, + AllowedNodePolicy: input.AllowedNodePolicy, + RoutingUsage: input.RoutingUsage, + RoutePolicy: input.RoutePolicy, + QoSPolicy: input.QoSPolicy, + PlacementPolicy: input.PlacementPolicy, + Status: VPNConnectionStatusDisabled, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) ListVPNConnections(context.Context, string) ([]VPNConnection, error) { + return nil, nil +} + +func (f *fakeRepository) GetVPNConnection(context.Context, string, string) (VPNConnection, error) { + if f.vpnConnection.ID != "" { + return f.vpnConnection, nil + } + return VPNConnection{ + ID: "vpn-1", + ClusterID: "cluster-1", + Mode: VPNConnectionModeSingleActive, + DesiredState: VPNConnectionDesiredEnabled, + }, nil +} + +func (f *fakeRepository) UpdateVPNConnectionDesiredState(_ context.Context, input UpdateVPNConnectionDesiredStateInput) (VPNConnection, error) { + return VPNConnection{ID: input.VPNConnectionID, ClusterID: input.ClusterID, DesiredState: input.DesiredState}, nil +} + +func (f *fakeRepository) UpsertVPNConnectionRoutePolicy(_ context.Context, input UpsertVPNConnectionRoutePolicyInput) (VPNConnectionRoutePolicy, error) { + return VPNConnectionRoutePolicy{ + ID: "route-policy-1", + VPNConnectionID: input.VPNConnectionID, + ClusterID: input.ClusterID, + RouteType: input.RouteType, + Destination: input.Destination, + Action: input.Action, + ServiceType: input.ServiceType, + Priority: input.Priority, + Policy: input.Policy, + Status: input.Status, + }, nil +} + +func (f *fakeRepository) ListVPNConnectionRoutePolicies(context.Context, string, string) ([]VPNConnectionRoutePolicy, error) { + return nil, nil +} + +func (f *fakeRepository) SetVPNConnectionAllowedNodes(_ context.Context, input SetVPNConnectionAllowedNodesInput) ([]VPNConnectionAllowedNode, error) { + f.lastAllowedNodesInput = input + items := make([]VPNConnectionAllowedNode, 0, len(input.NodeIDs)) + for _, nodeID := range input.NodeIDs { + items = append(items, VPNConnectionAllowedNode{ + VPNConnectionID: input.VPNConnectionID, + ClusterID: input.ClusterID, + NodeID: nodeID, + RolePreference: input.RolePreference, + Status: "active", + Metadata: input.Metadata, + }) + } + return items, nil +} + +func (f *fakeRepository) ListVPNConnectionAllowedNodes(context.Context, string, string) ([]VPNConnectionAllowedNode, error) { + return nil, nil +} + +func (f *fakeRepository) AcquireVPNConnectionLease(_ context.Context, input AcquireVPNConnectionLeaseInput, expiresAt time.Time, fencingToken string) (VPNConnectionLease, error) { + if f.acquireVPNLeaseErr != nil { + return VPNConnectionLease{}, f.acquireVPNLeaseErr + } + return VPNConnectionLease{ + ID: "lease-1", + VPNConnectionID: input.VPNConnectionID, + ClusterID: input.ClusterID, + OwnerNodeID: input.OwnerNodeID, + LeaseGeneration: 1, + FencingToken: fencingToken, + Status: VPNLeaseStatusActive, + ExpiresAt: expiresAt, + Metadata: input.Metadata, + }, nil +} + +func (f *fakeRepository) RenewVPNConnectionLease(_ context.Context, input RenewVPNConnectionLeaseInput, expiresAt time.Time) (VPNConnectionLease, error) { + if f.renewVPNLeaseErr != nil { + return VPNConnectionLease{}, f.renewVPNLeaseErr + } + return VPNConnectionLease{ID: input.LeaseID, VPNConnectionID: input.VPNConnectionID, ClusterID: input.ClusterID, OwnerNodeID: input.OwnerNodeID, FencingToken: input.FencingToken, Status: VPNLeaseStatusActive, ExpiresAt: expiresAt}, nil +} + +func (f *fakeRepository) ReleaseVPNConnectionLease(_ context.Context, input ReleaseVPNConnectionLeaseInput) (VPNConnectionLease, error) { + return VPNConnectionLease{ID: input.LeaseID, VPNConnectionID: input.VPNConnectionID, ClusterID: input.ClusterID, OwnerNodeID: input.OwnerNodeID, FencingToken: input.FencingToken, Status: VPNLeaseStatusReleased}, nil +} + +func (f *fakeRepository) FenceVPNConnectionLease(_ context.Context, input FenceVPNConnectionLeaseInput) (VPNConnectionLease, error) { + return VPNConnectionLease{ID: input.LeaseID, VPNConnectionID: input.VPNConnectionID, ClusterID: input.ClusterID, Status: VPNLeaseStatusFenced}, nil +} + +func (f *fakeRepository) GetActiveVPNConnectionLease(context.Context, string, string) (VPNConnectionLease, error) { + return VPNConnectionLease{ID: "lease-1", Status: VPNLeaseStatusActive}, nil +} + +func (f *fakeRepository) CheckVPNLeaseOwnerEligibility(context.Context, string, string, string) (VPNLeaseOwnerEligibility, error) { + if f.ownerEligibilityErr != nil { + return VPNLeaseOwnerEligibility{}, f.ownerEligibilityErr + } + if f.ownerEligibility.VPNConnectionID != "" { + return f.ownerEligibility, nil + } + return VPNLeaseOwnerEligibility{ + VPNConnectionID: "vpn-1", + ClusterID: "cluster-1", + OrganizationID: "org-1", + OwnerNodeID: "node-1", + MembershipStatus: "active", + NodeRegistrationStatus: NodeRegistrationActive, + AllowedByPolicy: true, + HasAuthorizedRole: true, + }, nil +} + +func (f *fakeRepository) ExpireStaleVPNConnectionLeases(context.Context, string, time.Time) ([]VPNConnectionLease, error) { + return f.expiredVPNLeases, nil +} + +func (f *fakeRepository) ListNodeVPNAssignments(context.Context, string, string) ([]NodeVPNAssignment, error) { + return f.nodeVPNAssignments, nil +} + +func (f *fakeRepository) ReportNodeVPNAssignmentStatus(_ context.Context, input ReportNodeVPNAssignmentStatusInput) (NodeVPNAssignmentStatus, error) { + return NodeVPNAssignmentStatus{ + ID: "status-1", + VPNConnectionID: input.VPNConnectionID, + ClusterID: input.ClusterID, + NodeID: input.NodeID, + ObservedStatus: input.ObservedStatus, + StatusPayload: input.StatusPayload, + ObservedAt: input.ObservedAt, + }, nil +} + +func (f *fakeRepository) RecordAudit(_ context.Context, event ClusterAuditEvent) error { + f.auditEvents = append(f.auditEvents, event) + return nil +} + +func (f *fakeRepository) ListAuditEvents(context.Context, string, int) ([]ClusterAuditEvent, error) { + return nil, nil +} diff --git a/backend/internal/modules/cluster/token.go b/backend/internal/modules/cluster/token.go new file mode 100644 index 0000000..9ce2e07 --- /dev/null +++ b/backend/internal/modules/cluster/token.go @@ -0,0 +1,43 @@ +package cluster + +import ( + "crypto/rand" + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "errors" + "strings" + "time" +) + +const joinTokenHashPrefix = "sha256:" + +func generateJoinToken() (string, error) { + var random [32]byte + if _, err := rand.Read(random[:]); err != nil { + return "", err + } + return "rap_join_" + base64.RawURLEncoding.EncodeToString(random[:]), nil +} + +func hashJoinToken(token string) (string, error) { + trimmed := strings.TrimSpace(token) + if trimmed == "" { + return "", errors.New("join token is required") + } + sum := sha256.Sum256([]byte(trimmed)) + return joinTokenHashPrefix + hex.EncodeToString(sum[:]), nil +} + +func isPlatformAdminRole(role string) bool { + return role == PlatformRoleAdmin || role == PlatformRoleRecoveryAdmin +} + +func isAllowedNodeRole(role string) bool { + _, ok := allowedNodeRoles[role] + return ok +} + +func defaultJoinTokenExpiry(now time.Time) time.Time { + return now.Add(30 * time.Minute) +} diff --git a/backend/internal/modules/identitysource/module.go b/backend/internal/modules/identitysource/module.go new file mode 100644 index 0000000..bf4e091 --- /dev/null +++ b/backend/internal/modules/identitysource/module.go @@ -0,0 +1,344 @@ +package identitysource + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/example/remote-access-platform/backend/internal/platform/httpx" + "github.com/example/remote-access-platform/backend/internal/platform/module" +) + +type Module struct { + db *pgxpool.Pool +} + +type IdentitySource struct { + ID string `json:"id"` + OrganizationID string `json:"organization_id"` + Kind string `json:"kind"` + Name string `json:"name"` + Status string `json:"status"` + Config json.RawMessage `json:"config"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type IdentityMapping struct { + ID string `json:"id"` + IdentitySourceID string `json:"identity_source_id"` + MappingType string `json:"mapping_type"` + ExternalSelector json.RawMessage `json:"external_selector"` + InternalTarget json.RawMessage `json:"internal_target"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type upsertIdentitySourceRequest struct { + ActorUserID string `json:"actor_user_id"` + OrganizationID string `json:"organization_id"` + Kind string `json:"kind"` + Name string `json:"name"` + Status string `json:"status"` + Config json.RawMessage `json:"config"` + IdentityMappings []struct { + MappingType string `json:"mapping_type"` + ExternalSelector json.RawMessage `json:"external_selector"` + InternalTarget json.RawMessage `json:"internal_target"` + } `json:"identity_mappings"` +} + +func NewModule(deps module.Dependencies) *Module { + return &Module{db: deps.Infra.DB} +} + +func (m *Module) Name() string { + return "identitysource" +} + +func (m *Module) RegisterRoutes(router chi.Router) { + router.Route("/identity-sources", func(r chi.Router) { + r.Get("/", m.listIdentitySources) + r.Post("/", m.createIdentitySource) + r.Get("/{identitySourceID}", m.getIdentitySource) + r.Put("/{identitySourceID}", m.updateIdentitySource) + }) +} + +func (m *Module) listIdentitySources(w http.ResponseWriter, r *http.Request) { + orgID := r.URL.Query().Get("organization_id") + if orgID == "" { + httpx.WriteError(w, http.StatusBadRequest, "organization_id is required") + return + } + rows, err := m.db.Query(r.Context(), ` + SELECT id, organization_id, kind, name, status, config, created_at, updated_at + FROM identity_sources + WHERE organization_id = $1 + ORDER BY created_at DESC + `, orgID) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer rows.Close() + var items []IdentitySource + for rows.Next() { + item, err := scanIdentitySource(rows) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + items = append(items, item) + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"identity_sources": items}) +} + +func (m *Module) getIdentitySource(w http.ResponseWriter, r *http.Request) { + id := chi.URLParam(r, "identitySourceID") + item, err := m.getByID(r.Context(), id) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + httpx.WriteError(w, http.StatusNotFound, "identity source not found") + return + } + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + mappings, err := m.listMappings(r.Context(), id) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{ + "identity_source": item, + "identity_mappings": mappings, + }) +} + +func (m *Module) createIdentitySource(w http.ResponseWriter, r *http.Request) { + req, err := decodeRequest(r) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + now := time.Now().UTC() + item := IdentitySource{ + ID: uuid.NewString(), + OrganizationID: req.OrganizationID, + Kind: req.Kind, + Name: req.Name, + Status: req.Status, + Config: req.Config, + CreatedAt: now, + UpdatedAt: now, + } + tx, err := m.db.Begin(r.Context()) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer tx.Rollback(r.Context()) + if _, err := tx.Exec(r.Context(), ` + INSERT INTO identity_sources (id, organization_id, kind, name, status, config, created_at, updated_at) + VALUES ($1, $2, $3, $4, $5, $6::jsonb, $7, $8) + `, item.ID, item.OrganizationID, item.Kind, item.Name, item.Status, []byte(item.Config), item.CreatedAt, item.UpdatedAt); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + mappings, err := upsertMappings(r.Context(), tx, item.ID, req.IdentityMappings) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := tx.Commit(r.Context()); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{ + "identity_source": item, + "identity_mappings": mappings, + }) +} + +func (m *Module) updateIdentitySource(w http.ResponseWriter, r *http.Request) { + req, err := decodeRequest(r) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + id := chi.URLParam(r, "identitySourceID") + tx, err := m.db.Begin(r.Context()) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer tx.Rollback(r.Context()) + tag, err := tx.Exec(r.Context(), ` + UPDATE identity_sources + SET organization_id = $2, kind = $3, name = $4, status = $5, config = $6::jsonb, updated_at = $7 + WHERE id = $1 + `, id, req.OrganizationID, req.Kind, req.Name, req.Status, []byte(req.Config), time.Now().UTC()) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if tag.RowsAffected() == 0 { + httpx.WriteError(w, http.StatusNotFound, "identity source not found") + return + } + if _, err := tx.Exec(r.Context(), `DELETE FROM identity_mappings WHERE identity_source_id = $1`, id); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + mappings, err := upsertMappings(r.Context(), tx, id, req.IdentityMappings) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := tx.Commit(r.Context()); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + item, err := m.getByID(r.Context(), id) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{ + "identity_source": item, + "identity_mappings": mappings, + }) +} + +func decodeRequest(r *http.Request) (*upsertIdentitySourceRequest, error) { + var req upsertIdentitySourceRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + return nil, errors.New("invalid identity source payload") + } + if req.ActorUserID == "" || req.OrganizationID == "" || req.Kind == "" || req.Name == "" { + return nil, errors.New("actor_user_id, organization_id, kind, and name are required") + } + if req.Status == "" { + req.Status = "active" + } + if len(req.Config) == 0 { + req.Config = json.RawMessage(`{}`) + } + if !json.Valid(req.Config) { + return nil, errors.New("config must be valid json") + } + for _, mapping := range req.IdentityMappings { + if len(mapping.ExternalSelector) == 0 { + mapping.ExternalSelector = json.RawMessage(`{}`) + } + if len(mapping.InternalTarget) == 0 { + mapping.InternalTarget = json.RawMessage(`{}`) + } + } + return &req, nil +} + +func (m *Module) getByID(ctx context.Context, id string) (IdentitySource, error) { + row := m.db.QueryRow(ctx, ` + SELECT id, organization_id, kind, name, status, config, created_at, updated_at + FROM identity_sources + WHERE id = $1 + `, id) + return scanIdentitySource(row) +} + +func (m *Module) listMappings(ctx context.Context, sourceID string) ([]IdentityMapping, error) { + rows, err := m.db.Query(ctx, ` + SELECT id, identity_source_id, mapping_type, external_selector, internal_target, created_at, updated_at + FROM identity_mappings + WHERE identity_source_id = $1 + ORDER BY created_at ASC + `, sourceID) + if err != nil { + return nil, err + } + defer rows.Close() + var mappings []IdentityMapping + for rows.Next() { + item, err := scanIdentityMapping(rows) + if err != nil { + return nil, err + } + mappings = append(mappings, item) + } + return mappings, rows.Err() +} + +func upsertMappings(ctx context.Context, tx pgx.Tx, sourceID string, requested []struct { + MappingType string `json:"mapping_type"` + ExternalSelector json.RawMessage `json:"external_selector"` + InternalTarget json.RawMessage `json:"internal_target"` +}) ([]IdentityMapping, error) { + now := time.Now().UTC() + items := make([]IdentityMapping, 0, len(requested)) + for _, mapping := range requested { + external := mapping.ExternalSelector + if len(external) == 0 { + external = json.RawMessage(`{}`) + } + internal := mapping.InternalTarget + if len(internal) == 0 { + internal = json.RawMessage(`{}`) + } + item := IdentityMapping{ + ID: uuid.NewString(), + IdentitySourceID: sourceID, + MappingType: mapping.MappingType, + ExternalSelector: external, + InternalTarget: internal, + CreatedAt: now, + UpdatedAt: now, + } + if _, err := tx.Exec(ctx, ` + INSERT INTO identity_mappings ( + id, identity_source_id, mapping_type, external_selector, internal_target, created_at, updated_at + ) VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6, $7) + `, item.ID, item.IdentitySourceID, item.MappingType, []byte(item.ExternalSelector), []byte(item.InternalTarget), item.CreatedAt, item.UpdatedAt); err != nil { + return nil, err + } + items = append(items, item) + } + return items, nil +} + +type rowScanner interface { + Scan(dest ...any) error +} + +func scanIdentitySource(row rowScanner) (IdentitySource, error) { + var item IdentitySource + if err := row.Scan(&item.ID, &item.OrganizationID, &item.Kind, &item.Name, &item.Status, &item.Config, &item.CreatedAt, &item.UpdatedAt); err != nil { + return IdentitySource{}, err + } + if len(item.Config) == 0 { + item.Config = json.RawMessage(`{}`) + } + return item, nil +} + +func scanIdentityMapping(row rowScanner) (IdentityMapping, error) { + var item IdentityMapping + if err := row.Scan(&item.ID, &item.IdentitySourceID, &item.MappingType, &item.ExternalSelector, &item.InternalTarget, &item.CreatedAt, &item.UpdatedAt); err != nil { + return IdentityMapping{}, err + } + if len(item.ExternalSelector) == 0 { + item.ExternalSelector = json.RawMessage(`{}`) + } + if len(item.InternalTarget) == 0 { + item.InternalTarget = json.RawMessage(`{}`) + } + return item, nil +} diff --git a/backend/internal/modules/node/module.go b/backend/internal/modules/node/module.go new file mode 100644 index 0000000..4032516 --- /dev/null +++ b/backend/internal/modules/node/module.go @@ -0,0 +1,458 @@ +package node + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/example/remote-access-platform/backend/internal/platform/httpx" + "github.com/example/remote-access-platform/backend/internal/platform/module" +) + +type Module struct { + db *pgxpool.Pool +} + +type Node struct { + ID string `json:"id"` + OwnerOrganizationID *string `json:"owner_organization_id,omitempty"` + NodeKey string `json:"node_key"` + Name string `json:"name"` + OwnershipType string `json:"ownership_type"` + RegistrationStatus string `json:"registration_status"` + HealthStatus string `json:"health_status"` + VersionState string `json:"version_state"` + PartitionState string `json:"partition_state"` + DesiredVersion *string `json:"desired_version,omitempty"` + ReportedVersion *string `json:"reported_version,omitempty"` + LastSeenAt *time.Time `json:"last_seen_at,omitempty"` + Metadata json.RawMessage `json:"metadata"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type NodeCapability struct { + NodeID string `json:"node_id"` + Capability string `json:"capability"` + Value json.RawMessage `json:"value"` + UpdatedAt time.Time `json:"updated_at"` +} + +type NodeService struct { + NodeID string `json:"node_id"` + ServiceType string `json:"service_type"` + Enabled bool `json:"enabled"` + DesiredState string `json:"desired_state"` + ReportedState string `json:"reported_state"` + LastReportedAt *time.Time `json:"last_reported_at,omitempty"` + Metadata json.RawMessage `json:"metadata"` + UpdatedAt time.Time `json:"updated_at"` +} + +type NodeUpdatePolicy struct { + NodeID string `json:"node_id"` + Mode string `json:"mode"` + Channel string `json:"channel"` + MaintenanceWindow json.RawMessage `json:"maintenance_window"` + Canary bool `json:"canary"` + AutomaticRollout bool `json:"automatic_rollout"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type NodePartitionState struct { + NodeID string `json:"node_id"` + ClusterState string `json:"cluster_state"` + RecoveryMode string `json:"recovery_mode"` + Notes *string `json:"notes,omitempty"` + UpdatedAt time.Time `json:"updated_at"` +} + +type upsertNodeRequest struct { + ActorUserID string `json:"actor_user_id"` + OwnerOrganizationID *string `json:"owner_organization_id"` + NodeKey string `json:"node_key"` + Name string `json:"name"` + OwnershipType string `json:"ownership_type"` + DesiredVersion *string `json:"desired_version"` + Metadata json.RawMessage `json:"metadata"` + Capabilities []struct { + Capability string `json:"capability"` + Value json.RawMessage `json:"value"` + } `json:"capabilities"` + Services []struct { + ServiceType string `json:"service_type"` + Enabled bool `json:"enabled"` + DesiredState string `json:"desired_state"` + Metadata json.RawMessage `json:"metadata"` + } `json:"services"` + UpdatePolicy struct { + Mode string `json:"mode"` + Channel string `json:"channel"` + MaintenanceWindow json.RawMessage `json:"maintenance_window"` + Canary bool `json:"canary"` + AutomaticRollout bool `json:"automatic_rollout"` + } `json:"update_policy"` + PartitionState struct { + ClusterState string `json:"cluster_state"` + RecoveryMode string `json:"recovery_mode"` + Notes *string `json:"notes"` + } `json:"partition_state"` +} + +func NewModule(deps module.Dependencies) *Module { + return &Module{db: deps.Infra.DB} +} + +func (m *Module) Name() string { + return "node" +} + +func (m *Module) RegisterRoutes(router chi.Router) { + router.Route("/nodes", func(r chi.Router) { + r.Get("/", m.listNodes) + r.Post("/", m.createNode) + r.Get("/{nodeID}", m.getNode) + r.Put("/{nodeID}", m.updateNode) + }) +} + +func (m *Module) listNodes(w http.ResponseWriter, r *http.Request) { + rows, err := m.db.Query(r.Context(), ` + SELECT id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status, + version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at + FROM nodes + ORDER BY created_at DESC + `) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer rows.Close() + var items []Node + for rows.Next() { + item, err := scanNode(rows) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + items = append(items, item) + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"nodes": items}) +} + +func (m *Module) getNode(w http.ResponseWriter, r *http.Request) { + nodeID := chi.URLParam(r, "nodeID") + item, err := m.getNodeByID(r.Context(), nodeID) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + httpx.WriteError(w, http.StatusNotFound, "node not found") + return + } + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + caps, _ := m.listCapabilities(r.Context(), nodeID) + services, _ := m.listServices(r.Context(), nodeID) + updatePolicy, _ := m.getUpdatePolicy(r.Context(), nodeID) + partitionState, _ := m.getPartitionState(r.Context(), nodeID) + httpx.WriteJSON(w, http.StatusOK, map[string]any{ + "node": item, + "capabilities": caps, + "services": services, + "update_policy": updatePolicy, + "partition_state": partitionState, + }) +} + +func (m *Module) createNode(w http.ResponseWriter, r *http.Request) { + req, err := decodeNodeRequest(r) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + item := Node{ + ID: uuid.NewString(), + OwnerOrganizationID: req.OwnerOrganizationID, + NodeKey: req.NodeKey, + Name: req.Name, + OwnershipType: req.OwnershipType, + RegistrationStatus: "pending", + HealthStatus: "unknown", + VersionState: "unknown", + PartitionState: "healthy", + DesiredVersion: req.DesiredVersion, + Metadata: req.Metadata, + CreatedAt: time.Now().UTC(), + UpdatedAt: time.Now().UTC(), + } + if err := m.persistNode(r.Context(), item, req, true); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"node": item}) +} + +func (m *Module) updateNode(w http.ResponseWriter, r *http.Request) { + req, err := decodeNodeRequest(r) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + nodeID := chi.URLParam(r, "nodeID") + item, err := m.getNodeByID(r.Context(), nodeID) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + httpx.WriteError(w, http.StatusNotFound, "node not found") + return + } + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + item.OwnerOrganizationID = req.OwnerOrganizationID + item.NodeKey = req.NodeKey + item.Name = req.Name + item.OwnershipType = req.OwnershipType + item.DesiredVersion = req.DesiredVersion + item.Metadata = req.Metadata + item.UpdatedAt = time.Now().UTC() + if err := m.persistNode(r.Context(), item, req, false); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"node": item}) +} + +func decodeNodeRequest(r *http.Request) (*upsertNodeRequest, error) { + var req upsertNodeRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + return nil, errors.New("invalid node payload") + } + if req.ActorUserID == "" || req.NodeKey == "" || req.Name == "" || req.OwnershipType == "" { + return nil, errors.New("actor_user_id, node_key, name, and ownership_type are required") + } + if len(req.Metadata) == 0 { + req.Metadata = json.RawMessage(`{}`) + } + if !json.Valid(req.Metadata) { + return nil, errors.New("metadata must be valid json") + } + if req.UpdatePolicy.Mode == "" { + req.UpdatePolicy.Mode = "manual" + } + if req.UpdatePolicy.Channel == "" { + req.UpdatePolicy.Channel = "stable" + } + if len(req.UpdatePolicy.MaintenanceWindow) == 0 { + req.UpdatePolicy.MaintenanceWindow = json.RawMessage(`{}`) + } + if req.PartitionState.ClusterState == "" { + req.PartitionState.ClusterState = "healthy" + } + if req.PartitionState.RecoveryMode == "" { + req.PartitionState.RecoveryMode = "normal" + } + for i := range req.Capabilities { + if len(req.Capabilities[i].Value) == 0 { + req.Capabilities[i].Value = json.RawMessage(`{}`) + } + } + for i := range req.Services { + if req.Services[i].DesiredState == "" { + req.Services[i].DesiredState = "disabled" + } + if len(req.Services[i].Metadata) == 0 { + req.Services[i].Metadata = json.RawMessage(`{}`) + } + } + return &req, nil +} + +func (m *Module) persistNode(ctx context.Context, item Node, req *upsertNodeRequest, create bool) error { + tx, err := m.db.Begin(ctx) + if err != nil { + return err + } + defer tx.Rollback(ctx) + if create { + _, err = tx.Exec(ctx, ` + INSERT INTO nodes ( + id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status, + version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at + ) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13::jsonb,$14,$15) + `, item.ID, item.OwnerOrganizationID, item.NodeKey, item.Name, item.OwnershipType, item.RegistrationStatus, item.HealthStatus, item.VersionState, item.PartitionState, item.DesiredVersion, item.ReportedVersion, item.LastSeenAt, []byte(item.Metadata), item.CreatedAt, item.UpdatedAt) + } else { + _, err = tx.Exec(ctx, ` + UPDATE nodes + SET owner_organization_id=$2, node_key=$3, name=$4, ownership_type=$5, desired_version=$6, metadata=$7::jsonb, updated_at=$8 + WHERE id=$1 + `, item.ID, item.OwnerOrganizationID, item.NodeKey, item.Name, item.OwnershipType, item.DesiredVersion, []byte(item.Metadata), item.UpdatedAt) + } + if err != nil { + return err + } + if _, err := tx.Exec(ctx, `DELETE FROM node_capabilities WHERE node_id = $1`, item.ID); err != nil { + return err + } + for _, capability := range req.Capabilities { + if _, err := tx.Exec(ctx, ` + INSERT INTO node_capabilities (node_id, capability, value, updated_at) + VALUES ($1, $2, $3::jsonb, $4) + `, item.ID, capability.Capability, []byte(capability.Value), time.Now().UTC()); err != nil { + return err + } + } + if _, err := tx.Exec(ctx, `DELETE FROM node_services WHERE node_id = $1`, item.ID); err != nil { + return err + } + for _, service := range req.Services { + if _, err := tx.Exec(ctx, ` + INSERT INTO node_services ( + node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at + ) VALUES ($1, $2, $3, $4, 'unknown', NULL, $5::jsonb, $6) + `, item.ID, service.ServiceType, service.Enabled, service.DesiredState, []byte(service.Metadata), time.Now().UTC()); err != nil { + return err + } + } + if _, err := tx.Exec(ctx, ` + INSERT INTO node_update_policies ( + node_id, mode, channel, maintenance_window, canary, automatic_rollout, created_at, updated_at + ) VALUES ($1,$2,$3,$4::jsonb,$5,$6,$7,$8) + ON CONFLICT (node_id) DO UPDATE SET + mode = EXCLUDED.mode, + channel = EXCLUDED.channel, + maintenance_window = EXCLUDED.maintenance_window, + canary = EXCLUDED.canary, + automatic_rollout = EXCLUDED.automatic_rollout, + updated_at = EXCLUDED.updated_at + `, item.ID, req.UpdatePolicy.Mode, req.UpdatePolicy.Channel, []byte(req.UpdatePolicy.MaintenanceWindow), req.UpdatePolicy.Canary, req.UpdatePolicy.AutomaticRollout, time.Now().UTC(), time.Now().UTC()); err != nil { + return err + } + if _, err := tx.Exec(ctx, ` + INSERT INTO node_partition_states (node_id, cluster_state, recovery_mode, notes, updated_at) + VALUES ($1,$2,$3,$4,$5) + ON CONFLICT (node_id) DO UPDATE SET + cluster_state = EXCLUDED.cluster_state, + recovery_mode = EXCLUDED.recovery_mode, + notes = EXCLUDED.notes, + updated_at = EXCLUDED.updated_at + `, item.ID, req.PartitionState.ClusterState, req.PartitionState.RecoveryMode, req.PartitionState.Notes, time.Now().UTC()); err != nil { + return err + } + return tx.Commit(ctx) +} + +func (m *Module) getNodeByID(ctx context.Context, nodeID string) (Node, error) { + row := m.db.QueryRow(ctx, ` + SELECT id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status, + version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at + FROM nodes + WHERE id = $1 + `, nodeID) + return scanNode(row) +} + +func (m *Module) listCapabilities(ctx context.Context, nodeID string) ([]NodeCapability, error) { + rows, err := m.db.Query(ctx, `SELECT node_id, capability, value, updated_at FROM node_capabilities WHERE node_id = $1 ORDER BY capability`, nodeID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []NodeCapability + for rows.Next() { + var item NodeCapability + if err := rows.Scan(&item.NodeID, &item.Capability, &item.Value, &item.UpdatedAt); err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (m *Module) listServices(ctx context.Context, nodeID string) ([]NodeService, error) { + rows, err := m.db.Query(ctx, ` + SELECT node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at + FROM node_services WHERE node_id = $1 ORDER BY service_type + `, nodeID) + if err != nil { + return nil, err + } + defer rows.Close() + var out []NodeService + for rows.Next() { + var item NodeService + if err := rows.Scan(&item.NodeID, &item.ServiceType, &item.Enabled, &item.DesiredState, &item.ReportedState, &item.LastReportedAt, &item.Metadata, &item.UpdatedAt); err != nil { + return nil, err + } + out = append(out, item) + } + return out, rows.Err() +} + +func (m *Module) getUpdatePolicy(ctx context.Context, nodeID string) (*NodeUpdatePolicy, error) { + row := m.db.QueryRow(ctx, ` + SELECT node_id, mode, channel, maintenance_window, canary, automatic_rollout, created_at, updated_at + FROM node_update_policies WHERE node_id = $1 + `, nodeID) + var item NodeUpdatePolicy + if err := row.Scan(&item.NodeID, &item.Mode, &item.Channel, &item.MaintenanceWindow, &item.Canary, &item.AutomaticRollout, &item.CreatedAt, &item.UpdatedAt); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return nil, nil + } + return nil, err + } + return &item, nil +} + +func (m *Module) getPartitionState(ctx context.Context, nodeID string) (*NodePartitionState, error) { + row := m.db.QueryRow(ctx, ` + SELECT node_id, cluster_state, recovery_mode, notes, updated_at + FROM node_partition_states WHERE node_id = $1 + `, nodeID) + var item NodePartitionState + if err := row.Scan(&item.NodeID, &item.ClusterState, &item.RecoveryMode, &item.Notes, &item.UpdatedAt); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return nil, nil + } + return nil, err + } + return &item, nil +} + +type rowScanner interface { + Scan(dest ...any) error +} + +func scanNode(row rowScanner) (Node, error) { + var item Node + if err := row.Scan( + &item.ID, + &item.OwnerOrganizationID, + &item.NodeKey, + &item.Name, + &item.OwnershipType, + &item.RegistrationStatus, + &item.HealthStatus, + &item.VersionState, + &item.PartitionState, + &item.DesiredVersion, + &item.ReportedVersion, + &item.LastSeenAt, + &item.Metadata, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return Node{}, err + } + if len(item.Metadata) == 0 { + item.Metadata = json.RawMessage(`{}`) + } + return item, nil +} diff --git a/backend/internal/modules/nodeagent/module.go b/backend/internal/modules/nodeagent/module.go new file mode 100644 index 0000000..b9ca479 --- /dev/null +++ b/backend/internal/modules/nodeagent/module.go @@ -0,0 +1,356 @@ +package nodeagent + +import ( + "encoding/json" + "errors" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/google/uuid" + "github.com/jackc/pgx/v5/pgxpool" + + clustermodule "github.com/example/remote-access-platform/backend/internal/modules/cluster" + "github.com/example/remote-access-platform/backend/internal/platform/httpx" + "github.com/example/remote-access-platform/backend/internal/platform/module" + "github.com/example/remote-access-platform/backend/internal/platform/secrets" +) + +type Module struct { + db *pgxpool.Pool + cluster *clustermodule.Service +} + +func NewModule(deps module.Dependencies) *Module { + clusterStore := clustermodule.NewPostgresStore(deps.Infra.DB) + if deps.Config.Secret.EncryptionKeyBase64 != "" { + if encryptor, err := secrets.NewEncryptor(deps.Config.Secret.EncryptionKeyBase64, deps.Config.Secret.EncryptionKeyID); err == nil { + clusterStore.WithClusterKeyEncryptor(encryptor) + } + } + return &Module{ + db: deps.Infra.DB, + cluster: clustermodule.NewService(clusterStore), + } +} + +func (m *Module) Name() string { + return "nodeagent" +} + +func (m *Module) RegisterRoutes(router chi.Router) { + router.Route("/node-agents", func(r chi.Router) { + r.Post("/enroll", m.enrollAgent) + r.Post("/enrollments/{requestID}/bootstrap", m.bootstrapEnrollment) + r.Post("/register", m.registerAgent) + r.Post("/{nodeID}/health", m.reportHealth) + r.Post("/{nodeID}/services/status", m.reportServiceStatus) + r.Post("/{nodeID}/update-manifest/request", m.requestUpdateManifest) + r.Post("/{nodeID}/update-result", m.acknowledgeUpdateResult) + r.Post("/{nodeID}/rollback-result", m.reportRollbackResult) + r.Get("/{nodeID}/clusters/{clusterID}/vpn-assignments/desired", m.listVPNAssignments) + r.Post("/{nodeID}/clusters/{clusterID}/vpn-assignments/{vpnConnectionID}/status", m.reportVPNAssignmentStatus) + }) +} + +func (m *Module) enrollAgent(w http.ResponseWriter, r *http.Request) { + var payload struct { + ClusterID string `json:"cluster_id"` + JoinToken string `json:"join_token"` + NodeName string `json:"node_name"` + NodeFingerprint string `json:"node_fingerprint"` + PublicKey string `json:"public_key"` + ReportedCapabilities json.RawMessage `json:"reported_capabilities"` + ReportedFacts json.RawMessage `json:"reported_facts"` + RequestedRoles json.RawMessage `json:"requested_roles"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid agent enrollment payload") + return + } + joinRequest, err := m.cluster.CreateJoinRequest(r.Context(), clustermodule.CreateJoinRequestInput{ + ClusterID: payload.ClusterID, + JoinToken: payload.JoinToken, + NodeName: payload.NodeName, + NodeFingerprint: payload.NodeFingerprint, + PublicKey: payload.PublicKey, + ReportedCapabilities: payload.ReportedCapabilities, + ReportedFacts: payload.ReportedFacts, + RequestedRoles: payload.RequestedRoles, + }) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{ + "status": "pending_approval", + "join_request": joinRequest, + }) +} + +func (m *Module) bootstrapEnrollment(w http.ResponseWriter, r *http.Request) { + var payload struct { + ClusterID string `json:"cluster_id"` + NodeFingerprint string `json:"node_fingerprint"` + PublicKey string `json:"public_key"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid enrollment bootstrap payload") + return + } + result, err := m.cluster.GetJoinRequestBootstrap(r.Context(), clustermodule.GetJoinRequestBootstrapInput{ + ClusterID: payload.ClusterID, + JoinRequestID: chi.URLParam(r, "requestID"), + NodeFingerprint: payload.NodeFingerprint, + PublicKey: payload.PublicKey, + }) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, result) +} + +func (m *Module) registerAgent(w http.ResponseWriter, r *http.Request) { + var payload struct { + NodeKey string `json:"node_key"` + Name string `json:"name"` + OwnershipType string `json:"ownership_type"` + OwnerOrganizationID *string `json:"owner_organization_id"` + ReportedVersion *string `json:"reported_version"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid agent registration payload") + return + } + if payload.NodeKey == "" || payload.Name == "" || payload.OwnershipType == "" { + httpx.WriteError(w, http.StatusBadRequest, "node_key, name, and ownership_type are required") + return + } + if len(payload.Metadata) == 0 { + payload.Metadata = json.RawMessage(`{}`) + } + now := time.Now().UTC() + nodeID := uuid.NewString() + if err := m.db.QueryRow(r.Context(), ` + INSERT INTO nodes ( + id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status, + version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at + ) VALUES ($1, $2, $3, $4, $5, 'active', 'unknown', 'unknown', 'healthy', NULL, $6, $7, $8::jsonb, $9, $10) + ON CONFLICT (node_key) DO UPDATE SET + name = EXCLUDED.name, + ownership_type = EXCLUDED.ownership_type, + owner_organization_id = EXCLUDED.owner_organization_id, + registration_status = 'active', + reported_version = EXCLUDED.reported_version, + last_seen_at = EXCLUDED.last_seen_at, + metadata = EXCLUDED.metadata, + updated_at = EXCLUDED.updated_at + RETURNING id + `, nodeID, payload.OwnerOrganizationID, payload.NodeKey, payload.Name, payload.OwnershipType, payload.ReportedVersion, now, []byte(payload.Metadata), now, now).Scan(&nodeID); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{ + "node_id": nodeID, + "status": "registered", + "legacy": true, + "warning": "direct node-agent registration is retained for compatibility; production enrollment must use /node-agents/enroll", + }) +} + +func (m *Module) reportHealth(w http.ResponseWriter, r *http.Request) { + var payload struct { + HealthStatus string `json:"health_status"` + ReportedVersion *string `json:"reported_version"` + Metadata json.RawMessage `json:"metadata"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid node health payload") + return + } + if payload.HealthStatus == "" { + payload.HealthStatus = "unknown" + } + if len(payload.Metadata) == 0 { + payload.Metadata = json.RawMessage(`{}`) + } + if _, err := m.db.Exec(r.Context(), ` + UPDATE nodes + SET health_status = $2, reported_version = COALESCE($3, reported_version), last_seen_at = $4, metadata = $5::jsonb, updated_at = $4 + WHERE id = $1 + `, chi.URLParam(r, "nodeID"), payload.HealthStatus, payload.ReportedVersion, time.Now().UTC(), []byte(payload.Metadata)); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted"}) +} + +func (m *Module) reportServiceStatus(w http.ResponseWriter, r *http.Request) { + var payload struct { + Services []struct { + ServiceType string `json:"service_type"` + ReportedState string `json:"reported_state"` + Metadata json.RawMessage `json:"metadata"` + } `json:"services"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid node service status payload") + return + } + now := time.Now().UTC() + for _, service := range payload.Services { + if len(service.Metadata) == 0 { + service.Metadata = json.RawMessage(`{}`) + } + if _, err := m.db.Exec(r.Context(), ` + INSERT INTO node_services ( + node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at + ) VALUES ($1, $2, FALSE, 'disabled', $3, $4, $5::jsonb, $4) + ON CONFLICT (node_id, service_type) DO UPDATE SET + reported_state = EXCLUDED.reported_state, + last_reported_at = EXCLUDED.last_reported_at, + metadata = EXCLUDED.metadata, + updated_at = EXCLUDED.updated_at + `, chi.URLParam(r, "nodeID"), service.ServiceType, service.ReportedState, now, []byte(service.Metadata)); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted"}) +} + +func (m *Module) listVPNAssignments(w http.ResponseWriter, r *http.Request) { + items, err := m.cluster.ListNodeVPNAssignments(r.Context(), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID")) + if writeClusterServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{ + "vpn_assignments": items, + "runtime_execution_enabled": false, + }) +} + +func (m *Module) reportVPNAssignmentStatus(w http.ResponseWriter, r *http.Request) { + var payload struct { + ObservedStatus string `json:"observed_status"` + StatusPayload json.RawMessage `json:"status_payload"` + ObservedAt *time.Time `json:"observed_at"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid vpn assignment status payload") + return + } + observedAt := time.Time{} + if payload.ObservedAt != nil { + observedAt = *payload.ObservedAt + } + item, err := m.cluster.ReportNodeVPNAssignmentStatus(r.Context(), clustermodule.ReportNodeVPNAssignmentStatusInput{ + ClusterID: chi.URLParam(r, "clusterID"), + NodeID: chi.URLParam(r, "nodeID"), + VPNConnectionID: chi.URLParam(r, "vpnConnectionID"), + ObservedStatus: payload.ObservedStatus, + StatusPayload: payload.StatusPayload, + ObservedAt: observedAt, + }) + if writeClusterServiceError(w, err) { + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{ + "vpn_assignment_status": item, + "runtime_execution_enabled": false, + }) +} + +func (m *Module) requestUpdateManifest(w http.ResponseWriter, r *http.Request) { + nodeID := chi.URLParam(r, "nodeID") + var mode, channel string + var canary, automatic bool + var desiredVersion *string + if err := m.db.QueryRow(r.Context(), ` + SELECT n.desired_version, p.mode, p.channel, p.canary, p.automatic_rollout + FROM nodes n + LEFT JOIN node_update_policies p ON p.node_id = n.id + WHERE n.id = $1 + `, nodeID).Scan(&desiredVersion, &mode, &channel, &canary, &automatic); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{ + "manifest": map[string]any{ + "node_id": nodeID, + "desired_version": desiredVersion, + "mode": mode, + "channel": channel, + "canary": canary, + "automatic_rollout": automatic, + }, + }) +} + +func (m *Module) acknowledgeUpdateResult(w http.ResponseWriter, r *http.Request) { + m.recordUpdateRun(w, r, "update") +} + +func (m *Module) reportRollbackResult(w http.ResponseWriter, r *http.Request) { + m.recordUpdateRun(w, r, "rollback") +} + +func (m *Module) recordUpdateRun(w http.ResponseWriter, r *http.Request, action string) { + var payload struct { + TargetVersion string `json:"target_version"` + Status string `json:"status"` + Payload json.RawMessage `json:"payload"` + } + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid update result payload") + return + } + if payload.Status == "" { + payload.Status = "acknowledged" + } + if len(payload.Payload) == 0 { + payload.Payload = json.RawMessage(`{}`) + } + now := time.Now().UTC() + runID := uuid.NewString() + if _, err := m.db.Exec(r.Context(), ` + INSERT INTO node_agent_update_runs ( + id, node_id, action, target_version, status, requested_at, acknowledged_at, completed_at, payload + ) VALUES ($1, $2, $3, $4, $5, $6, $6, CASE WHEN $5 IN ('succeeded', 'failed') THEN $6 ELSE NULL END, $7::jsonb) + `, runID, chi.URLParam(r, "nodeID"), action, payload.TargetVersion, payload.Status, now, []byte(payload.Payload)); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if action == "update" && payload.Status == "succeeded" { + _, _ = m.db.Exec(r.Context(), ` + UPDATE nodes + SET reported_version = $2, version_state = 'current', updated_at = $3 + WHERE id = $1 + `, chi.URLParam(r, "nodeID"), payload.TargetVersion, now) + } + if action == "rollback" && payload.Status == "succeeded" { + _, _ = m.db.Exec(r.Context(), ` + UPDATE nodes + SET reported_version = $2, version_state = 'rollback', updated_at = $3 + WHERE id = $1 + `, chi.URLParam(r, "nodeID"), payload.TargetVersion, now) + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted", "run_id": runID}) +} + +func writeClusterServiceError(w http.ResponseWriter, err error) bool { + if err == nil { + return false + } + switch { + case errors.Is(err, clustermodule.ErrVPNLeaseOwnerNotAllowed), errors.Is(err, clustermodule.ErrVPNLeaseOwnerRoleRequired): + httpx.WriteError(w, http.StatusForbidden, err.Error()) + case errors.Is(err, clustermodule.ErrInvalidPayload): + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + default: + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + } + return true +} diff --git a/backend/internal/modules/organization/admin_summary_test.go b/backend/internal/modules/organization/admin_summary_test.go new file mode 100644 index 0000000..bd0e4a1 --- /dev/null +++ b/backend/internal/modules/organization/admin_summary_test.go @@ -0,0 +1,21 @@ +package organization + +import "testing" + +func TestTenantSafeTopologyExposureDoesNotExposeCoreMesh(t *testing.T) { + value := tenantSafeTopologyExposure() + forbidden := []string{ + "core_node_id", + "mesh_route", + "cluster_private_topology", + "certificate_serial", + } + for _, token := range forbidden { + if value == token { + t.Fatalf("topology exposure leaked forbidden token %q", token) + } + } + if value != "tenant_safe_no_core_mesh_topology" { + t.Fatalf("unexpected topology exposure marker: %q", value) + } +} diff --git a/backend/internal/modules/organization/module.go b/backend/internal/modules/organization/module.go new file mode 100644 index 0000000..9be55e3 --- /dev/null +++ b/backend/internal/modules/organization/module.go @@ -0,0 +1,518 @@ +package organization + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "strings" + "time" + + "github.com/go-chi/chi/v5" + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/example/remote-access-platform/backend/internal/platform/authority" + "github.com/example/remote-access-platform/backend/internal/platform/httpx" + "github.com/example/remote-access-platform/backend/internal/platform/module" +) + +const ( + RoleOrgOwner = "org_owner" + RoleOrgAdmin = "org_admin" + RoleOrgOperator = "org_operator" + RoleOrgMember = "org_member" + RoleOrgViewer = "org_viewer" +) + +type Module struct { + db *pgxpool.Pool + authority *authority.Verifier +} + +type Organization struct { + ID string `json:"id"` + Slug string `json:"slug"` + Name string `json:"name"` + Status string `json:"status"` + Metadata json.RawMessage `json:"metadata"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type Membership struct { + ID string `json:"id"` + OrganizationID string `json:"organization_id"` + UserID string `json:"user_id"` + RoleID string `json:"role_id"` + Status string `json:"status"` + InvitedByUser *string `json:"invited_by_user_id,omitempty"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type AdminSummary struct { + OrganizationID string `json:"organization_id"` + ResourceCount int64 `json:"resource_count"` + ActiveSessionCount int64 `json:"active_session_count"` + ServiceEndpoints []ServiceSummary `json:"service_endpoints"` + ConnectorStatus map[string]any `json:"connector_status"` + RecentAudit []OrgAuditEvent `json:"recent_audit"` + TopologyExposure string `json:"topology_exposure"` +} + +type ServiceSummary struct { + Protocol string `json:"protocol"` + Count int64 `json:"count"` +} + +type OrgAuditEvent struct { + ID string `json:"id"` + EventType string `json:"event_type"` + TargetType string `json:"target_type"` + TargetID string `json:"target_id"` + Payload json.RawMessage `json:"payload"` + CreatedAt time.Time `json:"created_at"` +} + +type createOrganizationRequest struct { + ActorUserID string `json:"actor_user_id"` + Slug string `json:"slug"` + Name string `json:"name"` + Metadata json.RawMessage `json:"metadata"` +} + +type addMembershipRequest struct { + ActorUserID string `json:"actor_user_id"` + UserID string `json:"user_id"` + RoleID string `json:"role_id"` +} + +func NewModule(deps module.Dependencies) *Module { + authorityVerifier, _ := authority.NewVerifier(deps.Config.Installation) + return &Module{db: deps.Infra.DB, authority: authorityVerifier} +} + +func (m *Module) Name() string { + return "organization" +} + +func (m *Module) RegisterRoutes(router chi.Router) { + router.Route("/organizations", func(r chi.Router) { + r.Get("/", m.listOrganizations) + r.Post("/", m.createOrganization) + r.Get("/{organizationID}", m.getOrganization) + r.Get("/{organizationID}/admin-summary", m.getAdminSummary) + r.Get("/{organizationID}/memberships", m.listMemberships) + r.Post("/{organizationID}/memberships", m.addMembership) + }) +} + +func (m *Module) listOrganizations(w http.ResponseWriter, r *http.Request) { + userID := r.URL.Query().Get("user_id") + if userID == "" { + httpx.WriteError(w, http.StatusBadRequest, "user_id is required") + return + } + platformRole, err := m.getPlatformRole(r.Context(), userID) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + var rows pgx.Rows + if isPlatformAdmin(platformRole) { + rows, err = m.db.Query(r.Context(), ` + SELECT id, slug, name, status, metadata, created_at, updated_at + FROM organizations + ORDER BY created_at DESC + `) + } else { + rows, err = m.db.Query(r.Context(), ` + SELECT o.id, o.slug, o.name, o.status, o.metadata, o.created_at, o.updated_at + FROM organizations o + INNER JOIN organization_memberships om ON om.organization_id = o.id + WHERE om.user_id = $1 AND om.status = 'active' + ORDER BY o.created_at DESC + `, userID) + } + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer rows.Close() + var organizations []Organization + for rows.Next() { + org, err := scanOrganization(rows) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + organizations = append(organizations, org) + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"organizations": organizations}) +} + +func (m *Module) getOrganization(w http.ResponseWriter, r *http.Request) { + orgID := chi.URLParam(r, "organizationID") + userID := r.URL.Query().Get("user_id") + if userID == "" { + httpx.WriteError(w, http.StatusBadRequest, "user_id is required") + return + } + if err := m.ensureOrgAccess(r.Context(), orgID, userID, false); err != nil { + status := http.StatusInternalServerError + if errors.Is(err, pgx.ErrNoRows) || errors.Is(err, errForbidden) { + status = http.StatusForbidden + } + httpx.WriteError(w, status, err.Error()) + return + } + org, err := m.getOrganizationByID(r.Context(), orgID) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + httpx.WriteError(w, http.StatusNotFound, "organization not found") + return + } + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"organization": org}) +} + +func (m *Module) getAdminSummary(w http.ResponseWriter, r *http.Request) { + orgID := chi.URLParam(r, "organizationID") + actorUserID := r.URL.Query().Get("actor_user_id") + if actorUserID == "" { + httpx.WriteError(w, http.StatusBadRequest, "actor_user_id is required") + return + } + if err := m.ensureOrgAccess(r.Context(), orgID, actorUserID, true); err != nil { + httpx.WriteError(w, http.StatusForbidden, err.Error()) + return + } + summary, err := m.loadAdminSummary(r.Context(), orgID) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"admin_summary": summary}) +} + +func (m *Module) loadAdminSummary(ctx context.Context, orgID string) (AdminSummary, error) { + var resourceCount int64 + if err := m.db.QueryRow(ctx, ` + SELECT COUNT(*) + FROM resources + WHERE organization_id = $1::uuid + `, orgID).Scan(&resourceCount); err != nil { + return AdminSummary{}, err + } + + var activeSessionCount int64 + if err := m.db.QueryRow(ctx, ` + SELECT COUNT(*) + FROM remote_sessions + WHERE organization_id = $1::uuid + AND state = 'active' + `, orgID).Scan(&activeSessionCount); err != nil { + return AdminSummary{}, err + } + + rows, err := m.db.Query(ctx, ` + SELECT protocol, COUNT(*) + FROM resources + WHERE organization_id = $1::uuid + GROUP BY protocol + ORDER BY protocol + `, orgID) + if err != nil { + return AdminSummary{}, err + } + defer rows.Close() + var services []ServiceSummary + for rows.Next() { + var item ServiceSummary + if err := rows.Scan(&item.Protocol, &item.Count); err != nil { + return AdminSummary{}, err + } + services = append(services, item) + } + if err := rows.Err(); err != nil { + return AdminSummary{}, err + } + + auditRows, err := m.db.Query(ctx, ` + SELECT ae.id::text, ae.event_type, ae.target_type, ae.target_id, ae.payload, ae.created_at + FROM audit_events ae + LEFT JOIN remote_sessions rs ON rs.id = ae.remote_session_id + WHERE rs.organization_id = $1::uuid + ORDER BY ae.created_at DESC + LIMIT 20 + `, orgID) + if err != nil { + return AdminSummary{}, err + } + defer auditRows.Close() + var audit []OrgAuditEvent + for auditRows.Next() { + var item OrgAuditEvent + if err := auditRows.Scan(&item.ID, &item.EventType, &item.TargetType, &item.TargetID, &item.Payload, &item.CreatedAt); err != nil { + return AdminSummary{}, err + } + audit = append(audit, item) + } + if err := auditRows.Err(); err != nil { + return AdminSummary{}, err + } + + return AdminSummary{ + OrganizationID: orgID, + ResourceCount: resourceCount, + ActiveSessionCount: activeSessionCount, + ServiceEndpoints: services, + ConnectorStatus: map[string]any{ + "vpn": "not_implemented", + "connector": "not_implemented", + }, + RecentAudit: audit, + TopologyExposure: tenantSafeTopologyExposure(), + }, nil +} + +func tenantSafeTopologyExposure() string { + return "tenant_safe_no_core_mesh_topology" +} + +func (m *Module) createOrganization(w http.ResponseWriter, r *http.Request) { + var req createOrganizationRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid organization payload") + return + } + if req.ActorUserID == "" || req.Name == "" || req.Slug == "" { + httpx.WriteError(w, http.StatusBadRequest, "actor_user_id, slug, and name are required") + return + } + role, err := m.getPlatformRole(r.Context(), req.ActorUserID) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if !isPlatformAdmin(role) { + httpx.WriteError(w, http.StatusForbidden, "platform admin role is required") + return + } + if len(req.Metadata) == 0 { + req.Metadata = json.RawMessage(`{}`) + } + if !json.Valid(req.Metadata) { + httpx.WriteError(w, http.StatusBadRequest, "metadata must be valid json") + return + } + now := time.Now().UTC() + org := Organization{ + ID: uuid.NewString(), + Slug: normalizeSlug(req.Slug), + Name: req.Name, + Status: "active", + Metadata: req.Metadata, + CreatedAt: now, + UpdatedAt: now, + } + membership := Membership{ + ID: uuid.NewString(), + OrganizationID: org.ID, + UserID: req.ActorUserID, + RoleID: RoleOrgOwner, + Status: "active", + CreatedAt: now, + UpdatedAt: now, + } + tx, err := m.db.Begin(r.Context()) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer tx.Rollback(r.Context()) + if _, err := tx.Exec(r.Context(), ` + INSERT INTO organizations (id, slug, name, status, metadata, created_at, updated_at) + VALUES ($1, $2, $3, $4, $5::jsonb, $6, $7) + `, org.ID, org.Slug, org.Name, org.Status, []byte(org.Metadata), org.CreatedAt, org.UpdatedAt); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if _, err := tx.Exec(r.Context(), ` + INSERT INTO organization_memberships ( + id, organization_id, user_id, role_id, status, invited_by_user_id, created_at, updated_at + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) + `, membership.ID, membership.OrganizationID, membership.UserID, membership.RoleID, membership.Status, req.ActorUserID, membership.CreatedAt, membership.UpdatedAt); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := tx.Commit(r.Context()); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{ + "organization": org, + "membership": membership, + }) +} + +func (m *Module) listMemberships(w http.ResponseWriter, r *http.Request) { + orgID := chi.URLParam(r, "organizationID") + userID := r.URL.Query().Get("user_id") + if userID == "" { + httpx.WriteError(w, http.StatusBadRequest, "user_id is required") + return + } + if err := m.ensureOrgAccess(r.Context(), orgID, userID, true); err != nil { + httpx.WriteError(w, http.StatusForbidden, err.Error()) + return + } + rows, err := m.db.Query(r.Context(), ` + SELECT id, organization_id, user_id, role_id, status, invited_by_user_id, created_at, updated_at + FROM organization_memberships + WHERE organization_id = $1 + ORDER BY created_at DESC + `, orgID) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer rows.Close() + var memberships []Membership + for rows.Next() { + membership, err := scanMembership(rows) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + memberships = append(memberships, membership) + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"memberships": memberships}) +} + +func (m *Module) addMembership(w http.ResponseWriter, r *http.Request) { + orgID := chi.URLParam(r, "organizationID") + var req addMembershipRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid membership payload") + return + } + if req.ActorUserID == "" || req.UserID == "" || req.RoleID == "" { + httpx.WriteError(w, http.StatusBadRequest, "actor_user_id, user_id, and role_id are required") + return + } + if err := m.ensureOrgAccess(r.Context(), orgID, req.ActorUserID, true); err != nil { + httpx.WriteError(w, http.StatusForbidden, err.Error()) + return + } + now := time.Now().UTC() + membership := Membership{ + ID: uuid.NewString(), + OrganizationID: orgID, + UserID: req.UserID, + RoleID: req.RoleID, + Status: "active", + InvitedByUser: &req.ActorUserID, + CreatedAt: now, + UpdatedAt: now, + } + if _, err := m.db.Exec(r.Context(), ` + INSERT INTO organization_memberships ( + id, organization_id, user_id, role_id, status, invited_by_user_id, created_at, updated_at + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) + ON CONFLICT (organization_id, user_id) DO UPDATE SET + role_id = EXCLUDED.role_id, + status = 'active', + invited_by_user_id = EXCLUDED.invited_by_user_id, + updated_at = EXCLUDED.updated_at + `, membership.ID, membership.OrganizationID, membership.UserID, membership.RoleID, membership.Status, membership.InvitedByUser, membership.CreatedAt, membership.UpdatedAt); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"membership": membership}) +} + +var errForbidden = errors.New("forbidden") + +func (m *Module) ensureOrgAccess(ctx context.Context, orgID, userID string, adminRequired bool) error { + role, err := m.getPlatformRole(ctx, userID) + if err != nil { + return err + } + if isPlatformAdmin(role) { + return nil + } + query := ` + SELECT role_id + FROM organization_memberships + WHERE organization_id = $1 AND user_id = $2 AND status = 'active' + ` + var roleID string + if err := m.db.QueryRow(ctx, query, orgID, userID).Scan(&roleID); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return errForbidden + } + return err + } + if adminRequired && roleID != RoleOrgOwner && roleID != RoleOrgAdmin { + return errForbidden + } + return nil +} + +func (m *Module) getPlatformRole(ctx context.Context, userID string) (string, error) { + return authority.EffectivePlatformRole(ctx, m.db, m.authority, userID) +} + +func isPlatformAdmin(role string) bool { + return role == "platform_admin" || role == "platform_recovery_admin" +} + +func (m *Module) getOrganizationByID(ctx context.Context, orgID string) (Organization, error) { + row := m.db.QueryRow(ctx, ` + SELECT id, slug, name, status, metadata, created_at, updated_at + FROM organizations + WHERE id = $1 + `, orgID) + return scanOrganization(row) +} + +type rowScanner interface { + Scan(dest ...any) error +} + +func scanOrganization(row rowScanner) (Organization, error) { + var org Organization + if err := row.Scan(&org.ID, &org.Slug, &org.Name, &org.Status, &org.Metadata, &org.CreatedAt, &org.UpdatedAt); err != nil { + return Organization{}, err + } + if len(org.Metadata) == 0 { + org.Metadata = json.RawMessage(`{}`) + } + return org, nil +} + +func scanMembership(row rowScanner) (Membership, error) { + var membership Membership + if err := row.Scan( + &membership.ID, + &membership.OrganizationID, + &membership.UserID, + &membership.RoleID, + &membership.Status, + &membership.InvitedByUser, + &membership.CreatedAt, + &membership.UpdatedAt, + ); err != nil { + return Membership{}, err + } + return membership, nil +} + +func normalizeSlug(in string) string { + return strings.ToLower(strings.TrimSpace(in)) +} diff --git a/backend/internal/modules/resource/module.go b/backend/internal/modules/resource/module.go new file mode 100644 index 0000000..4c10e70 --- /dev/null +++ b/backend/internal/modules/resource/module.go @@ -0,0 +1,639 @@ +package resource + +import ( + "context" + "encoding/json" + "errors" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/example/remote-access-platform/backend/internal/platform/authority" + "github.com/example/remote-access-platform/backend/internal/platform/httpx" + "github.com/example/remote-access-platform/backend/internal/platform/module" + "github.com/example/remote-access-platform/backend/internal/platform/secrets" +) + +const ( + CertificateVerificationModeStrict = "strict" + CertificateVerificationModeIgnore = "ignore" + RenderQualityProfileLowBandwidth = "low_bandwidth" + RenderQualityProfileBalanced = "balanced" + RenderQualityProfileHighQuality = "high_quality" + RenderQualityProfileTextPriority = "text_priority" + ClipboardModeDisabled = "disabled" + ClipboardModeClientToServer = "client_to_server" + ClipboardModeServerToClient = "server_to_client" + ClipboardModeBidirectional = "bidirectional" + FileTransferModeDisabled = "disabled" + FileTransferModeClientToServer = "client_to_server" + FileTransferModeServerToClient = "server_to_client" + FileTransferModeBidirectional = "bidirectional" +) + +type Module struct { + db *pgxpool.Pool + appEnv string + secretStore *secrets.ResourceSecretStore + authority *authority.Verifier +} + +type Resource struct { + ID string `json:"id"` + OrganizationID string `json:"organization_id"` + Name string `json:"name"` + Address string `json:"address"` + Protocol string `json:"protocol"` + SecretRef *string `json:"secret_ref,omitempty"` + CertificateVerificationMode string `json:"certificate_verification_mode"` + RenderQualityProfile string `json:"render_quality_profile"` + ClipboardMode string `json:"clipboard_mode"` + FileTransferMode string `json:"file_transfer_mode"` + Metadata json.RawMessage `json:"metadata"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type upsertResourceRequest struct { + ActorUserID string `json:"actor_user_id"` + OrganizationID string `json:"organization_id"` + Name string `json:"name"` + Address string `json:"address"` + Protocol string `json:"protocol"` + SecretRef *string `json:"secret_ref"` + CertificateVerificationMode string `json:"certificate_verification_mode"` + RenderQualityProfile string `json:"render_quality_profile"` + ClipboardMode string `json:"clipboard_mode"` + FileTransferMode string `json:"file_transfer_mode"` + Metadata json.RawMessage `json:"metadata"` +} + +type upsertResourceSecretRequest struct { + ActorUserID string `json:"actor_user_id"` + Payload json.RawMessage `json:"payload"` + Metadata json.RawMessage `json:"metadata"` +} + +func NewModule(deps module.Dependencies, secretStores ...*secrets.ResourceSecretStore) *Module { + var secretStore *secrets.ResourceSecretStore + if len(secretStores) > 0 { + secretStore = secretStores[0] + } + authorityVerifier, _ := authority.NewVerifier(deps.Config.Installation) + return &Module{db: deps.Infra.DB, appEnv: deps.Config.App.Env, secretStore: secretStore, authority: authorityVerifier} +} + +func (m *Module) Name() string { + return "resource" +} + +func (m *Module) RegisterRoutes(router chi.Router) { + router.Route("/resources", func(r chi.Router) { + r.Get("/", m.listResources) + r.Post("/", m.createResource) + r.Get("/{resourceID}", m.getResource) + r.Put("/{resourceID}", m.updateResource) + r.Put("/{resourceID}/secret", m.upsertResourceSecret) + }) +} + +func (m *Module) listResources(w http.ResponseWriter, r *http.Request) { + userID := r.URL.Query().Get("user_id") + orgID := r.URL.Query().Get("organization_id") + if userID == "" { + httpx.WriteError(w, http.StatusBadRequest, "user_id is required") + return + } + platformRole, err := m.getPlatformRole(r.Context(), userID) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + query := ` + SELECT r.id, r.organization_id, r.name, r.address, r.protocol, r.secret_ref, + r.certificate_verification_mode, r.metadata, r.created_at, r.updated_at, + COALESCE(rp.clipboard_mode, 'disabled') AS clipboard_mode, + COALESCE(rp.file_transfer_mode, 'disabled') AS file_transfer_mode + FROM resources r + LEFT JOIN resource_policies rp ON rp.resource_id = r.id + ` + args := make([]any, 0, 2) + if platformRole == "platform_admin" || platformRole == "platform_recovery_admin" { + if orgID != "" { + query += ` WHERE r.organization_id = $1` + args = append(args, orgID) + } + query += ` ORDER BY r.created_at DESC` + } else { + query += ` + INNER JOIN organization_memberships om ON om.organization_id = r.organization_id + WHERE om.user_id = $1 AND om.status = 'active' + ` + args = append(args, userID) + if orgID != "" { + query += ` AND r.organization_id = $2` + args = append(args, orgID) + } + query += ` ORDER BY r.created_at DESC` + } + rows, err := m.db.Query(r.Context(), query, args...) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer rows.Close() + + resources := make([]Resource, 0) + for rows.Next() { + resource, err := scanResource(rows) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + resources = append(resources, resource) + } + if err := rows.Err(); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + httpx.WriteJSON(w, http.StatusOK, map[string]any{"resources": resources}) +} + +func (m *Module) getResource(w http.ResponseWriter, r *http.Request) { + userID := r.URL.Query().Get("user_id") + if userID == "" { + httpx.WriteError(w, http.StatusBadRequest, "user_id is required") + return + } + resource, err := m.getByID(r.Context(), chi.URLParam(r, "resourceID")) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + httpx.WriteError(w, http.StatusNotFound, "resource not found") + return + } + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + if err := m.ensureResourceAccess(r.Context(), resource.OrganizationID, userID, false); err != nil { + httpx.WriteError(w, http.StatusForbidden, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"resource": resource}) +} + +func (m *Module) createResource(w http.ResponseWriter, r *http.Request) { + req, err := decodeUpsertRequest(r) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + if err := secrets.ValidateResourceSecretReadiness(req.Protocol, req.SecretRef, req.Metadata, m.appEnv); err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + + now := time.Now().UTC() + resource := Resource{ + ID: uuid.NewString(), + OrganizationID: req.OrganizationID, + Name: req.Name, + Address: req.Address, + Protocol: req.Protocol, + SecretRef: req.SecretRef, + CertificateVerificationMode: req.CertificateVerificationMode, + RenderQualityProfile: req.RenderQualityProfile, + ClipboardMode: req.ClipboardMode, + FileTransferMode: req.FileTransferMode, + Metadata: req.Metadata, + CreatedAt: now, + UpdatedAt: now, + } + if err := m.ensureResourceAccess(r.Context(), req.OrganizationID, req.ActorUserID, true); err != nil { + httpx.WriteError(w, http.StatusForbidden, err.Error()) + return + } + tx, err := m.db.Begin(r.Context()) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer tx.Rollback(r.Context()) + if _, err := tx.Exec(r.Context(), ` + INSERT INTO resources ( + id, organization_id, name, address, protocol, secret_ref, certificate_verification_mode, metadata, created_at, updated_at + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb, $9, $10) + `, resource.ID, resource.OrganizationID, resource.Name, resource.Address, resource.Protocol, resource.SecretRef, resource.CertificateVerificationMode, []byte(resource.Metadata), resource.CreatedAt, resource.UpdatedAt); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := upsertResourcePolicy(r.Context(), tx, resource.ID, resource.ClipboardMode, resource.FileTransferMode, now); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := tx.Commit(r.Context()); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + httpx.WriteJSON(w, http.StatusCreated, map[string]any{"resource": resource}) +} + +func (m *Module) updateResource(w http.ResponseWriter, r *http.Request) { + req, err := decodeUpsertRequest(r) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + if err := secrets.ValidateResourceSecretReadiness(req.Protocol, req.SecretRef, req.Metadata, m.appEnv); err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + + resourceID := chi.URLParam(r, "resourceID") + existing, err := m.getByID(r.Context(), resourceID) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + httpx.WriteError(w, http.StatusNotFound, "resource not found") + return + } + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := m.ensureResourceAccess(r.Context(), existing.OrganizationID, req.ActorUserID, true); err != nil { + httpx.WriteError(w, http.StatusForbidden, err.Error()) + return + } + if req.OrganizationID != existing.OrganizationID { + if err := m.ensureResourceAccess(r.Context(), req.OrganizationID, req.ActorUserID, true); err != nil { + httpx.WriteError(w, http.StatusForbidden, err.Error()) + return + } + } + now := time.Now().UTC() + tx, err := m.db.Begin(r.Context()) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer tx.Rollback(r.Context()) + tag, err := tx.Exec(r.Context(), ` + UPDATE resources + SET + organization_id = $2, + name = $3, + address = $4, + protocol = $5, + secret_ref = $6, + certificate_verification_mode = $7, + metadata = $8::jsonb, + updated_at = $9 + WHERE id = $1 + `, resourceID, req.OrganizationID, req.Name, req.Address, req.Protocol, req.SecretRef, req.CertificateVerificationMode, []byte(req.Metadata), now) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if tag.RowsAffected() == 0 { + httpx.WriteError(w, http.StatusNotFound, "resource not found") + return + } + if err := upsertResourcePolicy(r.Context(), tx, resourceID, req.ClipboardMode, req.FileTransferMode, now); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := tx.Commit(r.Context()); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + resource, err := m.getByID(r.Context(), resourceID) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + httpx.WriteJSON(w, http.StatusOK, map[string]any{"resource": resource}) +} + +func (m *Module) upsertResourceSecret(w http.ResponseWriter, r *http.Request) { + if m.secretStore == nil { + httpx.WriteError(w, http.StatusServiceUnavailable, "resource secret encryption is not configured") + return + } + var req upsertResourceSecretRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid resource secret payload") + return + } + if req.ActorUserID == "" { + httpx.WriteError(w, http.StatusBadRequest, "actor_user_id is required") + return + } + resourceID := chi.URLParam(r, "resourceID") + resource, err := m.getByID(r.Context(), resourceID) + if err != nil { + if errors.Is(err, pgx.ErrNoRows) { + httpx.WriteError(w, http.StatusNotFound, "resource not found") + return + } + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := m.ensureResourceAccess(r.Context(), resource.OrganizationID, req.ActorUserID, true); err != nil { + httpx.WriteError(w, http.StatusForbidden, err.Error()) + return + } + tx, err := m.db.Begin(r.Context()) + if err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + defer tx.Rollback(r.Context()) + secretStore := m.secretStore.WithDB(tx) + secretRef := secrets.DefaultResourceSecretRef(resource.OrganizationID, resource.ID) + descriptor, err := secretStore.Upsert(r.Context(), secrets.UpsertResourceSecretCommand{ + OrganizationID: resource.OrganizationID, + ResourceID: resource.ID, + Protocol: resource.Protocol, + SecretRef: secretRef, + Payload: req.Payload, + Metadata: req.Metadata, + ActorUserID: req.ActorUserID, + }) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, err.Error()) + return + } + if _, err := tx.Exec(r.Context(), ` + UPDATE resources + SET secret_ref = $2, updated_at = $3 + WHERE id = $1::uuid + `, resource.ID, descriptor.SecretRef, time.Now().UTC()); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := writeAuditEvent(r.Context(), tx, "resource_secret_rotated", req.ActorUserID, "resource_secret", descriptor.SecretRef, map[string]any{ + "resource_id": resource.ID, + "organization_id": resource.OrganizationID, + "protocol": resource.Protocol, + "version": descriptor.Version, + "secret_ref": descriptor.SecretRef, + }); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + if err := tx.Commit(r.Context()); err != nil { + httpx.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"secret": descriptor}) +} + +func decodeUpsertRequest(r *http.Request) (*upsertResourceRequest, error) { + var req upsertResourceRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + return nil, errors.New("invalid resource payload") + } + if req.Name == "" { + return nil, errors.New("name is required") + } + if req.ActorUserID == "" { + return nil, errors.New("actor_user_id is required") + } + if req.OrganizationID == "" { + return nil, errors.New("organization_id is required") + } + if req.Address == "" { + return nil, errors.New("address is required") + } + if req.Protocol == "" { + req.Protocol = "rdp" + } + mode, err := normalizeCertificateVerificationMode(req.CertificateVerificationMode) + if err != nil { + return nil, err + } + req.CertificateVerificationMode = mode + renderQualityProfile, err := normalizeRenderQualityProfile(req.RenderQualityProfile) + if err != nil { + return nil, err + } + req.RenderQualityProfile = renderQualityProfile + clipboardMode, err := normalizeClipboardMode(req.ClipboardMode) + if err != nil { + return nil, err + } + req.ClipboardMode = clipboardMode + fileTransferMode, err := normalizeFileTransferMode(req.FileTransferMode) + if err != nil { + return nil, err + } + req.FileTransferMode = fileTransferMode + metadata, err := normalizeMetadata(req.Metadata, req.CertificateVerificationMode, req.RenderQualityProfile) + if err != nil { + return nil, err + } + req.Metadata = metadata + return &req, nil +} + +func normalizeCertificateVerificationMode(mode string) (string, error) { + switch mode { + case "", CertificateVerificationModeStrict: + return CertificateVerificationModeStrict, nil + case CertificateVerificationModeIgnore: + return CertificateVerificationModeIgnore, nil + default: + return "", errors.New("certificate_verification_mode must be one of: strict, ignore") + } +} + +func normalizeClipboardMode(mode string) (string, error) { + switch mode { + case "", ClipboardModeDisabled: + return ClipboardModeDisabled, nil + case ClipboardModeClientToServer, ClipboardModeServerToClient, ClipboardModeBidirectional: + return mode, nil + default: + return "", errors.New("clipboard_mode must be one of: disabled, client_to_server, server_to_client, bidirectional") + } +} + +func normalizeFileTransferMode(mode string) (string, error) { + switch mode { + case "", FileTransferModeDisabled: + return FileTransferModeDisabled, nil + case FileTransferModeClientToServer, FileTransferModeServerToClient, FileTransferModeBidirectional: + return mode, nil + default: + return "", errors.New("file_transfer_mode must be one of: disabled, client_to_server, server_to_client, bidirectional") + } +} + +func normalizeMetadata(raw json.RawMessage, certificateVerificationMode, renderQualityProfile string) (json.RawMessage, error) { + if len(raw) == 0 { + raw = json.RawMessage(`{}`) + } + if !json.Valid(raw) { + return nil, errors.New("metadata must be valid json") + } + var metadata map[string]any + if err := json.Unmarshal(raw, &metadata); err != nil { + return nil, errors.New("metadata must be a json object") + } + metadata["certificate_verification_mode"] = certificateVerificationMode + metadata["render_quality_profile"] = renderQualityProfile + encoded, err := json.Marshal(metadata) + if err != nil { + return nil, err + } + return json.RawMessage(encoded), nil +} + +func (m *Module) getByID(ctx context.Context, resourceID string) (Resource, error) { + row := m.db.QueryRow(ctx, ` + SELECT r.id, r.organization_id, r.name, r.address, r.protocol, r.secret_ref, + r.certificate_verification_mode, r.metadata, r.created_at, r.updated_at, + COALESCE(rp.clipboard_mode, 'disabled') AS clipboard_mode, + COALESCE(rp.file_transfer_mode, 'disabled') AS file_transfer_mode + FROM resources r + LEFT JOIN resource_policies rp ON rp.resource_id = r.id + WHERE r.id = $1 + `, resourceID) + return scanResource(row) +} + +func (m *Module) ensureResourceAccess(ctx context.Context, orgID, userID string, adminRequired bool) error { + role, err := m.getPlatformRole(ctx, userID) + if err != nil { + return err + } + if role == "platform_admin" || role == "platform_recovery_admin" { + return nil + } + var membershipRole string + if err := m.db.QueryRow(ctx, ` + SELECT role_id + FROM organization_memberships + WHERE organization_id = $1 AND user_id = $2 AND status = 'active' + `, orgID, userID).Scan(&membershipRole); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return errors.New("forbidden") + } + return err + } + if adminRequired && membershipRole != "org_owner" && membershipRole != "org_admin" { + return errors.New("forbidden") + } + return nil +} + +func (m *Module) getPlatformRole(ctx context.Context, userID string) (string, error) { + return authority.EffectivePlatformRole(ctx, m.db, m.authority, userID) +} + +type rowScanner interface { + Scan(dest ...any) error +} + +func scanResource(row rowScanner) (Resource, error) { + var resource Resource + if err := row.Scan( + &resource.ID, + &resource.OrganizationID, + &resource.Name, + &resource.Address, + &resource.Protocol, + &resource.SecretRef, + &resource.CertificateVerificationMode, + &resource.Metadata, + &resource.CreatedAt, + &resource.UpdatedAt, + &resource.ClipboardMode, + &resource.FileTransferMode, + ); err != nil { + return Resource{}, err + } + if len(resource.Metadata) == 0 { + resource.Metadata = json.RawMessage(`{}`) + } + if resource.CertificateVerificationMode == "" { + resource.CertificateVerificationMode = CertificateVerificationModeStrict + } + if resource.RenderQualityProfile == "" { + resource.RenderQualityProfile = renderQualityProfileFromMetadata(resource.Metadata) + } + if resource.ClipboardMode == "" { + resource.ClipboardMode = ClipboardModeDisabled + } + if resource.FileTransferMode == "" { + resource.FileTransferMode = FileTransferModeDisabled + } + return resource, nil +} + +func upsertResourcePolicy(ctx context.Context, tx pgx.Tx, resourceID, clipboardMode, fileTransferMode string, now time.Time) error { + clipboardEnabled := clipboardMode != ClipboardModeDisabled + fileTransferEnabled := fileTransferMode == FileTransferModeClientToServer || fileTransferMode == FileTransferModeBidirectional + _, err := tx.Exec(ctx, ` + INSERT INTO resource_policies ( + resource_id, clipboard_enabled, clipboard_mode, file_transfer_enabled, file_transfer_mode, created_at, updated_at + ) VALUES ($1, $2, $3, $4, $5, $6, $6) + ON CONFLICT (resource_id) DO UPDATE SET + clipboard_enabled = EXCLUDED.clipboard_enabled, + clipboard_mode = EXCLUDED.clipboard_mode, + file_transfer_enabled = EXCLUDED.file_transfer_enabled, + file_transfer_mode = EXCLUDED.file_transfer_mode, + updated_at = EXCLUDED.updated_at + `, resourceID, clipboardEnabled, clipboardMode, fileTransferEnabled, fileTransferMode, now) + return err +} + +func writeAuditEvent(ctx context.Context, tx pgx.Tx, eventType, actorUserID, targetType, targetID string, payload map[string]any) error { + encoded, err := json.Marshal(payload) + if err != nil { + return err + } + _, err = tx.Exec(ctx, ` + INSERT INTO audit_events ( + id, actor_user_id, event_type, target_type, target_id, payload, created_at + ) VALUES ( + $1::uuid, NULLIF($2, '')::uuid, $3, $4, $5, $6::jsonb, $7 + ) + `, uuid.NewString(), actorUserID, eventType, targetType, targetID, encoded, time.Now().UTC()) + return err +} + +func normalizeRenderQualityProfile(profile string) (string, error) { + switch profile { + case "", RenderQualityProfileBalanced: + return RenderQualityProfileBalanced, nil + case RenderQualityProfileLowBandwidth, RenderQualityProfileHighQuality, RenderQualityProfileTextPriority: + return profile, nil + default: + return "", errors.New("render_quality_profile must be one of: low_bandwidth, balanced, high_quality, text_priority") + } +} + +func renderQualityProfileFromMetadata(raw json.RawMessage) string { + if len(raw) == 0 { + return RenderQualityProfileBalanced + } + var metadata map[string]any + if err := json.Unmarshal(raw, &metadata); err != nil { + return RenderQualityProfileBalanced + } + if profile, ok := metadata["render_quality_profile"].(string); ok { + switch profile { + case RenderQualityProfileLowBandwidth, RenderQualityProfileBalanced, RenderQualityProfileHighQuality, RenderQualityProfileTextPriority: + return profile + } + } + return RenderQualityProfileBalanced +} diff --git a/backend/internal/modules/sessionbroker/dataplane.go b/backend/internal/modules/sessionbroker/dataplane.go new file mode 100644 index 0000000..205fb95 --- /dev/null +++ b/backend/internal/modules/sessionbroker/dataplane.go @@ -0,0 +1,219 @@ +package sessionbroker + +import ( + "crypto/rsa" + "fmt" + "strings" + "time" + + "github.com/golang-jwt/jwt/v5" + "github.com/google/uuid" + + "github.com/example/remote-access-platform/backend/internal/platform/secrets" + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" +) + +const ( + directWorkerTLSTrustModeSmokeInsecure = "smoke_insecure" + directWorkerTLSTrustModePublicCA = "public_ca" + directWorkerTLSTrustModePlatformCA = "platform_ca" +) + +type DataPlaneTokenClaims struct { + SessionID string `json:"session_id"` + AttachmentID string `json:"attachment_id"` + UserID string `json:"user_id"` + OrganizationID string `json:"organization_id"` + ClusterID string `json:"cluster_id,omitempty"` + WorkerID string `json:"worker_id"` + ResourceID string `json:"resource_id"` + AllowedChannels []string `json:"allowed_channels"` + ExpiresAtValue time.Time `json:"expires_at"` + jwt.RegisteredClaims +} + +func (s *Service) buildDataPlaneOffer(session RemoteSession, attachment SessionAttachment) (*sessioncontracts.DataPlaneOffer, error) { + if s.cfg.DataPlane.TokenTTL <= 0 || s.cfg.DataPlane.TokenPrivateKeyPEM == "" { + return nil, nil + } + + now := s.now().UTC() + expiresAt := now.Add(s.cfg.DataPlane.TokenTTL) + allowedChannels := dataPlaneAllowedChannelsFromSession(session) + jti := uuid.NewString() + claims := DataPlaneTokenClaims{ + SessionID: session.ID, + AttachmentID: attachment.ID, + UserID: attachment.UserID, + OrganizationID: session.OrganizationID, + WorkerID: session.WorkerID, + ResourceID: session.ResourceID, + AllowedChannels: allowedChannels, + ExpiresAtValue: expiresAt, + RegisteredClaims: jwt.RegisteredClaims{ + ID: jti, + Issuer: s.cfg.Auth.Issuer, + Subject: attachment.UserID, + Audience: jwt.ClaimStrings{"rap-data-plane", "worker:" + session.WorkerID}, + IssuedAt: jwt.NewNumericDate(now), + NotBefore: jwt.NewNumericDate(now), + ExpiresAt: jwt.NewNumericDate(expiresAt), + }, + } + token, err := signDataPlaneToken(claims, s.cfg.DataPlane.TokenPrivateKeyPEM) + if err != nil { + return nil, err + } + + candidates := s.buildDataPlaneCandidates(session) + preferred := sessioncontracts.DataPlaneCandidateBackendGateway + if len(candidates) > 0 { + preferred = candidates[0].Type + } + + return &sessioncontracts.DataPlaneOffer{ + Preferred: preferred, + Token: token, + ExpiresAt: expiresAt, + Candidates: candidates, + }, nil +} + +func (s *Service) buildDataPlaneCandidates(session RemoteSession) []sessioncontracts.DataPlaneCandidate { + var candidates []sessioncontracts.DataPlaneCandidate + if directURL := s.directWorkerWSSURL(session.WorkerID); directURL != "" && s.canAdvertiseDirectWorkerWSS() { + metadata := map[string]any(nil) + if s.cfg.DataPlane.DirectWorkerJSONRuntime { + metadata = map[string]any{ + "runtime_transport": "json_v1", + "traffic_ready": true, + } + s.addDirectWorkerTLSTrustMetadata(metadata) + if s.cfg.DataPlane.DirectWorkerBinaryRender { + metadata["render_transport"] = "binary_v1" + metadata["binary_render"] = true + metadata["supported_color_modes"] = []string{"full_color", "grayscale"} + metadata["default_color_mode"] = "full_color" + } + } + candidates = append(candidates, sessioncontracts.DataPlaneCandidate{ + Type: sessioncontracts.DataPlaneCandidateDirectWorkerWSS, + URL: directURL, + WorkerID: session.WorkerID, + Priority: 10, + Metadata: metadata, + }) + } + if s.cfg.DataPlane.BackendGatewayURL != "" { + candidates = append(candidates, sessioncontracts.DataPlaneCandidate{ + Type: sessioncontracts.DataPlaneCandidateBackendGateway, + URL: s.cfg.DataPlane.BackendGatewayURL, + Priority: 100, + }) + } + return candidates +} + +func (s *Service) canAdvertiseDirectWorkerWSS() bool { + trustMode := normalizeDirectWorkerTLSTrustMode(s.cfg.DataPlane.DirectWorkerTLSTrustMode) + return !secrets.IsProductionEnv(s.cfg.App.Env) || directWorkerTLSTrustModeIsProductionTrusted(trustMode) +} + +func (s *Service) addDirectWorkerTLSTrustMetadata(metadata map[string]any) { + trustMode := normalizeDirectWorkerTLSTrustMode(s.cfg.DataPlane.DirectWorkerTLSTrustMode) + metadata["tls_trust_mode"] = trustMode + metadata["production_trusted"] = directWorkerTLSTrustModeIsProductionTrusted(trustMode) + metadata["smoke_only"] = trustMode == directWorkerTLSTrustModeSmokeInsecure + if s.cfg.DataPlane.DirectWorkerTLSCARef != "" { + metadata["tls_ca_ref"] = s.cfg.DataPlane.DirectWorkerTLSCARef + } +} + +func normalizeDirectWorkerTLSTrustMode(mode string) string { + switch strings.ToLower(strings.TrimSpace(mode)) { + case directWorkerTLSTrustModePublicCA: + return directWorkerTLSTrustModePublicCA + case directWorkerTLSTrustModePlatformCA: + return directWorkerTLSTrustModePlatformCA + default: + return directWorkerTLSTrustModeSmokeInsecure + } +} + +func directWorkerTLSTrustModeIsProductionTrusted(mode string) bool { + return mode == directWorkerTLSTrustModePublicCA || mode == directWorkerTLSTrustModePlatformCA +} + +func (s *Service) directWorkerWSSURL(workerID string) string { + template := strings.TrimSpace(s.cfg.DataPlane.DirectWorkerWSSURLTemplate) + if template == "" || workerID == "" { + return "" + } + return strings.ReplaceAll(template, "{worker_id}", workerID) +} + +func signDataPlaneToken(claims DataPlaneTokenClaims, privateKeyPEM string) (string, error) { + privateKey, err := jwt.ParseRSAPrivateKeyFromPEM([]byte(privateKeyPEM)) + if err != nil { + return "", fmt.Errorf("parse data-plane private key: %w", err) + } + token := jwt.NewWithClaims(jwt.SigningMethodRS256, claims) + signed, err := token.SignedString(privateKey) + if err != nil { + return "", fmt.Errorf("sign data-plane token: %w", err) + } + return signed, nil +} + +func parseDataPlaneToken(tokenValue string, publicKey *rsa.PublicKey) (*DataPlaneTokenClaims, error) { + claims := &DataPlaneTokenClaims{} + token, err := jwt.ParseWithClaims(tokenValue, claims, func(token *jwt.Token) (any, error) { + if token.Method != jwt.SigningMethodRS256 { + return nil, fmt.Errorf("unexpected data-plane signing method: %s", token.Header["alg"]) + } + return publicKey, nil + }) + if err != nil { + return nil, err + } + if !token.Valid { + return nil, fmt.Errorf("data-plane token invalid") + } + return claims, nil +} + +func dataPlaneAllowedChannelsFromSession(session RemoteSession) []string { + channels := []string{ + sessioncontracts.DataPlaneChannelControl, + sessioncontracts.DataPlaneChannelInput, + sessioncontracts.DataPlaneChannelRender, + sessioncontracts.DataPlaneChannelTelemetry, + } + metadata := decodeJSONMap(session.Metadata) + policy, _ := metadata["policy"].(map[string]any) + if policy != nil { + if mode, _ := policy["clipboard_mode"].(string); mode != "" && mode != string(ResourceClipboardModeDisabled) { + channels = append(channels, sessioncontracts.DataPlaneChannelClipboard) + } + if mode, _ := policy["file_transfer_mode"].(string); fileTransferAllowsClientToServer(ResourceFileTransferMode(mode)) { + channels = append(channels, sessioncontracts.DataPlaneChannelFileUpload) + } + if mode, _ := policy["file_transfer_mode"].(string); fileTransferAllowsServerToClient(ResourceFileTransferMode(mode)) { + channels = append(channels, sessioncontracts.DataPlaneChannelFileDownload) + } + } + return channels +} + +func (s *Service) attachDataPlaneOffer(result *SessionControlResult) error { + if result == nil || result.Attachment == nil { + return nil + } + result.GatewayURL = s.cfg.DataPlane.BackendGatewayURL + offer, err := s.buildDataPlaneOffer(result.Session, *result.Attachment) + if err != nil { + return err + } + result.DataPlane = offer + return nil +} diff --git a/backend/internal/modules/sessionbroker/dataplane_test.go b/backend/internal/modules/sessionbroker/dataplane_test.go new file mode 100644 index 0000000..d2c875c --- /dev/null +++ b/backend/internal/modules/sessionbroker/dataplane_test.go @@ -0,0 +1,357 @@ +package sessionbroker + +import ( + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "encoding/json" + "encoding/pem" + "slices" + "strings" + "testing" + "time" + + "github.com/example/remote-access-platform/backend/internal/platform/config" + "github.com/example/remote-access-platform/backend/internal/platform/module" + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" +) + +func TestDataPlaneTokenScopeValidation(t *testing.T) { + now := time.Now().UTC().Truncate(time.Second) + privateKeyPEM, publicKey := testRS256Key(t) + service := &Service{ + cfg: module.Config{ + Auth: config.AuthConfig{ + Issuer: "rap-api-test", + }, + DataPlane: config.DataPlaneConfig{ + TokenTTL: time.Minute, + TokenPrivateKeyPEM: privateKeyPEM, + BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws", + }, + }, + now: func() time.Time { return now }, + } + session := RemoteSession{ + ID: "session-1", + OrganizationID: "org-1", + ResourceID: "resource-1", + WorkerID: "worker-1", + Metadata: mustJSON(t, map[string]any{"policy": map[string]any{"clipboard_mode": "bidirectional", "file_transfer_mode": "client_to_server"}}), + } + attachment := SessionAttachment{ + ID: "attachment-1", + UserID: "user-1", + } + + offer, err := service.buildDataPlaneOffer(session, attachment) + if err != nil { + t.Fatalf("buildDataPlaneOffer returned error: %v", err) + } + if offer == nil { + t.Fatal("expected data-plane offer") + } + + claims, err := parseDataPlaneToken(offer.Token, publicKey) + if err != nil { + t.Fatalf("parseDataPlaneToken returned error: %v", err) + } + assertEqual(t, claims.SessionID, session.ID, "session_id") + assertEqual(t, claims.AttachmentID, attachment.ID, "attachment_id") + assertEqual(t, claims.UserID, attachment.UserID, "user_id") + assertEqual(t, claims.OrganizationID, session.OrganizationID, "organization_id") + assertEqual(t, claims.WorkerID, session.WorkerID, "worker_id") + assertEqual(t, claims.ResourceID, session.ResourceID, "resource_id") + if claims.ID == "" { + t.Fatal("expected jti") + } + if claims.ExpiresAt == nil || !claims.ExpiresAt.Time.Equal(now.Add(time.Minute)) { + t.Fatalf("unexpected expires_at: %v", claims.ExpiresAt) + } + if !claims.ExpiresAtValue.Equal(now.Add(time.Minute)) { + t.Fatalf("unexpected expires_at claim value: %v", claims.ExpiresAtValue) + } + for _, channel := range []string{ + sessioncontracts.DataPlaneChannelControl, + sessioncontracts.DataPlaneChannelInput, + sessioncontracts.DataPlaneChannelRender, + sessioncontracts.DataPlaneChannelTelemetry, + sessioncontracts.DataPlaneChannelClipboard, + sessioncontracts.DataPlaneChannelFileUpload, + } { + if !slices.Contains(claims.AllowedChannels, channel) { + t.Fatalf("expected allowed channel %q in %v", channel, claims.AllowedChannels) + } + } +} + +func TestDataPlaneOfferResponseShapeCompatibility(t *testing.T) { + now := time.Now().UTC().Truncate(time.Second) + privateKeyPEM, _ := testRS256Key(t) + service := &Service{ + cfg: module.Config{ + Auth: config.AuthConfig{Issuer: "rap-api-test"}, + DataPlane: config.DataPlaneConfig{ + TokenTTL: time.Minute, + TokenPrivateKeyPEM: privateKeyPEM, + BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws", + DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane", + DirectWorkerJSONRuntime: true, + DirectWorkerTLSTrustMode: "smoke_insecure", + }, + }, + now: func() time.Time { return now }, + } + result := &SessionControlResult{ + Session: RemoteSession{ + ID: "session-1", + OrganizationID: "org-1", + ResourceID: "resource-1", + WorkerID: "worker-1", + Metadata: mustJSON(t, map[string]any{"policy": map[string]any{"clipboard_mode": "disabled", "file_transfer_mode": "disabled"}}), + }, + Attachment: &SessionAttachment{ID: "attachment-1", UserID: "user-1"}, + AttachToken: &sessioncontracts.AttachTokenClaims{ + Token: "existing-attach-token", + SessionID: "session-1", + AttachmentID: "attachment-1", + UserID: "user-1", + WorkerID: "worker-1", + ExpiresAt: now.Add(2 * time.Minute), + }, + } + + if err := service.attachDataPlaneOffer(result); err != nil { + t.Fatalf("attachDataPlaneOffer returned error: %v", err) + } + payload, err := json.Marshal(result) + if err != nil { + t.Fatalf("marshal response: %v", err) + } + var decoded map[string]any + if err := json.Unmarshal(payload, &decoded); err != nil { + t.Fatalf("decode response: %v", err) + } + if decoded["session"] == nil || decoded["attachment"] == nil || decoded["attach_token"] == nil { + t.Fatalf("response lost existing fields: %s", payload) + } + if decoded["data_plane"] == nil || decoded["gateway_url"] == nil { + t.Fatalf("response missing data-plane fields: %s", payload) + } + if result.DataPlane == nil { + t.Fatal("expected data-plane offer") + } + if result.DataPlane.Preferred != sessioncontracts.DataPlaneCandidateDirectWorkerWSS { + t.Fatalf("unexpected preferred candidate: %s", result.DataPlane.Preferred) + } + if len(result.DataPlane.Candidates) != 2 { + t.Fatalf("expected direct and fallback candidates, got %d", len(result.DataPlane.Candidates)) + } + if result.DataPlane.Candidates[0].URL != "wss://worker-1.worker.example.test/rap/v1/data-plane" { + t.Fatalf("unexpected direct candidate URL: %s", result.DataPlane.Candidates[0].URL) + } + if result.DataPlane.Candidates[0].Metadata["runtime_transport"] != "json_v1" { + t.Fatalf("direct candidate is missing json_v1 runtime metadata: %#v", result.DataPlane.Candidates[0].Metadata) + } + if result.DataPlane.Candidates[0].Metadata["traffic_ready"] != true { + t.Fatalf("direct candidate is missing traffic_ready metadata: %#v", result.DataPlane.Candidates[0].Metadata) + } + if result.DataPlane.Candidates[0].Metadata["smoke_only"] != true { + t.Fatalf("direct candidate should be marked smoke-only by default: %#v", result.DataPlane.Candidates[0].Metadata) + } + if result.DataPlane.Candidates[0].Metadata["production_trusted"] != false { + t.Fatalf("smoke direct candidate must not be production-trusted: %#v", result.DataPlane.Candidates[0].Metadata) + } + if !strings.Contains(result.DataPlane.Candidates[1].URL, "/api/v1/gateway/ws") { + t.Fatalf("unexpected backend candidate URL: %s", result.DataPlane.Candidates[1].URL) + } +} + +func TestDataPlaneDirectCandidateMetadataRequiresRuntimeFlag(t *testing.T) { + service := &Service{ + cfg: module.Config{ + DataPlane: config.DataPlaneConfig{ + BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws", + DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane", + DirectWorkerTLSTrustMode: "smoke_insecure", + }, + }, + } + candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"}) + if len(candidates) != 2 { + t.Fatalf("expected direct and fallback candidates, got %d", len(candidates)) + } + if candidates[0].Metadata != nil { + t.Fatalf("direct candidate must not advertise json_v1 before runtime flag is enabled: %#v", candidates[0].Metadata) + } +} + +func TestDataPlaneDirectCandidateAdvertisesBinaryRenderOnlyWhenEnabled(t *testing.T) { + service := &Service{ + cfg: module.Config{ + DataPlane: config.DataPlaneConfig{ + BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws", + DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane", + DirectWorkerJSONRuntime: true, + DirectWorkerBinaryRender: true, + DirectWorkerTLSTrustMode: "platform_ca", + DirectWorkerTLSCARef: "rap-platform-ca:v1", + }, + }, + } + candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"}) + if len(candidates) != 2 { + t.Fatalf("expected direct and fallback candidates, got %d", len(candidates)) + } + if candidates[0].Metadata["render_transport"] != "binary_v1" { + t.Fatalf("direct candidate is missing binary render metadata: %#v", candidates[0].Metadata) + } + if candidates[0].Metadata["binary_render"] != true { + t.Fatalf("direct candidate is missing binary_render metadata: %#v", candidates[0].Metadata) + } + if candidates[0].Metadata["default_color_mode"] != "full_color" { + t.Fatalf("direct candidate is missing default_color_mode metadata: %#v", candidates[0].Metadata) + } + if candidates[0].Metadata["production_trusted"] != true || candidates[0].Metadata["tls_trust_mode"] != "platform_ca" { + t.Fatalf("direct candidate is missing production trust metadata: %#v", candidates[0].Metadata) + } + if candidates[0].Metadata["tls_ca_ref"] != "rap-platform-ca:v1" { + t.Fatalf("direct candidate is missing tls_ca_ref metadata: %#v", candidates[0].Metadata) + } + modes, ok := candidates[0].Metadata["supported_color_modes"].([]string) + if !ok || !slices.Contains(modes, "full_color") || !slices.Contains(modes, "grayscale") { + t.Fatalf("direct candidate is missing supported_color_modes metadata: %#v", candidates[0].Metadata) + } +} + +func TestDataPlaneDirectCandidateOmittedInProductionWhenSmokeOnly(t *testing.T) { + service := &Service{ + cfg: module.Config{ + App: config.AppConfig{Env: "production"}, + DataPlane: config.DataPlaneConfig{ + BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws", + DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane", + DirectWorkerJSONRuntime: true, + DirectWorkerTLSTrustMode: "smoke_insecure", + }, + }, + } + candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"}) + if len(candidates) != 1 { + t.Fatalf("expected fallback-only candidates in production with smoke TLS, got %d", len(candidates)) + } + if candidates[0].Type != sessioncontracts.DataPlaneCandidateBackendGateway { + t.Fatalf("production must not advertise smoke-only direct candidate: %#v", candidates) + } +} + +func TestDataPlaneDirectCandidateAdvertisedInProductionWhenTrusted(t *testing.T) { + service := &Service{ + cfg: module.Config{ + App: config.AppConfig{Env: "production"}, + DataPlane: config.DataPlaneConfig{ + BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws", + DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane", + DirectWorkerJSONRuntime: true, + DirectWorkerTLSTrustMode: "public_ca", + }, + }, + } + candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"}) + if len(candidates) != 2 { + t.Fatalf("expected trusted direct and fallback candidates, got %d", len(candidates)) + } + if candidates[0].Metadata["production_trusted"] != true || candidates[0].Metadata["tls_trust_mode"] != "public_ca" { + t.Fatalf("trusted production direct candidate metadata mismatch: %#v", candidates[0].Metadata) + } +} + +func TestDataPlaneCandidatesFallbackOnlyWhenDirectTemplateMissing(t *testing.T) { + service := &Service{ + cfg: module.Config{ + DataPlane: config.DataPlaneConfig{ + BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws", + }, + }, + } + candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"}) + if len(candidates) != 1 { + t.Fatalf("expected fallback-only candidate list, got %d", len(candidates)) + } + if candidates[0].Type != sessioncontracts.DataPlaneCandidateBackendGateway { + t.Fatalf("unexpected candidate type: %s", candidates[0].Type) + } +} + +func TestDataPlaneAllowedChannelsRespectRuntimePolicy(t *testing.T) { + cases := []struct { + name string + policy map[string]any + expected []string + blocked []string + }{ + { + name: "disabled policies expose only control input render telemetry", + policy: map[string]any{"clipboard_mode": "disabled", "file_transfer_mode": "disabled"}, + expected: []string{sessioncontracts.DataPlaneChannelControl, sessioncontracts.DataPlaneChannelInput, sessioncontracts.DataPlaneChannelRender, sessioncontracts.DataPlaneChannelTelemetry}, + blocked: []string{sessioncontracts.DataPlaneChannelClipboard, sessioncontracts.DataPlaneChannelFileUpload}, + }, + { + name: "clipboard policy adds clipboard channel", + policy: map[string]any{"clipboard_mode": "server_to_client", "file_transfer_mode": "disabled"}, + expected: []string{sessioncontracts.DataPlaneChannelClipboard}, + blocked: []string{sessioncontracts.DataPlaneChannelFileUpload}, + }, + { + name: "client upload policy adds file upload channel", + policy: map[string]any{"clipboard_mode": "disabled", "file_transfer_mode": "client_to_server"}, + expected: []string{sessioncontracts.DataPlaneChannelFileUpload}, + blocked: []string{sessioncontracts.DataPlaneChannelClipboard}, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + session := RemoteSession{Metadata: mustJSON(t, map[string]any{"policy": tc.policy})} + channels := dataPlaneAllowedChannelsFromSession(session) + for _, channel := range tc.expected { + if !slices.Contains(channels, channel) { + t.Fatalf("expected channel %q in %v", channel, channels) + } + } + for _, channel := range tc.blocked { + if slices.Contains(channels, channel) { + t.Fatalf("did not expect channel %q in %v", channel, channels) + } + } + }) + } +} + +func mustJSON(t *testing.T, value any) []byte { + t.Helper() + payload, err := json.Marshal(value) + if err != nil { + t.Fatalf("marshal test metadata: %v", err) + } + return payload +} + +func testRS256Key(t *testing.T) (string, *rsa.PublicKey) { + t.Helper() + privateKey, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + t.Fatalf("generate RSA key: %v", err) + } + encoded := pem.EncodeToMemory(&pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(privateKey), + }) + return string(encoded), &privateKey.PublicKey +} + +func assertEqual(t *testing.T, got, want, name string) { + t.Helper() + if got != want { + t.Fatalf("unexpected %s: got %q want %q", name, got, want) + } +} diff --git a/backend/internal/modules/sessionbroker/errors.go b/backend/internal/modules/sessionbroker/errors.go new file mode 100644 index 0000000..b93e875 --- /dev/null +++ b/backend/internal/modules/sessionbroker/errors.go @@ -0,0 +1,15 @@ +package sessionbroker + +import "errors" + +var ( + ErrSessionNotFound = errors.New("remote session not found") + ErrAttachmentNotFound = errors.New("session attachment not found") + ErrActiveControllerPresent = errors.New("active controller already present") + ErrTakeoverNotAllowed = errors.New("takeover not allowed") + ErrTrustedDeviceRequired = errors.New("trusted device required") + ErrAccessDenied = errors.New("access denied") + ErrSessionNotAttachable = errors.New("session is not attachable") + ErrSessionNotTerminable = errors.New("session is not terminable") + ErrAttachTokenInvalid = errors.New("attach token invalid or expired") +) diff --git a/backend/internal/modules/sessionbroker/livestate.go b/backend/internal/modules/sessionbroker/livestate.go new file mode 100644 index 0000000..b196027 --- /dev/null +++ b/backend/internal/modules/sessionbroker/livestate.go @@ -0,0 +1,65 @@ +package sessionbroker + +import ( + "context" + "time" + + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" +) + +type LiveStateStore interface { + UpsertSession(ctx context.Context, state LiveSessionState) error + GetSession(ctx context.Context, sessionID string) (*LiveSessionState, error) + DeleteSession(ctx context.Context, sessionID string) error + BindController(ctx context.Context, binding sessioncontracts.ControllerBinding, ttl time.Duration) error + GetControllerBinding(ctx context.Context, sessionID string) (*sessioncontracts.ControllerBinding, error) + ClearControllerBinding(ctx context.Context, sessionID string) error + StoreAttachToken(ctx context.Context, claims sessioncontracts.AttachTokenClaims, ttl time.Duration) error + ConsumeAttachToken(ctx context.Context, token string) (*sessioncontracts.AttachTokenClaims, error) + TouchAttachmentHeartbeat(ctx context.Context, sessionID, attachmentID string, ttl time.Duration) error + UpdateWorkerRoute(ctx context.Context, route WorkerRoute, ttl time.Duration) error + GetWorkerRoute(ctx context.Context, sessionID string) (*WorkerRoute, error) + DeleteWorkerRoute(ctx context.Context, sessionID string) error +} + +type LiveSessionState struct { + SessionID string `json:"session_id"` + ResourceID string `json:"resource_id"` + WorkerID string `json:"worker_id"` + State sessioncontracts.State `json:"state"` + ControllerID string `json:"controller_id"` + AttachmentID string `json:"attachment_id"` + TakeoverVersion int `json:"takeover_version"` + RenderQualityProfile string `json:"render_quality_profile,omitempty"` + RenderState string `json:"render_state,omitempty"` + RenderWidth int `json:"render_width,omitempty"` + RenderHeight int `json:"render_height,omitempty"` + RenderFrameSequence int64 `json:"render_frame_sequence,omitempty"` + RenderFrameFormat string `json:"render_frame_format,omitempty"` + RenderFrameData string `json:"render_frame_data,omitempty"` + LastInputCorrelationID string `json:"last_input_correlation_id,omitempty"` + WorkerFrameCapturedAt string `json:"worker_frame_captured_at,omitempty"` + CursorX int `json:"cursor_x,omitempty"` + CursorY int `json:"cursor_y,omitempty"` + CursorVisible bool `json:"cursor_visible,omitempty"` + DirtyRectangles int `json:"dirty_rectangles,omitempty"` + LastRenderAt *time.Time `json:"last_render_at,omitempty"` + ClipboardSequence int64 `json:"clipboard_sequence,omitempty"` + ClipboardText string `json:"clipboard_text,omitempty"` + ClipboardOrigin string `json:"clipboard_origin,omitempty"` + ClipboardContentHash string `json:"clipboard_content_hash,omitempty"` + ClipboardUpdatedAt *time.Time `json:"clipboard_updated_at,omitempty"` + FileDownloadSequence int64 `json:"file_download_sequence,omitempty"` + FileDownloadType string `json:"file_download_type,omitempty"` + FileDownloadPayload map[string]any `json:"file_download_payload,omitempty"` + FileDownloadUpdatedAt *time.Time `json:"file_download_updated_at,omitempty"` + UpdatedAt time.Time `json:"updated_at"` +} + +type WorkerRoute struct { + SessionID string `json:"session_id"` + WorkerID string `json:"worker_id"` + LeaseID string `json:"lease_id"` + ControlStream string `json:"control_stream"` + UpdatedAt time.Time `json:"updated_at"` +} diff --git a/backend/internal/modules/sessionbroker/models.go b/backend/internal/modules/sessionbroker/models.go new file mode 100644 index 0000000..2357505 --- /dev/null +++ b/backend/internal/modules/sessionbroker/models.go @@ -0,0 +1,132 @@ +package sessionbroker + +import ( + "time" + + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" +) + +type AttachmentRole string + +const ( + AttachmentRoleController AttachmentRole = "controller" +) + +type AttachmentState string + +const ( + AttachmentStateAttaching AttachmentState = "attaching" + AttachmentStateActive AttachmentState = "active" + AttachmentStateDetached AttachmentState = "detached" + AttachmentStateSuperseded AttachmentState = "superseded" + AttachmentStateRevoked AttachmentState = "revoked" + AttachmentStateClosed AttachmentState = "closed" +) + +type ResourceTakeoverPolicy string + +const ( + ResourceTakeoverPolicyTrustedDevice ResourceTakeoverPolicy = "trusted_device" + ResourceTakeoverPolicySameUser ResourceTakeoverPolicy = "same_user" + ResourceTakeoverPolicyAdminOnly ResourceTakeoverPolicy = "admin_only" +) + +type ResourceClipboardMode string + +const ( + ResourceClipboardModeDisabled ResourceClipboardMode = "disabled" + ResourceClipboardModeClientToServer ResourceClipboardMode = "client_to_server" + ResourceClipboardModeServerToClient ResourceClipboardMode = "server_to_client" + ResourceClipboardModeBidirectional ResourceClipboardMode = "bidirectional" +) + +type ResourceFileTransferMode string + +const ( + ResourceFileTransferModeDisabled ResourceFileTransferMode = "disabled" + ResourceFileTransferModeClientToServer ResourceFileTransferMode = "client_to_server" + ResourceFileTransferModeServerToClient ResourceFileTransferMode = "server_to_client" + ResourceFileTransferModeBidirectional ResourceFileTransferMode = "bidirectional" +) + +type RemoteSession struct { + ID string + OrganizationID string + ResourceID string + Protocol string + State sessioncontracts.State + WorkerID string + ControllerUserID string + DetachDeadlineAt *time.Time + LastHeartbeatAt *time.Time + TakeoverVersion int + RenderQualityProfile string + Metadata []byte + CreatedAt time.Time + UpdatedAt time.Time +} + +type SessionAttachment struct { + ID string + RemoteSessionID string + UserID string + DeviceID string + Role AttachmentRole + State AttachmentState + SupersededBy *string + TakeoverOf *string + AttachedAt *time.Time + DetachedAt *time.Time + LastInputAt *time.Time + Metadata []byte + CreatedAt time.Time + UpdatedAt time.Time +} + +type ResourcePolicy struct { + ResourceID string + MaxConcurrentSessions int + TakeoverPolicy ResourceTakeoverPolicy + RequireTrustedDevice bool + DetachGracePeriod time.Duration + ClipboardEnabled bool + ClipboardMode ResourceClipboardMode + FileTransferEnabled bool + FileTransferMode ResourceFileTransferMode + CreatedAt time.Time + UpdatedAt time.Time +} + +type ResourceRuntimeSpec struct { + ID string + OrganizationID string + Name string + Address string + Protocol string + SecretRef *string + CertificateVerificationMode string + Metadata []byte +} + +type AuditEvent struct { + ID string + ActorUserID *string + ActorDeviceID *string + EventType string + TargetType string + TargetID string + RemoteSessionID *string + Payload []byte + CreatedAt time.Time +} + +const ( + AuditEventSessionStarted = "session_started" + AuditEventSessionAttached = "session_attached" + AuditEventSessionDetached = "session_detached" + AuditEventSessionTakenOver = "session_taken_over" + AuditEventSessionTerminated = "session_terminated" + AuditEventSessionFailed = "session_failed" + AuditEventSecretAccessed = "resource_secret_accessed" + AuditEventSecretAccessDenied = "resource_secret_access_denied" +) diff --git a/backend/internal/modules/sessionbroker/module.go b/backend/internal/modules/sessionbroker/module.go new file mode 100644 index 0000000..8863b83 --- /dev/null +++ b/backend/internal/modules/sessionbroker/module.go @@ -0,0 +1,164 @@ +package sessionbroker + +import ( + "encoding/json" + "net/http" + + "github.com/go-chi/chi/v5" + + "github.com/example/remote-access-platform/backend/internal/platform/httpx" +) + +type Module struct { + service *Service +} + +func NewModule(service *Service) *Module { + return &Module{service: service} +} + +func (m *Module) Name() string { + return "session-broker" +} + +func (m *Module) Service() *Service { + return m.service +} + +func (m *Module) RegisterRoutes(router chi.Router) { + router.Route("/sessions", func(r chi.Router) { + r.Get("/", m.listSessions) + r.Post("/", m.startSession) + r.Post("/{sessionID}/attach", m.attachSession) + r.Post("/{sessionID}/detach", m.detachSession) + r.Post("/{sessionID}/takeover", m.takeoverSession) + r.Post("/{sessionID}/terminate", m.terminateSession) + r.Post("/{sessionID}/fail", m.markFailed) + }) +} + +func (m *Module) listSessions(w http.ResponseWriter, r *http.Request) { + userID := r.URL.Query().Get("user_id") + if userID == "" { + httpx.WriteError(w, http.StatusBadRequest, "user_id is required") + return + } + sessions, err := m.service.ListSessions(r.Context(), userID) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"sessions": sessions}) +} + +func (m *Module) startSession(w http.ResponseWriter, r *http.Request) { + var cmd StartRemoteSessionCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid start session payload") + return + } + result, err := m.service.StartRemoteSession(r.Context(), cmd) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + httpx.WriteJSON(w, http.StatusCreated, result) +} + +func (m *Module) attachSession(w http.ResponseWriter, r *http.Request) { + var cmd AttachToSessionCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid attach session payload") + return + } + cmd.SessionID = chi.URLParam(r, "sessionID") + result, err := m.service.AttachToSession(r.Context(), cmd) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + httpx.WriteJSON(w, http.StatusOK, result) +} + +func (m *Module) detachSession(w http.ResponseWriter, r *http.Request) { + var cmd DetachFromSessionCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid detach session payload") + return + } + cmd.SessionID = chi.URLParam(r, "sessionID") + result, err := m.service.DetachFromSession(r.Context(), cmd) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + httpx.WriteJSON(w, http.StatusAccepted, result) +} + +func (m *Module) takeoverSession(w http.ResponseWriter, r *http.Request) { + var cmd TakeoverSessionCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid takeover session payload") + return + } + cmd.SessionID = chi.URLParam(r, "sessionID") + result, err := m.service.TakeoverSession(r.Context(), cmd) + if err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + httpx.WriteJSON(w, http.StatusOK, result) +} + +func (m *Module) terminateSession(w http.ResponseWriter, r *http.Request) { + var cmd TerminateSessionCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid terminate session payload") + return + } + cmd.SessionID = chi.URLParam(r, "sessionID") + if err := m.service.TerminateSession(r.Context(), cmd); err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{ + "status": "terminated", + "message": httpx.NewMessage( + "session.terminated", + "status.session.terminated", + "Session terminated.", + nil, + "", + ), + }) +} + +func (m *Module) markFailed(w http.ResponseWriter, r *http.Request) { + var cmd MarkSessionFailedCommand + if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil { + httpx.WriteError(w, http.StatusBadRequest, "invalid fail session payload") + return + } + cmd.SessionID = chi.URLParam(r, "sessionID") + if err := m.service.MarkSessionFailed(r.Context(), cmd); err != nil { + status, message := m.service.MapError(err) + httpx.WriteError(w, status, message) + return + } + httpx.WriteJSON(w, http.StatusAccepted, map[string]any{ + "status": "failed", + "message": httpx.NewMessage( + "session.failed", + "status.session.failed", + "Session marked as failed.", + nil, + "", + ), + }) +} diff --git a/backend/internal/modules/sessionbroker/orchestration.go b/backend/internal/modules/sessionbroker/orchestration.go new file mode 100644 index 0000000..d6b9532 --- /dev/null +++ b/backend/internal/modules/sessionbroker/orchestration.go @@ -0,0 +1,17 @@ +package sessionbroker + +import ( + "context" + + workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker" +) + +type WorkerOrchestrator interface { + Reserve(ctx context.Context, request workercontracts.AttachRequest) (*workercontracts.WorkerLease, error) + GetSessionLease(ctx context.Context, sessionID string) (*workercontracts.WorkerLease, error) + ReleaseSessionLease(ctx context.Context, sessionID string) error + PrepareAttachment(ctx context.Context, session RemoteSession, attachment SessionAttachment, runtimeMetadata map[string]any) error + NotifyDetachment(ctx context.Context, session RemoteSession, attachment SessionAttachment) error + TerminateRemoteSession(ctx context.Context, sessionID, attachmentID string) error + ValidateSessionRuntime(ctx context.Context, sessionID, workerID string) (bool, string, error) +} diff --git a/backend/internal/modules/sessionbroker/postgres_store.go b/backend/internal/modules/sessionbroker/postgres_store.go new file mode 100644 index 0000000..d9ae34e --- /dev/null +++ b/backend/internal/modules/sessionbroker/postgres_store.go @@ -0,0 +1,607 @@ +package sessionbroker + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/example/remote-access-platform/backend/internal/platform/authority" + postgresplatform "github.com/example/remote-access-platform/backend/internal/platform/postgres" +) + +type postgresStore struct { + db postgresplatform.DBTX + authority *authority.Verifier +} + +type PostgresTransactor struct { + pool *pgxpool.Pool + authority *authority.Verifier +} + +func NewPostgresStore(pool *pgxpool.Pool, verifiers ...*authority.Verifier) Store { + var authorityVerifier *authority.Verifier + if len(verifiers) > 0 { + authorityVerifier = verifiers[0] + } + return &postgresStore{db: pool, authority: authorityVerifier} +} + +func NewPostgresTransactor(pool *pgxpool.Pool, verifiers ...*authority.Verifier) *PostgresTransactor { + var authorityVerifier *authority.Verifier + if len(verifiers) > 0 { + authorityVerifier = verifiers[0] + } + return &PostgresTransactor{pool: pool, authority: authorityVerifier} +} + +func (t *PostgresTransactor) WithinTransaction(ctx context.Context, fn func(store Store) error) error { + return postgresplatform.WithTransaction(ctx, t.pool, func(tx pgx.Tx) error { + return fn(&postgresStore{db: tx, authority: t.authority}) + }) +} + +func (s *postgresStore) RemoteSessions() RemoteSessionRepository { + return &postgresRemoteSessionRepository{db: s.db} +} + +func (s *postgresStore) SessionAttachments() SessionAttachmentRepository { + return &postgresSessionAttachmentRepository{db: s.db} +} + +func (s *postgresStore) ResourcePolicies() ResourcePolicyRepository { + return &postgresResourcePolicyRepository{db: s.db} +} + +func (s *postgresStore) ResourceRuntime() ResourceRuntimeRepository { + return &postgresResourceRuntimeRepository{db: s.db} +} + +func (s *postgresStore) AuditEvents() AuditEventRepository { + return &postgresAuditEventRepository{db: s.db} +} + +func (s *postgresStore) Access() AccessRepository { + return &postgresAccessRepository{db: s.db, authority: s.authority} +} + +type postgresRemoteSessionRepository struct { + db postgresplatform.DBTX +} + +type postgresSessionAttachmentRepository struct { + db postgresplatform.DBTX +} + +type postgresResourcePolicyRepository struct { + db postgresplatform.DBTX +} + +type postgresResourceRuntimeRepository struct { + db postgresplatform.DBTX +} + +type postgresAuditEventRepository struct { + db postgresplatform.DBTX +} + +type postgresAccessRepository struct { + db postgresplatform.DBTX + authority *authority.Verifier +} + +func (r *postgresRemoteSessionRepository) Create(ctx context.Context, session RemoteSession) error { + const query = ` +INSERT INTO remote_sessions ( + id, organization_id, resource_id, protocol, state, worker_id, controller_user_id, detach_deadline_at, + last_heartbeat_at, takeover_version, metadata, created_at, updated_at +) VALUES ( + $1::uuid, $2::uuid, $3::uuid, $4, $5, NULLIF($6, ''), $7::uuid, $8, $9, $10, $11::jsonb, $12, $13 +) +` + if _, err := r.db.Exec(ctx, query, + session.ID, + session.OrganizationID, + session.ResourceID, + session.Protocol, + session.State, + session.WorkerID, + session.ControllerUserID, + session.DetachDeadlineAt, + session.LastHeartbeatAt, + session.TakeoverVersion, + jsonPayload(session.Metadata), + session.CreatedAt, + session.UpdatedAt, + ); err != nil { + return fmt.Errorf("create remote session: %w", err) + } + return nil +} + +func (r *postgresRemoteSessionRepository) GetByID(ctx context.Context, sessionID string) (*RemoteSession, error) { + return r.getByID(ctx, sessionID, "") +} + +func (r *postgresRemoteSessionRepository) GetByIDForUpdate(ctx context.Context, sessionID string) (*RemoteSession, error) { + return r.getByID(ctx, sessionID, " FOR UPDATE") +} + +func (r *postgresRemoteSessionRepository) getByID(ctx context.Context, sessionID string, suffix string) (*RemoteSession, error) { + query := ` +SELECT id::text, organization_id::text, resource_id::text, protocol, state, COALESCE(worker_id, ''), controller_user_id::text, + detach_deadline_at, last_heartbeat_at, takeover_version, metadata, created_at, updated_at +FROM remote_sessions +WHERE id = $1::uuid` + suffix + remoteSession, err := scanRemoteSession(r.db.QueryRow(ctx, query, sessionID)) + if errors.Is(err, pgx.ErrNoRows) { + return nil, nil + } + return remoteSession, err +} + +func (r *postgresRemoteSessionRepository) ListByController(ctx context.Context, userID string) ([]RemoteSession, error) { + const query = ` +SELECT id::text, organization_id::text, resource_id::text, protocol, state, COALESCE(worker_id, ''), controller_user_id::text, + detach_deadline_at, last_heartbeat_at, takeover_version, metadata, created_at, updated_at +FROM remote_sessions +WHERE controller_user_id = $1::uuid +ORDER BY updated_at DESC +` + rows, err := r.db.Query(ctx, query, userID) + if err != nil { + return nil, fmt.Errorf("list remote sessions: %w", err) + } + defer rows.Close() + + var sessions []RemoteSession + for rows.Next() { + item, err := scanRemoteSession(rows) + if err != nil { + return nil, err + } + sessions = append(sessions, *item) + } + return sessions, rows.Err() +} + +func (r *postgresRemoteSessionRepository) CountLiveByResource(ctx context.Context, resourceID string) (int, error) { + const query = ` +SELECT COUNT(*) +FROM remote_sessions +WHERE resource_id = $1::uuid AND state IN ('starting', 'active', 'detached', 'reconnecting') +` + var count int + if err := r.db.QueryRow(ctx, query, resourceID).Scan(&count); err != nil { + return 0, fmt.Errorf("count live remote sessions: %w", err) + } + return count, nil +} + +func (r *postgresRemoteSessionRepository) ListDetachedExpired(ctx context.Context, before time.Time, limit int) ([]RemoteSession, error) { + const query = ` +SELECT id::text, organization_id::text, resource_id::text, protocol, state, COALESCE(worker_id, ''), controller_user_id::text, + detach_deadline_at, last_heartbeat_at, takeover_version, metadata, created_at, updated_at +FROM remote_sessions +WHERE state = 'detached' AND detach_deadline_at IS NOT NULL AND detach_deadline_at <= $1 +ORDER BY detach_deadline_at ASC +LIMIT $2 +` + rows, err := r.db.Query(ctx, query, before, limit) + if err != nil { + return nil, fmt.Errorf("list detached expired sessions: %w", err) + } + defer rows.Close() + + var sessions []RemoteSession + for rows.Next() { + item, err := scanRemoteSession(rows) + if err != nil { + return nil, err + } + sessions = append(sessions, *item) + } + return sessions, rows.Err() +} + +func (r *postgresRemoteSessionRepository) UpdateState(ctx context.Context, params UpdateRemoteSessionStateParams) error { + const query = ` +UPDATE remote_sessions +SET state = $2, + worker_id = NULLIF($3, ''), + detach_deadline_at = $4, + last_heartbeat_at = $5, + takeover_version = $6, + updated_at = $7 +WHERE id = $1::uuid +` + if _, err := r.db.Exec(ctx, query, + params.RemoteSessionID, + params.State, + params.WorkerID, + params.DetachDeadlineAt, + params.LastHeartbeatAt, + params.TakeoverVersion, + params.UpdatedAt, + ); err != nil { + return fmt.Errorf("update remote session state: %w", err) + } + return nil +} + +func (r *postgresSessionAttachmentRepository) Create(ctx context.Context, attachment SessionAttachment) error { + const query = ` +INSERT INTO session_attachments ( + id, remote_session_id, user_id, device_id, role, state, superseded_by, + takeover_of, attached_at, detached_at, last_input_at, metadata, created_at, updated_at +) VALUES ( + $1::uuid, $2::uuid, $3::uuid, $4::uuid, $5, $6, NULLIF($7, '')::uuid, + NULLIF($8, '')::uuid, $9, $10, $11, $12::jsonb, $13, $14 +) +` + if _, err := r.db.Exec(ctx, query, + attachment.ID, + attachment.RemoteSessionID, + attachment.UserID, + attachment.DeviceID, + attachment.Role, + attachment.State, + stringValue(attachment.SupersededBy), + stringValue(attachment.TakeoverOf), + attachment.AttachedAt, + attachment.DetachedAt, + attachment.LastInputAt, + jsonPayload(attachment.Metadata), + attachment.CreatedAt, + attachment.UpdatedAt, + ); err != nil { + return fmt.Errorf("create session attachment: %w", err) + } + return nil +} + +func (r *postgresSessionAttachmentRepository) GetByID(ctx context.Context, attachmentID string) (*SessionAttachment, error) { + return r.getByID(ctx, attachmentID, "") +} + +func (r *postgresSessionAttachmentRepository) GetByIDForUpdate(ctx context.Context, attachmentID string) (*SessionAttachment, error) { + return r.getByID(ctx, attachmentID, " FOR UPDATE") +} + +func (r *postgresSessionAttachmentRepository) getByID(ctx context.Context, attachmentID string, suffix string) (*SessionAttachment, error) { + query := ` +SELECT id::text, remote_session_id::text, user_id::text, device_id::text, role, state, + superseded_by::text, takeover_of::text, attached_at, detached_at, last_input_at, metadata, created_at, updated_at +FROM session_attachments +WHERE id = $1::uuid` + suffix + attachment, err := scanSessionAttachment(r.db.QueryRow(ctx, query, attachmentID)) + if errors.Is(err, pgx.ErrNoRows) { + return nil, nil + } + return attachment, err +} + +func (r *postgresSessionAttachmentRepository) ListByRemoteSession(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) { + return r.listByRemoteSession(ctx, remoteSessionID, "") +} + +func (r *postgresSessionAttachmentRepository) ListActiveByRemoteSessionForUpdate(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) { + return r.listByRemoteSession(ctx, remoteSessionID, " AND state IN ('attaching', 'active', 'reconnecting') FOR UPDATE") +} + +func (r *postgresSessionAttachmentRepository) listByRemoteSession(ctx context.Context, remoteSessionID string, suffix string) ([]SessionAttachment, error) { + query := ` +SELECT id::text, remote_session_id::text, user_id::text, device_id::text, role, state, + superseded_by::text, takeover_of::text, attached_at, detached_at, last_input_at, metadata, created_at, updated_at +FROM session_attachments +WHERE remote_session_id = $1::uuid` + suffix + rows, err := r.db.Query(ctx, query, remoteSessionID) + if err != nil { + return nil, fmt.Errorf("list session attachments: %w", err) + } + defer rows.Close() + + var attachments []SessionAttachment + for rows.Next() { + item, err := scanSessionAttachment(rows) + if err != nil { + return nil, err + } + attachments = append(attachments, *item) + } + return attachments, rows.Err() +} + +func (r *postgresSessionAttachmentRepository) UpdateState(ctx context.Context, params UpdateSessionAttachmentStateParams) error { + const query = ` +UPDATE session_attachments +SET state = $2, + detached_at = $3, + last_input_at = $4, + updated_at = $5 +WHERE id = $1::uuid +` + if _, err := r.db.Exec(ctx, query, + params.AttachmentID, + params.State, + params.DetachedAt, + params.LastInputAt, + params.UpdatedAt, + ); err != nil { + return fmt.Errorf("update session attachment state: %w", err) + } + return nil +} + +func (r *postgresSessionAttachmentRepository) Supersede(ctx context.Context, params SupersedeAttachmentParams) error { + const query = ` +UPDATE session_attachments +SET state = 'superseded', + superseded_by = $2::uuid, + detached_at = $3, + updated_at = $4 +WHERE id = $1::uuid +` + if _, err := r.db.Exec(ctx, query, + params.PreviousAttachmentID, + params.NextAttachmentID, + params.DetachedAt, + params.UpdatedAt, + ); err != nil { + return fmt.Errorf("supersede attachment: %w", err) + } + return nil +} + +func (r *postgresResourcePolicyRepository) GetByResourceID(ctx context.Context, resourceID string) (*ResourcePolicy, error) { + const query = ` +SELECT resource_id::text, max_concurrent_sessions, takeover_policy, require_trusted_device, + detach_grace_period_seconds, clipboard_enabled, clipboard_mode, file_transfer_enabled, + COALESCE(file_transfer_mode, 'disabled'), created_at, updated_at +FROM resource_policies +WHERE resource_id = $1::uuid +` + policy, err := scanResourcePolicy(r.db.QueryRow(ctx, query, resourceID)) + if errors.Is(err, pgx.ErrNoRows) { + return nil, nil + } + return policy, err +} + +func (r *postgresResourcePolicyRepository) Upsert(ctx context.Context, policy ResourcePolicy) error { + const query = ` +INSERT INTO resource_policies ( + resource_id, max_concurrent_sessions, takeover_policy, require_trusted_device, + detach_grace_period_seconds, clipboard_enabled, clipboard_mode, file_transfer_enabled, file_transfer_mode, created_at, updated_at +) VALUES ($1::uuid, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11) +ON CONFLICT (resource_id) DO UPDATE SET + max_concurrent_sessions = EXCLUDED.max_concurrent_sessions, + takeover_policy = EXCLUDED.takeover_policy, + require_trusted_device = EXCLUDED.require_trusted_device, + detach_grace_period_seconds = EXCLUDED.detach_grace_period_seconds, + clipboard_enabled = EXCLUDED.clipboard_enabled, + clipboard_mode = EXCLUDED.clipboard_mode, + file_transfer_enabled = EXCLUDED.file_transfer_enabled, + file_transfer_mode = EXCLUDED.file_transfer_mode, + updated_at = EXCLUDED.updated_at +` + clipboardMode := normalizeClipboardMode(policy.ClipboardMode) + fileTransferMode := normalizeFileTransferMode(policy.FileTransferMode) + if _, err := r.db.Exec(ctx, query, + policy.ResourceID, + policy.MaxConcurrentSessions, + policy.TakeoverPolicy, + policy.RequireTrustedDevice, + int(policy.DetachGracePeriod.Seconds()), + clipboardMode != ResourceClipboardModeDisabled, + clipboardMode, + fileTransferAllowsClientToServer(fileTransferMode), + fileTransferMode, + policy.CreatedAt, + policy.UpdatedAt, + ); err != nil { + return fmt.Errorf("upsert resource policy: %w", err) + } + return nil +} + +func (r *postgresResourceRuntimeRepository) GetByID(ctx context.Context, resourceID string) (*ResourceRuntimeSpec, error) { + const query = ` +SELECT id::text, organization_id::text, name, address, protocol, secret_ref, certificate_verification_mode, metadata +FROM resources +WHERE id = $1::uuid +` + item := &ResourceRuntimeSpec{} + var secretRef *string + var metadata []byte + if err := r.db.QueryRow(ctx, query, resourceID).Scan( + &item.ID, + &item.OrganizationID, + &item.Name, + &item.Address, + &item.Protocol, + &secretRef, + &item.CertificateVerificationMode, + &metadata, + ); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return nil, nil + } + return nil, fmt.Errorf("get resource runtime spec: %w", err) + } + item.SecretRef = secretRef + item.Metadata = metadata + return item, nil +} + +func (r *postgresAuditEventRepository) Create(ctx context.Context, event AuditEvent) error { + const query = ` +INSERT INTO audit_events ( + id, actor_user_id, actor_device_id, event_type, target_type, target_id, + remote_session_id, payload, created_at +) VALUES ( + $1::uuid, NULLIF($2, '')::uuid, NULLIF($3, '')::uuid, $4, $5, $6, + NULLIF($7, '')::uuid, $8::jsonb, $9 +) +` + if _, err := r.db.Exec(ctx, query, + event.ID, + stringValue(event.ActorUserID), + stringValue(event.ActorDeviceID), + event.EventType, + event.TargetType, + event.TargetID, + stringValue(event.RemoteSessionID), + jsonPayload(event.Payload), + event.CreatedAt, + ); err != nil { + return fmt.Errorf("create audit event: %w", err) + } + return nil +} + +func (r *postgresAccessRepository) IsTrustedDevice(ctx context.Context, userID, deviceID string) (bool, error) { + const query = ` +SELECT EXISTS( + SELECT 1 FROM devices + WHERE id = $1::uuid AND user_id = $2::uuid AND trust_status = 'trusted' AND revoked_at IS NULL +) +` + var trusted bool + if err := r.db.QueryRow(ctx, query, deviceID, userID).Scan(&trusted); err != nil { + return false, fmt.Errorf("check trusted device: %w", err) + } + return trusted, nil +} + +func (r *postgresAccessRepository) GetPlatformRole(ctx context.Context, userID string) (string, error) { + return authority.EffectivePlatformRole(ctx, r.db, r.authority, userID) +} + +func (r *postgresAccessRepository) GetOrganizationRole(ctx context.Context, organizationID, userID string) (string, bool, error) { + const query = ` +SELECT role_id +FROM organization_memberships +WHERE organization_id = $1::uuid AND user_id = $2::uuid AND status = 'active' +` + var role string + if err := r.db.QueryRow(ctx, query, organizationID, userID).Scan(&role); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return "", false, nil + } + return "", false, fmt.Errorf("get organization role: %w", err) + } + return role, true, nil +} + +type scanner interface { + Scan(dest ...any) error +} + +func scanRemoteSession(row scanner) (*RemoteSession, error) { + item := &RemoteSession{} + var detachDeadlineAt, lastHeartbeatAt *time.Time + var metadata []byte + if err := row.Scan( + &item.ID, + &item.OrganizationID, + &item.ResourceID, + &item.Protocol, + &item.State, + &item.WorkerID, + &item.ControllerUserID, + &detachDeadlineAt, + &lastHeartbeatAt, + &item.TakeoverVersion, + &metadata, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return nil, fmt.Errorf("scan remote session: %w", err) + } + item.DetachDeadlineAt = detachDeadlineAt + item.LastHeartbeatAt = lastHeartbeatAt + item.Metadata = metadata + item.RenderQualityProfile = renderQualityProfileFromSessionMetadata(metadata) + return item, nil +} + +func scanSessionAttachment(row scanner) (*SessionAttachment, error) { + item := &SessionAttachment{} + var supersededBy, takeoverOf *string + var attachedAt, detachedAt, lastInputAt *time.Time + var metadata []byte + if err := row.Scan( + &item.ID, + &item.RemoteSessionID, + &item.UserID, + &item.DeviceID, + &item.Role, + &item.State, + &supersededBy, + &takeoverOf, + &attachedAt, + &detachedAt, + &lastInputAt, + &metadata, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return nil, fmt.Errorf("scan session attachment: %w", err) + } + item.SupersededBy = supersededBy + item.TakeoverOf = takeoverOf + item.AttachedAt = attachedAt + item.DetachedAt = detachedAt + item.LastInputAt = lastInputAt + item.Metadata = metadata + return item, nil +} + +func scanResourcePolicy(row scanner) (*ResourcePolicy, error) { + item := &ResourcePolicy{} + var detachGraceSeconds int + if err := row.Scan( + &item.ResourceID, + &item.MaxConcurrentSessions, + &item.TakeoverPolicy, + &item.RequireTrustedDevice, + &detachGraceSeconds, + &item.ClipboardEnabled, + &item.ClipboardMode, + &item.FileTransferEnabled, + &item.FileTransferMode, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return nil, fmt.Errorf("scan resource policy: %w", err) + } + item.DetachGracePeriod = time.Duration(detachGraceSeconds) * time.Second + item.ClipboardMode = normalizeClipboardMode(item.ClipboardMode) + item.ClipboardEnabled = item.ClipboardMode != ResourceClipboardModeDisabled + item.FileTransferMode = normalizeFileTransferMode(item.FileTransferMode) + item.FileTransferEnabled = fileTransferAllowsClientToServer(item.FileTransferMode) + return item, nil +} + +func jsonPayload(payload []byte) []byte { + if len(payload) == 0 { + return []byte(`{}`) + } + if json.Valid(payload) { + return payload + } + return []byte(`{}`) +} + +func stringValue(value *string) string { + if value == nil { + return "" + } + return *value +} diff --git a/backend/internal/modules/sessionbroker/redis_livestate.go b/backend/internal/modules/sessionbroker/redis_livestate.go new file mode 100644 index 0000000..c698fed --- /dev/null +++ b/backend/internal/modules/sessionbroker/redis_livestate.go @@ -0,0 +1,140 @@ +package sessionbroker + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/redis/go-redis/v9" + + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" +) + +type RedisLiveStateStore struct { + client *redis.Client +} + +func NewRedisLiveStateStore(client *redis.Client) *RedisLiveStateStore { + return &RedisLiveStateStore{client: client} +} + +func (s *RedisLiveStateStore) UpsertSession(ctx context.Context, state LiveSessionState) error { + return s.setJSON(ctx, liveSessionKey(state.SessionID), state, 0) +} + +func (s *RedisLiveStateStore) GetSession(ctx context.Context, sessionID string) (*LiveSessionState, error) { + var state LiveSessionState + ok, err := s.getJSON(ctx, liveSessionKey(sessionID), &state) + if err != nil || !ok { + return nil, err + } + return &state, nil +} + +func (s *RedisLiveStateStore) DeleteSession(ctx context.Context, sessionID string) error { + return s.client.Del(ctx, liveSessionKey(sessionID)).Err() +} + +func (s *RedisLiveStateStore) BindController(ctx context.Context, binding sessioncontracts.ControllerBinding, ttl time.Duration) error { + return s.setJSON(ctx, controllerBindingKey(binding.SessionID), binding, ttl) +} + +func (s *RedisLiveStateStore) GetControllerBinding(ctx context.Context, sessionID string) (*sessioncontracts.ControllerBinding, error) { + var binding sessioncontracts.ControllerBinding + ok, err := s.getJSON(ctx, controllerBindingKey(sessionID), &binding) + if err != nil || !ok { + return nil, err + } + return &binding, nil +} + +func (s *RedisLiveStateStore) ClearControllerBinding(ctx context.Context, sessionID string) error { + return s.client.Del(ctx, controllerBindingKey(sessionID)).Err() +} + +func (s *RedisLiveStateStore) StoreAttachToken(ctx context.Context, claims sessioncontracts.AttachTokenClaims, ttl time.Duration) error { + return s.setJSON(ctx, attachTokenKey(claims.Token), claims, ttl) +} + +func (s *RedisLiveStateStore) ConsumeAttachToken(ctx context.Context, token string) (*sessioncontracts.AttachTokenClaims, error) { + key := attachTokenKey(token) + payload, err := s.client.GetDel(ctx, key).Result() + if err == redis.Nil { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("consume attach token: %w", err) + } + var claims sessioncontracts.AttachTokenClaims + if err := json.Unmarshal([]byte(payload), &claims); err != nil { + return nil, fmt.Errorf("decode attach token: %w", err) + } + return &claims, nil +} + +func (s *RedisLiveStateStore) TouchAttachmentHeartbeat(ctx context.Context, sessionID, attachmentID string, ttl time.Duration) error { + return s.client.Set(ctx, attachmentHeartbeatKey(sessionID, attachmentID), time.Now().UTC().Format(time.RFC3339Nano), ttl).Err() +} + +func (s *RedisLiveStateStore) UpdateWorkerRoute(ctx context.Context, route WorkerRoute, ttl time.Duration) error { + return s.setJSON(ctx, workerRouteKey(route.SessionID), route, ttl) +} + +func (s *RedisLiveStateStore) GetWorkerRoute(ctx context.Context, sessionID string) (*WorkerRoute, error) { + var route WorkerRoute + ok, err := s.getJSON(ctx, workerRouteKey(sessionID), &route) + if err != nil || !ok { + return nil, err + } + return &route, nil +} + +func (s *RedisLiveStateStore) DeleteWorkerRoute(ctx context.Context, sessionID string) error { + return s.client.Del(ctx, workerRouteKey(sessionID)).Err() +} + +func (s *RedisLiveStateStore) setJSON(ctx context.Context, key string, value any, ttl time.Duration) error { + payload, err := json.Marshal(value) + if err != nil { + return fmt.Errorf("encode redis payload: %w", err) + } + if err := s.client.Set(ctx, key, payload, ttl).Err(); err != nil { + return fmt.Errorf("set redis key %s: %w", key, err) + } + return nil +} + +func (s *RedisLiveStateStore) getJSON(ctx context.Context, key string, dest any) (bool, error) { + payload, err := s.client.Get(ctx, key).Result() + if err == redis.Nil { + return false, nil + } + if err != nil { + return false, fmt.Errorf("get redis key %s: %w", key, err) + } + if err := json.Unmarshal([]byte(payload), dest); err != nil { + return false, fmt.Errorf("decode redis key %s: %w", key, err) + } + return true, nil +} + +func liveSessionKey(sessionID string) string { + return "live:session:" + sessionID +} + +func controllerBindingKey(sessionID string) string { + return "live:session:" + sessionID + ":controller" +} + +func attachTokenKey(token string) string { + return "live:attach:" + token +} + +func attachmentHeartbeatKey(sessionID, attachmentID string) string { + return "live:session:" + sessionID + ":attachment:" + attachmentID + ":heartbeat" +} + +func workerRouteKey(sessionID string) string { + return "live:session:" + sessionID + ":worker-route" +} diff --git a/backend/internal/modules/sessionbroker/repository.go b/backend/internal/modules/sessionbroker/repository.go new file mode 100644 index 0000000..d6b6433 --- /dev/null +++ b/backend/internal/modules/sessionbroker/repository.go @@ -0,0 +1,85 @@ +package sessionbroker + +import ( + "context" + "time" + + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" +) + +type RemoteSessionRepository interface { + Create(ctx context.Context, session RemoteSession) error + GetByID(ctx context.Context, sessionID string) (*RemoteSession, error) + GetByIDForUpdate(ctx context.Context, sessionID string) (*RemoteSession, error) + ListByController(ctx context.Context, userID string) ([]RemoteSession, error) + CountLiveByResource(ctx context.Context, resourceID string) (int, error) + ListDetachedExpired(ctx context.Context, before time.Time, limit int) ([]RemoteSession, error) + UpdateState(ctx context.Context, params UpdateRemoteSessionStateParams) error +} + +type SessionAttachmentRepository interface { + Create(ctx context.Context, attachment SessionAttachment) error + GetByID(ctx context.Context, attachmentID string) (*SessionAttachment, error) + GetByIDForUpdate(ctx context.Context, attachmentID string) (*SessionAttachment, error) + ListByRemoteSession(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) + ListActiveByRemoteSessionForUpdate(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) + UpdateState(ctx context.Context, params UpdateSessionAttachmentStateParams) error + Supersede(ctx context.Context, params SupersedeAttachmentParams) error +} + +type ResourcePolicyRepository interface { + GetByResourceID(ctx context.Context, resourceID string) (*ResourcePolicy, error) + Upsert(ctx context.Context, policy ResourcePolicy) error +} + +type AuditEventRepository interface { + Create(ctx context.Context, event AuditEvent) error +} + +type Store interface { + RemoteSessions() RemoteSessionRepository + SessionAttachments() SessionAttachmentRepository + ResourcePolicies() ResourcePolicyRepository + ResourceRuntime() ResourceRuntimeRepository + AuditEvents() AuditEventRepository + Access() AccessRepository +} + +type Transactor interface { + WithinTransaction(ctx context.Context, fn func(store Store) error) error +} + +type UpdateRemoteSessionStateParams struct { + RemoteSessionID string + State sessioncontracts.State + WorkerID string + DetachDeadlineAt *time.Time + LastHeartbeatAt *time.Time + TakeoverVersion int + UpdatedAt time.Time +} + +type UpdateSessionAttachmentStateParams struct { + AttachmentID string + State AttachmentState + DetachedAt *time.Time + LastInputAt *time.Time + UpdatedAt time.Time +} + +type SupersedeAttachmentParams struct { + PreviousAttachmentID string + NextAttachmentID string + DetachedAt time.Time + UpdatedAt time.Time +} + +type AccessRepository interface { + IsTrustedDevice(ctx context.Context, userID, deviceID string) (bool, error) + GetPlatformRole(ctx context.Context, userID string) (string, error) + GetOrganizationRole(ctx context.Context, organizationID, userID string) (string, bool, error) +} + +type ResourceRuntimeRepository interface { + GetByID(ctx context.Context, resourceID string) (*ResourceRuntimeSpec, error) +} diff --git a/backend/internal/modules/sessionbroker/secret_assignment_test.go b/backend/internal/modules/sessionbroker/secret_assignment_test.go new file mode 100644 index 0000000..fef709f --- /dev/null +++ b/backend/internal/modules/sessionbroker/secret_assignment_test.go @@ -0,0 +1,138 @@ +package sessionbroker + +import ( + "context" + "encoding/json" + "errors" + "testing" + + "github.com/example/remote-access-platform/backend/internal/platform/config" + "github.com/example/remote-access-platform/backend/internal/platform/module" + "github.com/example/remote-access-platform/backend/internal/platform/secrets" + workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker" +) + +type fakeSecretResolver struct { + response *secrets.ResolvedResourceSecret + err error + request secrets.ResolveResourceSecretRequest +} + +func testAppConfig(env string) config.AppConfig { + return config.AppConfig{Name: "rap-api-test", Env: env} +} + +func (r *fakeSecretResolver) ResolveForSession(_ context.Context, req secrets.ResolveResourceSecretRequest) (*secrets.ResolvedResourceSecret, error) { + r.request = req + if r.err != nil { + return nil, r.err + } + return r.response, nil +} + +func TestRuntimeAssignmentMetadataMergesResolvedSecretWithoutMutatingSessionMetadata(t *testing.T) { + resolver := &fakeSecretResolver{ + response: &secrets.ResolvedResourceSecret{ + Descriptor: secrets.ResourceSecretDescriptor{Version: 3}, + Payload: json.RawMessage(`{"username":"user","password":"secret","domain":"corp"}`), + }, + } + service := NewService(module.Dependencies{ + Config: module.Config{App: testAppConfig("production")}, + }, nil, nil, nil, nil, resolver) + sessionMetadata := mustJSON(t, map[string]any{ + "resource": map[string]any{ + "id": "resource-1", + "organization_id": "org-1", + "secret_ref": "rap-secret://org/org-1/resources/resource-1/primary", + "metadata": map[string]any{ + "rdp_host": "host", + }, + }, + }) + session := RemoteSession{ + ID: "session-1", + OrganizationID: "org-1", + ResourceID: "resource-1", + WorkerID: "worker-1", + Metadata: sessionMetadata, + } + metadata, secretRef, version, err := service.runtimeAssignmentMetadata(context.Background(), session, &workercontracts.WorkerLease{LeaseID: "lease-1"}) + if err != nil { + t.Fatalf("runtimeAssignmentMetadata returned error: %v", err) + } + if secretRef == "" || version != 3 { + t.Fatalf("expected secret ref and version, got ref=%q version=%d", secretRef, version) + } + resource := metadata["resource"].(map[string]any) + resourceMetadata := resource["metadata"].(map[string]any) + if resourceMetadata["username"] != "user" || resourceMetadata["password"] != "secret" || resourceMetadata["domain"] != "corp" { + t.Fatalf("resolved secret was not merged: %#v", resourceMetadata) + } + var persisted map[string]any + if err := json.Unmarshal(session.Metadata, &persisted); err != nil { + t.Fatalf("decode persisted metadata: %v", err) + } + persistedResource := persisted["resource"].(map[string]any) + persistedMetadata := persistedResource["metadata"].(map[string]any) + if _, ok := persistedMetadata["password"]; ok { + t.Fatalf("session metadata was mutated with plaintext secret") + } + if resolver.request.LeaseID != "lease-1" || resolver.request.WorkerID != "worker-1" { + t.Fatalf("resolver request missed lease/worker proof: %#v", resolver.request) + } +} + +func TestRuntimeAssignmentMetadataRequiresResolverInProduction(t *testing.T) { + service := NewService(module.Dependencies{ + Config: module.Config{App: testAppConfig("production")}, + }, nil, nil, nil, nil) + session := RemoteSession{ + ID: "session-1", + OrganizationID: "org-1", + ResourceID: "resource-1", + WorkerID: "worker-1", + Metadata: mustJSON(t, map[string]any{ + "resource": map[string]any{ + "secret_ref": "rap-secret://org/org-1/resources/resource-1/primary", + }, + }), + } + _, _, _, err := service.runtimeAssignmentMetadata(context.Background(), session, &workercontracts.WorkerLease{LeaseID: "lease-1"}) + if !errors.Is(err, secrets.ErrSecretEncryptionKeyMissing) { + t.Fatalf("expected missing resolver error, got %v", err) + } +} + +func TestRuntimeAssignmentMetadataAllowsDevelopmentMetadataWithoutResolver(t *testing.T) { + service := NewService(module.Dependencies{ + Config: module.Config{App: testAppConfig("development")}, + }, nil, nil, nil, nil) + session := RemoteSession{ + ID: "session-1", + OrganizationID: "org-1", + ResourceID: "resource-1", + WorkerID: "worker-1", + Metadata: mustJSON(t, map[string]any{ + "resource": map[string]any{ + "secret_ref": "rap-secret://org/org-1/resources/resource-1/primary", + "metadata": map[string]any{ + "username": "dev-user", + "password": "dev-password", + }, + }, + }), + } + metadata, secretRef, _, err := service.runtimeAssignmentMetadata(context.Background(), session, nil) + if err != nil { + t.Fatalf("development metadata should not require resolver: %v", err) + } + if secretRef != "" { + t.Fatalf("development fallback should not audit resolver use, got %q", secretRef) + } + resource := metadata["resource"].(map[string]any) + resourceMetadata := resource["metadata"].(map[string]any) + if resourceMetadata["password"] != "dev-password" { + t.Fatalf("development metadata was not preserved") + } +} diff --git a/backend/internal/modules/sessionbroker/service.go b/backend/internal/modules/sessionbroker/service.go new file mode 100644 index 0000000..8895346 --- /dev/null +++ b/backend/internal/modules/sessionbroker/service.go @@ -0,0 +1,1960 @@ +package sessionbroker + +import ( + "context" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "log/slog" + "time" + + "github.com/google/uuid" + + "github.com/example/remote-access-platform/backend/internal/platform/module" + "github.com/example/remote-access-platform/backend/internal/platform/secrets" + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" + workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker" +) + +type Service struct { + cfg module.Config + logger *slog.Logger + store Store + transactor Transactor + liveState LiveStateStore + orchestrator WorkerOrchestrator + secretResolver secrets.ResourceSecretResolver + now func() time.Time +} + +type StartRemoteSessionCommand struct { + ResourceID string `json:"resource_id"` + UserID string `json:"user_id"` + DeviceID string `json:"device_id"` +} + +type AttachToSessionCommand struct { + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + DeviceID string `json:"device_id"` +} + +type DetachFromSessionCommand struct { + SessionID string `json:"session_id"` + AttachmentID string `json:"attachment_id"` + UserID string `json:"user_id"` + Reason string `json:"reason"` +} + +type TakeoverSessionCommand struct { + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + DeviceID string `json:"device_id"` + Reason string `json:"reason"` +} + +type TerminateSessionCommand struct { + SessionID string `json:"session_id"` + UserID string `json:"user_id"` + Reason string `json:"reason"` +} + +type MarkSessionFailedCommand struct { + SessionID string `json:"session_id"` + Reason string `json:"reason"` +} + +type SessionControlResult struct { + Session RemoteSession `json:"session"` + Attachment *SessionAttachment `json:"attachment,omitempty"` + AttachToken *sessioncontracts.AttachTokenClaims `json:"attach_token,omitempty"` + GatewayURL string `json:"gateway_url,omitempty"` + DataPlane *sessioncontracts.DataPlaneOffer `json:"data_plane,omitempty"` +} + +func NewService(deps module.Dependencies, store Store, transactor Transactor, liveState LiveStateStore, orchestrator WorkerOrchestrator, secretResolvers ...secrets.ResourceSecretResolver) *Service { + var secretResolver secrets.ResourceSecretResolver + if len(secretResolvers) > 0 { + secretResolver = secretResolvers[0] + } + return &Service{ + cfg: deps.Config, + logger: deps.Infra.Logger, + store: store, + transactor: transactor, + liveState: liveState, + orchestrator: orchestrator, + secretResolver: secretResolver, + now: time.Now, + } +} + +func (s *Service) StartRemoteSession(ctx context.Context, cmd StartRemoteSessionCommand) (*SessionControlResult, error) { + resource, err := s.store.ResourceRuntime().GetByID(ctx, cmd.ResourceID) + if err != nil { + return nil, err + } + if resource == nil { + return nil, ErrSessionNotFound + } + if err := s.ensureOrganizationMemberAccess(ctx, resource.OrganizationID, cmd.UserID); err != nil { + return nil, err + } + if err := secrets.ValidateResourceSecretReadiness(resource.Protocol, resource.SecretRef, resource.Metadata, s.cfg.App.Env); err != nil { + return nil, err + } + if reusable, err := s.findReusableSession(ctx, cmd.ResourceID, cmd.UserID); err != nil { + return nil, err + } else if reusable != nil { + return s.AttachToSession(ctx, AttachToSessionCommand{ + SessionID: reusable.ID, + UserID: cmd.UserID, + DeviceID: cmd.DeviceID, + }) + } + policy, err := s.loadPolicy(ctx, cmd.ResourceID) + if err != nil { + return nil, err + } + if err := s.ensureTrustedDevice(ctx, policy, cmd.UserID, cmd.DeviceID); err != nil { + return nil, err + } + if err := s.ensureSessionCapacity(ctx, policy, cmd.ResourceID); err != nil { + return nil, err + } + + resourceMetadata := decodeJSONMap(resource.Metadata) + resourceCertificateVerificationMode := resource.CertificateVerificationMode + if resourceCertificateVerificationMode == "" { + if mode, ok := resourceMetadata["certificate_verification_mode"].(string); ok && mode != "" { + resourceCertificateVerificationMode = mode + } else { + resourceCertificateVerificationMode = "strict" + } + } + renderQualityProfile := renderQualityProfileFromMetadata(resourceMetadata) + + sessionID := uuid.NewString() + lease, err := s.orchestrator.Reserve(ctx, workercontracts.AttachRequest{ + ResourceID: cmd.ResourceID, + ActorID: cmd.UserID, + SessionID: sessionID, + RequiredCapabilities: requiredCapabilities(policy), + RenderQualityProfile: renderQualityProfile, + }) + if err != nil { + return nil, err + } + + now := s.now().UTC() + attachmentID := uuid.NewString() + var result SessionControlResult + err = s.transactor.WithinTransaction(ctx, func(store Store) error { + sessionMetadata, err := json.Marshal(map[string]any{ + "resource": map[string]any{ + "id": resource.ID, + "organization_id": resource.OrganizationID, + "name": resource.Name, + "address": resource.Address, + "protocol": resource.Protocol, + "secret_ref": resource.SecretRef, + "certificate_verification_mode": resourceCertificateVerificationMode, + "render_quality_profile": renderQualityProfile, + "metadata": resourceMetadata, + }, + "policy": map[string]any{ + "detach_grace_period_seconds": int(policy.DetachGracePeriod.Seconds()), + "clipboard_enabled": policy.ClipboardMode != ResourceClipboardModeDisabled, + "clipboard_mode": policy.ClipboardMode, + "file_transfer_enabled": fileTransferAllowsClientToServer(policy.FileTransferMode), + "file_transfer_mode": policy.FileTransferMode, + }, + }) + if err != nil { + return err + } + session := RemoteSession{ + ID: sessionID, + OrganizationID: resource.OrganizationID, + ResourceID: cmd.ResourceID, + Protocol: string(lease.Protocol), + State: sessioncontracts.StateStarting, + WorkerID: lease.WorkerID, + ControllerUserID: cmd.UserID, + TakeoverVersion: 1, + RenderQualityProfile: renderQualityProfile, + Metadata: sessionMetadata, + LastHeartbeatAt: &now, + CreatedAt: now, + UpdatedAt: now, + } + attachment := SessionAttachment{ + ID: attachmentID, + RemoteSessionID: sessionID, + UserID: cmd.UserID, + DeviceID: cmd.DeviceID, + Role: AttachmentRoleController, + State: AttachmentStateAttaching, + AttachedAt: &now, + LastInputAt: &now, + Metadata: []byte(`{}`), + CreatedAt: now, + UpdatedAt: now, + } + if err := store.RemoteSessions().Create(ctx, session); err != nil { + return err + } + if err := store.SessionAttachments().Create(ctx, attachment); err != nil { + return err + } + if err := s.writeAuditEvent(ctx, store, AuditEventSessionStarted, cmd.UserID, cmd.DeviceID, session.ID, session.ID, map[string]any{ + "resource_id": cmd.ResourceID, + "worker_id": lease.WorkerID, + }); err != nil { + return err + } + if err := s.writeAuditEvent(ctx, store, AuditEventSessionAttached, cmd.UserID, cmd.DeviceID, attachment.ID, session.ID, map[string]any{ + "takeover": false, + }); err != nil { + return err + } + result.Session = session + result.Attachment = &attachment + return nil + }) + if err != nil { + _ = s.orchestrator.ReleaseSessionLease(ctx, sessionID) + return nil, err + } + + if err := s.prepareAttachment(ctx, result.Session, *result.Attachment, lease); err != nil { + return nil, err + } + attachToken, err := s.publishLiveState(ctx, result.Session, *result.Attachment, lease) + if err != nil { + return nil, err + } + result.AttachToken = attachToken + if err := s.attachDataPlaneOffer(&result); err != nil { + return nil, err + } + return &result, nil +} + +func (s *Service) AttachToSession(ctx context.Context, cmd AttachToSessionCommand) (*SessionControlResult, error) { + if _, err := s.ensureSessionRuntimeConsistencyByID(ctx, cmd.SessionID); err != nil { + return nil, err + } + now := s.now().UTC() + var result SessionControlResult + var existingAttachment *SessionAttachment + err := s.transactor.WithinTransaction(ctx, func(store Store) error { + session, err := store.RemoteSessions().GetByIDForUpdate(ctx, cmd.SessionID) + if err != nil { + return err + } + if session == nil { + return ErrSessionNotFound + } + if err := s.ensureOrganizationMemberAccessWithStore(ctx, store, session.OrganizationID, cmd.UserID); err != nil { + return err + } + policy, err := s.loadPolicyFromStore(ctx, store, session.ResourceID) + if err != nil { + return err + } + if err := s.ensureTrustedDeviceWithStore(ctx, store, policy, cmd.UserID, cmd.DeviceID); err != nil { + return err + } + activeAttachments, err := store.SessionAttachments().ListActiveByRemoteSessionForUpdate(ctx, cmd.SessionID) + if err != nil { + return err + } + var prior *SessionAttachment + for _, attachment := range activeAttachments { + if attachment.UserID != cmd.UserID || attachment.DeviceID != cmd.DeviceID { + return ErrActiveControllerPresent + } + if attachment.State == AttachmentStateActive && session.State == sessioncontracts.StateActive { + copy := attachment + existingAttachment = © + session.LastHeartbeatAt = &now + result.Session = *session + result.Attachment = existingAttachment + return nil + } + copy := attachment + prior = © + } + targetState := sessioncontracts.StateActive + if session.State == sessioncontracts.StateDetached { + targetState = sessioncontracts.StateReconnecting + } + if err := validateTransition(session.State, targetState); err != nil && session.State != sessioncontracts.StateActive { + return err + } + if targetState == sessioncontracts.StateReconnecting { + if err := store.RemoteSessions().UpdateState(ctx, UpdateRemoteSessionStateParams{ + RemoteSessionID: session.ID, + State: sessioncontracts.StateReconnecting, + WorkerID: session.WorkerID, + TakeoverVersion: session.TakeoverVersion, + UpdatedAt: now, + }); err != nil { + return err + } + session.State = sessioncontracts.StateReconnecting + } + attachment := SessionAttachment{ + ID: uuid.NewString(), + RemoteSessionID: session.ID, + UserID: cmd.UserID, + DeviceID: cmd.DeviceID, + Role: AttachmentRoleController, + State: AttachmentStateActive, + AttachedAt: &now, + LastInputAt: &now, + CreatedAt: now, + UpdatedAt: now, + Metadata: []byte(`{}`), + } + if err := store.SessionAttachments().Create(ctx, attachment); err != nil { + return err + } + if prior != nil { + attachment.TakeoverOf = &prior.ID + if err := store.SessionAttachments().Supersede(ctx, SupersedeAttachmentParams{ + PreviousAttachmentID: prior.ID, + NextAttachmentID: attachment.ID, + DetachedAt: now, + UpdatedAt: now, + }); err != nil { + return err + } + } + if err := store.RemoteSessions().UpdateState(ctx, UpdateRemoteSessionStateParams{ + RemoteSessionID: session.ID, + State: sessioncontracts.StateActive, + WorkerID: session.WorkerID, + DetachDeadlineAt: nil, + LastHeartbeatAt: &now, + TakeoverVersion: session.TakeoverVersion, + UpdatedAt: now, + }); err != nil { + return err + } + if err := s.writeAuditEvent(ctx, store, AuditEventSessionAttached, cmd.UserID, cmd.DeviceID, attachment.ID, session.ID, map[string]any{ + "reconnect": prior != nil, + }); err != nil { + return err + } + session.State = sessioncontracts.StateActive + session.DetachDeadlineAt = nil + session.LastHeartbeatAt = &now + session.UpdatedAt = now + result.Session = *session + result.Attachment = &attachment + return nil + }) + if err != nil { + return nil, err + } + + lease := &workercontracts.WorkerLease{ + WorkerID: result.Session.WorkerID, + SessionID: result.Session.ID, + Protocol: workercontracts.Protocol(result.Session.Protocol), + ExpiresAt: now.Add(s.cfg.Worker.LeaseTTL), + ControlStream: "worker://control/" + result.Session.WorkerID, + RenderQualityProfile: result.Session.RenderQualityProfile, + } + if existingAttachment == nil { + actualLease, err := s.orchestrator.GetSessionLease(ctx, result.Session.ID) + if err != nil { + return nil, err + } + if actualLease != nil { + lease = actualLease + } + if err := s.prepareAttachment(ctx, result.Session, *result.Attachment, lease); err != nil { + return nil, err + } + } else { + lease = nil + } + currentLive, err := s.liveState.GetSession(ctx, result.Session.ID) + if err != nil { + return nil, err + } + attachToken, err := s.publishLiveState(ctx, result.Session, *result.Attachment, lease) + if err != nil { + return nil, err + } + if currentLive != nil { + live, liveErr := s.liveState.GetSession(ctx, result.Session.ID) + if liveErr != nil { + return nil, liveErr + } + if live != nil { + mergeLiveRenderTelemetry(live, currentLive) + if err := s.liveState.UpsertSession(ctx, *live); err != nil { + return nil, err + } + } + } + result.AttachToken = attachToken + if err := s.attachDataPlaneOffer(&result); err != nil { + return nil, err + } + return &result, nil +} + +func (s *Service) DetachFromSession(ctx context.Context, cmd DetachFromSessionCommand) (*SessionControlResult, error) { + now := s.now().UTC() + var result SessionControlResult + err := s.transactor.WithinTransaction(ctx, func(store Store) error { + session, err := store.RemoteSessions().GetByIDForUpdate(ctx, cmd.SessionID) + if err != nil { + return err + } + if session == nil { + return ErrSessionNotFound + } + if err := s.ensureSessionControllerAccessWithStore(ctx, store, session, cmd.UserID); err != nil { + return err + } + if session.State == sessioncontracts.StateDetached { + attachment, err := store.SessionAttachments().GetByID(ctx, cmd.AttachmentID) + if err != nil { + return err + } + if attachment == nil { + return ErrAttachmentNotFound + } + result.Session = *session + result.Attachment = attachment + return nil + } + if err := validateTransition(session.State, sessioncontracts.StateDetached); err != nil { + return ErrSessionNotAttachable + } + attachment, err := store.SessionAttachments().GetByIDForUpdate(ctx, cmd.AttachmentID) + if err != nil { + return err + } + if attachment == nil || attachment.RemoteSessionID != cmd.SessionID || attachment.UserID != cmd.UserID { + return ErrAttachmentNotFound + } + if attachment.State == AttachmentStateDetached { + result.Session = *session + result.Attachment = attachment + return nil + } + detachedAt := now + if err := store.SessionAttachments().UpdateState(ctx, UpdateSessionAttachmentStateParams{ + AttachmentID: attachment.ID, + State: AttachmentStateDetached, + DetachedAt: &detachedAt, + LastInputAt: attachment.LastInputAt, + UpdatedAt: now, + }); err != nil { + return err + } + deadline := now.Add(s.cfg.Session.DetachGracePeriod) + if err := store.RemoteSessions().UpdateState(ctx, UpdateRemoteSessionStateParams{ + RemoteSessionID: session.ID, + State: sessioncontracts.StateDetached, + WorkerID: session.WorkerID, + DetachDeadlineAt: &deadline, + LastHeartbeatAt: session.LastHeartbeatAt, + TakeoverVersion: session.TakeoverVersion, + UpdatedAt: now, + }); err != nil { + return err + } + if err := s.writeAuditEvent(ctx, store, AuditEventSessionDetached, cmd.UserID, attachment.DeviceID, attachment.ID, session.ID, map[string]any{ + "reason": cmd.Reason, + }); err != nil { + return err + } + attachment.State = AttachmentStateDetached + attachment.DetachedAt = &detachedAt + session.State = sessioncontracts.StateDetached + session.DetachDeadlineAt = &deadline + session.UpdatedAt = now + result.Session = *session + result.Attachment = attachment + return nil + }) + if err != nil { + return nil, err + } + + _ = s.orchestrator.NotifyDetachment(ctx, result.Session, *result.Attachment) + if err := s.liveState.ClearControllerBinding(ctx, result.Session.ID); err != nil { + return nil, err + } + currentLive, err := s.liveState.GetSession(ctx, result.Session.ID) + if err != nil { + return nil, err + } + detachedLive := LiveSessionState{ + SessionID: result.Session.ID, + ResourceID: result.Session.ResourceID, + WorkerID: result.Session.WorkerID, + State: result.Session.State, + ControllerID: result.Session.ControllerUserID, + AttachmentID: result.Attachment.ID, + TakeoverVersion: result.Session.TakeoverVersion, + UpdatedAt: now, + } + mergeLiveRenderTelemetry(&detachedLive, currentLive) + if err := s.liveState.UpsertSession(ctx, detachedLive); err != nil { + return nil, err + } + return &result, nil +} + +func (s *Service) TakeoverSession(ctx context.Context, cmd TakeoverSessionCommand) (*SessionControlResult, error) { + if _, err := s.ensureSessionRuntimeConsistencyByID(ctx, cmd.SessionID); err != nil { + return nil, err + } + now := s.now().UTC() + var result SessionControlResult + var existingAttachment *SessionAttachment + err := s.transactor.WithinTransaction(ctx, func(store Store) error { + session, err := store.RemoteSessions().GetByIDForUpdate(ctx, cmd.SessionID) + if err != nil { + return err + } + if session == nil { + return ErrSessionNotFound + } + if err := s.ensureOrganizationMemberAccessWithStore(ctx, store, session.OrganizationID, cmd.UserID); err != nil { + return err + } + policy, err := s.loadPolicyFromStore(ctx, store, session.ResourceID) + if err != nil { + return err + } + if err := s.ensureTakeoverAllowed(ctx, store, policy, session, cmd.UserID, cmd.DeviceID); err != nil { + return err + } + activeAttachments, err := store.SessionAttachments().ListActiveByRemoteSessionForUpdate(ctx, cmd.SessionID) + if err != nil { + return err + } + if len(activeAttachments) == 0 { + return ErrSessionNotAttachable + } + for _, attachment := range activeAttachments { + if attachment.UserID == cmd.UserID && attachment.DeviceID == cmd.DeviceID && attachment.State == AttachmentStateActive { + copy := attachment + existingAttachment = © + result.Session = *session + result.Attachment = existingAttachment + return nil + } + } + attachment := SessionAttachment{ + ID: uuid.NewString(), + RemoteSessionID: session.ID, + UserID: cmd.UserID, + DeviceID: cmd.DeviceID, + Role: AttachmentRoleController, + State: AttachmentStateActive, + AttachedAt: &now, + LastInputAt: &now, + CreatedAt: now, + UpdatedAt: now, + Metadata: []byte(`{}`), + } + if err := store.SessionAttachments().Create(ctx, attachment); err != nil { + return err + } + firstPrevious := activeAttachments[0].ID + attachment.TakeoverOf = &firstPrevious + for _, prior := range activeAttachments { + if err := store.SessionAttachments().Supersede(ctx, SupersedeAttachmentParams{ + PreviousAttachmentID: prior.ID, + NextAttachmentID: attachment.ID, + DetachedAt: now, + UpdatedAt: now, + }); err != nil { + return err + } + } + nextVersion := session.TakeoverVersion + 1 + if err := store.RemoteSessions().UpdateState(ctx, UpdateRemoteSessionStateParams{ + RemoteSessionID: session.ID, + State: sessioncontracts.StateActive, + WorkerID: session.WorkerID, + DetachDeadlineAt: nil, + LastHeartbeatAt: &now, + TakeoverVersion: nextVersion, + UpdatedAt: now, + }); err != nil { + return err + } + session.ControllerUserID = cmd.UserID + session.TakeoverVersion = nextVersion + session.State = sessioncontracts.StateActive + session.DetachDeadlineAt = nil + session.LastHeartbeatAt = &now + session.UpdatedAt = now + if err := s.writeAuditEvent(ctx, store, AuditEventSessionTakenOver, cmd.UserID, cmd.DeviceID, attachment.ID, session.ID, map[string]any{ + "reason": cmd.Reason, + }); err != nil { + return err + } + result.Session = *session + result.Attachment = &attachment + return nil + }) + if err != nil { + return nil, err + } + + lease := &workercontracts.WorkerLease{ + WorkerID: result.Session.WorkerID, + SessionID: result.Session.ID, + Protocol: workercontracts.Protocol(result.Session.Protocol), + ExpiresAt: now.Add(s.cfg.Worker.LeaseTTL), + ControlStream: "worker://control/" + result.Session.WorkerID, + } + if existingAttachment == nil { + actualLease, err := s.orchestrator.GetSessionLease(ctx, result.Session.ID) + if err != nil { + return nil, err + } + if actualLease != nil { + lease = actualLease + } + if err := s.prepareAttachment(ctx, result.Session, *result.Attachment, lease); err != nil { + return nil, err + } + } else { + lease = nil + } + currentLive, err := s.liveState.GetSession(ctx, result.Session.ID) + if err != nil { + return nil, err + } + attachToken, err := s.publishLiveState(ctx, result.Session, *result.Attachment, lease) + if err != nil { + return nil, err + } + if currentLive != nil { + live, liveErr := s.liveState.GetSession(ctx, result.Session.ID) + if liveErr != nil { + return nil, liveErr + } + if live != nil { + mergeLiveRenderTelemetry(live, currentLive) + if err := s.liveState.UpsertSession(ctx, *live); err != nil { + return nil, err + } + } + } + result.AttachToken = attachToken + if err := s.attachDataPlaneOffer(&result); err != nil { + return nil, err + } + return &result, nil +} + +func (s *Service) TerminateSession(ctx context.Context, cmd TerminateSessionCommand) error { + if err := s.finalizeSession(ctx, cmd.SessionID, cmd.UserID, "", AuditEventSessionTerminated, cmd.Reason, sessioncontracts.StateTerminated, true); err != nil { + return err + } + return s.orchestrator.ReleaseSessionLease(ctx, cmd.SessionID) +} + +func (s *Service) MarkSessionFailed(ctx context.Context, cmd MarkSessionFailedCommand) error { + if err := s.finalizeSession(ctx, cmd.SessionID, "", "", AuditEventSessionFailed, cmd.Reason, sessioncontracts.StateFailed, false); err != nil { + return err + } + return s.orchestrator.ReleaseSessionLease(ctx, cmd.SessionID) +} + +func (s *Service) RecoverDetachedSessions(ctx context.Context) error { + sessions, err := s.store.RemoteSessions().ListDetachedExpired(ctx, s.now().UTC(), s.cfg.Session.RecoveryBatchSize) + if err != nil { + return err + } + for _, session := range sessions { + if err := s.TerminateSession(ctx, TerminateSessionCommand{ + SessionID: session.ID, + Reason: "detach_grace_period_expired", + }); err != nil { + return err + } + } + return nil +} + +func (s *Service) ConsumeAttachToken(ctx context.Context, token string) (*sessioncontracts.AttachTokenClaims, *LiveSessionState, error) { + claims, err := s.liveState.ConsumeAttachToken(ctx, token) + if err != nil { + return nil, nil, err + } + if claims == nil || claims.ExpiresAt.Before(s.now().UTC()) { + return nil, nil, ErrAttachTokenInvalid + } + state, err := s.GetLiveSession(ctx, claims.SessionID) + if err != nil { + return nil, nil, err + } + if state == nil { + return nil, nil, ErrSessionNotFound + } + return claims, state, nil +} + +func (s *Service) GetControllerBinding(ctx context.Context, sessionID string) (*sessioncontracts.ControllerBinding, error) { + return s.liveState.GetControllerBinding(ctx, sessionID) +} + +func (s *Service) GetSessionSnapshot(ctx context.Context, sessionID string) (*RemoteSession, error) { + return s.ensureSessionRuntimeConsistencyByID(ctx, sessionID) +} + +func (s *Service) TouchAttachmentHeartbeat(ctx context.Context, sessionID, attachmentID string) error { + if err := s.liveState.TouchAttachmentHeartbeat(ctx, sessionID, attachmentID, s.cfg.Session.HeartbeatTTL); err != nil { + return err + } + binding, err := s.liveState.GetControllerBinding(ctx, sessionID) + if err != nil { + return err + } + if binding != nil && binding.AttachmentID == attachmentID { + if err := s.liveState.BindController(ctx, *binding, s.cfg.Session.LiveStateTTL); err != nil { + return err + } + } + route, err := s.liveState.GetWorkerRoute(ctx, sessionID) + if err != nil { + return err + } + if route != nil { + return s.liveState.UpdateWorkerRoute(ctx, *route, s.cfg.Session.LiveStateTTL) + } + return nil +} + +func (s *Service) GetLiveSession(ctx context.Context, sessionID string) (*LiveSessionState, error) { + session, err := s.ensureSessionRuntimeConsistencyByID(ctx, sessionID) + if err != nil { + return nil, err + } + live, err := s.liveState.GetSession(ctx, sessionID) + if err != nil || live != nil { + return live, err + } + if session == nil { + return nil, nil + } + return s.RebuildLiveStateFromStore(ctx, sessionID) +} + +func (s *Service) ListSessions(ctx context.Context, userID string) ([]RemoteSession, error) { + sessions, err := s.store.RemoteSessions().ListByController(ctx, userID) + if err != nil { + return nil, err + } + filtered := make([]RemoteSession, 0, len(sessions)) + for _, session := range sessions { + consistent, err := s.ensureSessionRuntimeConsistency(ctx, &session) + if err != nil { + return nil, err + } + if consistent == nil { + continue + } + session = *consistent + allowed, err := s.hasOrganizationMemberAccess(ctx, session.OrganizationID, userID) + if err != nil { + return nil, err + } + if allowed { + filtered = append(filtered, session) + } + } + return filtered, nil +} + +func (s *Service) finalizeSession(ctx context.Context, sessionID, actorUserID, actorDeviceID, eventType, reason string, targetState sessioncontracts.State, enforceActor bool) error { + now := s.now().UTC() + lastAttachmentID := "" + if err := s.transactor.WithinTransaction(ctx, func(store Store) error { + session, err := store.RemoteSessions().GetByIDForUpdate(ctx, sessionID) + if err != nil { + return err + } + if session == nil { + return ErrSessionNotFound + } + if enforceActor { + if err := s.ensureSessionAdminOrControllerAccessWithStore(ctx, store, session, actorUserID); err != nil { + return err + } + } + if err := validateTransition(session.State, targetState); err != nil { + return ErrSessionNotTerminable + } + activeAttachments, err := store.SessionAttachments().ListByRemoteSession(ctx, sessionID) + if err != nil { + return err + } + for _, attachment := range activeAttachments { + if attachment.State == AttachmentStateClosed || attachment.State == AttachmentStateSuperseded { + continue + } + lastAttachmentID = attachment.ID + detachedAt := now + if err := store.SessionAttachments().UpdateState(ctx, UpdateSessionAttachmentStateParams{ + AttachmentID: attachment.ID, + State: AttachmentStateClosed, + DetachedAt: &detachedAt, + LastInputAt: attachment.LastInputAt, + UpdatedAt: now, + }); err != nil { + return err + } + } + if err := store.RemoteSessions().UpdateState(ctx, UpdateRemoteSessionStateParams{ + RemoteSessionID: session.ID, + State: targetState, + WorkerID: session.WorkerID, + TakeoverVersion: session.TakeoverVersion, + UpdatedAt: now, + }); err != nil { + return err + } + return s.writeAuditEvent(ctx, store, eventType, actorUserID, actorDeviceID, session.ID, session.ID, map[string]any{ + "reason": reason, + }) + }); err != nil { + return err + } + if lastAttachmentID != "" { + _ = s.orchestrator.TerminateRemoteSession(ctx, sessionID, lastAttachmentID) + } + if err := s.liveState.ClearControllerBinding(ctx, sessionID); err != nil { + return err + } + if err := s.liveState.DeleteWorkerRoute(ctx, sessionID); err != nil { + return err + } + return s.liveState.DeleteSession(ctx, sessionID) +} + +func (s *Service) HandleWorkerConnected(ctx context.Context, sessionID string) error { + now := s.now().UTC() + var sessionSnapshot *RemoteSession + var attachmentSnapshot *SessionAttachment + var staleTerminalState sessioncontracts.State + if err := s.transactor.WithinTransaction(ctx, func(store Store) error { + session, err := store.RemoteSessions().GetByIDForUpdate(ctx, sessionID) + if err != nil { + return err + } + if session == nil { + return ErrSessionNotFound + } + if isTerminalSessionState(session.State) { + staleTerminalState = session.State + return nil + } + if err := validateTransition(session.State, sessioncontracts.StateActive); err != nil && session.State != sessioncontracts.StateActive { + return err + } + attachments, err := store.SessionAttachments().ListByRemoteSession(ctx, sessionID) + if err != nil { + return err + } + for _, attachment := range attachments { + if attachment.State == AttachmentStateAttaching || attachment.State == AttachmentStateActive { + if err := store.SessionAttachments().UpdateState(ctx, UpdateSessionAttachmentStateParams{ + AttachmentID: attachment.ID, + State: AttachmentStateActive, + DetachedAt: attachment.DetachedAt, + LastInputAt: attachment.LastInputAt, + UpdatedAt: now, + }); err != nil { + return err + } + attachment.State = AttachmentStateActive + attachment.UpdatedAt = now + attachmentSnapshot = &attachment + break + } + } + if err := store.RemoteSessions().UpdateState(ctx, UpdateRemoteSessionStateParams{ + RemoteSessionID: session.ID, + State: sessioncontracts.StateActive, + WorkerID: session.WorkerID, + DetachDeadlineAt: nil, + LastHeartbeatAt: &now, + TakeoverVersion: session.TakeoverVersion, + UpdatedAt: now, + }); err != nil { + return err + } + session.State = sessioncontracts.StateActive + session.DetachDeadlineAt = nil + session.LastHeartbeatAt = &now + session.UpdatedAt = now + sessionSnapshot = session + return nil + }); err != nil { + return err + } + if staleTerminalState != "" { + s.logStaleWorkerEvent("connected", sessionID, staleTerminalState) + return nil + } + live, err := s.liveState.GetSession(ctx, sessionID) + if err != nil { + return err + } + if live != nil { + live.State = sessioncontracts.StateActive + live.UpdatedAt = now + return s.liveState.UpsertSession(ctx, *live) + } + if sessionSnapshot != nil && attachmentSnapshot != nil { + _, err = s.publishLiveState(ctx, *sessionSnapshot, *attachmentSnapshot, nil) + } + return err +} + +func (s *Service) HandleWorkerHeartbeat(ctx context.Context, sessionID string) error { + now := s.now().UTC() + var staleTerminalState sessioncontracts.State + if err := s.transactor.WithinTransaction(ctx, func(store Store) error { + session, err := store.RemoteSessions().GetByIDForUpdate(ctx, sessionID) + if err != nil { + return err + } + if session == nil { + return ErrSessionNotFound + } + if isTerminalSessionState(session.State) { + staleTerminalState = session.State + return nil + } + return store.RemoteSessions().UpdateState(ctx, UpdateRemoteSessionStateParams{ + RemoteSessionID: session.ID, + State: session.State, + WorkerID: session.WorkerID, + DetachDeadlineAt: session.DetachDeadlineAt, + LastHeartbeatAt: &now, + TakeoverVersion: session.TakeoverVersion, + UpdatedAt: now, + }) + }); err != nil { + return err + } + if staleTerminalState != "" { + s.logStaleWorkerEvent("heartbeat", sessionID, staleTerminalState) + return nil + } + live, err := s.liveState.GetSession(ctx, sessionID) + if err != nil { + return err + } + if live != nil { + live.UpdatedAt = now + return s.liveState.UpsertSession(ctx, *live) + } + return nil +} + +func (s *Service) UpdateWorkerRenderTelemetry(ctx context.Context, sessionID string, payload map[string]any) error { + if stale, state, err := s.isStaleTerminalWorkerEvent(ctx, sessionID); err != nil { + return err + } else if stale { + s.logStaleWorkerEvent("render_telemetry", sessionID, state) + return nil + } + live, err := s.GetLiveSession(ctx, sessionID) + if err != nil { + return err + } + if live == nil { + live, err = s.RebuildLiveStateFromStore(ctx, sessionID) + if err != nil { + return err + } + if live == nil { + return ErrSessionNotFound + } + } + now := s.now().UTC() + if profile, ok := payload["render_quality_profile"].(string); ok && profile != "" { + live.RenderQualityProfile = normalizeRenderQualityProfile(profile) + } + if state, ok := payload["render_state"].(string); ok && state != "" { + live.RenderState = state + } else if state, ok := payload["state"].(string); ok && state != "" { + live.RenderState = state + } + if width, ok := toInt(payload["width"]); ok { + live.RenderWidth = width + } + if height, ok := toInt(payload["height"]); ok { + live.RenderHeight = height + } + if sequence, ok := toInt64(payload["frame_sequence"]); ok { + live.RenderFrameSequence = sequence + } + if frameFormat, ok := payload["frame_format"].(string); ok && frameFormat != "" { + live.RenderFrameFormat = frameFormat + } + if desktopWidth, ok := toInt(payload["desktop_width"]); ok && desktopWidth > 0 { + live.RenderWidth = desktopWidth + } + if desktopHeight, ok := toInt(payload["desktop_height"]); ok && desktopHeight > 0 { + live.RenderHeight = desktopHeight + } + s.applyRenderFrameData(live, payload) + if correlationID, ok := payload["input_correlation_id"].(string); ok && correlationID != "" { + live.LastInputCorrelationID = correlationID + } + if capturedAt, ok := payload["worker_frame_captured_at"].(string); ok && capturedAt != "" { + live.WorkerFrameCapturedAt = capturedAt + } + if cursorX, ok := toInt(payload["cursor_x"]); ok { + live.CursorX = cursorX + } + if cursorY, ok := toInt(payload["cursor_y"]); ok { + live.CursorY = cursorY + } + if visible, ok := payload["cursor_visible"].(bool); ok { + live.CursorVisible = visible + } + if dirty, ok := toInt(payload["dirty_rectangles"]); ok { + live.DirtyRectangles = dirty + } + if cursorVisible, ok := payload["cursor_visible"].(bool); ok { + live.CursorVisible = cursorVisible + } + live.LastRenderAt = &now + live.UpdatedAt = now + return s.liveState.UpsertSession(ctx, *live) +} + +func (s *Service) applyRenderFrameData(live *LiveSessionState, payload map[string]any) { + frameData, ok := payload["frame_data"].(string) + if !ok || frameData == "" { + return + } + + updateKind, _ := payload["frame_update_kind"].(string) + if updateKind != "region" { + live.RenderFrameData = frameData + return + } + + desktopWidth := live.RenderWidth + desktopHeight := live.RenderHeight + if width, ok := toInt(payload["desktop_width"]); ok && width > 0 { + desktopWidth = width + } + if height, ok := toInt(payload["desktop_height"]); ok && height > 0 { + desktopHeight = height + } + frameWidth, frameWidthOK := toInt(payload["frame_width"]) + frameHeight, frameHeightOK := toInt(payload["frame_height"]) + regionX, regionXOK := toInt(payload["region_x"]) + regionY, regionYOK := toInt(payload["region_y"]) + regionWidth, regionWidthOK := toInt(payload["region_width"]) + regionHeight, regionHeightOK := toInt(payload["region_height"]) + if !frameWidthOK || !frameHeightOK { + frameWidth = regionWidth + frameHeight = regionHeight + } + if !regionXOK || !regionYOK || !regionWidthOK || !regionHeightOK || + desktopWidth <= 0 || desktopHeight <= 0 || + frameWidth <= 0 || frameHeight <= 0 || + regionWidth <= 0 || regionHeight <= 0 || + regionX < 0 || regionY < 0 || + regionX+regionWidth > desktopWidth || + regionY+regionHeight > desktopHeight { + s.logger.Warn("render live-state region frame ignored because metadata is invalid", + "session_id", live.SessionID, + "desktop_width", desktopWidth, + "desktop_height", desktopHeight, + "frame_width", frameWidth, + "frame_height", frameHeight, + "region_x", regionX, + "region_y", regionY, + "region_width", regionWidth, + "region_height", regionHeight) + return + } + if live.RenderFrameData == "" { + s.logger.Warn("render live-state region frame ignored because no full baseline exists", + "session_id", live.SessionID, + "frame_sequence", live.RenderFrameSequence) + return + } + + fullBytes, err := base64.StdEncoding.DecodeString(live.RenderFrameData) + if err != nil { + s.logger.Warn("render live-state region frame ignored because baseline decode failed", + "session_id", live.SessionID, + "error", err) + return + } + regionBytes, err := base64.StdEncoding.DecodeString(frameData) + if err != nil { + s.logger.Warn("render live-state region frame ignored because region decode failed", + "session_id", live.SessionID, + "error", err) + return + } + + fullStride := desktopWidth * 4 + regionStride := frameWidth * 4 + if stride, ok := toInt(payload["frame_stride"]); ok && stride > 0 { + regionStride = stride + } + requiredFullBytes := fullStride * desktopHeight + requiredRegionBytes := regionStride * regionHeight + if len(fullBytes) < requiredFullBytes || len(regionBytes) < requiredRegionBytes || regionStride < regionWidth*4 { + s.logger.Warn("render live-state region frame ignored because byte lengths are invalid", + "session_id", live.SessionID, + "full_bytes", len(fullBytes), + "required_full_bytes", requiredFullBytes, + "region_bytes", len(regionBytes), + "required_region_bytes", requiredRegionBytes) + return + } + + for row := 0; row < regionHeight; row++ { + srcOffset := row * regionStride + dstOffset := (regionY+row)*fullStride + regionX*4 + copy(fullBytes[dstOffset:dstOffset+regionWidth*4], regionBytes[srcOffset:srcOffset+regionWidth*4]) + } + live.RenderFrameData = base64.StdEncoding.EncodeToString(fullBytes[:requiredFullBytes]) + live.RenderWidth = desktopWidth + live.RenderHeight = desktopHeight + live.RenderFrameFormat = "bgra32" +} + +func (s *Service) UpdateWorkerClipboardText(ctx context.Context, sessionID string, payload map[string]any) error { + if stale, state, err := s.isStaleTerminalWorkerEvent(ctx, sessionID); err != nil { + return err + } else if stale { + s.logStaleWorkerEvent("clipboard_text", sessionID, state) + return nil + } + mode, state, err := s.GetSessionClipboardPolicy(ctx, sessionID) + if err != nil { + return err + } + if state != sessioncontracts.StateActive || !clipboardAllowsServerToClient(mode) { + s.logger.Info("worker clipboard text ignored by policy or state", + "session_id", sessionID, + "state", state, + "clipboard_mode", mode) + return nil + } + text, _ := payload["text"].(string) + if text == "" { + s.logger.Info("worker clipboard text ignored because payload text is empty", + "session_id", sessionID) + return nil + } + origin, _ := payload["origin"].(string) + contentHash, _ := payload["content_hash"].(string) + live, err := s.GetLiveSession(ctx, sessionID) + if err != nil { + return err + } + if live == nil { + live, err = s.RebuildLiveStateFromStore(ctx, sessionID) + if err != nil { + return err + } + if live == nil { + return ErrSessionNotFound + } + } + now := s.now().UTC() + live.ClipboardSequence++ + if sequence, ok := toInt64(payload["sequence_id"]); ok && sequence > live.ClipboardSequence { + live.ClipboardSequence = sequence + } + live.ClipboardText = text + live.ClipboardOrigin = origin + live.ClipboardContentHash = contentHash + live.ClipboardUpdatedAt = &now + live.UpdatedAt = now + s.logger.Info("worker clipboard text persisted to live state", + "session_id", sessionID, + "origin", origin, + "sequence_id", live.ClipboardSequence, + "content_hash", contentHash, + "text_bytes", len(text)) + return s.liveState.UpsertSession(ctx, *live) +} + +func (s *Service) UpdateWorkerFileDownloadEvent(ctx context.Context, sessionID, eventType string, payload map[string]any) error { + if stale, state, err := s.isStaleTerminalWorkerEvent(ctx, sessionID); err != nil { + return err + } else if stale { + s.logStaleWorkerEvent("file_download", sessionID, state) + return nil + } + mode, state, err := s.GetSessionFileTransferPolicy(ctx, sessionID) + if err != nil { + return err + } + if state != sessioncontracts.StateActive || !fileTransferAllowsServerToClient(mode) { + s.logger.Info("worker file download event ignored by policy or state", + "session_id", sessionID, + "state", state, + "file_transfer_mode", mode, + "event_type", eventType) + return nil + } + if payload == nil { + payload = map[string]any{} + } + payload["direction"] = "server_to_client" + live, err := s.GetLiveSession(ctx, sessionID) + if err != nil { + return err + } + if live == nil { + live, err = s.RebuildLiveStateFromStore(ctx, sessionID) + if err != nil { + return err + } + if live == nil { + return ErrSessionNotFound + } + } + now := s.now().UTC() + live.FileDownloadSequence++ + if sequence, ok := toInt64(payload["sequence"]); ok && sequence > live.FileDownloadSequence { + live.FileDownloadSequence = sequence + } + live.FileDownloadType = eventType + live.FileDownloadPayload = payload + live.FileDownloadUpdatedAt = &now + live.UpdatedAt = now + s.logger.Info("worker file download event persisted to live state", + "session_id", sessionID, + "event_type", eventType, + "sequence", live.FileDownloadSequence, + "transfer_id", payload["transfer_id"], + "file_id", payload["file_id"], + "file_name", payload["file_name"], + "status", payload["status"]) + return s.liveState.UpsertSession(ctx, *live) +} + +func (s *Service) prepareAttachment(ctx context.Context, session RemoteSession, attachment SessionAttachment, lease *workercontracts.WorkerLease) error { + runtimeMetadata, secretRef, secretVersion, err := s.runtimeAssignmentMetadata(ctx, session, lease) + if err != nil { + if secretRef != "" { + _ = s.writeAuditEvent(ctx, s.store, AuditEventSecretAccessDenied, attachment.UserID, attachment.DeviceID, secretRef, session.ID, map[string]any{ + "resource_id": session.ResourceID, + "organization_id": session.OrganizationID, + "worker_id": session.WorkerID, + "reason": err.Error(), + }) + } + return err + } + if err := s.orchestrator.PrepareAttachment(ctx, session, attachment, runtimeMetadata); err != nil { + return err + } + if secretRef != "" { + _ = s.writeAuditEvent(ctx, s.store, AuditEventSecretAccessed, attachment.UserID, attachment.DeviceID, secretRef, session.ID, map[string]any{ + "resource_id": session.ResourceID, + "organization_id": session.OrganizationID, + "worker_id": session.WorkerID, + "secret_ref": secretRef, + "version": secretVersion, + }) + } + return nil +} + +func (s *Service) runtimeAssignmentMetadata(ctx context.Context, session RemoteSession, lease *workercontracts.WorkerLease) (map[string]any, string, int, error) { + metadata := decodeJSONMap(session.Metadata) + secretRef := secretRefFromAssignmentMetadata(metadata) + if secretRef == "" { + return metadata, "", 0, nil + } + if s.secretResolver == nil { + if secrets.IsProductionEnv(s.cfg.App.Env) { + return nil, secretRef, 0, secrets.ErrSecretEncryptionKeyMissing + } + return metadata, "", 0, nil + } + leaseID := "" + if lease != nil { + leaseID = lease.LeaseID + } + resolved, err := s.secretResolver.ResolveForSession(ctx, secrets.ResolveResourceSecretRequest{ + SecretRef: secretRef, + OrganizationID: session.OrganizationID, + ResourceID: session.ResourceID, + SessionID: session.ID, + WorkerID: session.WorkerID, + LeaseID: leaseID, + }) + if err != nil { + return nil, secretRef, 0, err + } + merged, err := secrets.MergeResourceSecretIntoAssignmentMetadata(metadata, resolved.Payload) + if err != nil { + return nil, secretRef, resolved.Descriptor.Version, err + } + return merged.Metadata, secretRef, resolved.Descriptor.Version, nil +} + +func (s *Service) publishLiveState(ctx context.Context, session RemoteSession, attachment SessionAttachment, lease *workercontracts.WorkerLease) (*sessioncontracts.AttachTokenClaims, error) { + now := s.now().UTC() + renderQualityProfile := session.RenderQualityProfile + if renderQualityProfile == "" { + renderQualityProfile = renderQualityProfileFromSessionMetadata(session.Metadata) + } + if err := s.liveState.UpsertSession(ctx, LiveSessionState{ + SessionID: session.ID, + ResourceID: session.ResourceID, + WorkerID: session.WorkerID, + State: session.State, + ControllerID: session.ControllerUserID, + AttachmentID: attachment.ID, + TakeoverVersion: session.TakeoverVersion, + RenderQualityProfile: renderQualityProfile, + RenderState: "connecting", + UpdatedAt: now, + }); err != nil { + return nil, err + } + if err := s.liveState.BindController(ctx, sessioncontracts.ControllerBinding{ + SessionID: session.ID, + AttachmentID: attachment.ID, + UserID: attachment.UserID, + DeviceID: attachment.DeviceID, + TakeoverVersion: session.TakeoverVersion, + BoundAt: now, + }, s.cfg.Session.LiveStateTTL); err != nil { + return nil, err + } + if lease != nil { + if err := s.liveState.UpdateWorkerRoute(ctx, WorkerRoute{ + SessionID: session.ID, + WorkerID: lease.WorkerID, + LeaseID: lease.LeaseID, + ControlStream: lease.ControlStream, + UpdatedAt: now, + }, s.cfg.Session.LiveStateTTL); err != nil { + return nil, err + } + } + token := sessioncontracts.AttachTokenClaims{ + Token: uuid.NewString(), + SessionID: session.ID, + AttachmentID: attachment.ID, + UserID: attachment.UserID, + DeviceID: attachment.DeviceID, + WorkerID: session.WorkerID, + TakeoverVersion: session.TakeoverVersion, + ExpiresAt: now.Add(s.cfg.Session.AttachTokenTTL), + Reconnectable: true, + } + if err := s.liveState.StoreAttachToken(ctx, token, s.cfg.Session.AttachTokenTTL); err != nil { + return nil, err + } + return &token, s.liveState.TouchAttachmentHeartbeat(ctx, session.ID, attachment.ID, s.cfg.Session.HeartbeatTTL) +} + +func (s *Service) loadPolicy(ctx context.Context, resourceID string) (*ResourcePolicy, error) { + return s.loadPolicyFromStore(ctx, s.store, resourceID) +} + +func (s *Service) loadPolicyFromStore(ctx context.Context, store Store, resourceID string) (*ResourcePolicy, error) { + policy, err := store.ResourcePolicies().GetByResourceID(ctx, resourceID) + if err != nil { + return nil, err + } + if policy == nil { + policy = &ResourcePolicy{ + ResourceID: resourceID, + MaxConcurrentSessions: 1, + TakeoverPolicy: ResourceTakeoverPolicyTrustedDevice, + RequireTrustedDevice: true, + DetachGracePeriod: s.cfg.Session.DetachGracePeriod, + ClipboardEnabled: false, + ClipboardMode: ResourceClipboardModeDisabled, + FileTransferEnabled: false, + FileTransferMode: ResourceFileTransferModeDisabled, + } + } + policy.ClipboardMode = normalizeClipboardMode(policy.ClipboardMode) + policy.ClipboardEnabled = policy.ClipboardMode != ResourceClipboardModeDisabled + policy.FileTransferMode = normalizeFileTransferMode(policy.FileTransferMode) + policy.FileTransferEnabled = fileTransferAllowsClientToServer(policy.FileTransferMode) + return policy, nil +} + +func (s *Service) GetSessionClipboardPolicy(ctx context.Context, sessionID string) (ResourceClipboardMode, sessioncontracts.State, error) { + session, err := s.store.RemoteSessions().GetByID(ctx, sessionID) + if err != nil { + return ResourceClipboardModeDisabled, "", err + } + if session == nil { + return ResourceClipboardModeDisabled, "", ErrSessionNotFound + } + policy, err := s.loadPolicy(ctx, session.ResourceID) + if err != nil { + return ResourceClipboardModeDisabled, "", err + } + return policy.ClipboardMode, session.State, nil +} + +func normalizeClipboardMode(mode ResourceClipboardMode) ResourceClipboardMode { + switch mode { + case ResourceClipboardModeClientToServer, ResourceClipboardModeServerToClient, ResourceClipboardModeBidirectional: + return mode + default: + return ResourceClipboardModeDisabled + } +} + +func clipboardAllowsClientToServer(mode ResourceClipboardMode) bool { + return mode == ResourceClipboardModeClientToServer || mode == ResourceClipboardModeBidirectional +} + +func clipboardAllowsServerToClient(mode ResourceClipboardMode) bool { + return mode == ResourceClipboardModeServerToClient || mode == ResourceClipboardModeBidirectional +} + +func (s *Service) GetSessionFileTransferPolicy(ctx context.Context, sessionID string) (ResourceFileTransferMode, sessioncontracts.State, error) { + session, err := s.store.RemoteSessions().GetByID(ctx, sessionID) + if err != nil { + return ResourceFileTransferModeDisabled, "", err + } + if session == nil { + return ResourceFileTransferModeDisabled, "", ErrSessionNotFound + } + policy, err := s.loadPolicy(ctx, session.ResourceID) + if err != nil { + return ResourceFileTransferModeDisabled, "", err + } + return policy.FileTransferMode, session.State, nil +} + +func normalizeFileTransferMode(mode ResourceFileTransferMode) ResourceFileTransferMode { + switch mode { + case ResourceFileTransferModeClientToServer, ResourceFileTransferModeServerToClient, ResourceFileTransferModeBidirectional: + return mode + default: + return ResourceFileTransferModeDisabled + } +} + +func fileTransferAllowsClientToServer(mode ResourceFileTransferMode) bool { + return mode == ResourceFileTransferModeClientToServer || mode == ResourceFileTransferModeBidirectional +} + +func fileTransferAllowsServerToClient(mode ResourceFileTransferMode) bool { + return mode == ResourceFileTransferModeServerToClient || mode == ResourceFileTransferModeBidirectional +} + +func (s *Service) ensureTrustedDevice(ctx context.Context, policy *ResourcePolicy, userID, deviceID string) error { + return s.ensureTrustedDeviceWithStore(ctx, s.store, policy, userID, deviceID) +} + +func (s *Service) ensureTrustedDeviceWithStore(ctx context.Context, store Store, policy *ResourcePolicy, userID, deviceID string) error { + if !policy.RequireTrustedDevice { + return nil + } + ok, err := store.Access().IsTrustedDevice(ctx, userID, deviceID) + if err != nil { + return err + } + if !ok { + return ErrTrustedDeviceRequired + } + return nil +} + +func (s *Service) ensureSessionCapacity(ctx context.Context, policy *ResourcePolicy, resourceID string) error { + count, err := s.store.RemoteSessions().CountLiveByResource(ctx, resourceID) + if err != nil { + return err + } + if count >= policy.MaxConcurrentSessions { + return ErrActiveControllerPresent + } + return nil +} + +func (s *Service) ensureTakeoverAllowed(ctx context.Context, store Store, policy *ResourcePolicy, session *RemoteSession, userID, deviceID string) error { + role, err := store.Access().GetPlatformRole(ctx, userID) + if err != nil { + return err + } + if isPlatformAdminRole(role) { + return nil + } + orgRole, ok, err := store.Access().GetOrganizationRole(ctx, session.OrganizationID, userID) + if err != nil { + return err + } + if !ok { + return ErrAccessDenied + } + if err := s.ensureTrustedDeviceWithStore(ctx, store, policy, userID, deviceID); err != nil { + return err + } + if isOrganizationAdminRole(orgRole) { + return nil + } + switch policy.TakeoverPolicy { + case ResourceTakeoverPolicyTrustedDevice: + return nil + case ResourceTakeoverPolicySameUser: + if session.ControllerUserID != userID { + return ErrTakeoverNotAllowed + } + return nil + case ResourceTakeoverPolicyAdminOnly: + return ErrTakeoverNotAllowed + default: + return ErrTakeoverNotAllowed + } +} + +func (s *Service) writeAuditEvent(ctx context.Context, store Store, eventType, actorUserID, actorDeviceID, targetID, sessionID string, payload map[string]any) error { + encoded, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("marshal audit payload: %w", err) + } + var actorUser *string + if actorUserID != "" { + actorUser = &actorUserID + } + var actorDevice *string + if actorDeviceID != "" { + actorDevice = &actorDeviceID + } + sessionRef := sessionID + return store.AuditEvents().Create(ctx, AuditEvent{ + ID: uuid.NewString(), + ActorUserID: actorUser, + ActorDeviceID: actorDevice, + EventType: eventType, + TargetType: "remote_session", + TargetID: targetID, + RemoteSessionID: &sessionRef, + Payload: encoded, + CreatedAt: s.now().UTC(), + }) +} + +func requiredCapabilities(policy *ResourcePolicy) []string { + caps := []string{"adaptive-quality", "dirty-rects"} + if policy.ClipboardEnabled { + caps = append(caps, "clipboard") + } + if fileTransferAllowsClientToServer(policy.FileTransferMode) { + caps = append(caps, "file-transfer") + } + return caps +} + +func renderQualityProfileFromMetadata(metadata map[string]any) string { + if profile, ok := metadata["render_quality_profile"].(string); ok && profile != "" { + switch profile { + case "low_bandwidth", "balanced", "high_quality", "text_priority": + return profile + } + } + return "balanced" +} + +func renderQualityProfileFromSessionMetadata(metadata []byte) string { + if len(metadata) == 0 { + return "balanced" + } + var decoded map[string]any + if err := json.Unmarshal(metadata, &decoded); err != nil { + return "balanced" + } + resource, _ := decoded["resource"].(map[string]any) + if resource == nil { + return "balanced" + } + return renderQualityProfileFromMetadata(resource) +} + +func toInt(value any) (int, bool) { + switch v := value.(type) { + case int: + return v, true + case int32: + return int(v), true + case int64: + return int(v), true + case float64: + return int(v), true + case float32: + return int(v), true + default: + return 0, false + } +} + +func toInt64(value any) (int64, bool) { + switch v := value.(type) { + case int: + return int64(v), true + case int32: + return int64(v), true + case int64: + return v, true + case float64: + return int64(v), true + case float32: + return int64(v), true + default: + return 0, false + } +} + +func decodeJSONMap(payload []byte) map[string]any { + var out map[string]any + if len(payload) == 0 { + return map[string]any{} + } + if err := json.Unmarshal(payload, &out); err != nil { + return map[string]any{} + } + return out +} + +func secretRefFromAssignmentMetadata(metadata map[string]any) string { + resource, _ := metadata["resource"].(map[string]any) + if resource == nil { + return "" + } + secretRef, _ := resource["secret_ref"].(string) + return secretRef +} + +func (s *Service) RebuildLiveStateFromStore(ctx context.Context, sessionID string) (*LiveSessionState, error) { + session, err := s.store.RemoteSessions().GetByID(ctx, sessionID) + if err != nil || session == nil { + return sessionToLiveState(session), err + } + session, err = s.ensureSessionRuntimeConsistency(ctx, session) + if err != nil || session == nil { + return sessionToLiveState(session), err + } + attachments, err := s.store.SessionAttachments().ListByRemoteSession(ctx, sessionID) + if err != nil { + return nil, err + } + var controller *SessionAttachment + for i := len(attachments) - 1; i >= 0; i-- { + attachment := attachments[i] + if attachment.State == AttachmentStateActive || attachment.State == AttachmentStateAttaching || attachment.State == AttachmentStateDetached { + controller = &attachment + break + } + } + if controller == nil { + return sessionToLiveState(session), nil + } + live := &LiveSessionState{ + SessionID: session.ID, + ResourceID: session.ResourceID, + WorkerID: session.WorkerID, + State: session.State, + ControllerID: session.ControllerUserID, + AttachmentID: controller.ID, + TakeoverVersion: session.TakeoverVersion, + UpdatedAt: s.now().UTC(), + } + if err := s.liveState.UpsertSession(ctx, *live); err != nil { + return nil, err + } + if err := s.liveState.BindController(ctx, sessioncontracts.ControllerBinding{ + SessionID: session.ID, + AttachmentID: controller.ID, + UserID: controller.UserID, + DeviceID: controller.DeviceID, + TakeoverVersion: session.TakeoverVersion, + BoundAt: s.now().UTC(), + }, s.cfg.Session.LiveStateTTL); err != nil { + return nil, err + } + return live, nil +} + +func (s *Service) ensureSessionRuntimeConsistencyByID(ctx context.Context, sessionID string) (*RemoteSession, error) { + session, err := s.store.RemoteSessions().GetByID(ctx, sessionID) + if err != nil || session == nil { + return session, err + } + return s.ensureSessionRuntimeConsistency(ctx, session) +} + +func (s *Service) ensureSessionRuntimeConsistency(ctx context.Context, session *RemoteSession) (*RemoteSession, error) { + if session == nil || !requiresLiveRuntime(session.State) { + return session, nil + } + healthy, reason, err := s.orchestrator.ValidateSessionRuntime(ctx, session.ID, session.WorkerID) + if err != nil { + return nil, err + } + if healthy { + return session, nil + } + s.logger.Warn("session runtime consistency check failed; marking session failed", + "session_id", session.ID, + "worker_id", session.WorkerID, + "state", session.State, + "reason", reason) + if err := s.MarkSessionFailed(ctx, MarkSessionFailedCommand{ + SessionID: session.ID, + Reason: reason, + }); err != nil && !errors.Is(err, ErrSessionNotFound) && !errors.Is(err, ErrSessionNotTerminable) { + return nil, err + } + return s.store.RemoteSessions().GetByID(ctx, session.ID) +} + +func requiresLiveRuntime(state sessioncontracts.State) bool { + switch state { + case sessioncontracts.StateStarting, sessioncontracts.StateActive, sessioncontracts.StateDetached, sessioncontracts.StateReconnecting: + return true + default: + return false + } +} + +func isTerminalSessionState(state sessioncontracts.State) bool { + switch state { + case sessioncontracts.StateTerminated, sessioncontracts.StateFailed: + return true + default: + return false + } +} + +func (s *Service) isStaleTerminalWorkerEvent(ctx context.Context, sessionID string) (bool, sessioncontracts.State, error) { + session, err := s.store.RemoteSessions().GetByID(ctx, sessionID) + if err != nil || session == nil { + return false, "", err + } + if isTerminalSessionState(session.State) { + return true, session.State, nil + } + return false, session.State, nil +} + +func (s *Service) logStaleWorkerEvent(eventType, sessionID string, state sessioncontracts.State) { + if s.logger == nil { + return + } + args := []any{ + "event_type", eventType, + "session_id", sessionID, + "state", state, + } + if eventType == "render_telemetry" { + s.logger.Debug("stale worker event ignored because authoritative session state is terminal", args...) + return + } + s.logger.Info("stale worker event ignored because authoritative session state is terminal", args...) +} + +func sessionToLiveState(session *RemoteSession) *LiveSessionState { + if session == nil { + return nil + } + return &LiveSessionState{ + SessionID: session.ID, + ResourceID: session.ResourceID, + WorkerID: session.WorkerID, + State: session.State, + ControllerID: session.ControllerUserID, + TakeoverVersion: session.TakeoverVersion, + RenderQualityProfile: normalizeRenderQualityProfile(session.RenderQualityProfile), + RenderState: "connecting", + } +} + +func mergeLiveRenderTelemetry(dst *LiveSessionState, src *LiveSessionState) { + if dst == nil || src == nil { + return + } + if src.RenderQualityProfile != "" { + dst.RenderQualityProfile = src.RenderQualityProfile + } + if src.RenderState != "" { + dst.RenderState = src.RenderState + } + if src.RenderWidth > 0 { + dst.RenderWidth = src.RenderWidth + } + if src.RenderHeight > 0 { + dst.RenderHeight = src.RenderHeight + } + if src.RenderFrameSequence > 0 { + dst.RenderFrameSequence = src.RenderFrameSequence + } + if src.RenderFrameFormat != "" { + dst.RenderFrameFormat = src.RenderFrameFormat + } + if src.RenderFrameData != "" { + dst.RenderFrameData = src.RenderFrameData + } + if src.CursorX != 0 { + dst.CursorX = src.CursorX + } + if src.CursorY != 0 { + dst.CursorY = src.CursorY + } + dst.CursorVisible = src.CursorVisible + if src.DirtyRectangles > 0 { + dst.DirtyRectangles = src.DirtyRectangles + } + if src.LastRenderAt != nil { + dst.LastRenderAt = src.LastRenderAt + } +} + +func normalizeRenderQualityProfile(profile string) string { + switch profile { + case "low_bandwidth", "balanced", "high_quality", "text_priority": + return profile + default: + return "balanced" + } +} + +func (s *Service) findReusableSession(ctx context.Context, resourceID, userID string) (*RemoteSession, error) { + sessions, err := s.store.RemoteSessions().ListByController(ctx, userID) + if err != nil { + return nil, err + } + for _, session := range sessions { + if session.ResourceID != resourceID { + continue + } + if session.State == sessioncontracts.StateStarting || session.State == sessioncontracts.StateActive || session.State == sessioncontracts.StateDetached || session.State == sessioncontracts.StateReconnecting { + copy := session + return ©, nil + } + } + return nil, nil +} + +func (s *Service) MapError(err error) (int, string) { + switch { + case err == nil: + return 0, "" + case errors.Is(err, ErrSessionNotFound): + return 404, err.Error() + case errors.Is(err, ErrAttachmentNotFound): + return 404, err.Error() + case errors.Is(err, ErrAccessDenied), errors.Is(err, ErrTrustedDeviceRequired): + return 403, err.Error() + case errors.Is(err, ErrActiveControllerPresent), errors.Is(err, ErrTakeoverNotAllowed), errors.Is(err, ErrSessionNotAttachable), errors.Is(err, ErrSessionNotTerminable): + return 409, err.Error() + case errors.Is(err, ErrAttachTokenInvalid): + return 401, err.Error() + default: + return 500, err.Error() + } +} + +func (s *Service) ensureOrganizationMemberAccess(ctx context.Context, organizationID, userID string) error { + return s.ensureOrganizationMemberAccessWithStore(ctx, s.store, organizationID, userID) +} + +func (s *Service) ensureOrganizationMemberAccessWithStore(ctx context.Context, store Store, organizationID, userID string) error { + if userID == "" { + return ErrAccessDenied + } + allowed, err := s.hasOrganizationMemberAccessWithStore(ctx, store, organizationID, userID) + if err != nil { + return err + } + if !allowed { + return ErrAccessDenied + } + return nil +} + +func (s *Service) hasOrganizationMemberAccess(ctx context.Context, organizationID, userID string) (bool, error) { + return s.hasOrganizationMemberAccessWithStore(ctx, s.store, organizationID, userID) +} + +func (s *Service) hasOrganizationMemberAccessWithStore(ctx context.Context, store Store, organizationID, userID string) (bool, error) { + role, err := store.Access().GetPlatformRole(ctx, userID) + if err != nil { + return false, err + } + if isPlatformAdminRole(role) { + return true, nil + } + _, ok, err := store.Access().GetOrganizationRole(ctx, organizationID, userID) + if err != nil { + return false, err + } + return ok, nil +} + +func (s *Service) ensureSessionControllerAccessWithStore(ctx context.Context, store Store, session *RemoteSession, userID string) error { + if err := s.ensureOrganizationMemberAccessWithStore(ctx, store, session.OrganizationID, userID); err != nil { + return err + } + if session.ControllerUserID == userID { + return nil + } + return ErrAccessDenied +} + +func (s *Service) ensureSessionAdminOrControllerAccessWithStore(ctx context.Context, store Store, session *RemoteSession, userID string) error { + role, err := store.Access().GetPlatformRole(ctx, userID) + if err != nil { + return err + } + if isPlatformAdminRole(role) { + return nil + } + orgRole, ok, err := store.Access().GetOrganizationRole(ctx, session.OrganizationID, userID) + if err != nil { + return err + } + if !ok { + return ErrAccessDenied + } + if isOrganizationAdminRole(orgRole) || session.ControllerUserID == userID { + return nil + } + return ErrAccessDenied +} + +func isPlatformAdminRole(role string) bool { + return role == "platform_admin" || role == "platform_recovery_admin" +} + +func isOrganizationAdminRole(role string) bool { + return role == "org_owner" || role == "org_admin" +} diff --git a/backend/internal/modules/sessionbroker/stale_worker_event_test.go b/backend/internal/modules/sessionbroker/stale_worker_event_test.go new file mode 100644 index 0000000..e9dc997 --- /dev/null +++ b/backend/internal/modules/sessionbroker/stale_worker_event_test.go @@ -0,0 +1,391 @@ +package sessionbroker + +import ( + "context" + "io" + "log/slog" + "testing" + "time" + + "github.com/example/remote-access-platform/backend/internal/platform/module" + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" + workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker" +) + +func TestHandleWorkerConnectedIgnoresTerminalSession(t *testing.T) { + service, store, live, _ := newStaleWorkerEventTestService() + store.remote.sessions["session-1"] = RemoteSession{ + ID: "session-1", + State: sessioncontracts.StateTerminated, + WorkerID: "worker-1", + } + + if err := service.HandleWorkerConnected(context.Background(), "session-1"); err != nil { + t.Fatalf("HandleWorkerConnected returned error for stale terminal event: %v", err) + } + if got := store.remote.sessions["session-1"].State; got != sessioncontracts.StateTerminated { + t.Fatalf("stale connected event changed terminal state to %q", got) + } + if store.remote.updateCount != 0 { + t.Fatalf("stale connected event updated authoritative session %d times", store.remote.updateCount) + } + if live.upsertCount != 0 { + t.Fatalf("stale connected event recreated live state %d times", live.upsertCount) + } +} + +func TestUpdateWorkerRenderTelemetryIgnoresTerminalSession(t *testing.T) { + service, store, live, _ := newStaleWorkerEventTestService() + store.remote.sessions["session-1"] = RemoteSession{ + ID: "session-1", + State: sessioncontracts.StateTerminated, + WorkerID: "worker-1", + } + + err := service.UpdateWorkerRenderTelemetry(context.Background(), "session-1", map[string]any{ + "render_state": "ready", + "width": 1280, + "height": 720, + "frame_sequence": int64(99), + "frame_data": "stale-frame", + }) + if err != nil { + t.Fatalf("UpdateWorkerRenderTelemetry returned error for stale terminal event: %v", err) + } + if live.upsertCount != 0 { + t.Fatalf("stale render event recreated live state %d times", live.upsertCount) + } + if live.sessions["session-1"] != nil { + t.Fatalf("stale render event left live state behind: %#v", live.sessions["session-1"]) + } +} + +func TestMarkSessionFailedTransitionsActiveSession(t *testing.T) { + service, store, live, orchestrator := newStaleWorkerEventTestService() + store.remote.sessions["session-1"] = RemoteSession{ + ID: "session-1", + State: sessioncontracts.StateActive, + WorkerID: "worker-1", + TakeoverVersion: 3, + } + store.attachments.items["attachment-1"] = SessionAttachment{ + ID: "attachment-1", + RemoteSessionID: "session-1", + State: AttachmentStateActive, + } + live.sessions["session-1"] = &LiveSessionState{SessionID: "session-1", State: sessioncontracts.StateActive} + + if err := service.MarkSessionFailed(context.Background(), MarkSessionFailedCommand{SessionID: "session-1", Reason: "worker_lost"}); err != nil { + t.Fatalf("MarkSessionFailed returned error: %v", err) + } + if got := store.remote.sessions["session-1"].State; got != sessioncontracts.StateFailed { + t.Fatalf("expected failed state, got %q", got) + } + if got := store.attachments.items["attachment-1"].State; got != AttachmentStateClosed { + t.Fatalf("expected attachment closed, got %q", got) + } + if store.audit.createCount != 1 { + t.Fatalf("expected one audit event, got %d", store.audit.createCount) + } + if live.sessions["session-1"] != nil { + t.Fatal("expected failed session live state to be deleted") + } + if orchestrator.releaseCount != 1 { + t.Fatalf("expected session lease release, got %d", orchestrator.releaseCount) + } +} + +func TestMarkSessionFailedAlreadyFailedIsIdempotent(t *testing.T) { + service, store, _, _ := newStaleWorkerEventTestService() + store.remote.sessions["session-1"] = RemoteSession{ + ID: "session-1", + State: sessioncontracts.StateFailed, + WorkerID: "worker-1", + TakeoverVersion: 1, + } + + if err := service.MarkSessionFailed(context.Background(), MarkSessionFailedCommand{SessionID: "session-1", Reason: "duplicate_worker_failure"}); err != nil { + t.Fatalf("duplicate MarkSessionFailed returned error: %v", err) + } + if got := store.remote.sessions["session-1"].State; got != sessioncontracts.StateFailed { + t.Fatalf("duplicate terminal event changed state to %q", got) + } +} + +func newStaleWorkerEventTestService() (*Service, *staleWorkerEventTestStore, *staleWorkerEventLiveState, *staleWorkerEventOrchestrator) { + store := &staleWorkerEventTestStore{ + remote: &staleWorkerEventRemoteSessions{sessions: map[string]RemoteSession{}}, + attachments: &staleWorkerEventAttachments{items: map[string]SessionAttachment{}}, + policies: &staleWorkerEventPolicies{}, + audit: &staleWorkerEventAudit{}, + } + live := &staleWorkerEventLiveState{sessions: map[string]*LiveSessionState{}} + orchestrator := &staleWorkerEventOrchestrator{} + service := NewService(module.Dependencies{ + Infra: module.Infra{Logger: slog.New(slog.NewTextHandler(io.Discard, nil))}, + }, store, staleWorkerEventTransactor{store: store}, live, orchestrator) + service.now = func() time.Time { return time.Unix(100, 0).UTC() } + return service, store, live, orchestrator +} + +type staleWorkerEventTransactor struct { + store Store +} + +func (t staleWorkerEventTransactor) WithinTransaction(ctx context.Context, fn func(store Store) error) error { + return fn(t.store) +} + +type staleWorkerEventTestStore struct { + remote *staleWorkerEventRemoteSessions + attachments *staleWorkerEventAttachments + policies *staleWorkerEventPolicies + audit *staleWorkerEventAudit +} + +func (s *staleWorkerEventTestStore) RemoteSessions() RemoteSessionRepository { return s.remote } +func (s *staleWorkerEventTestStore) SessionAttachments() SessionAttachmentRepository { + return s.attachments +} +func (s *staleWorkerEventTestStore) ResourcePolicies() ResourcePolicyRepository { return s.policies } +func (s *staleWorkerEventTestStore) ResourceRuntime() ResourceRuntimeRepository { + return staleWorkerEventResourceRuntime{} +} +func (s *staleWorkerEventTestStore) AuditEvents() AuditEventRepository { return s.audit } +func (s *staleWorkerEventTestStore) Access() AccessRepository { return staleWorkerEventAccess{} } + +type staleWorkerEventRemoteSessions struct { + sessions map[string]RemoteSession + updateCount int +} + +func (r *staleWorkerEventRemoteSessions) Create(_ context.Context, session RemoteSession) error { + r.sessions[session.ID] = session + return nil +} + +func (r *staleWorkerEventRemoteSessions) GetByID(_ context.Context, sessionID string) (*RemoteSession, error) { + session, ok := r.sessions[sessionID] + if !ok { + return nil, nil + } + return &session, nil +} + +func (r *staleWorkerEventRemoteSessions) GetByIDForUpdate(ctx context.Context, sessionID string) (*RemoteSession, error) { + return r.GetByID(ctx, sessionID) +} + +func (r *staleWorkerEventRemoteSessions) ListByController(_ context.Context, _ string) ([]RemoteSession, error) { + return nil, nil +} + +func (r *staleWorkerEventRemoteSessions) CountLiveByResource(_ context.Context, _ string) (int, error) { + return 0, nil +} + +func (r *staleWorkerEventRemoteSessions) ListDetachedExpired(_ context.Context, _ time.Time, _ int) ([]RemoteSession, error) { + return nil, nil +} + +func (r *staleWorkerEventRemoteSessions) UpdateState(_ context.Context, params UpdateRemoteSessionStateParams) error { + session := r.sessions[params.RemoteSessionID] + session.State = params.State + session.WorkerID = params.WorkerID + session.DetachDeadlineAt = params.DetachDeadlineAt + session.LastHeartbeatAt = params.LastHeartbeatAt + session.TakeoverVersion = params.TakeoverVersion + session.UpdatedAt = params.UpdatedAt + r.sessions[params.RemoteSessionID] = session + r.updateCount++ + return nil +} + +type staleWorkerEventAttachments struct { + items map[string]SessionAttachment +} + +func (r *staleWorkerEventAttachments) Create(_ context.Context, attachment SessionAttachment) error { + r.items[attachment.ID] = attachment + return nil +} + +func (r *staleWorkerEventAttachments) GetByID(_ context.Context, attachmentID string) (*SessionAttachment, error) { + attachment, ok := r.items[attachmentID] + if !ok { + return nil, nil + } + return &attachment, nil +} + +func (r *staleWorkerEventAttachments) GetByIDForUpdate(ctx context.Context, attachmentID string) (*SessionAttachment, error) { + return r.GetByID(ctx, attachmentID) +} + +func (r *staleWorkerEventAttachments) ListByRemoteSession(_ context.Context, remoteSessionID string) ([]SessionAttachment, error) { + attachments := make([]SessionAttachment, 0) + for _, attachment := range r.items { + if attachment.RemoteSessionID == remoteSessionID { + attachments = append(attachments, attachment) + } + } + return attachments, nil +} + +func (r *staleWorkerEventAttachments) ListActiveByRemoteSessionForUpdate(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) { + return r.ListByRemoteSession(ctx, remoteSessionID) +} + +func (r *staleWorkerEventAttachments) UpdateState(_ context.Context, params UpdateSessionAttachmentStateParams) error { + attachment := r.items[params.AttachmentID] + attachment.State = params.State + attachment.DetachedAt = params.DetachedAt + attachment.LastInputAt = params.LastInputAt + attachment.UpdatedAt = params.UpdatedAt + r.items[params.AttachmentID] = attachment + return nil +} + +func (r *staleWorkerEventAttachments) Supersede(_ context.Context, params SupersedeAttachmentParams) error { + attachment := r.items[params.PreviousAttachmentID] + attachment.State = AttachmentStateSuperseded + attachment.SupersededBy = ¶ms.NextAttachmentID + attachment.DetachedAt = ¶ms.DetachedAt + attachment.UpdatedAt = params.UpdatedAt + r.items[params.PreviousAttachmentID] = attachment + return nil +} + +type staleWorkerEventPolicies struct{} + +func (r *staleWorkerEventPolicies) GetByResourceID(_ context.Context, _ string) (*ResourcePolicy, error) { + return nil, nil +} + +func (r *staleWorkerEventPolicies) Upsert(_ context.Context, _ ResourcePolicy) error { + return nil +} + +type staleWorkerEventAudit struct { + createCount int +} + +func (r *staleWorkerEventAudit) Create(_ context.Context, _ AuditEvent) error { + r.createCount++ + return nil +} + +type staleWorkerEventResourceRuntime struct{} + +func (staleWorkerEventResourceRuntime) GetByID(_ context.Context, _ string) (*ResourceRuntimeSpec, error) { + return nil, nil +} + +type staleWorkerEventAccess struct{} + +func (staleWorkerEventAccess) IsTrustedDevice(_ context.Context, _, _ string) (bool, error) { + return false, nil +} + +func (staleWorkerEventAccess) GetPlatformRole(_ context.Context, _ string) (string, error) { + return "", nil +} + +func (staleWorkerEventAccess) GetOrganizationRole(_ context.Context, _, _ string) (string, bool, error) { + return "", false, nil +} + +type staleWorkerEventLiveState struct { + sessions map[string]*LiveSessionState + upsertCount int +} + +func (s *staleWorkerEventLiveState) UpsertSession(_ context.Context, state LiveSessionState) error { + copied := state + s.sessions[state.SessionID] = &copied + s.upsertCount++ + return nil +} + +func (s *staleWorkerEventLiveState) GetSession(_ context.Context, sessionID string) (*LiveSessionState, error) { + state := s.sessions[sessionID] + if state == nil { + return nil, nil + } + copied := *state + return &copied, nil +} + +func (s *staleWorkerEventLiveState) DeleteSession(_ context.Context, sessionID string) error { + delete(s.sessions, sessionID) + return nil +} + +func (s *staleWorkerEventLiveState) BindController(_ context.Context, _ sessioncontracts.ControllerBinding, _ time.Duration) error { + return nil +} + +func (s *staleWorkerEventLiveState) GetControllerBinding(_ context.Context, _ string) (*sessioncontracts.ControllerBinding, error) { + return nil, nil +} + +func (s *staleWorkerEventLiveState) ClearControllerBinding(_ context.Context, _ string) error { + return nil +} + +func (s *staleWorkerEventLiveState) StoreAttachToken(_ context.Context, _ sessioncontracts.AttachTokenClaims, _ time.Duration) error { + return nil +} + +func (s *staleWorkerEventLiveState) ConsumeAttachToken(_ context.Context, _ string) (*sessioncontracts.AttachTokenClaims, error) { + return nil, nil +} + +func (s *staleWorkerEventLiveState) TouchAttachmentHeartbeat(_ context.Context, _, _ string, _ time.Duration) error { + return nil +} + +func (s *staleWorkerEventLiveState) UpdateWorkerRoute(_ context.Context, _ WorkerRoute, _ time.Duration) error { + return nil +} + +func (s *staleWorkerEventLiveState) GetWorkerRoute(_ context.Context, _ string) (*WorkerRoute, error) { + return nil, nil +} + +func (s *staleWorkerEventLiveState) DeleteWorkerRoute(_ context.Context, _ string) error { + return nil +} + +type staleWorkerEventOrchestrator struct { + releaseCount int +} + +func (o *staleWorkerEventOrchestrator) Reserve(_ context.Context, _ workercontracts.AttachRequest) (*workercontracts.WorkerLease, error) { + return nil, nil +} + +func (o *staleWorkerEventOrchestrator) GetSessionLease(_ context.Context, _ string) (*workercontracts.WorkerLease, error) { + return nil, nil +} + +func (o *staleWorkerEventOrchestrator) ReleaseSessionLease(_ context.Context, _ string) error { + o.releaseCount++ + return nil +} + +func (o *staleWorkerEventOrchestrator) PrepareAttachment(_ context.Context, _ RemoteSession, _ SessionAttachment, _ map[string]any) error { + return nil +} + +func (o *staleWorkerEventOrchestrator) NotifyDetachment(_ context.Context, _ RemoteSession, _ SessionAttachment) error { + return nil +} + +func (o *staleWorkerEventOrchestrator) TerminateRemoteSession(_ context.Context, _, _ string) error { + return nil +} + +func (o *staleWorkerEventOrchestrator) ValidateSessionRuntime(_ context.Context, _, _ string) (bool, string, error) { + return true, "", nil +} diff --git a/backend/internal/modules/sessionbroker/state_machine.go b/backend/internal/modules/sessionbroker/state_machine.go new file mode 100644 index 0000000..94b8963 --- /dev/null +++ b/backend/internal/modules/sessionbroker/state_machine.go @@ -0,0 +1,44 @@ +package sessionbroker + +import ( + "fmt" + + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" +) + +var allowedTransitions = map[sessioncontracts.State]map[sessioncontracts.State]struct{}{ + sessioncontracts.StateStarting: { + sessioncontracts.StateActive: {}, + sessioncontracts.StateFailed: {}, + sessioncontracts.StateTerminated: {}, + }, + sessioncontracts.StateActive: { + sessioncontracts.StateDetached: {}, + sessioncontracts.StateReconnecting: {}, + sessioncontracts.StateFailed: {}, + sessioncontracts.StateTerminated: {}, + }, + sessioncontracts.StateDetached: { + sessioncontracts.StateReconnecting: {}, + sessioncontracts.StateTerminated: {}, + sessioncontracts.StateFailed: {}, + }, + sessioncontracts.StateReconnecting: { + sessioncontracts.StateActive: {}, + sessioncontracts.StateDetached: {}, + sessioncontracts.StateFailed: {}, + sessioncontracts.StateTerminated: {}, + }, +} + +func validateTransition(from, to sessioncontracts.State) error { + if from == to { + return nil + } + if allowed, ok := allowedTransitions[from]; ok { + if _, ok := allowed[to]; ok { + return nil + } + } + return fmt.Errorf("invalid session state transition: %s -> %s", from, to) +} diff --git a/backend/internal/modules/sessiongateway/module.go b/backend/internal/modules/sessiongateway/module.go new file mode 100644 index 0000000..6561977 --- /dev/null +++ b/backend/internal/modules/sessiongateway/module.go @@ -0,0 +1,1118 @@ +package sessiongateway + +import ( + "context" + "encoding/base64" + "errors" + "fmt" + "hash/fnv" + "log/slog" + "net/http" + "path/filepath" + "regexp" + "strings" + "sync" + "time" + + "github.com/go-chi/chi/v5" + "github.com/google/uuid" + "github.com/gorilla/websocket" + + "github.com/example/remote-access-platform/backend/internal/modules/sessionbroker" + "github.com/example/remote-access-platform/backend/internal/modules/worker" + "github.com/example/remote-access-platform/backend/internal/platform/httpx" + "github.com/example/remote-access-platform/backend/internal/platform/module" + messagecontracts "github.com/example/remote-access-platform/backend/pkg/contracts/message" + sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session" + workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker" +) + +const maxClipboardTextBytes = 1024 * 1024 +const maxFileUploadBytes = 25 * 1024 * 1024 +const maxFileUploadChunkBytes = 256 * 1024 +const liveSyncInterval = time.Second + +var safeTransferIDPattern = regexp.MustCompile(`^[a-fA-F0-9-]{36}$`) +var safeFileIDPattern = regexp.MustCompile(`^[a-zA-Z0-9._:-]{1,160}$`) + +type gatewayConnection struct { + conn *websocket.Conn + writeMu sync.Mutex + uploads map[string]*fileUploadState +} + +type fileUploadState struct { + TransferID string + FileName string + FileSize int64 + TotalChunks int64 + Received int64 + NextIndex int64 + ContentHash string +} + +func (c *gatewayConnection) writeJSON(timeout time.Duration, value any) error { + c.writeMu.Lock() + defer c.writeMu.Unlock() + _ = c.conn.SetWriteDeadline(time.Now().Add(timeout)) + return c.conn.WriteJSON(value) +} + +func (c *gatewayConnection) writeMessage(timeout time.Duration, messageType int, data []byte) error { + c.writeMu.Lock() + defer c.writeMu.Unlock() + _ = c.conn.SetWriteDeadline(time.Now().Add(timeout)) + return c.conn.WriteMessage(messageType, data) +} + +type Module struct { + cfg module.Config + logger *slog.Logger + broker *sessionbroker.Service + workers *worker.Service + upgrader websocket.Upgrader +} + +func NewModule(deps module.Dependencies, broker *sessionbroker.Service, workers *worker.Service) *Module { + return &Module{ + cfg: deps.Config, + logger: deps.Infra.Logger, + broker: broker, + workers: workers, + upgrader: websocket.Upgrader{ + CheckOrigin: func(_ *http.Request) bool { return true }, + }, + } +} + +func (m *Module) Name() string { + return "session-gateway" +} + +func (m *Module) RegisterRoutes(router chi.Router) { + router.Route("/gateway", func(r chi.Router) { + r.Get("/status", m.status) + r.Get("/ws", m.handleWebSocket) + }) +} + +func (m *Module) status(w http.ResponseWriter, _ *http.Request) { + httpx.WriteJSON(w, http.StatusOK, map[string]any{ + "module": m.Name(), + "transport": "websocket", + "attach_handshake": "query.attach_token", + "ping_every": m.cfg.WebSocket.PingInterval.String(), + "pong_deadby": m.cfg.WebSocket.PongWait.String(), + }) +} + +func (m *Module) handleWebSocket(w http.ResponseWriter, r *http.Request) { + token := r.URL.Query().Get("attach_token") + if token == "" { + httpx.WriteError(w, http.StatusBadRequest, "attach_token is required") + return + } + claims, state, err := m.broker.ConsumeAttachToken(r.Context(), token) + if err != nil { + httpx.WriteError(w, http.StatusUnauthorized, err.Error()) + return + } + conn, err := m.upgrader.Upgrade(w, r, nil) + if err != nil { + httpx.WriteError(w, http.StatusBadRequest, "websocket upgrade failed") + return + } + defer conn.Close() + gatewayConn := &gatewayConnection{conn: conn, uploads: make(map[string]*fileUploadState)} + + sessionCtx, cancel := context.WithCancel(r.Context()) + defer cancel() + + _ = conn.SetReadDeadline(time.Now().Add(m.cfg.WebSocket.PongWait)) + conn.SetPongHandler(func(string) error { + if err := m.broker.TouchAttachmentHeartbeat(sessionCtx, claims.SessionID, claims.AttachmentID); err != nil { + return err + } + return conn.SetReadDeadline(time.Now().Add(m.cfg.WebSocket.PongWait)) + }) + + if err := m.writeEnvelope(gatewayConn, sessioncontracts.TransportEnvelope{ + Type: "session.state", + SessionID: claims.SessionID, + Payload: map[string]any{ + "state": state.State, + "attachment_id": claims.AttachmentID, + "takeover_version": state.TakeoverVersion, + "reconnectable": claims.Reconnectable, + "worker_routing_id": claims.WorkerID, + "render": renderPayloadFromLiveState(state), + }, + Event: m.newEventMessage( + "session.state."+string(state.State), + "events.session.state."+string(state.State), + "Session state updated.", + map[string]any{ + "state": state.State, + }, + ), + }); err != nil { + return + } + lastFrameSequence := int64(0) + lastClipboardSequence := int64(0) + lastFileDownloadSequence := int64(0) + if state != nil { + lastFrameSequence = state.RenderFrameSequence + lastClipboardSequence = state.ClipboardSequence + if state.RenderFrameSequence > 0 && state.RenderFrameData != "" { + _ = m.writeFrameEnvelope(gatewayConn, claims.SessionID, *state) + } + if state.ClipboardSequence > 0 && state.ClipboardText != "" { + if err := m.ensureClipboardAllowed(sessionCtx, claims.SessionID, "server_to_client"); err != nil { + lastClipboardSequence = state.ClipboardSequence + } else { + _ = m.writeClipboardEnvelope(gatewayConn, claims.SessionID, *state) + } + } + } + + readErr := make(chan error, 1) + go func() { + readErr <- m.readLoop(sessionCtx, gatewayConn, claims) + }() + + pingTicker := time.NewTicker(m.cfg.WebSocket.PingInterval) + defer pingTicker.Stop() + syncTicker := time.NewTicker(liveSyncInterval) + defer syncTicker.Stop() + + for { + select { + case err := <-readErr: + if err == nil { + return + } + _ = m.writeEnvelope(gatewayConn, sessioncontracts.TransportEnvelope{ + Type: "transport.closed", + SessionID: claims.SessionID, + Payload: map[string]any{ + "reason": err.Error(), + }, + Event: m.newEventMessage( + "transport.closed", + "events.session.transport_closed", + "The session transport closed.", + map[string]any{ + "reason": err.Error(), + }, + ), + }) + return + case <-syncTicker.C: + if err := m.syncConnection(sessionCtx, gatewayConn, claims, &lastFrameSequence, &lastClipboardSequence, &lastFileDownloadSequence); err != nil { + return + } + case <-pingTicker.C: + if err := gatewayConn.writeMessage(m.cfg.WebSocket.WriteTimeout, websocket.PingMessage, []byte("ping")); err != nil { + return + } + case <-sessionCtx.Done(): + return + } + } +} + +func (m *Module) readLoop(ctx context.Context, conn *gatewayConnection, claims *sessioncontracts.AttachTokenClaims) error { + for { + var envelope sessioncontracts.TransportEnvelope + if err := conn.conn.ReadJSON(&envelope); err != nil { + return err + } + if err := m.handleEnvelope(ctx, conn, claims, envelope); err != nil { + return err + } + } +} + +func (m *Module) handleEnvelope(ctx context.Context, conn *gatewayConnection, claims *sessioncontracts.AttachTokenClaims, envelope sessioncontracts.TransportEnvelope) error { + switch envelope.Type { + case "heartbeat": + if err := m.broker.TouchAttachmentHeartbeat(ctx, claims.SessionID, claims.AttachmentID); err != nil { + return err + } + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "heartbeat.ack", + SessionID: claims.SessionID, + Event: m.newEventMessage( + "session.heartbeat_ack", + "events.session.heartbeat_ack", + "Session heartbeat acknowledged.", + nil, + ), + }) + case "control": + action, _ := envelope.Payload["action"].(string) + if action == "detach" { + _, err := m.broker.DetachFromSession(ctx, sessionbroker.DetachFromSessionCommand{ + SessionID: claims.SessionID, + AttachmentID: claims.AttachmentID, + UserID: claims.UserID, + Reason: "client_requested_detach", + }) + return err + } + return m.workers.PublishControl(ctx, workercontracts.RoutedEnvelope{ + SessionID: claims.SessionID, + AttachmentID: claims.AttachmentID, + Type: "control", + Payload: envelope.Payload, + }) + case "input": + session, err := m.broker.GetSessionSnapshot(ctx, claims.SessionID) + if err != nil { + return err + } + if session == nil { + return sessionbroker.ErrSessionNotFound + } + if session.State != sessioncontracts.StateActive { + m.logger.Warn("session gateway input envelope rejected because session is not active", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "state", session.State) + return sessionbroker.ErrSessionNotAttachable + } + action, _ := envelope.Payload["action"].(string) + kind, _ := envelope.Payload["kind"].(string) + correlationID, _ := envelope.Payload["correlation_id"].(string) + m.logger.Info("session gateway input envelope received", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "takeover_version", claims.TakeoverVersion, + "kind", kind, + "action", action, + "correlation_id", correlationID, + "trace_stage", "backend_receive") + err = m.workers.PublishInput(ctx, workercontracts.RoutedEnvelope{ + SessionID: claims.SessionID, + AttachmentID: claims.AttachmentID, + Type: "input", + Payload: envelope.Payload, + }) + if err != nil { + m.logger.Warn("session gateway input envelope rejected", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "kind", kind, + "action", action, + "reason", err.Error()) + return err + } + m.logger.Info("session gateway input envelope routed to worker", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "kind", kind, + "action", action, + "correlation_id", correlationID, + "trace_stage", "backend_route") + return nil + case "clipboard": + if ok, err := m.isCurrentController(ctx, claims); err != nil { + return err + } else if !ok { + return m.writeTakenOver(conn, claims.SessionID) + } + if err := m.ensureClipboardAllowed(ctx, claims.SessionID, "client_to_server"); err != nil { + return m.writeClipboardBlocked(conn, claims.SessionID, err.Error()) + } + text, _ := envelope.Payload["text"].(string) + if text == "" || len([]byte(text)) > maxClipboardTextBytes { + return m.writeClipboardBlocked(conn, claims.SessionID, "clipboard text is required") + } + contentHash, _ := envelope.Payload["content_hash"].(string) + if contentHash == "" { + contentHash = clipboardContentHash(text) + } + return m.workers.PublishControl(ctx, workercontracts.RoutedEnvelope{ + SessionID: claims.SessionID, + AttachmentID: claims.AttachmentID, + Type: "clipboard", + Payload: map[string]any{ + "direction": "client_to_server", + "text": text, + "sequence_id": time.Now().UTC().UnixNano(), + "origin": claims.AttachmentID, + "content_hash": contentHash, + "attachment_id": claims.AttachmentID, + }, + }) + case "file_upload.start": + return m.handleFileUploadStart(ctx, conn, claims, envelope) + case "file_upload.chunk": + return m.handleFileUploadChunk(ctx, conn, claims, envelope) + case "file_upload.cancel": + return m.handleFileUploadCancel(ctx, conn, claims, envelope) + case "file_download.start": + return m.handleFileDownloadControl(ctx, conn, claims, envelope, "start") + case "file_download.ack": + return m.handleFileDownloadControl(ctx, conn, claims, envelope, "ack") + case "file_download.cancel": + return m.handleFileDownloadControl(ctx, conn, claims, envelope, "cancel") + case "gui": + return m.workers.PublishControl(ctx, workercontracts.RoutedEnvelope{ + SessionID: claims.SessionID, + AttachmentID: claims.AttachmentID, + Type: "gui", + Payload: envelope.Payload, + }) + default: + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "error", + SessionID: claims.SessionID, + Payload: map[string]any{ + "message": "unsupported envelope type", + "type": envelope.Type, + }, + Event: m.newEventMessage( + "session.unsupported_envelope", + "events.session.unsupported_envelope", + "Unsupported session envelope type.", + map[string]any{ + "type": envelope.Type, + }, + ), + }) + } +} + +func (m *Module) handleFileUploadStart(ctx context.Context, conn *gatewayConnection, claims *sessioncontracts.AttachTokenClaims, envelope sessioncontracts.TransportEnvelope) error { + if ok, err := m.isCurrentController(ctx, claims); err != nil { + return err + } else if !ok { + return m.writeTakenOver(conn, claims.SessionID) + } + if err := m.ensureFileUploadAllowed(ctx, claims.SessionID); err != nil { + return m.writeFileUploadBlocked(conn, claims.SessionID, stringValue(envelope.Payload, "transfer_id"), err.Error()) + } + transferID, err := validateTransferID(stringValue(envelope.Payload, "transfer_id")) + if err != nil { + return m.writeFileUploadBlocked(conn, claims.SessionID, "", err.Error()) + } + fileName, err := sanitizeUploadFileName(stringValue(envelope.Payload, "file_name")) + if err != nil { + return m.writeFileUploadBlocked(conn, claims.SessionID, transferID, err.Error()) + } + fileSize, ok := numberValue(envelope.Payload, "file_size") + if !ok || fileSize <= 0 || fileSize > maxFileUploadBytes { + return m.writeFileUploadBlocked(conn, claims.SessionID, transferID, "file size exceeds policy") + } + totalChunks, ok := numberValue(envelope.Payload, "total_chunks") + if !ok || totalChunks <= 0 || totalChunks > (maxFileUploadBytes/maxFileUploadChunkBytes)+1 { + return m.writeFileUploadBlocked(conn, claims.SessionID, transferID, "invalid upload chunk count") + } + if _, exists := conn.uploads[transferID]; exists { + return m.writeFileUploadBlocked(conn, claims.SessionID, transferID, "duplicate transfer_id") + } + contentHash := stringValue(envelope.Payload, "content_hash") + conn.uploads[transferID] = &fileUploadState{ + TransferID: transferID, + FileName: fileName, + FileSize: fileSize, + TotalChunks: totalChunks, + ContentHash: contentHash, + } + m.logger.Info("session gateway file upload start accepted", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "transfer_id", transferID, + "file_name", fileName, + "file_size", fileSize, + "total_chunks", totalChunks) + if err := m.workers.PublishControl(ctx, workercontracts.RoutedEnvelope{ + SessionID: claims.SessionID, + AttachmentID: claims.AttachmentID, + Type: "file_upload", + Payload: map[string]any{ + "action": "start", + "direction": "client_to_server", + "transfer_id": transferID, + "file_name": fileName, + "file_size": fileSize, + "total_chunks": totalChunks, + "content_hash": contentHash, + "attachment_id": claims.AttachmentID, + }, + }); err != nil { + delete(conn.uploads, transferID) + return err + } + return m.writeFileUploadProgress(conn, claims.SessionID, transferID, 0, fileSize, "started") +} + +func (m *Module) handleFileUploadChunk(ctx context.Context, conn *gatewayConnection, claims *sessioncontracts.AttachTokenClaims, envelope sessioncontracts.TransportEnvelope) error { + if ok, err := m.isCurrentController(ctx, claims); err != nil { + return err + } else if !ok { + return m.writeTakenOver(conn, claims.SessionID) + } + if err := m.ensureFileUploadAllowed(ctx, claims.SessionID); err != nil { + return m.writeFileUploadBlocked(conn, claims.SessionID, stringValue(envelope.Payload, "transfer_id"), err.Error()) + } + transferID, err := validateTransferID(stringValue(envelope.Payload, "transfer_id")) + if err != nil { + return m.writeFileUploadBlocked(conn, claims.SessionID, "", err.Error()) + } + state := conn.uploads[transferID] + if state == nil { + return m.writeFileUploadBlocked(conn, claims.SessionID, transferID, "unknown transfer_id") + } + chunkIndex, ok := numberValue(envelope.Payload, "chunk_index") + if !ok || chunkIndex != state.NextIndex || chunkIndex >= state.TotalChunks { + return m.writeFileUploadBlocked(conn, claims.SessionID, transferID, "invalid chunk index") + } + offset, ok := numberValue(envelope.Payload, "offset") + if !ok || offset != state.Received { + return m.writeFileUploadBlocked(conn, claims.SessionID, transferID, "invalid chunk offset") + } + chunkBytes := stringValue(envelope.Payload, "chunk_bytes") + decoded, err := base64.StdEncoding.DecodeString(chunkBytes) + if err != nil || len(decoded) == 0 || len(decoded) > maxFileUploadChunkBytes { + return m.writeFileUploadBlocked(conn, claims.SessionID, transferID, "invalid chunk payload") + } + if state.Received+int64(len(decoded)) > state.FileSize { + return m.writeFileUploadBlocked(conn, claims.SessionID, transferID, "chunk exceeds declared file size") + } + state.Received += int64(len(decoded)) + state.NextIndex++ + status := "transferring" + if state.Received == state.FileSize && state.NextIndex == state.TotalChunks { + status = "completed" + } + m.logger.Info("session gateway file upload chunk accepted", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "transfer_id", transferID, + "chunk_index", chunkIndex, + "chunk_size", len(decoded), + "received", state.Received, + "file_size", state.FileSize) + if err := m.workers.PublishControl(ctx, workercontracts.RoutedEnvelope{ + SessionID: claims.SessionID, + AttachmentID: claims.AttachmentID, + Type: "file_upload", + Payload: map[string]any{ + "action": "chunk", + "direction": "client_to_server", + "transfer_id": transferID, + "chunk_index": chunkIndex, + "offset": offset, + "chunk_size": len(decoded), + "chunk_bytes": chunkBytes, + "attachment_id": claims.AttachmentID, + }, + }); err != nil { + return err + } + if status == "completed" { + delete(conn.uploads, transferID) + } + return m.writeFileUploadProgress(conn, claims.SessionID, transferID, state.Received, state.FileSize, status) +} + +func (m *Module) handleFileUploadCancel(ctx context.Context, conn *gatewayConnection, claims *sessioncontracts.AttachTokenClaims, envelope sessioncontracts.TransportEnvelope) error { + transferID, err := validateTransferID(stringValue(envelope.Payload, "transfer_id")) + if err != nil { + return m.writeFileUploadBlocked(conn, claims.SessionID, "", err.Error()) + } + delete(conn.uploads, transferID) + if err := m.workers.PublishControl(ctx, workercontracts.RoutedEnvelope{ + SessionID: claims.SessionID, + AttachmentID: claims.AttachmentID, + Type: "file_upload", + Payload: map[string]any{ + "action": "cancel", + "direction": "client_to_server", + "transfer_id": transferID, + "attachment_id": claims.AttachmentID, + }, + }); err != nil { + return err + } + return m.writeFileUploadProgress(conn, claims.SessionID, transferID, 0, 0, "cancelled") +} + +func (m *Module) handleFileDownloadControl(ctx context.Context, conn *gatewayConnection, claims *sessioncontracts.AttachTokenClaims, envelope sessioncontracts.TransportEnvelope, action string) error { + if ok, err := m.isCurrentController(ctx, claims); err != nil { + return err + } else if !ok { + return m.writeTakenOver(conn, claims.SessionID) + } + if err := m.ensureFileDownloadAllowed(ctx, claims.SessionID); err != nil { + return m.writeFileDownloadBlocked(conn, claims.SessionID, stringValue(envelope.Payload, "transfer_id"), err.Error()) + } + transferID, err := validateTransferID(stringValue(envelope.Payload, "transfer_id")) + if err != nil { + return m.writeFileDownloadBlocked(conn, claims.SessionID, "", err.Error()) + } + payload := map[string]any{ + "action": action, + "direction": "server_to_client", + "transfer_id": transferID, + "attachment_id": claims.AttachmentID, + } + switch action { + case "start": + fileID := strings.TrimSpace(stringValue(envelope.Payload, "file_id")) + if fileID == "" || !safeFileIDPattern.MatchString(fileID) { + return m.writeFileDownloadBlocked(conn, claims.SessionID, transferID, "invalid file_id") + } + payload["file_id"] = fileID + case "ack": + if sequence, ok := numberValue(envelope.Payload, "sequence"); ok && sequence >= 0 { + payload["sequence"] = sequence + } + if offset, ok := numberValue(envelope.Payload, "offset"); ok && offset >= 0 { + payload["offset"] = offset + } + case "cancel": + default: + return m.writeFileDownloadBlocked(conn, claims.SessionID, transferID, "invalid download action") + } + m.logger.Info("session gateway file download control accepted", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "transfer_id", transferID, + "action", action, + "file_id", payload["file_id"]) + return m.workers.PublishControl(ctx, workercontracts.RoutedEnvelope{ + SessionID: claims.SessionID, + AttachmentID: claims.AttachmentID, + Type: "file_download", + Payload: payload, + }) +} + +func (m *Module) syncConnection(ctx context.Context, conn *gatewayConnection, claims *sessioncontracts.AttachTokenClaims, lastFrameSequence *int64, lastClipboardSequence *int64, lastFileDownloadSequence *int64) error { + binding, err := m.broker.GetControllerBinding(ctx, claims.SessionID) + if err != nil { + return err + } + if binding == nil { + session, err := m.broker.GetSessionSnapshot(ctx, claims.SessionID) + if err != nil { + return err + } + if session != nil { + switch session.State { + case sessioncontracts.StateFailed, sessioncontracts.StateTerminated: + if err := m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "session.state", + SessionID: claims.SessionID, + Payload: map[string]any{ + "state": session.State, + "takeover_version": session.TakeoverVersion, + "attachment_id": claims.AttachmentID, + "render": renderPayloadFromSessionSnapshot(session), + }, + Event: m.newEventMessage( + "session.state."+string(session.State), + "events.session.state."+string(session.State), + "Session state updated.", + map[string]any{ + "state": session.State, + }, + ), + }); err != nil { + return err + } + _ = m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "transport.closed", + SessionID: claims.SessionID, + Payload: map[string]any{ + "reason": string(session.State), + }, + Event: m.newEventMessage( + "transport.closed", + "events.session.transport_closed", + "The session transport closed.", + map[string]any{ + "reason": session.State, + }, + ), + }) + return websocket.ErrCloseSent + } + } + _ = m.writeTakenOver(conn, claims.SessionID) + return websocket.ErrCloseSent + } + if binding.AttachmentID != claims.AttachmentID || binding.TakeoverVersion != claims.TakeoverVersion { + _ = m.writeTakenOver(conn, claims.SessionID) + return websocket.ErrCloseSent + } + state, err := m.broker.GetLiveSession(ctx, claims.SessionID) + if err != nil { + return err + } + if state != nil { + if err := m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "session.state", + SessionID: claims.SessionID, + Payload: map[string]any{ + "state": state.State, + "takeover_version": state.TakeoverVersion, + "attachment_id": state.AttachmentID, + "render": renderPayloadFromLiveState(state), + }, + Event: m.newEventMessage( + "session.state."+string(state.State), + "events.session.state."+string(state.State), + "Session state updated.", + map[string]any{ + "state": state.State, + }, + ), + }); err != nil { + return err + } + if state.RenderFrameSequence > 0 && state.RenderFrameSequence != *lastFrameSequence && state.RenderFrameData != "" { + if err := m.writeFrameEnvelope(conn, claims.SessionID, *state); err != nil { + return err + } + *lastFrameSequence = state.RenderFrameSequence + } + m.logger.Info("session gateway clipboard sync evaluation", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "clipboard_sequence", state.ClipboardSequence, + "last_clipboard_sequence", *lastClipboardSequence, + "clipboard_text_bytes", len(state.ClipboardText), + "clipboard_origin", state.ClipboardOrigin) + if state.ClipboardSequence > 0 && state.ClipboardSequence != *lastClipboardSequence && state.ClipboardText != "" { + if err := m.ensureClipboardAllowed(ctx, claims.SessionID, "server_to_client"); err == nil { + m.logger.Info("session gateway clipboard sync allowed", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "sequence_id", state.ClipboardSequence) + if err := m.writeClipboardEnvelope(conn, claims.SessionID, *state); err != nil { + return err + } + } else { + m.logger.Warn("session gateway clipboard sync blocked", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "sequence_id", state.ClipboardSequence, + "reason", err.Error()) + } + *lastClipboardSequence = state.ClipboardSequence + } + if state.FileDownloadSequence > 0 && state.FileDownloadSequence != *lastFileDownloadSequence && state.FileDownloadType != "" { + if err := m.ensureFileDownloadAllowed(ctx, claims.SessionID); err == nil { + if err := m.writeFileDownloadEnvelope(conn, claims.SessionID, *state); err != nil { + return err + } + } else { + m.logger.Warn("session gateway file download sync blocked", + "session_id", claims.SessionID, + "attachment_id", claims.AttachmentID, + "sequence", state.FileDownloadSequence, + "reason", err.Error()) + } + *lastFileDownloadSequence = state.FileDownloadSequence + } + } + return nil +} + +func (m *Module) isCurrentController(ctx context.Context, claims *sessioncontracts.AttachTokenClaims) (bool, error) { + binding, err := m.broker.GetControllerBinding(ctx, claims.SessionID) + if err != nil { + return false, err + } + return binding != nil && binding.AttachmentID == claims.AttachmentID && binding.TakeoverVersion == claims.TakeoverVersion, nil +} + +func (m *Module) writeTakenOver(conn *gatewayConnection, sessionID string) error { + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "session.taken_over", + SessionID: sessionID, + Payload: map[string]any{ + "message": "controller binding changed", + }, + Event: m.newEventMessage( + "session.taken_over", + "events.session.taken_over", + "This session was taken over from another device.", + map[string]any{ + "reason": "controller_binding_changed", + }, + ), + }) +} + +func (m *Module) writeClipboardEnvelope(conn *gatewayConnection, sessionID string, state sessionbroker.LiveSessionState) error { + m.logger.Info("session gateway writing clipboard envelope", + "session_id", sessionID, + "sequence_id", state.ClipboardSequence, + "origin", state.ClipboardOrigin, + "content_hash", state.ClipboardContentHash, + "text_bytes", len(state.ClipboardText)) + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "clipboard.text", + SessionID: sessionID, + Payload: map[string]any{ + "direction": "server_to_client", + "text": state.ClipboardText, + "clipboard_sequence": state.ClipboardSequence, + "sequence_id": state.ClipboardSequence, + "origin": state.ClipboardOrigin, + "content_hash": state.ClipboardContentHash, + }, + Event: m.newEventMessage( + "clipboard.text", + "events.clipboard.text_received", + "Clipboard text received from the remote session.", + map[string]any{"direction": "server_to_client"}, + ), + }) +} + +func clipboardContentHash(text string) string { + hash := fnv.New64a() + _, _ = hash.Write([]byte(text)) + return fmt.Sprintf("%016x", hash.Sum64()) +} + +func (m *Module) writeClipboardBlocked(conn *gatewayConnection, sessionID, reason string) error { + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "clipboard.blocked", + SessionID: sessionID, + Payload: map[string]any{ + "reason": reason, + }, + Event: m.newEventMessage( + "clipboard.blocked", + "events.clipboard.blocked", + "Clipboard transfer is blocked by session state or resource policy.", + map[string]any{"reason": reason}, + ), + }) +} + +func (m *Module) writeFileUploadProgress(conn *gatewayConnection, sessionID, transferID string, received, total int64, status string) error { + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "file_upload.progress", + SessionID: sessionID, + Payload: map[string]any{ + "transfer_id": transferID, + "received": received, + "total": total, + "status": status, + }, + Event: m.newEventMessage( + "file_upload."+status, + "events.file_upload."+status, + "File upload status updated.", + map[string]any{"transfer_id": transferID, "status": status}, + ), + }) +} + +func (m *Module) writeFileUploadBlocked(conn *gatewayConnection, sessionID, transferID, reason string) error { + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "file_upload.blocked", + SessionID: sessionID, + Payload: map[string]any{ + "transfer_id": transferID, + "reason": reason, + }, + Event: m.newEventMessage( + "file_upload.blocked", + "events.file_upload.blocked", + "File upload is blocked by session state or resource policy.", + map[string]any{"reason": reason, "transfer_id": transferID}, + ), + }) +} + +func (m *Module) writeFileDownloadEnvelope(conn *gatewayConnection, sessionID string, state sessionbroker.LiveSessionState) error { + payload := map[string]any{} + for key, value := range state.FileDownloadPayload { + payload[key] = value + } + payload["sequence"] = state.FileDownloadSequence + payload["direction"] = "server_to_client" + envelopeType := fileDownloadEnvelopeType(state.FileDownloadType) + m.logger.Info("session gateway writing file download envelope", + "session_id", sessionID, + "type", envelopeType, + "sequence", state.FileDownloadSequence, + "transfer_id", payload["transfer_id"], + "file_id", payload["file_id"], + "file_name", payload["file_name"], + "status", payload["status"]) + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: envelopeType, + SessionID: sessionID, + Payload: payload, + Event: m.newEventMessage( + fileDownloadEventCode(envelopeType), + fileDownloadMessageKey(envelopeType), + fileDownloadFallback(envelopeType), + map[string]any{ + "transfer_id": payload["transfer_id"], + "file_id": payload["file_id"], + "file_name": payload["file_name"], + "status": payload["status"], + }, + ), + }) +} + +func (m *Module) writeFileDownloadBlocked(conn *gatewayConnection, sessionID, transferID, reason string) error { + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "file_download.blocked", + SessionID: sessionID, + Payload: map[string]any{ + "transfer_id": transferID, + "reason": reason, + "direction": "server_to_client", + }, + Event: m.newEventMessage( + "file_download.blocked", + "events.file_download.blocked", + "File download is blocked by session state or resource policy.", + map[string]any{"reason": reason, "transfer_id": transferID}, + ), + }) +} + +func (m *Module) ensureClipboardAllowed(ctx context.Context, sessionID, direction string) error { + mode, state, err := m.broker.GetSessionClipboardPolicy(ctx, sessionID) + if err != nil { + return err + } + if state != sessioncontracts.StateActive { + return sessionbroker.ErrSessionNotAttachable + } + switch direction { + case "client_to_server": + if mode == sessionbroker.ResourceClipboardModeClientToServer || mode == sessionbroker.ResourceClipboardModeBidirectional { + return nil + } + case "server_to_client": + if mode == sessionbroker.ResourceClipboardModeServerToClient || mode == sessionbroker.ResourceClipboardModeBidirectional { + return nil + } + } + return sessionbroker.ErrAccessDenied +} + +func (m *Module) ensureFileUploadAllowed(ctx context.Context, sessionID string) error { + mode, state, err := m.broker.GetSessionFileTransferPolicy(ctx, sessionID) + if err != nil { + return err + } + if state != sessioncontracts.StateActive { + return sessionbroker.ErrSessionNotAttachable + } + if mode == sessionbroker.ResourceFileTransferModeClientToServer || mode == sessionbroker.ResourceFileTransferModeBidirectional { + return nil + } + return sessionbroker.ErrAccessDenied +} + +func (m *Module) ensureFileDownloadAllowed(ctx context.Context, sessionID string) error { + mode, state, err := m.broker.GetSessionFileTransferPolicy(ctx, sessionID) + if err != nil { + return err + } + if state != sessioncontracts.StateActive { + return sessionbroker.ErrSessionNotAttachable + } + if mode == sessionbroker.ResourceFileTransferModeServerToClient || mode == sessionbroker.ResourceFileTransferModeBidirectional { + return nil + } + return sessionbroker.ErrAccessDenied +} + +func fileDownloadEnvelopeType(workerEventType string) string { + switch workerEventType { + case worker.SessionEventFileDownloadAvailable: + return "file_download.available" + case worker.SessionEventFileDownloadChunk: + return "file_download.chunk" + case worker.SessionEventFileDownloadCompleted: + return "file_download.completed" + case worker.SessionEventFileDownloadFailed: + return "file_download.failed" + case worker.SessionEventFileDownloadBlocked: + return "file_download.blocked" + default: + return "file_download.progress" + } +} + +func fileDownloadEventCode(envelopeType string) string { + return strings.ReplaceAll(envelopeType, ".", "_") +} + +func fileDownloadMessageKey(envelopeType string) string { + return "events." + envelopeType +} + +func fileDownloadFallback(envelopeType string) string { + switch envelopeType { + case "file_download.available": + return "A file is available for download from the remote session." + case "file_download.chunk": + return "File download chunk received." + case "file_download.completed": + return "File download completed." + case "file_download.failed": + return "File download failed." + case "file_download.blocked": + return "File download is blocked by session state or resource policy." + default: + return "File download status updated." + } +} + +func validateTransferID(value string) (string, error) { + if value == "" || !safeTransferIDPattern.MatchString(value) { + return "", errors.New("invalid transfer_id") + } + parsed, err := uuid.Parse(value) + if err != nil { + return "", errors.New("invalid transfer_id") + } + return parsed.String(), nil +} + +func sanitizeUploadFileName(value string) (string, error) { + name := strings.TrimSpace(value) + if name == "" || name == "." || name == ".." { + return "", errors.New("invalid file name") + } + if strings.Contains(name, "..") || strings.ContainsAny(name, `/\:`) || filepath.IsAbs(name) { + return "", errors.New("invalid file name") + } + if len([]byte(name)) > 255 { + return "", errors.New("file name is too long") + } + return name, nil +} + +func stringValue(payload map[string]any, key string) string { + if payload == nil { + return "" + } + value, _ := payload[key].(string) + return value +} + +func numberValue(payload map[string]any, key string) (int64, bool) { + if payload == nil { + return 0, false + } + switch value := payload[key].(type) { + case int: + return int64(value), true + case int64: + return value, true + case float64: + return int64(value), value == float64(int64(value)) + default: + return 0, false + } +} + +func (m *Module) writeFrameEnvelope(conn *gatewayConnection, sessionID string, state sessionbroker.LiveSessionState) error { + if state.LastInputCorrelationID != "" { + m.logger.Info("session gateway frame envelope writing", + "session_id", sessionID, + "frame_sequence", state.RenderFrameSequence, + "correlation_id", state.LastInputCorrelationID, + "worker_frame_captured_at", state.WorkerFrameCapturedAt, + "trace_stage", "backend_frame_to_client") + } + return m.writeEnvelope(conn, sessioncontracts.TransportEnvelope{ + Type: "session.frame", + SessionID: sessionID, + Payload: map[string]any{ + "frame_sequence": state.RenderFrameSequence, + "frame_format": state.RenderFrameFormat, + "frame_data": state.RenderFrameData, + "frame_width": state.RenderWidth, + "frame_height": state.RenderHeight, + "frame_stride": state.RenderWidth * 4, + "frame_update_kind": "full", + "desktop_width": state.RenderWidth, + "desktop_height": state.RenderHeight, + "region_x": 0, + "region_y": 0, + "region_width": state.RenderWidth, + "region_height": state.RenderHeight, + "input_correlation_id": state.LastInputCorrelationID, + "worker_frame_captured_at": state.WorkerFrameCapturedAt, + "render": renderPayloadFromLiveState(&state), + }, + Event: m.newEventMessage( + "session.frame", + "events.session.frame", + "The session desktop frame updated.", + map[string]any{ + "frame_sequence": state.RenderFrameSequence, + }, + ), + }) +} + +func renderPayloadFromLiveState(state *sessionbroker.LiveSessionState) map[string]any { + if state == nil { + return map[string]any{} + } + return map[string]any{ + "quality_profile": state.RenderQualityProfile, + "render_quality_profile": state.RenderQualityProfile, + "state": state.RenderState, + "render_state": state.RenderState, + "width": state.RenderWidth, + "height": state.RenderHeight, + "frame_sequence": state.RenderFrameSequence, + "frame_format": state.RenderFrameFormat, + "cursor_x": state.CursorX, + "cursor_y": state.CursorY, + "cursor_visible": state.CursorVisible, + "dirty_rectangles": state.DirtyRectangles, + "last_render_at": state.LastRenderAt, + } +} + +func renderPayloadFromSessionSnapshot(session *sessionbroker.RemoteSession) map[string]any { + if session == nil { + return map[string]any{} + } + return map[string]any{ + "quality_profile": session.RenderQualityProfile, + "render_quality_profile": session.RenderQualityProfile, + "state": session.State, + "render_state": session.State, + "width": 0, + "height": 0, + "cursor_x": 0, + "cursor_y": 0, + "cursor_visible": true, + "dirty_rectangles": 0, + "last_render_at": session.LastHeartbeatAt, + } +} + +func (m *Module) writeEnvelope(conn *gatewayConnection, envelope sessioncontracts.TransportEnvelope) error { + return conn.writeJSON(m.cfg.WebSocket.WriteTimeout, envelope) +} + +func (m *Module) newEventMessage(code, messageKey, fallback string, details map[string]any) *messagecontracts.Message { + event := httpx.NewMessage(code, messageKey, fallback, details, "") + return &event +} diff --git a/backend/internal/modules/worker/events.go b/backend/internal/modules/worker/events.go new file mode 100644 index 0000000..4b07985 --- /dev/null +++ b/backend/internal/modules/worker/events.go @@ -0,0 +1,29 @@ +package worker + +type SessionEvent struct { + Type string `json:"type"` + SessionID string `json:"session_id"` + WorkerID string `json:"worker_id"` + Payload map[string]any `json:"payload,omitempty"` +} + +const ( + SessionEventConnected = "session_connected" + SessionEventHeartbeat = "session_heartbeat" + SessionEventFailed = "session_failed" + SessionEventTerminated = "session_terminated" + SessionEventDisplayReady = "session_display_ready" + SessionEventRenderReady = "session_render_ready" + SessionEventRenderDirty = "session_render_dirty" + SessionEventRenderResized = "session_render_resized" + SessionEventCursorUpdated = "session_cursor_updated" + SessionEventFrame = "session_frame" + SessionEventClipboardText = "session_clipboard_text" + SessionEventFileUploaded = "session_file_upload_completed" + SessionEventFileDownloadAvailable = "session_file_download_available" + SessionEventFileDownloadChunk = "session_file_download_chunk" + SessionEventFileDownloadProgress = "session_file_download_progress" + SessionEventFileDownloadCompleted = "session_file_download_completed" + SessionEventFileDownloadFailed = "session_file_download_failed" + SessionEventFileDownloadBlocked = "session_file_download_blocked" +) diff --git a/backend/internal/modules/worker/monitor.go b/backend/internal/modules/worker/monitor.go new file mode 100644 index 0000000..06f590c --- /dev/null +++ b/backend/internal/modules/worker/monitor.go @@ -0,0 +1,52 @@ +package worker + +import ( + "context" + "errors" + "time" + + "github.com/example/remote-access-platform/backend/internal/modules/sessionbroker" +) + +type LeaseMonitor struct { + service *Service + broker *sessionbroker.Service + interval time.Duration +} + +func NewLeaseMonitor(service *Service, broker *sessionbroker.Service, interval time.Duration) *LeaseMonitor { + if interval <= 0 { + interval = 15 * time.Second + } + return &LeaseMonitor{ + service: service, + broker: broker, + interval: interval, + } +} + +func (m *LeaseMonitor) Run(ctx context.Context) error { + ticker := time.NewTicker(m.interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + stale, err := m.service.RecoverStaleLeases(ctx) + if err != nil { + return err + } + for _, lease := range stale { + err := m.broker.MarkSessionFailed(ctx, sessionbroker.MarkSessionFailedCommand{ + SessionID: lease.SessionID, + Reason: "worker_lease_stale_or_worker_missing", + }) + if err != nil && !errors.Is(err, sessionbroker.ErrSessionNotFound) && !errors.Is(err, sessionbroker.ErrSessionNotTerminable) { + return err + } + } + } + } +} diff --git a/backend/internal/modules/worker/processor.go b/backend/internal/modules/worker/processor.go new file mode 100644 index 0000000..dff7fae --- /dev/null +++ b/backend/internal/modules/worker/processor.go @@ -0,0 +1,153 @@ +package worker + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "time" + + "github.com/redis/go-redis/v9" + + "github.com/example/remote-access-platform/backend/internal/modules/sessionbroker" +) + +type EventProcessor struct { + client *redis.Client + broker *sessionbroker.Service +} + +func NewEventProcessor(client *redis.Client, broker *sessionbroker.Service) *EventProcessor { + return &EventProcessor{client: client, broker: broker} +} + +func (p *EventProcessor) Run(ctx context.Context) error { + for { + select { + case <-ctx.Done(): + return nil + default: + } + + result, err := p.client.BLPop(ctx, 5*time.Second, "worker:events").Result() + if err == redis.Nil { + continue + } + if err != nil { + if ctx.Err() != nil { + return nil + } + return fmt.Errorf("consume worker event: %w", err) + } + if len(result) != 2 { + continue + } + + var event SessionEvent + if err := json.Unmarshal([]byte(result[1]), &event); err != nil { + continue + } + if err := p.handleEvent(ctx, event); err != nil { + return err + } + } +} + +func (p *EventProcessor) handleEvent(ctx context.Context, event SessionEvent) error { + switch event.Type { + case SessionEventConnected, SessionEventDisplayReady: + if err := p.broker.HandleWorkerConnected(ctx, event.SessionID); err != nil { + return err + } + if len(event.Payload) > 0 { + if err := p.broker.UpdateWorkerRenderTelemetry(ctx, event.SessionID, event.Payload); err != nil && !errors.Is(err, sessionbroker.ErrSessionNotFound) { + return err + } + } + return nil + case SessionEventHeartbeat: + return p.broker.HandleWorkerHeartbeat(ctx, event.SessionID) + case SessionEventRenderReady, SessionEventRenderDirty, SessionEventRenderResized, SessionEventCursorUpdated, SessionEventFrame: + if len(event.Payload) == 0 { + return nil + } + if correlationID, _ := event.Payload["input_correlation_id"].(string); correlationID != "" { + slog.Info("worker frame event received", + "session_id", event.SessionID, + "worker_id", event.WorkerID, + "frame_sequence", event.Payload["frame_sequence"], + "correlation_id", correlationID, + "worker_frame_captured_at", event.Payload["worker_frame_captured_at"], + "trace_stage", "backend_frame_receive") + } + return p.updateRenderTelemetryWithRetry(ctx, event.SessionID, event.Payload) + case SessionEventClipboardText: + if len(event.Payload) == 0 { + return nil + } + slog.Info("worker clipboard event received", + "session_id", event.SessionID, + "worker_id", event.WorkerID, + "origin", event.Payload["origin"], + "sequence_id", event.Payload["sequence_id"], + "content_hash", event.Payload["content_hash"]) + return p.broker.UpdateWorkerClipboardText(ctx, event.SessionID, event.Payload) + case SessionEventFileUploaded: + slog.Info("worker file upload completed", + "session_id", event.SessionID, + "worker_id", event.WorkerID, + "transfer_id", event.Payload["transfer_id"], + "file_name", event.Payload["file_name"], + "file_size", event.Payload["file_size"], + "content_hash", event.Payload["content_hash"], + "storage_path", event.Payload["storage_path"]) + return nil + case SessionEventFileDownloadAvailable, SessionEventFileDownloadChunk, SessionEventFileDownloadProgress, + SessionEventFileDownloadCompleted, SessionEventFileDownloadFailed, SessionEventFileDownloadBlocked: + slog.Info("worker file download event received", + "session_id", event.SessionID, + "worker_id", event.WorkerID, + "event_type", event.Type, + "transfer_id", event.Payload["transfer_id"], + "file_id", event.Payload["file_id"], + "file_name", event.Payload["file_name"], + "status", event.Payload["status"]) + return p.broker.UpdateWorkerFileDownloadEvent(ctx, event.SessionID, event.Type, event.Payload) + case SessionEventFailed: + reason, _ := event.Payload["reason"].(string) + err := p.broker.MarkSessionFailed(ctx, sessionbroker.MarkSessionFailedCommand{ + SessionID: event.SessionID, + Reason: reason, + }) + if errors.Is(err, sessionbroker.ErrSessionNotFound) || errors.Is(err, sessionbroker.ErrSessionNotTerminable) { + return nil + } + return err + case SessionEventTerminated: + reason, _ := event.Payload["reason"].(string) + err := p.broker.TerminateSession(ctx, sessionbroker.TerminateSessionCommand{ + SessionID: event.SessionID, + Reason: reason, + }) + if errors.Is(err, sessionbroker.ErrSessionNotFound) || errors.Is(err, sessionbroker.ErrSessionNotTerminable) { + return nil + } + return err + default: + return nil + } +} + +func (p *EventProcessor) updateRenderTelemetryWithRetry(ctx context.Context, sessionID string, payload map[string]any) error { + var lastErr error + for attempt := 0; attempt < 10; attempt++ { + err := p.broker.UpdateWorkerRenderTelemetry(ctx, sessionID, payload) + if err == nil || errors.Is(err, sessionbroker.ErrSessionNotFound) { + return nil + } + lastErr = err + time.Sleep(100 * time.Millisecond) + } + return lastErr +} diff --git a/backend/internal/modules/worker/redis_store.go b/backend/internal/modules/worker/redis_store.go new file mode 100644 index 0000000..d3e827d --- /dev/null +++ b/backend/internal/modules/worker/redis_store.go @@ -0,0 +1,264 @@ +package worker + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "time" + + "github.com/redis/go-redis/v9" + + workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker" +) + +type RedisStore struct { + client *redis.Client +} + +func NewRedisStore(client *redis.Client) *RedisStore { + return &RedisStore{client: client} +} + +func (s *RedisStore) RegisterWorker(ctx context.Context, registration workercontracts.WorkerRegistration, ttl time.Duration) error { + payload, err := json.Marshal(registration) + if err != nil { + return fmt.Errorf("marshal worker registration: %w", err) + } + pipe := s.client.TxPipeline() + pipe.Set(ctx, workerKey(registration.WorkerID), payload, ttl) + pipe.SAdd(ctx, workerSetKey(), registration.WorkerID) + _, err = pipe.Exec(ctx) + if err != nil { + return fmt.Errorf("register worker: %w", err) + } + return nil +} + +func (s *RedisStore) TouchWorkerHeartbeat(ctx context.Context, heartbeat workercontracts.WorkerHeartbeat, ttl time.Duration) error { + registration, err := s.GetWorker(ctx, heartbeat.WorkerID) + if err != nil { + return err + } + if registration == nil { + registration = &workercontracts.WorkerRegistration{ + WorkerID: heartbeat.WorkerID, + Protocol: workercontracts.ProtocolRDP, + } + } + registration.Status = heartbeat.Status + registration.LastHeartbeatAt = heartbeat.LastHeartbeatAt + return s.RegisterWorker(ctx, *registration, ttl) +} + +func (s *RedisStore) ListWorkers(ctx context.Context) ([]workercontracts.WorkerRegistration, error) { + ids, err := s.client.SMembers(ctx, workerSetKey()).Result() + if err != nil { + return nil, fmt.Errorf("list worker ids: %w", err) + } + workers := make([]workercontracts.WorkerRegistration, 0, len(ids)) + for _, id := range ids { + worker, err := s.GetWorker(ctx, id) + if err != nil { + return nil, err + } + if worker != nil { + workers = append(workers, *worker) + } + } + return workers, nil +} + +func (s *RedisStore) GetWorker(ctx context.Context, workerID string) (*workercontracts.WorkerRegistration, error) { + payload, err := s.client.Get(ctx, workerKey(workerID)).Result() + if err == redis.Nil { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("get worker: %w", err) + } + var registration workercontracts.WorkerRegistration + if err := json.Unmarshal([]byte(payload), ®istration); err != nil { + return nil, fmt.Errorf("decode worker registration: %w", err) + } + return ®istration, nil +} + +func (s *RedisStore) AcquireLease(ctx context.Context, lease workercontracts.WorkerLease, ttl time.Duration) error { + payload, err := json.Marshal(lease) + if err != nil { + return fmt.Errorf("marshal lease: %w", err) + } + ok, err := s.client.SetNX(ctx, leaseKey(lease.LeaseID), payload, ttl).Result() + if err != nil { + return fmt.Errorf("acquire lease: %w", err) + } + if !ok { + return fmt.Errorf("lease already exists") + } + pipe := s.client.TxPipeline() + pipe.SAdd(ctx, leaseSetKey(), lease.LeaseID) + pipe.Set(ctx, sessionLeaseKey(lease.SessionID), lease.LeaseID, ttl) + _, err = pipe.Exec(ctx) + if err != nil { + return fmt.Errorf("index lease: %w", err) + } + return nil +} + +func (s *RedisStore) GetLease(ctx context.Context, leaseID string) (*workercontracts.WorkerLease, error) { + payload, err := s.client.Get(ctx, leaseKey(leaseID)).Result() + if err == redis.Nil { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("get lease: %w", err) + } + var lease workercontracts.WorkerLease + if err := json.Unmarshal([]byte(payload), &lease); err != nil { + return nil, fmt.Errorf("decode lease: %w", err) + } + return &lease, nil +} + +func (s *RedisStore) GetLeaseBySession(ctx context.Context, sessionID string) (*workercontracts.WorkerLease, error) { + leaseID, err := s.client.Get(ctx, sessionLeaseKey(sessionID)).Result() + if err == redis.Nil { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("get lease by session: %w", err) + } + return s.GetLease(ctx, leaseID) +} + +func (s *RedisStore) RenewLease(ctx context.Context, lease workercontracts.WorkerLease, ttl time.Duration) error { + payload, err := json.Marshal(lease) + if err != nil { + return fmt.Errorf("marshal lease renewal: %w", err) + } + pipe := s.client.TxPipeline() + pipe.Set(ctx, leaseKey(lease.LeaseID), payload, ttl) + pipe.Set(ctx, sessionLeaseKey(lease.SessionID), lease.LeaseID, ttl) + _, err = pipe.Exec(ctx) + if err != nil { + return fmt.Errorf("renew lease: %w", err) + } + return nil +} + +func (s *RedisStore) ReleaseLease(ctx context.Context, leaseID string) error { + lease, err := s.GetLease(ctx, leaseID) + if err != nil { + return err + } + pipe := s.client.TxPipeline() + pipe.Del(ctx, leaseKey(leaseID)) + pipe.SRem(ctx, leaseSetKey(), leaseID) + if lease != nil { + pipe.Del(ctx, sessionLeaseKey(lease.SessionID)) + } + _, err = pipe.Exec(ctx) + if err != nil { + return fmt.Errorf("release lease: %w", err) + } + return nil +} + +func (s *RedisStore) ListLeases(ctx context.Context) ([]workercontracts.WorkerLease, error) { + ids, err := s.client.SMembers(ctx, leaseSetKey()).Result() + if err != nil { + return nil, fmt.Errorf("list lease ids: %w", err) + } + leases := make([]workercontracts.WorkerLease, 0, len(ids)) + for _, id := range ids { + lease, err := s.GetLease(ctx, id) + if err != nil { + return nil, err + } + if lease != nil { + leases = append(leases, *lease) + } + } + return leases, nil +} + +func (s *RedisStore) AppendEnvelope(ctx context.Context, envelope workercontracts.RoutedEnvelope) error { + payload, err := json.Marshal(envelope) + if err != nil { + return fmt.Errorf("marshal routed envelope: %w", err) + } + key := workerQueueKey(envelope.SessionID) + if err := s.client.RPush(ctx, key, payload).Err(); err != nil { + return fmt.Errorf("append routed envelope: %w", err) + } + if envelope.Type == "input" { + correlationID, _ := envelope.Payload["correlation_id"].(string) + if correlationID != "" { + if length, err := s.client.LLen(ctx, key).Result(); err == nil { + slog.Info("worker queue length after input append", + "session_id", envelope.SessionID, + "attachment_id", envelope.AttachmentID, + "correlation_id", correlationID, + "queue_key", key, + "queue_length", length, + "trace_stage", "redis_queue_append") + } + } + } + return s.client.Expire(ctx, key, 10*time.Minute).Err() +} + +func (s *RedisStore) AppendAssignment(ctx context.Context, workerID string, payload map[string]any) error { + encoded, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("marshal worker assignment: %w", err) + } + if err := s.client.RPush(ctx, workerControlQueueKey(workerID), encoded).Err(); err != nil { + return fmt.Errorf("append worker assignment: %w", err) + } + return s.client.Expire(ctx, workerControlQueueKey(workerID), 10*time.Minute).Err() +} + +func (s *RedisStore) AppendEvent(ctx context.Context, payload map[string]any) error { + encoded, err := json.Marshal(payload) + if err != nil { + return fmt.Errorf("marshal worker event: %w", err) + } + if err := s.client.RPush(ctx, workerEventsKey(), encoded).Err(); err != nil { + return fmt.Errorf("append worker event: %w", err) + } + return s.client.Expire(ctx, workerEventsKey(), 10*time.Minute).Err() +} + +func workerKey(workerID string) string { + return "worker:registration:" + workerID +} + +func workerSetKey() string { + return "worker:registrations" +} + +func leaseKey(leaseID string) string { + return "worker:lease:" + leaseID +} + +func leaseSetKey() string { + return "worker:leases" +} + +func sessionLeaseKey(sessionID string) string { + return "worker:session-lease:" + sessionID +} + +func workerQueueKey(sessionID string) string { + return "worker:queue:" + sessionID +} + +func workerControlQueueKey(workerID string) string { + return "worker:control:" + workerID +} + +func workerEventsKey() string { + return "worker:events" +} diff --git a/backend/internal/modules/worker/service.go b/backend/internal/modules/worker/service.go new file mode 100644 index 0000000..55ac056 --- /dev/null +++ b/backend/internal/modules/worker/service.go @@ -0,0 +1,274 @@ +package worker + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "slices" + "time" + + "github.com/google/uuid" + + "github.com/example/remote-access-platform/backend/internal/modules/sessionbroker" + "github.com/example/remote-access-platform/backend/internal/platform/module" + workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker" +) + +var ErrNoWorkerAvailable = errors.New("no worker available") + +type Service struct { + cfg module.Config + store Store + now func() time.Time +} + +func NewService(deps module.Dependencies, store Store) *Service { + return &Service{ + cfg: deps.Config, + store: store, + now: time.Now, + } +} + +func (s *Service) Register(ctx context.Context, registration workercontracts.WorkerRegistration) error { + if registration.WorkerID == "" { + return fmt.Errorf("worker id is required") + } + registration.LastHeartbeatAt = s.now().UTC() + return s.store.RegisterWorker(ctx, registration, s.cfg.Worker.HeartbeatTTL) +} + +func (s *Service) Heartbeat(ctx context.Context, heartbeat workercontracts.WorkerHeartbeat) error { + heartbeat.LastHeartbeatAt = s.now().UTC() + return s.store.TouchWorkerHeartbeat(ctx, heartbeat, s.cfg.Worker.HeartbeatTTL) +} + +func (s *Service) Reserve(ctx context.Context, request workercontracts.AttachRequest) (*workercontracts.WorkerLease, error) { + registration, err := s.reserveWorker(ctx, workercontracts.ProtocolRDP, request.RequiredCapabilities) + if err != nil { + return nil, err + } + return s.AcquireLease(ctx, registration.WorkerID, request) +} + +func (s *Service) reserveWorker(ctx context.Context, protocol workercontracts.Protocol, capabilities []string) (*workercontracts.WorkerRegistration, error) { + workers, err := s.store.ListWorkers(ctx) + if err != nil { + return nil, err + } + now := s.now().UTC() + for _, worker := range workers { + if worker.Protocol != protocol || worker.Status != workercontracts.StatusOnline { + continue + } + if now.Sub(worker.LastHeartbeatAt) > s.cfg.Worker.StaleLeaseGracePeriod+s.cfg.Worker.HeartbeatTTL { + continue + } + if !hasCapabilities(worker.Capabilities, capabilities) { + continue + } + return &worker, nil + } + return nil, ErrNoWorkerAvailable +} + +func (s *Service) AcquireLease(ctx context.Context, workerID string, request workercontracts.AttachRequest) (*workercontracts.WorkerLease, error) { + if request.SessionID == "" { + request.SessionID = uuid.NewString() + } + now := s.now().UTC() + lease := workercontracts.WorkerLease{ + LeaseID: uuid.NewString(), + WorkerID: workerID, + Protocol: workercontracts.ProtocolRDP, + ResourceID: request.ResourceID, + SessionID: request.SessionID, + Capabilities: request.RequiredCapabilities, + ControlStream: "worker://control/" + workerID, + ExpiresAt: now.Add(s.cfg.Worker.LeaseTTL), + RenderQualityProfile: normalizeRenderQualityProfile(request.RenderQualityProfile), + } + if err := s.store.AcquireLease(ctx, lease, s.cfg.Worker.LeaseTTL); err != nil { + return nil, err + } + return &lease, nil +} + +func (s *Service) GetSessionLease(ctx context.Context, sessionID string) (*workercontracts.WorkerLease, error) { + return s.store.GetLeaseBySession(ctx, sessionID) +} + +func (s *Service) RenewLease(ctx context.Context, leaseID string) (*workercontracts.WorkerLease, error) { + lease, err := s.store.GetLease(ctx, leaseID) + if err != nil || lease == nil { + return lease, err + } + lease.ExpiresAt = s.now().UTC().Add(s.cfg.Worker.LeaseTTL) + if err := s.store.RenewLease(ctx, *lease, s.cfg.Worker.LeaseTTL); err != nil { + return nil, err + } + return lease, nil +} + +func (s *Service) ReleaseLease(ctx context.Context, leaseID string) error { + return s.store.ReleaseLease(ctx, leaseID) +} + +func (s *Service) ReleaseSessionLease(ctx context.Context, sessionID string) error { + lease, err := s.store.GetLeaseBySession(ctx, sessionID) + if err != nil || lease == nil { + return err + } + return s.store.ReleaseLease(ctx, lease.LeaseID) +} + +func (s *Service) RecoverStaleLeases(ctx context.Context) ([]workercontracts.WorkerLease, error) { + leases, err := s.store.ListLeases(ctx) + if err != nil { + return nil, err + } + var stale []workercontracts.WorkerLease + now := s.now().UTC() + for _, lease := range leases { + registration, err := s.store.GetWorker(ctx, lease.WorkerID) + if err != nil { + return nil, err + } + if registration == nil || now.Sub(registration.LastHeartbeatAt) > s.cfg.Worker.StaleLeaseGracePeriod+s.cfg.Worker.HeartbeatTTL { + if err := s.store.ReleaseLease(ctx, lease.LeaseID); err != nil { + return nil, err + } + stale = append(stale, lease) + } + } + return stale, nil +} + +func (s *Service) ValidateSessionRuntime(ctx context.Context, sessionID, workerID string) (bool, string, error) { + lease, err := s.store.GetLeaseBySession(ctx, sessionID) + if err != nil { + return false, "", err + } + if lease == nil { + return false, "worker_lease_missing", nil + } + if workerID != "" && lease.WorkerID != workerID { + _ = s.store.ReleaseLease(ctx, lease.LeaseID) + return false, "worker_binding_mismatch", nil + } + now := s.now().UTC() + if !lease.ExpiresAt.After(now) { + _ = s.store.ReleaseLease(ctx, lease.LeaseID) + return false, "worker_lease_expired", nil + } + registration, err := s.store.GetWorker(ctx, lease.WorkerID) + if err != nil { + return false, "", err + } + if registration == nil { + _ = s.store.ReleaseLease(ctx, lease.LeaseID) + return false, "worker_registration_missing", nil + } + if registration.Status != workercontracts.StatusOnline { + return false, "worker_not_online", nil + } + if now.Sub(registration.LastHeartbeatAt) > s.cfg.Worker.StaleLeaseGracePeriod+s.cfg.Worker.HeartbeatTTL { + _ = s.store.ReleaseLease(ctx, lease.LeaseID) + return false, "worker_heartbeat_stale", nil + } + return true, "", nil +} + +func (s *Service) PublishControl(ctx context.Context, envelope workercontracts.RoutedEnvelope) error { + return s.store.AppendEnvelope(ctx, envelope) +} + +func (s *Service) PublishInput(ctx context.Context, envelope workercontracts.RoutedEnvelope) error { + return s.store.AppendEnvelope(ctx, envelope) +} + +func hasCapabilities(workerCaps, required []string) bool { + for _, capability := range required { + if !slices.Contains(workerCaps, capability) { + return false + } + } + return true +} + +func (s *Service) PrepareAttachment(ctx context.Context, session sessionbroker.RemoteSession, attachment sessionbroker.SessionAttachment, runtimeMetadata map[string]any) error { + renderQualityProfile := normalizeRenderQualityProfile(session.RenderQualityProfile) + if renderQualityProfile == "balanced" { + renderQualityProfile = renderQualityProfileFromMetadata(session.Metadata) + } + if runtimeMetadata == nil { + runtimeMetadata = decodeMetadata(session.Metadata) + } + return s.store.AppendAssignment(ctx, session.WorkerID, map[string]any{ + "type": "session_assignment", + "session_id": session.ID, + "worker_id": session.WorkerID, + "attachment_id": attachment.ID, + "user_id": attachment.UserID, + "device_id": attachment.DeviceID, + "takeover_of": attachment.TakeoverOf, + "state": session.State, + "render_quality_profile": renderQualityProfile, + "metadata": runtimeMetadata, + }) +} + +func (s *Service) NotifyDetachment(ctx context.Context, session sessionbroker.RemoteSession, attachment sessionbroker.SessionAttachment) error { + return s.PublishControl(ctx, workercontracts.RoutedEnvelope{ + SessionID: session.ID, + AttachmentID: attachment.ID, + Type: "control", + Payload: map[string]any{ + "action": "detach", + }, + }) +} + +func (s *Service) TerminateRemoteSession(ctx context.Context, sessionID, attachmentID string) error { + return s.PublishControl(ctx, workercontracts.RoutedEnvelope{ + SessionID: sessionID, + AttachmentID: attachmentID, + Type: "control", + Payload: map[string]any{ + "action": "terminate", + }, + }) +} + +func decodeMetadata(payload []byte) map[string]any { + var out map[string]any + if len(payload) == 0 { + return map[string]any{} + } + if err := json.Unmarshal(payload, &out); err != nil { + return map[string]any{} + } + return out +} + +func normalizeRenderQualityProfile(profile string) string { + switch profile { + case "low_bandwidth", "balanced", "high_quality", "text_priority": + return profile + default: + return "balanced" + } +} + +func renderQualityProfileFromMetadata(metadata []byte) string { + decoded := decodeMetadata(metadata) + resource, _ := decoded["resource"].(map[string]any) + if resource == nil { + return "balanced" + } + if profile, ok := resource["render_quality_profile"].(string); ok && profile != "" { + return normalizeRenderQualityProfile(profile) + } + return "balanced" +} diff --git a/backend/internal/modules/worker/store.go b/backend/internal/modules/worker/store.go new file mode 100644 index 0000000..a73eb20 --- /dev/null +++ b/backend/internal/modules/worker/store.go @@ -0,0 +1,24 @@ +package worker + +import ( + "context" + "time" + + workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker" +) + +type Store interface { + RegisterWorker(ctx context.Context, registration workercontracts.WorkerRegistration, ttl time.Duration) error + TouchWorkerHeartbeat(ctx context.Context, heartbeat workercontracts.WorkerHeartbeat, ttl time.Duration) error + ListWorkers(ctx context.Context) ([]workercontracts.WorkerRegistration, error) + GetWorker(ctx context.Context, workerID string) (*workercontracts.WorkerRegistration, error) + AcquireLease(ctx context.Context, lease workercontracts.WorkerLease, ttl time.Duration) error + GetLease(ctx context.Context, leaseID string) (*workercontracts.WorkerLease, error) + GetLeaseBySession(ctx context.Context, sessionID string) (*workercontracts.WorkerLease, error) + RenewLease(ctx context.Context, lease workercontracts.WorkerLease, ttl time.Duration) error + ReleaseLease(ctx context.Context, leaseID string) error + ListLeases(ctx context.Context) ([]workercontracts.WorkerLease, error) + AppendAssignment(ctx context.Context, workerID string, payload map[string]any) error + AppendEnvelope(ctx context.Context, envelope workercontracts.RoutedEnvelope) error + AppendEvent(ctx context.Context, payload map[string]any) error +} diff --git a/backend/internal/platform/authority/authority.go b/backend/internal/platform/authority/authority.go new file mode 100644 index 0000000..01a6582 --- /dev/null +++ b/backend/internal/platform/authority/authority.go @@ -0,0 +1,329 @@ +package authority + +import ( + "context" + "crypto/ed25519" + "crypto/sha256" + "crypto/x509" + "encoding/base64" + "encoding/hex" + "encoding/json" + "encoding/pem" + "errors" + "fmt" + "strings" + "time" + + "github.com/jackc/pgx/v5" + + "github.com/example/remote-access-platform/backend/internal/platform/config" + postgresplatform "github.com/example/remote-access-platform/backend/internal/platform/postgres" +) + +const ( + ModeStrict = "strict" + ModeLegacy = "legacy" + + ActivationSchemaVersion = "rap.installation.activation.v1" + + PlatformRoleUser = "user" + PlatformRoleAdmin = "platform_admin" + PlatformRoleRecoveryAdmin = "platform_recovery_admin" +) + +var ( + ErrInvalidAuthorityMode = errors.New("invalid installation authority mode") + ErrProductRootKeyNeeded = errors.New("product root public key is required") + ErrInvalidActivation = errors.New("invalid installation activation") + ErrInvalidGrant = errors.New("invalid platform role grant") +) + +type ActivationPayload struct { + SchemaVersion string `json:"schema_version"` + InstallID string `json:"install_id"` + OwnerEmail string `json:"owner_email"` + PlatformRole string `json:"platform_role"` + IssuedAt time.Time `json:"issued_at"` + ExpiresAt *time.Time `json:"expires_at,omitempty"` + Nonce string `json:"nonce,omitempty"` + Environment string `json:"environment,omitempty"` +} + +type Verifier struct { + mode string + rootPublicKey ed25519.PublicKey + rootFingerprint string + allowInsecureBootstrap bool + now func() time.Time +} + +func NewVerifier(cfg config.InstallationConfig) (*Verifier, error) { + mode := strings.ToLower(strings.TrimSpace(cfg.AuthorityMode)) + if mode == "" { + mode = ModeLegacy + } + verifier := &Verifier{ + mode: mode, + allowInsecureBootstrap: cfg.AllowInsecureBootstrap, + now: time.Now, + } + + switch mode { + case ModeLegacy: + return verifier, nil + case ModeStrict: + publicKey, err := decodeEd25519PublicKey(cfg.ProductRootPublicKeyBase64) + if err != nil { + return nil, err + } + verifier.rootPublicKey = publicKey + fingerprint := sha256.Sum256(publicKey) + verifier.rootFingerprint = hex.EncodeToString(fingerprint[:]) + return verifier, nil + default: + return nil, fmt.Errorf("%w: %s", ErrInvalidAuthorityMode, mode) + } +} + +func (v *Verifier) Mode() string { + if v == nil || v.mode == "" { + return ModeLegacy + } + return v.mode +} + +func (v *Verifier) Strict() bool { + return v != nil && v.mode == ModeStrict +} + +func (v *Verifier) AllowInsecureBootstrap() bool { + return v != nil && v.allowInsecureBootstrap +} + +func (v *Verifier) RootFingerprint() string { + if v == nil { + return "" + } + return v.rootFingerprint +} + +func (v *Verifier) VerifyActivation(payload json.RawMessage, signature string) (ActivationPayload, error) { + if v == nil || !v.Strict() { + return ActivationPayload{}, ErrProductRootKeyNeeded + } + activation, canonical, err := parseActivationPayload(payload) + if err != nil { + return ActivationPayload{}, err + } + if err := activation.validate(v.now().UTC()); err != nil { + return ActivationPayload{}, err + } + if err := v.verifySignature(canonical, signature); err != nil { + return ActivationPayload{}, fmt.Errorf("%w: %v", ErrInvalidActivation, err) + } + return activation, nil +} + +func (v *Verifier) VerifyPlatformRoleGrant(payload json.RawMessage, signature, expectedInstallID, expectedEmail, expectedRole string) (ActivationPayload, error) { + activation, err := v.VerifyActivation(payload, signature) + if err != nil { + return ActivationPayload{}, fmt.Errorf("%w: %v", ErrInvalidGrant, err) + } + if activation.InstallID != strings.TrimSpace(expectedInstallID) { + return ActivationPayload{}, fmt.Errorf("%w: install_id mismatch", ErrInvalidGrant) + } + if !strings.EqualFold(activation.OwnerEmail, strings.TrimSpace(expectedEmail)) { + return ActivationPayload{}, fmt.Errorf("%w: owner_email mismatch", ErrInvalidGrant) + } + if activation.PlatformRole != strings.TrimSpace(expectedRole) { + return ActivationPayload{}, fmt.Errorf("%w: platform_role mismatch", ErrInvalidGrant) + } + return activation, nil +} + +func CanonicalJSON(raw json.RawMessage) ([]byte, error) { + if len(raw) == 0 { + return nil, fmt.Errorf("%w: empty payload", ErrInvalidActivation) + } + var value any + if err := json.Unmarshal(raw, &value); err != nil { + return nil, fmt.Errorf("%w: invalid json: %v", ErrInvalidActivation, err) + } + canonical, err := json.Marshal(value) + if err != nil { + return nil, fmt.Errorf("%w: canonical json: %v", ErrInvalidActivation, err) + } + return canonical, nil +} + +func EffectivePlatformRole(ctx context.Context, db postgresplatform.DBTX, verifier *Verifier, userID string) (string, error) { + userID = strings.TrimSpace(userID) + if userID == "" { + return PlatformRoleUser, nil + } + if verifier == nil || !verifier.Strict() { + return legacyPlatformRole(ctx, db, userID) + } + + var email string + if err := db.QueryRow(ctx, `SELECT email FROM users WHERE id = $1::uuid`, userID).Scan(&email); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return PlatformRoleUser, nil + } + return "", fmt.Errorf("get user email for platform grant: %w", err) + } + + rows, err := db.Query(ctx, ` +SELECT prg.role, prg.install_id, prg.grant_payload, prg.grant_signature +FROM platform_role_grants prg +JOIN installation_authority ia + ON ia.id = 1 + AND ia.install_id = prg.install_id + AND ia.authority_state = 'active' +WHERE prg.user_id = $1::uuid + AND prg.revoked_at IS NULL + AND (prg.expires_at IS NULL OR prg.expires_at > NOW()) +ORDER BY CASE prg.role + WHEN 'platform_recovery_admin' THEN 0 + WHEN 'platform_admin' THEN 1 + ELSE 2 +END, prg.granted_at DESC +`, userID) + if err != nil { + return "", fmt.Errorf("query platform role grants: %w", err) + } + defer rows.Close() + + bestRole := PlatformRoleUser + for rows.Next() { + var role, installID, signature string + var payload []byte + if err := rows.Scan(&role, &installID, &payload, &signature); err != nil { + return "", fmt.Errorf("scan platform role grant: %w", err) + } + if _, err := verifier.VerifyPlatformRoleGrant(json.RawMessage(payload), signature, installID, email, role); err != nil { + continue + } + if role == PlatformRoleRecoveryAdmin { + return role, nil + } + if role == PlatformRoleAdmin { + bestRole = role + } + } + if err := rows.Err(); err != nil { + return "", fmt.Errorf("iterate platform role grants: %w", err) + } + return bestRole, nil +} + +func legacyPlatformRole(ctx context.Context, db postgresplatform.DBTX, userID string) (string, error) { + var role string + if err := db.QueryRow(ctx, `SELECT platform_role FROM users WHERE id = $1::uuid`, userID).Scan(&role); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return PlatformRoleUser, nil + } + return "", fmt.Errorf("get platform role: %w", err) + } + if role == "" { + return PlatformRoleUser, nil + } + return role, nil +} + +func parseActivationPayload(raw json.RawMessage) (ActivationPayload, []byte, error) { + canonical, err := CanonicalJSON(raw) + if err != nil { + return ActivationPayload{}, nil, err + } + var activation ActivationPayload + if err := json.Unmarshal(canonical, &activation); err != nil { + return ActivationPayload{}, nil, fmt.Errorf("%w: decode activation: %v", ErrInvalidActivation, err) + } + activation.SchemaVersion = strings.TrimSpace(activation.SchemaVersion) + activation.InstallID = strings.TrimSpace(activation.InstallID) + activation.OwnerEmail = strings.ToLower(strings.TrimSpace(activation.OwnerEmail)) + activation.PlatformRole = strings.TrimSpace(activation.PlatformRole) + activation.Nonce = strings.TrimSpace(activation.Nonce) + activation.Environment = strings.TrimSpace(activation.Environment) + return activation, canonical, nil +} + +func (p ActivationPayload) validate(now time.Time) error { + if p.SchemaVersion != ActivationSchemaVersion { + return fmt.Errorf("%w: schema_version must be %s", ErrInvalidActivation, ActivationSchemaVersion) + } + if p.InstallID == "" { + return fmt.Errorf("%w: install_id is required", ErrInvalidActivation) + } + if p.OwnerEmail == "" || !strings.Contains(p.OwnerEmail, "@") { + return fmt.Errorf("%w: owner_email is required", ErrInvalidActivation) + } + switch p.PlatformRole { + case PlatformRoleAdmin, PlatformRoleRecoveryAdmin: + default: + return fmt.Errorf("%w: platform_role must be platform_admin or platform_recovery_admin", ErrInvalidActivation) + } + if p.IssuedAt.IsZero() { + return fmt.Errorf("%w: issued_at is required", ErrInvalidActivation) + } + if p.IssuedAt.After(now.Add(5 * time.Minute)) { + return fmt.Errorf("%w: issued_at is too far in the future", ErrInvalidActivation) + } + if p.ExpiresAt != nil && !p.ExpiresAt.After(now) { + return fmt.Errorf("%w: activation expired", ErrInvalidActivation) + } + return nil +} + +func (v *Verifier) verifySignature(payload []byte, signatureText string) error { + signature, err := decodeBase64(strings.TrimSpace(signatureText)) + if err != nil { + return fmt.Errorf("signature must be base64 encoded: %w", err) + } + if len(signature) != ed25519.SignatureSize { + return fmt.Errorf("signature must decode to %d bytes", ed25519.SignatureSize) + } + if !ed25519.Verify(v.rootPublicKey, payload, signature) { + return errors.New("signature verification failed") + } + return nil +} + +func decodeEd25519PublicKey(value string) (ed25519.PublicKey, error) { + value = strings.TrimSpace(value) + if value == "" { + return nil, ErrProductRootKeyNeeded + } + if block, _ := pem.Decode([]byte(value)); block != nil { + parsed, err := x509.ParsePKIXPublicKey(block.Bytes) + if err != nil { + return nil, fmt.Errorf("parse product root public key PEM: %w", err) + } + publicKey, ok := parsed.(ed25519.PublicKey) + if !ok { + return nil, fmt.Errorf("product root public key PEM must contain an Ed25519 public key") + } + return publicKey, nil + } + decoded, err := decodeBase64(value) + if err != nil { + return nil, fmt.Errorf("product root public key must be base64 encoded: %w", err) + } + if len(decoded) != ed25519.PublicKeySize { + return nil, fmt.Errorf("product root public key must decode to %d bytes", ed25519.PublicKeySize) + } + return ed25519.PublicKey(decoded), nil +} + +func decodeBase64(value string) ([]byte, error) { + decoded, err := base64.StdEncoding.DecodeString(value) + if err == nil { + return decoded, nil + } + decoded, rawErr := base64.RawStdEncoding.DecodeString(value) + if rawErr == nil { + return decoded, nil + } + return nil, err +} diff --git a/backend/internal/platform/authority/authority_test.go b/backend/internal/platform/authority/authority_test.go new file mode 100644 index 0000000..0f586ee --- /dev/null +++ b/backend/internal/platform/authority/authority_test.go @@ -0,0 +1,85 @@ +package authority + +import ( + "crypto/ed25519" + "encoding/base64" + "encoding/json" + "strings" + "testing" + "time" + + "github.com/example/remote-access-platform/backend/internal/platform/config" +) + +func TestVerifierAcceptsSignedActivation(t *testing.T) { + publicKey, privateKey, err := ed25519.GenerateKey(nil) + if err != nil { + t.Fatalf("generate key: %v", err) + } + verifier, err := NewVerifier(config.InstallationConfig{ + AuthorityMode: ModeStrict, + ProductRootPublicKeyBase64: base64.StdEncoding.EncodeToString(publicKey), + }) + if err != nil { + t.Fatalf("NewVerifier: %v", err) + } + verifier.now = func() time.Time { return time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) } + + payload := json.RawMessage(`{ + "platform_role":"platform_admin", + "owner_email":"Owner@Example.test", + "install_id":"install-1", + "schema_version":"rap.installation.activation.v1", + "issued_at":"2026-04-28T11:00:00Z", + "expires_at":"2026-04-29T11:00:00Z" + }`) + canonical, err := CanonicalJSON(payload) + if err != nil { + t.Fatalf("CanonicalJSON: %v", err) + } + signature := base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)) + + activation, err := verifier.VerifyActivation(payload, signature) + if err != nil { + t.Fatalf("VerifyActivation: %v", err) + } + if activation.OwnerEmail != "owner@example.test" || activation.PlatformRole != PlatformRoleAdmin { + t.Fatalf("unexpected activation: %+v", activation) + } + if verifier.RootFingerprint() == "" { + t.Fatal("expected root fingerprint") + } +} + +func TestVerifierRejectsTamperedActivation(t *testing.T) { + publicKey, privateKey, err := ed25519.GenerateKey(nil) + if err != nil { + t.Fatalf("generate key: %v", err) + } + verifier, err := NewVerifier(config.InstallationConfig{ + AuthorityMode: ModeStrict, + ProductRootPublicKeyBase64: base64.StdEncoding.EncodeToString(publicKey), + }) + if err != nil { + t.Fatalf("NewVerifier: %v", err) + } + verifier.now = func() time.Time { return time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC) } + + payload := json.RawMessage(`{ + "schema_version":"rap.installation.activation.v1", + "install_id":"install-1", + "owner_email":"owner@example.test", + "platform_role":"platform_admin", + "issued_at":"2026-04-28T11:00:00Z" + }`) + canonical, err := CanonicalJSON(payload) + if err != nil { + t.Fatalf("CanonicalJSON: %v", err) + } + signature := base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)) + tampered := json.RawMessage(strings.Replace(string(payload), "platform_admin", "platform_recovery_admin", 1)) + + if _, err := verifier.VerifyActivation(tampered, signature); err == nil { + t.Fatal("expected tampered activation to fail") + } +} diff --git a/backend/internal/platform/clusterauth/authority.go b/backend/internal/platform/clusterauth/authority.go new file mode 100644 index 0000000..9992a9f --- /dev/null +++ b/backend/internal/platform/clusterauth/authority.go @@ -0,0 +1,186 @@ +package clusterauth + +import ( + "crypto/ed25519" + "crypto/rand" + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "strings" + "time" +) + +const ( + AuthoritySchemaVersion = "rap.cluster_authority.v1" + SignatureSchemaVersion = "rap.cluster_authority.signature.v1" + AlgorithmEd25519 = "ed25519" +) + +var ( + ErrInvalidKey = errors.New("invalid cluster authority key") + ErrInvalidSignature = errors.New("invalid cluster authority signature") + ErrInvalidPayload = errors.New("invalid cluster authority payload") +) + +type KeyPair struct { + PublicKeyB64 string + PrivateKeyB64 string + Fingerprint string +} + +type Signature struct { + SchemaVersion string `json:"schema_version"` + Algorithm string `json:"algorithm"` + KeyFingerprint string `json:"key_fingerprint"` + Signature string `json:"signature"` + SignedAt time.Time `json:"signed_at"` +} + +func GenerateKeyPair() (KeyPair, error) { + publicKey, privateKey, err := ed25519.GenerateKey(rand.Reader) + if err != nil { + return KeyPair{}, err + } + fingerprint := Fingerprint(publicKey) + return KeyPair{ + PublicKeyB64: base64.StdEncoding.EncodeToString(publicKey), + PrivateKeyB64: base64.StdEncoding.EncodeToString(privateKey), + Fingerprint: fingerprint, + }, nil +} + +func Fingerprint(publicKey ed25519.PublicKey) string { + sum := sha256.Sum256(publicKey) + return "rap-ca-ed25519-" + hex.EncodeToString(sum[:16]) +} + +func FingerprintFromBase64(publicKeyB64 string) (string, error) { + publicKey, err := DecodePublicKey(publicKeyB64) + if err != nil { + return "", err + } + return Fingerprint(publicKey), nil +} + +func SignRaw(privateKeyB64 string, payload json.RawMessage, signedAt time.Time) (Signature, error) { + privateKey, err := DecodePrivateKey(privateKeyB64) + if err != nil { + return Signature{}, err + } + canonical, err := CanonicalJSON(payload) + if err != nil { + return Signature{}, err + } + publicKey, ok := privateKey.Public().(ed25519.PublicKey) + if !ok { + return Signature{}, ErrInvalidKey + } + signature := ed25519.Sign(privateKey, canonical) + return Signature{ + SchemaVersion: SignatureSchemaVersion, + Algorithm: AlgorithmEd25519, + KeyFingerprint: Fingerprint(publicKey), + Signature: base64.StdEncoding.EncodeToString(signature), + SignedAt: signedAt.UTC(), + }, nil +} + +func SignPayload(privateKeyB64 string, payload any, signedAt time.Time) (json.RawMessage, Signature, error) { + raw, err := json.Marshal(payload) + if err != nil { + return nil, Signature{}, fmt.Errorf("%w: marshal: %v", ErrInvalidPayload, err) + } + signature, err := SignRaw(privateKeyB64, raw, signedAt) + if err != nil { + return nil, Signature{}, err + } + return json.RawMessage(raw), signature, nil +} + +func VerifyRaw(publicKeyB64 string, payload json.RawMessage, signature Signature) error { + if signature.SchemaVersion != SignatureSchemaVersion { + return fmt.Errorf("%w: schema_version must be %s", ErrInvalidSignature, SignatureSchemaVersion) + } + if signature.Algorithm != AlgorithmEd25519 { + return fmt.Errorf("%w: algorithm must be %s", ErrInvalidSignature, AlgorithmEd25519) + } + publicKey, err := DecodePublicKey(publicKeyB64) + if err != nil { + return err + } + if signature.KeyFingerprint != Fingerprint(publicKey) { + return fmt.Errorf("%w: key fingerprint mismatch", ErrInvalidSignature) + } + canonical, err := CanonicalJSON(payload) + if err != nil { + return err + } + decodedSignature, err := decodeBase64(strings.TrimSpace(signature.Signature)) + if err != nil || len(decodedSignature) != ed25519.SignatureSize { + return fmt.Errorf("%w: signature must be base64 ed25519 signature", ErrInvalidSignature) + } + if !ed25519.Verify(publicKey, canonical, decodedSignature) { + return ErrInvalidSignature + } + return nil +} + +func CanonicalJSON(raw json.RawMessage) ([]byte, error) { + if len(raw) == 0 { + return nil, fmt.Errorf("%w: empty payload", ErrInvalidPayload) + } + var value any + if err := json.Unmarshal(raw, &value); err != nil { + return nil, fmt.Errorf("%w: invalid json: %v", ErrInvalidPayload, err) + } + canonical, err := json.Marshal(value) + if err != nil { + return nil, fmt.Errorf("%w: canonical json: %v", ErrInvalidPayload, err) + } + return canonical, nil +} + +func HashRaw(raw json.RawMessage) (string, error) { + canonical, err := CanonicalJSON(raw) + if err != nil { + return "", err + } + sum := sha256.Sum256(canonical) + return hex.EncodeToString(sum[:]), nil +} + +func DecodePublicKey(value string) (ed25519.PublicKey, error) { + decoded, err := decodeBase64(strings.TrimSpace(value)) + if err != nil { + return nil, fmt.Errorf("%w: public key must be base64 encoded", ErrInvalidKey) + } + if len(decoded) != ed25519.PublicKeySize { + return nil, fmt.Errorf("%w: public key must decode to %d bytes", ErrInvalidKey, ed25519.PublicKeySize) + } + return ed25519.PublicKey(decoded), nil +} + +func DecodePrivateKey(value string) (ed25519.PrivateKey, error) { + decoded, err := decodeBase64(strings.TrimSpace(value)) + if err != nil { + return nil, fmt.Errorf("%w: private key must be base64 encoded", ErrInvalidKey) + } + if len(decoded) != ed25519.PrivateKeySize { + return nil, fmt.Errorf("%w: private key must decode to %d bytes", ErrInvalidKey, ed25519.PrivateKeySize) + } + return ed25519.PrivateKey(decoded), nil +} + +func decodeBase64(value string) ([]byte, error) { + if value == "" { + return nil, errors.New("empty base64 value") + } + decoded, err := base64.StdEncoding.DecodeString(value) + if err == nil { + return decoded, nil + } + return base64.RawStdEncoding.DecodeString(value) +} diff --git a/backend/internal/platform/clusterauth/authority_test.go b/backend/internal/platform/clusterauth/authority_test.go new file mode 100644 index 0000000..5f68ddf --- /dev/null +++ b/backend/internal/platform/clusterauth/authority_test.go @@ -0,0 +1,44 @@ +package clusterauth + +import ( + "encoding/json" + "errors" + "testing" + "time" +) + +func TestSignAndVerifyRawPayload(t *testing.T) { + keys, err := GenerateKeyPair() + if err != nil { + t.Fatalf("GenerateKeyPair: %v", err) + } + payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1","value":1}`) + + signature, err := SignRaw(keys.PrivateKeyB64, payload, time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)) + if err != nil { + t.Fatalf("SignRaw: %v", err) + } + if signature.KeyFingerprint != keys.Fingerprint { + t.Fatalf("fingerprint = %q, want %q", signature.KeyFingerprint, keys.Fingerprint) + } + if err := VerifyRaw(keys.PublicKeyB64, payload, signature); err != nil { + t.Fatalf("VerifyRaw: %v", err) + } +} + +func TestVerifyRawRejectsTamperedPayload(t *testing.T) { + keys, err := GenerateKeyPair() + if err != nil { + t.Fatalf("GenerateKeyPair: %v", err) + } + payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1","value":1}`) + signature, err := SignRaw(keys.PrivateKeyB64, payload, time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)) + if err != nil { + t.Fatalf("SignRaw: %v", err) + } + + tampered := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1","value":2}`) + if err := VerifyRaw(keys.PublicKeyB64, tampered, signature); !errors.Is(err, ErrInvalidSignature) { + t.Fatalf("err = %v, want ErrInvalidSignature", err) + } +} diff --git a/backend/internal/platform/config/config.go b/backend/internal/platform/config/config.go new file mode 100644 index 0000000..8ec43c5 --- /dev/null +++ b/backend/internal/platform/config/config.go @@ -0,0 +1,307 @@ +package config + +import ( + "encoding/base64" + "fmt" + "os" + "strconv" + "strings" + "time" +) + +type Config struct { + App AppConfig + HTTP HTTPConfig + Postgres PostgresConfig + Redis RedisConfig + Auth AuthConfig + Installation InstallationConfig + DataPlane DataPlaneConfig + Secret SecretConfig + Session SessionConfig + Worker WorkerConfig + WebSocket WebSocketConfig +} + +type AppConfig struct { + Name string + Env string +} + +type HTTPConfig struct { + Host string + Port int + ReadTimeout time.Duration + WriteTimeout time.Duration + IdleTimeout time.Duration + ShutdownTimeout time.Duration +} + +type PostgresConfig struct { + DSN string + MaxConns int32 + MinConns int32 + ConnectTimeout time.Duration +} + +type RedisConfig struct { + Addr string + Password string + DB int + DialTimeout time.Duration +} + +type AuthConfig struct { + AccessTokenTTL time.Duration + RefreshTokenTTL time.Duration + Issuer string + AccessTokenSecret string + RefreshHashSecret string +} + +type InstallationConfig struct { + AuthorityMode string + ProductRootPublicKeyBase64 string + ProductRootPublicKeyFile string + AllowInsecureBootstrap bool +} + +type DataPlaneConfig struct { + TokenTTL time.Duration + TokenPrivateKeyPEM string + TokenPrivateKeyFile string + BackendGatewayURL string + DirectWorkerWSSURLTemplate string + DirectWorkerJSONRuntime bool + DirectWorkerBinaryRender bool + DirectWorkerTLSTrustMode string + DirectWorkerTLSCARef string +} + +type SecretConfig struct { + EncryptionKeyBase64 string + EncryptionKeyFile string + EncryptionKeyID string +} + +type SessionConfig struct { + HeartbeatTTL time.Duration + DetachGracePeriod time.Duration + AttachTokenTTL time.Duration + LiveStateTTL time.Duration + RecoveryBatchSize int +} + +type WorkerConfig struct { + LeaseTTL time.Duration + HeartbeatTTL time.Duration + StaleLeaseGracePeriod time.Duration +} + +type WebSocketConfig struct { + WriteTimeout time.Duration + PingInterval time.Duration + PongWait time.Duration +} + +func Load() (Config, error) { + cfg := Config{ + App: AppConfig{ + Name: getEnv("APP_NAME", "rap-api"), + Env: getEnv("APP_ENV", "development"), + }, + HTTP: HTTPConfig{ + Host: getEnv("HTTP_HOST", "0.0.0.0"), + Port: getInt("HTTP_PORT", 8080), + ReadTimeout: getDuration("HTTP_READ_TIMEOUT", 15*time.Second), + WriteTimeout: getDuration("HTTP_WRITE_TIMEOUT", 15*time.Second), + IdleTimeout: getDuration("HTTP_IDLE_TIMEOUT", 60*time.Second), + ShutdownTimeout: getDuration("HTTP_SHUTDOWN_TIMEOUT", 10*time.Second), + }, + Postgres: PostgresConfig{ + DSN: getEnv("POSTGRES_DSN", ""), + MaxConns: int32(getInt("POSTGRES_MAX_CONNS", 20)), + MinConns: int32(getInt("POSTGRES_MIN_CONNS", 2)), + ConnectTimeout: getDuration("POSTGRES_CONNECT_TIMEOUT", 5*time.Second), + }, + Redis: RedisConfig{ + Addr: getEnv("REDIS_ADDR", "localhost:6379"), + Password: getEnv("REDIS_PASSWORD", ""), + DB: getInt("REDIS_DB", 0), + DialTimeout: getDuration("REDIS_DIAL_TIMEOUT", 5*time.Second), + }, + Auth: AuthConfig{ + AccessTokenTTL: getDuration("AUTH_ACCESS_TOKEN_TTL", 15*time.Minute), + RefreshTokenTTL: getDuration("AUTH_REFRESH_TOKEN_TTL", 30*24*time.Hour), + Issuer: getEnv("AUTH_ISSUER", "rap-api"), + AccessTokenSecret: getEnv("AUTH_ACCESS_TOKEN_SECRET", ""), + RefreshHashSecret: getEnv("AUTH_REFRESH_HASH_SECRET", ""), + }, + Installation: InstallationConfig{ + AuthorityMode: getEnv("INSTALLATION_AUTHORITY_MODE", ""), + ProductRootPublicKeyBase64: getEnv("INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_B64", ""), + ProductRootPublicKeyFile: getEnv("INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_FILE", ""), + AllowInsecureBootstrap: getBool("INSTALLATION_INSECURE_BOOTSTRAP_ENABLED", false), + }, + DataPlane: DataPlaneConfig{ + TokenTTL: getDuration("DATA_PLANE_TOKEN_TTL", 1*time.Minute), + TokenPrivateKeyPEM: getEnv("DATA_PLANE_TOKEN_PRIVATE_KEY_PEM", ""), + TokenPrivateKeyFile: getEnv("DATA_PLANE_TOKEN_PRIVATE_KEY_FILE", ""), + BackendGatewayURL: getEnv("DATA_PLANE_BACKEND_GATEWAY_URL", "/api/v1/gateway/ws"), + DirectWorkerWSSURLTemplate: getEnv("DATA_PLANE_DIRECT_WORKER_WSS_URL_TEMPLATE", ""), + DirectWorkerJSONRuntime: getBool("DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME", false), + DirectWorkerBinaryRender: getBool("DATA_PLANE_DIRECT_WORKER_BINARY_RENDER", false), + DirectWorkerTLSTrustMode: getEnv("DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE", "smoke_insecure"), + DirectWorkerTLSCARef: getEnv("DATA_PLANE_DIRECT_WORKER_TLS_CA_REF", ""), + }, + Secret: SecretConfig{ + EncryptionKeyBase64: getEnv("SECRET_ENCRYPTION_KEY_B64", ""), + EncryptionKeyFile: getEnv("SECRET_ENCRYPTION_KEY_FILE", ""), + EncryptionKeyID: getEnv("SECRET_ENCRYPTION_KEY_ID", "local-v1"), + }, + Session: SessionConfig{ + HeartbeatTTL: getDuration("SESSION_HEARTBEAT_TTL", 90*time.Second), + DetachGracePeriod: getDuration("SESSION_DETACH_GRACE_PERIOD", 30*time.Minute), + AttachTokenTTL: getDuration("SESSION_ATTACH_TOKEN_TTL", 2*time.Minute), + LiveStateTTL: getDuration("SESSION_LIVE_STATE_TTL", 2*time.Minute), + RecoveryBatchSize: getInt("SESSION_RECOVERY_BATCH_SIZE", 100), + }, + Worker: WorkerConfig{ + LeaseTTL: getDuration("WORKER_LEASE_TTL", 45*time.Second), + HeartbeatTTL: getDuration("WORKER_HEARTBEAT_TTL", 15*time.Second), + StaleLeaseGracePeriod: getDuration("WORKER_STALE_LEASE_GRACE_PERIOD", 30*time.Second), + }, + WebSocket: WebSocketConfig{ + WriteTimeout: getDuration("WEBSOCKET_WRITE_TIMEOUT", 10*time.Second), + PingInterval: getDuration("WEBSOCKET_PING_INTERVAL", 20*time.Second), + PongWait: getDuration("WEBSOCKET_PONG_WAIT", 40*time.Second), + }, + } + + if cfg.Postgres.DSN == "" { + return Config{}, fmt.Errorf("POSTGRES_DSN is required") + } + if cfg.Auth.AccessTokenSecret == "" { + return Config{}, fmt.Errorf("AUTH_ACCESS_TOKEN_SECRET is required") + } + if cfg.Auth.RefreshHashSecret == "" { + return Config{}, fmt.Errorf("AUTH_REFRESH_HASH_SECRET is required") + } + if cfg.Installation.ProductRootPublicKeyBase64 == "" && cfg.Installation.ProductRootPublicKeyFile != "" { + publicKey, err := os.ReadFile(cfg.Installation.ProductRootPublicKeyFile) + if err != nil { + return Config{}, fmt.Errorf("read INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_FILE: %w", err) + } + cfg.Installation.ProductRootPublicKeyBase64 = strings.TrimSpace(string(publicKey)) + } + cfg.Installation.AuthorityMode = normalizeInstallationAuthorityMode(cfg.Installation.AuthorityMode, cfg.Installation.ProductRootPublicKeyBase64) + if isProductionEnv(cfg.App.Env) && cfg.Installation.AuthorityMode != "strict" { + return Config{}, fmt.Errorf("INSTALLATION_AUTHORITY_MODE=strict with INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_B64 or file is required in production") + } + if cfg.DataPlane.TokenPrivateKeyPEM == "" && cfg.DataPlane.TokenPrivateKeyFile != "" { + privateKey, err := os.ReadFile(cfg.DataPlane.TokenPrivateKeyFile) + if err != nil { + return Config{}, fmt.Errorf("read DATA_PLANE_TOKEN_PRIVATE_KEY_FILE: %w", err) + } + cfg.DataPlane.TokenPrivateKeyPEM = string(privateKey) + } + if cfg.Secret.EncryptionKeyBase64 == "" && cfg.Secret.EncryptionKeyFile != "" { + secretKey, err := os.ReadFile(cfg.Secret.EncryptionKeyFile) + if err != nil { + return Config{}, fmt.Errorf("read SECRET_ENCRYPTION_KEY_FILE: %w", err) + } + cfg.Secret.EncryptionKeyBase64 = strings.TrimSpace(string(secretKey)) + } + if cfg.Secret.EncryptionKeyBase64 != "" { + decoded, err := base64.StdEncoding.DecodeString(cfg.Secret.EncryptionKeyBase64) + if err != nil { + if decodedRaw, rawErr := base64.RawStdEncoding.DecodeString(cfg.Secret.EncryptionKeyBase64); rawErr == nil { + decoded = decodedRaw + } else { + return Config{}, fmt.Errorf("SECRET_ENCRYPTION_KEY_B64 must be base64 encoded: %w", err) + } + } + if len(decoded) != 32 { + return Config{}, fmt.Errorf("SECRET_ENCRYPTION_KEY_B64 must decode to 32 bytes for AES-256-GCM") + } + } + if isProductionEnv(cfg.App.Env) && cfg.Secret.EncryptionKeyBase64 == "" { + return Config{}, fmt.Errorf("SECRET_ENCRYPTION_KEY_B64 or SECRET_ENCRYPTION_KEY_FILE is required in production") + } + + return cfg, nil +} + +func normalizeInstallationAuthorityMode(mode string, rootPublicKey string) string { + mode = strings.ToLower(strings.TrimSpace(mode)) + switch mode { + case "strict", "legacy": + return mode + case "": + if strings.TrimSpace(rootPublicKey) != "" { + return "strict" + } + return "legacy" + default: + return mode + } +} + +func isProductionEnv(appEnv string) bool { + switch strings.ToLower(strings.TrimSpace(appEnv)) { + case "production", "prod": + return true + default: + return false + } +} + +func getEnv(key, fallback string) string { + if value := os.Getenv(key); value != "" { + return value + } + return fallback +} + +func getInt(key string, fallback int) int { + value := os.Getenv(key) + if value == "" { + return fallback + } + + parsed, err := strconv.Atoi(value) + if err != nil { + return fallback + } + + return parsed +} + +func getBool(key string, fallback bool) bool { + value := os.Getenv(key) + if value == "" { + return fallback + } + switch value { + case "1", "true", "TRUE", "yes", "on": + return true + case "0", "false", "FALSE", "no", "off": + return false + default: + return fallback + } +} + +func getDuration(key string, fallback time.Duration) time.Duration { + value := os.Getenv(key) + if value == "" { + return fallback + } + + parsed, err := time.ParseDuration(value) + if err != nil { + return fallback + } + + return parsed +} diff --git a/backend/internal/platform/httpserver/server.go b/backend/internal/platform/httpserver/server.go new file mode 100644 index 0000000..a6fbd1d --- /dev/null +++ b/backend/internal/platform/httpserver/server.go @@ -0,0 +1,20 @@ +package httpserver + +import ( + "fmt" + "net/http" + "time" + + "github.com/example/remote-access-platform/backend/internal/platform/config" +) + +func New(cfg config.HTTPConfig, handler http.Handler) *http.Server { + return &http.Server{ + Addr: fmt.Sprintf("%s:%d", cfg.Host, cfg.Port), + Handler: handler, + ReadHeaderTimeout: 5 * time.Second, + ReadTimeout: cfg.ReadTimeout, + WriteTimeout: cfg.WriteTimeout, + IdleTimeout: cfg.IdleTimeout, + } +} diff --git a/backend/internal/platform/httpx/json.go b/backend/internal/platform/httpx/json.go new file mode 100644 index 0000000..744e98a --- /dev/null +++ b/backend/internal/platform/httpx/json.go @@ -0,0 +1,45 @@ +package httpx + +import ( + "encoding/json" + "net/http" +) + +func WriteJSON(w http.ResponseWriter, status int, payload any) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(payload) +} + +func WriteError(w http.ResponseWriter, status int, message string) { + traceID := ensureTraceID(w) + WriteJSON(w, status, ErrorResponse{ + Error: NewErrorMessage(status, message, nil, traceID), + }) +} + +func WriteErrorMessage(w http.ResponseWriter, status int, message any) { + traceID := ensureTraceID(w) + switch payload := message.(type) { + case string: + WriteJSON(w, status, ErrorResponse{ + Error: NewErrorMessage(status, payload, nil, traceID), + }) + case ErrorResponse: + payload.Error.TraceID = traceID + WriteJSON(w, status, payload) + case *ErrorResponse: + if payload == nil { + WriteJSON(w, status, ErrorResponse{ + Error: NewErrorMessage(status, "", nil, traceID), + }) + return + } + payload.Error.TraceID = traceID + WriteJSON(w, status, payload) + default: + WriteJSON(w, status, ErrorResponse{ + Error: NewErrorMessage(status, "Request failed.", nil, traceID), + }) + } +} diff --git a/backend/internal/platform/httpx/message.go b/backend/internal/platform/httpx/message.go new file mode 100644 index 0000000..099afc5 --- /dev/null +++ b/backend/internal/platform/httpx/message.go @@ -0,0 +1,131 @@ +package httpx + +import ( + "net/http" + "strings" + "unicode" + + "github.com/google/uuid" + + messagecontracts "github.com/example/remote-access-platform/backend/pkg/contracts/message" +) + +type ErrorResponse struct { + Error messagecontracts.Message `json:"error"` +} + +func NewMessage(code, messageKey, fallbackMessage string, details map[string]any, traceID string) messagecontracts.Message { + if traceID == "" { + traceID = uuid.NewString() + } + if details == nil { + details = map[string]any{} + } + return messagecontracts.Message{ + Code: code, + MessageKey: messageKey, + FallbackMessage: fallbackMessage, + Details: details, + TraceID: traceID, + } +} + +func NewErrorMessage(status int, fallbackMessage string, details map[string]any, traceID string) messagecontracts.Message { + normalizedFallback, normalizedDetails := normalizeErrorFallback(status, fallbackMessage, details) + code := deriveErrorCode(status, normalizedFallback) + return NewMessage(code, "errors."+code, normalizedFallback, normalizedDetails, traceID) +} + +func ensureTraceID(w http.ResponseWriter) string { + traceID := w.Header().Get("X-Trace-Id") + if traceID == "" { + traceID = uuid.NewString() + w.Header().Set("X-Trace-Id", traceID) + } + return traceID +} + +func normalizeErrorFallback(status int, fallbackMessage string, details map[string]any) (string, map[string]any) { + if details == nil { + details = map[string]any{} + } + details["http_status"] = status + + if status >= http.StatusInternalServerError { + return "An internal server error occurred.", details + } + + trimmed := strings.TrimSpace(fallbackMessage) + switch strings.ToLower(trimmed) { + case "forbidden", "access denied": + return "Access denied.", details + } + + if field, ok := extractRequiredField(trimmed); ok { + details["field"] = field + } + + return trimmed, details +} + +func deriveErrorCode(status int, fallbackMessage string) string { + switch strings.ToLower(strings.TrimSpace(fallbackMessage)) { + case "invalid credentials": + return "auth.invalid_credentials" + case "session expired. please sign in again.": + return "auth.session_expired" + case "access denied.": + return "common.access_denied" + } + + statusPrefix := map[int]string{ + http.StatusBadRequest: "bad_request", + http.StatusUnauthorized: "unauthorized", + http.StatusForbidden: "forbidden", + http.StatusNotFound: "not_found", + http.StatusConflict: "conflict", + http.StatusUnprocessableEntity: "unprocessable_entity", + http.StatusInternalServerError: "internal_server_error", + }[status] + if statusPrefix == "" { + statusPrefix = "http_" + strings.ReplaceAll(http.StatusText(status), " ", "_") + statusPrefix = strings.ToLower(statusPrefix) + } + + slug := slugifyMessage(fallbackMessage) + if slug == "" { + slug = "message" + } + if status >= http.StatusInternalServerError { + return "common." + statusPrefix + } + return statusPrefix + "." + slug +} + +func slugifyMessage(input string) string { + var builder strings.Builder + lastUnderscore := false + for _, r := range strings.ToLower(strings.TrimSpace(input)) { + if unicode.IsLetter(r) || unicode.IsDigit(r) { + builder.WriteRune(r) + lastUnderscore = false + continue + } + if !lastUnderscore { + builder.WriteRune('_') + lastUnderscore = true + } + } + return strings.Trim(builder.String(), "_") +} + +func extractRequiredField(message string) (string, bool) { + const suffix = " is required" + if !strings.HasSuffix(strings.ToLower(message), suffix) { + return "", false + } + field := strings.TrimSpace(message[:len(message)-len(suffix)]) + field = strings.ReplaceAll(field, " ", "_") + field = strings.ToLower(field) + return field, field != "" +} diff --git a/backend/internal/platform/logging/logger.go b/backend/internal/platform/logging/logger.go new file mode 100644 index 0000000..ae36496 --- /dev/null +++ b/backend/internal/platform/logging/logger.go @@ -0,0 +1,17 @@ +package logging + +import ( + "log/slog" + "os" +) + +func New(env string) *slog.Logger { + level := slog.LevelInfo + if env == "development" { + level = slog.LevelDebug + } + + return slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{ + Level: level, + })) +} diff --git a/backend/internal/platform/module/module.go b/backend/internal/platform/module/module.go new file mode 100644 index 0000000..e42057a --- /dev/null +++ b/backend/internal/platform/module/module.go @@ -0,0 +1,38 @@ +package module + +import ( + "log/slog" + + "github.com/go-chi/chi/v5" + "github.com/jackc/pgx/v5/pgxpool" + "github.com/redis/go-redis/v9" + + "github.com/example/remote-access-platform/backend/internal/platform/config" +) + +type Dependencies struct { + Config Config + Infra Infra +} + +type Config struct { + App config.AppConfig + Auth config.AuthConfig + Installation config.InstallationConfig + DataPlane config.DataPlaneConfig + Secret config.SecretConfig + Session config.SessionConfig + Worker config.WorkerConfig + WebSocket config.WebSocketConfig +} + +type Infra struct { + Logger *slog.Logger + DB *pgxpool.Pool + Redis *redis.Client +} + +type Module interface { + Name() string + RegisterRoutes(router chi.Router) +} diff --git a/backend/internal/platform/postgres/postgres.go b/backend/internal/platform/postgres/postgres.go new file mode 100644 index 0000000..596db9e --- /dev/null +++ b/backend/internal/platform/postgres/postgres.go @@ -0,0 +1,33 @@ +package postgres + +import ( + "context" + "fmt" + + "github.com/jackc/pgx/v5/pgxpool" + + "github.com/example/remote-access-platform/backend/internal/platform/config" +) + +func Open(ctx context.Context, cfg config.PostgresConfig) (*pgxpool.Pool, error) { + poolConfig, err := pgxpool.ParseConfig(cfg.DSN) + if err != nil { + return nil, fmt.Errorf("parse postgres dsn: %w", err) + } + + poolConfig.MaxConns = cfg.MaxConns + poolConfig.MinConns = cfg.MinConns + poolConfig.ConnConfig.ConnectTimeout = cfg.ConnectTimeout + + pool, err := pgxpool.NewWithConfig(ctx, poolConfig) + if err != nil { + return nil, fmt.Errorf("create postgres pool: %w", err) + } + + if err := pool.Ping(ctx); err != nil { + pool.Close() + return nil, fmt.Errorf("ping postgres: %w", err) + } + + return pool, nil +} diff --git a/backend/internal/platform/postgres/tx.go b/backend/internal/platform/postgres/tx.go new file mode 100644 index 0000000..b37cf66 --- /dev/null +++ b/backend/internal/platform/postgres/tx.go @@ -0,0 +1,34 @@ +package postgres + +import ( + "context" + "fmt" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5/pgxpool" +) + +type DBTX interface { + Exec(ctx context.Context, sql string, arguments ...any) (pgconn.CommandTag, error) + Query(ctx context.Context, sql string, args ...any) (pgx.Rows, error) + QueryRow(ctx context.Context, sql string, args ...any) pgx.Row +} + +func WithTransaction(ctx context.Context, pool *pgxpool.Pool, fn func(tx pgx.Tx) error) error { + tx, err := pool.BeginTx(ctx, pgx.TxOptions{}) + if err != nil { + return fmt.Errorf("begin transaction: %w", err) + } + + if err := fn(tx); err != nil { + _ = tx.Rollback(ctx) + return err + } + + if err := tx.Commit(ctx); err != nil { + return fmt.Errorf("commit transaction: %w", err) + } + + return nil +} diff --git a/backend/internal/platform/redis/redis.go b/backend/internal/platform/redis/redis.go new file mode 100644 index 0000000..147bf92 --- /dev/null +++ b/backend/internal/platform/redis/redis.go @@ -0,0 +1,26 @@ +package redis + +import ( + "context" + "fmt" + + goredis "github.com/redis/go-redis/v9" + + "github.com/example/remote-access-platform/backend/internal/platform/config" +) + +func Open(ctx context.Context, cfg config.RedisConfig) (*goredis.Client, error) { + client := goredis.NewClient(&goredis.Options{ + Addr: cfg.Addr, + Password: cfg.Password, + DB: cfg.DB, + DialTimeout: cfg.DialTimeout, + }) + + if err := client.Ping(ctx).Err(); err != nil { + _ = client.Close() + return nil, fmt.Errorf("ping redis: %w", err) + } + + return client, nil +} diff --git a/backend/internal/platform/runtime/app.go b/backend/internal/platform/runtime/app.go new file mode 100644 index 0000000..9327a50 --- /dev/null +++ b/backend/internal/platform/runtime/app.go @@ -0,0 +1,220 @@ +package runtime + +import ( + "context" + "errors" + "fmt" + "log/slog" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + chimiddleware "github.com/go-chi/chi/v5/middleware" + + "github.com/example/remote-access-platform/backend/internal/modules/auth" + "github.com/example/remote-access-platform/backend/internal/modules/cluster" + "github.com/example/remote-access-platform/backend/internal/modules/identitysource" + "github.com/example/remote-access-platform/backend/internal/modules/node" + "github.com/example/remote-access-platform/backend/internal/modules/nodeagent" + "github.com/example/remote-access-platform/backend/internal/modules/organization" + "github.com/example/remote-access-platform/backend/internal/modules/resource" + "github.com/example/remote-access-platform/backend/internal/modules/sessionbroker" + "github.com/example/remote-access-platform/backend/internal/modules/sessiongateway" + "github.com/example/remote-access-platform/backend/internal/modules/worker" + "github.com/example/remote-access-platform/backend/internal/platform/authority" + "github.com/example/remote-access-platform/backend/internal/platform/config" + "github.com/example/remote-access-platform/backend/internal/platform/httpserver" + "github.com/example/remote-access-platform/backend/internal/platform/logging" + "github.com/example/remote-access-platform/backend/internal/platform/module" + postgresplatform "github.com/example/remote-access-platform/backend/internal/platform/postgres" + redisplatform "github.com/example/remote-access-platform/backend/internal/platform/redis" + "github.com/example/remote-access-platform/backend/internal/platform/secrets" +) + +type App struct { + cfg config.Config + logger *slog.Logger + httpServer *http.Server + workers []backgroundRunner + db closeFunc + redis closeFunc +} + +type closeFunc func() error +type backgroundRunner func(context.Context) error + +func NewApp(ctx context.Context) (*App, error) { + cfg, err := config.Load() + if err != nil { + return nil, fmt.Errorf("load config: %w", err) + } + + logger := logging.New(cfg.App.Env) + + db, err := postgresplatform.Open(ctx, cfg.Postgres) + if err != nil { + return nil, err + } + + redisClient, err := redisplatform.Open(ctx, cfg.Redis) + if err != nil { + db.Close() + return nil, err + } + + authorityVerifier, err := authority.NewVerifier(cfg.Installation) + if err != nil { + redisClient.Close() + db.Close() + return nil, fmt.Errorf("create installation authority verifier: %w", err) + } + + deps := module.Dependencies{ + Config: module.Config{ + App: cfg.App, + Auth: cfg.Auth, + Installation: cfg.Installation, + DataPlane: cfg.DataPlane, + Secret: cfg.Secret, + Session: cfg.Session, + Worker: cfg.Worker, + WebSocket: cfg.WebSocket, + }, + Infra: module.Infra{ + Logger: logger, + DB: db, + Redis: redisClient, + }, + } + + workerStore := worker.NewRedisStore(redisClient) + workerService := worker.NewService(deps, workerStore) + authStore := auth.NewPostgresStore(db) + authTx := auth.NewPostgresTransactor(db) + authService := auth.NewService(deps, authStore, authTx, authorityVerifier) + var resourceSecretStore *secrets.ResourceSecretStore + if cfg.Secret.EncryptionKeyBase64 != "" { + secretEncryptor, err := secrets.NewEncryptor(cfg.Secret.EncryptionKeyBase64, cfg.Secret.EncryptionKeyID) + if err != nil { + redisClient.Close() + db.Close() + return nil, fmt.Errorf("create resource secret encryptor: %w", err) + } + resourceSecretStore = secrets.NewResourceSecretStore(db, secretEncryptor) + } + + brokerStore := sessionbroker.NewPostgresStore(db, authorityVerifier) + brokerTx := sessionbroker.NewPostgresTransactor(db, authorityVerifier) + liveStateStore := sessionbroker.NewRedisLiveStateStore(redisClient) + brokerService := sessionbroker.NewService(deps, brokerStore, brokerTx, liveStateStore, workerService, resourceSecretStore) + workerEvents := worker.NewEventProcessor(redisClient, brokerService) + leaseMonitor := worker.NewLeaseMonitor(workerService, brokerService, cfg.Worker.StaleLeaseGracePeriod) + + brokerModule := sessionbroker.NewModule(brokerService) + authModule := auth.NewModule(deps, authService) + clusterModule := cluster.NewModule(deps, authorityVerifier) + organizationModule := organization.NewModule(deps) + identitySourceModule := identitysource.NewModule(deps) + resourceModule := resource.NewModule(deps, resourceSecretStore) + nodeModule := node.NewModule(deps) + nodeAgentModule := nodeagent.NewModule(deps) + sessionGatewayModule := sessiongateway.NewModule(deps, brokerModule.Service(), workerService) + + router := buildRouter( + logger, + authModule, + clusterModule, + organizationModule, + identitySourceModule, + resourceModule, + brokerModule, + nodeModule, + nodeAgentModule, + sessionGatewayModule, + ) + + return &App{ + cfg: cfg, + logger: logger, + httpServer: httpserver.New(cfg.HTTP, router), + workers: []backgroundRunner{workerEvents.Run, leaseMonitor.Run}, + db: func() error { + db.Close() + return nil + }, + redis: redisClient.Close, + }, nil +} + +func (a *App) Run(ctx context.Context) error { + errCh := make(chan error, 1) + + go func() { + a.logger.Info("http server starting", "addr", a.httpServer.Addr, "service", a.cfg.App.Name) + if err := a.httpServer.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + errCh <- err + return + } + errCh <- nil + }() + + for _, runner := range a.workers { + runner := runner + go func() { + if err := runner(ctx); err != nil { + errCh <- err + } + }() + } + + select { + case <-ctx.Done(): + a.logger.Info("shutdown signal received") + case err := <-errCh: + if err != nil { + return err + } + return nil + } + + shutdownCtx, cancel := context.WithTimeout(context.Background(), a.cfg.HTTP.ShutdownTimeout) + defer cancel() + + if err := a.httpServer.Shutdown(shutdownCtx); err != nil { + return fmt.Errorf("shutdown http server: %w", err) + } + + if err := a.redis(); err != nil { + return fmt.Errorf("close redis: %w", err) + } + + if err := a.db(); err != nil { + return fmt.Errorf("close postgres: %w", err) + } + + a.logger.Info("app stopped", "at", time.Now().UTC()) + return nil +} + +func buildRouter(logger *slog.Logger, modules ...module.Module) http.Handler { + router := chi.NewRouter() + router.Use(chimiddleware.RequestID) + router.Use(chimiddleware.RealIP) + router.Use(chimiddleware.Recoverer) + router.Use(chimiddleware.Timeout(60 * time.Second)) + router.Use(chimiddleware.Heartbeat("/healthz")) + + router.Get("/readyz", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ready")) + }) + + router.Route("/api/v1", func(r chi.Router) { + for _, mod := range modules { + logger.Info("register module routes", "module", mod.Name()) + mod.RegisterRoutes(r) + } + }) + + return router +} diff --git a/backend/internal/platform/secrets/assignment.go b/backend/internal/platform/secrets/assignment.go new file mode 100644 index 0000000..064019e --- /dev/null +++ b/backend/internal/platform/secrets/assignment.go @@ -0,0 +1,37 @@ +package secrets + +import ( + "encoding/json" + "fmt" +) + +type AssignmentSecretMergeResult struct { + Metadata map[string]any + Keys []string +} + +func MergeResourceSecretIntoAssignmentMetadata(metadata map[string]any, payload json.RawMessage) (AssignmentSecretMergeResult, error) { + if metadata == nil { + metadata = map[string]any{} + } + var secretPayload map[string]any + if err := json.Unmarshal(payload, &secretPayload); err != nil { + return AssignmentSecretMergeResult{}, fmt.Errorf("decode resolved resource secret: %w", err) + } + resource, _ := metadata["resource"].(map[string]any) + if resource == nil { + resource = map[string]any{} + metadata["resource"] = resource + } + resourceMetadata, _ := resource["metadata"].(map[string]any) + if resourceMetadata == nil { + resourceMetadata = map[string]any{} + resource["metadata"] = resourceMetadata + } + keys := make([]string, 0, len(secretPayload)) + for key, value := range secretPayload { + resourceMetadata[key] = value + keys = append(keys, key) + } + return AssignmentSecretMergeResult{Metadata: metadata, Keys: keys}, nil +} diff --git a/backend/internal/platform/secrets/encryption.go b/backend/internal/platform/secrets/encryption.go new file mode 100644 index 0000000..d44536c --- /dev/null +++ b/backend/internal/platform/secrets/encryption.go @@ -0,0 +1,113 @@ +package secrets + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/rand" + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "errors" + "fmt" + "io" + "strings" +) + +const AlgorithmAES256GCM = "AES-256-GCM" + +var ( + ErrSecretEncryptionKeyMissing = errors.New("secret encryption key is not configured") + ErrSecretPayloadInvalid = errors.New("secret payload must be a json object") +) + +type Encryptor struct { + aead cipher.AEAD + keyID string +} + +type EncryptedPayload struct { + Algorithm string + KeyID string + Nonce []byte + Ciphertext []byte + PayloadSHA256 string +} + +func NewEncryptor(masterKeyBase64, keyID string) (*Encryptor, error) { + masterKeyBase64 = strings.TrimSpace(masterKeyBase64) + if masterKeyBase64 == "" { + return nil, ErrSecretEncryptionKeyMissing + } + key, err := base64.StdEncoding.DecodeString(masterKeyBase64) + if err != nil { + if rawKey, rawErr := base64.RawStdEncoding.DecodeString(masterKeyBase64); rawErr == nil { + key = rawKey + } else { + return nil, fmt.Errorf("decode secret encryption key: %w", err) + } + } + if len(key) != 32 { + return nil, fmt.Errorf("secret encryption key must decode to 32 bytes") + } + block, err := aes.NewCipher(key) + if err != nil { + return nil, fmt.Errorf("create secret cipher: %w", err) + } + aead, err := cipher.NewGCM(block) + if err != nil { + return nil, fmt.Errorf("create secret gcm: %w", err) + } + if strings.TrimSpace(keyID) == "" { + keyID = "local-v1" + } + return &Encryptor{aead: aead, keyID: keyID}, nil +} + +func (e *Encryptor) KeyID() string { + if e == nil { + return "" + } + return e.keyID +} + +func (e *Encryptor) Encrypt(plaintext, aad []byte) (EncryptedPayload, error) { + if e == nil { + return EncryptedPayload{}, ErrSecretEncryptionKeyMissing + } + nonce := make([]byte, e.aead.NonceSize()) + if _, err := io.ReadFull(rand.Reader, nonce); err != nil { + return EncryptedPayload{}, fmt.Errorf("generate secret nonce: %w", err) + } + hash := sha256.Sum256(plaintext) + return EncryptedPayload{ + Algorithm: AlgorithmAES256GCM, + KeyID: e.keyID, + Nonce: nonce, + Ciphertext: e.aead.Seal(nil, nonce, plaintext, aad), + PayloadSHA256: hex.EncodeToString(hash[:]), + }, nil +} + +func (e *Encryptor) Decrypt(payload EncryptedPayload, aad []byte) ([]byte, error) { + if e == nil { + return nil, ErrSecretEncryptionKeyMissing + } + if payload.Algorithm != "" && payload.Algorithm != AlgorithmAES256GCM { + return nil, fmt.Errorf("unsupported secret algorithm %q", payload.Algorithm) + } + plaintext, err := e.aead.Open(nil, payload.Nonce, payload.Ciphertext, aad) + if err != nil { + return nil, fmt.Errorf("decrypt secret payload: %w", err) + } + return plaintext, nil +} + +func ResourceSecretAAD(organizationID, resourceID, secretRef, protocol string) []byte { + return []byte(strings.Join([]string{ + "rap-resource-secret-v1", + strings.TrimSpace(organizationID), + strings.TrimSpace(resourceID), + strings.TrimSpace(secretRef), + strings.ToLower(strings.TrimSpace(protocol)), + }, "|")) +} diff --git a/backend/internal/platform/secrets/encryption_test.go b/backend/internal/platform/secrets/encryption_test.go new file mode 100644 index 0000000..66c253d --- /dev/null +++ b/backend/internal/platform/secrets/encryption_test.go @@ -0,0 +1,65 @@ +package secrets + +import ( + "encoding/base64" + "encoding/json" + "testing" +) + +func TestEncryptorRoundTrip(t *testing.T) { + key := base64.StdEncoding.EncodeToString([]byte("0123456789abcdef0123456789abcdef")) + encryptor, err := NewEncryptor(key, "test-key") + if err != nil { + t.Fatalf("NewEncryptor returned error: %v", err) + } + aad := ResourceSecretAAD("org-1", "resource-1", "rap-secret://test", "rdp") + encrypted, err := encryptor.Encrypt([]byte(`{"username":"user","password":"secret"}`), aad) + if err != nil { + t.Fatalf("Encrypt returned error: %v", err) + } + plaintext, err := encryptor.Decrypt(encrypted, aad) + if err != nil { + t.Fatalf("Decrypt returned error: %v", err) + } + if string(plaintext) != `{"username":"user","password":"secret"}` { + t.Fatalf("unexpected plaintext: %s", plaintext) + } +} + +func TestEncryptorRejectsWrongAAD(t *testing.T) { + key := base64.StdEncoding.EncodeToString([]byte("0123456789abcdef0123456789abcdef")) + encryptor, err := NewEncryptor(key, "test-key") + if err != nil { + t.Fatalf("NewEncryptor returned error: %v", err) + } + encrypted, err := encryptor.Encrypt([]byte(`{"password":"secret"}`), ResourceSecretAAD("org-1", "resource-1", "ref", "rdp")) + if err != nil { + t.Fatalf("Encrypt returned error: %v", err) + } + if _, err := encryptor.Decrypt(encrypted, ResourceSecretAAD("org-2", "resource-1", "ref", "rdp")); err == nil { + t.Fatalf("expected decrypt with wrong aad to fail") + } +} + +func TestMergeResourceSecretIntoAssignmentMetadata(t *testing.T) { + metadata := map[string]any{ + "resource": map[string]any{ + "id": "resource-1", + "metadata": map[string]any{ + "rdp_host": "host", + }, + }, + } + merged, err := MergeResourceSecretIntoAssignmentMetadata(metadata, json.RawMessage(`{"username":"user","password":"secret","domain":"corp"}`)) + if err != nil { + t.Fatalf("MergeResourceSecretIntoAssignmentMetadata returned error: %v", err) + } + resource := merged.Metadata["resource"].(map[string]any) + resourceMetadata := resource["metadata"].(map[string]any) + if resourceMetadata["rdp_host"] != "host" { + t.Fatalf("existing metadata was not preserved") + } + if resourceMetadata["username"] != "user" || resourceMetadata["password"] != "secret" || resourceMetadata["domain"] != "corp" { + t.Fatalf("secret payload was not merged: %#v", resourceMetadata) + } +} diff --git a/backend/internal/platform/secrets/resource_metadata.go b/backend/internal/platform/secrets/resource_metadata.go new file mode 100644 index 0000000..70dbb96 --- /dev/null +++ b/backend/internal/platform/secrets/resource_metadata.go @@ -0,0 +1,132 @@ +package secrets + +import ( + "encoding/json" + "errors" + "fmt" + "slices" + "sort" + "strings" +) + +var ( + ErrPlaintextResourceCredentials = errors.New("plaintext resource credentials are not allowed in metadata in production") + ErrMissingResourceSecretRef = errors.New("secret_ref is required for this resource protocol in production") +) + +var credentialKeyFragments = []string{ + "accesstoken", + "clientsecret", + "credential", + "credentials", + "domain", + "password", + "privatekey", + "refreshtoken", + "secret", + "secrets", + "token", + "user", + "username", +} + +var safeReferenceKeys = []string{ + "certificateverificationmode", + "renderqualityprofile", + "secretref", + "secretreference", + "vaultref", +} + +func ValidateResourceSecretReadiness(protocol string, secretRef *string, metadata json.RawMessage, appEnv string) error { + if !IsProductionEnv(appEnv) { + return nil + } + + paths, err := PlaintextCredentialMetadataPaths(metadata) + if err != nil { + return err + } + if len(paths) > 0 { + return fmt.Errorf("%w: %s", ErrPlaintextResourceCredentials, strings.Join(paths, ", ")) + } + if ResourceProtocolRequiresSecretRef(protocol) && (secretRef == nil || strings.TrimSpace(*secretRef) == "") { + return ErrMissingResourceSecretRef + } + return nil +} + +func IsProductionEnv(appEnv string) bool { + switch strings.ToLower(strings.TrimSpace(appEnv)) { + case "prod", "production": + return true + default: + return false + } +} + +func ResourceProtocolRequiresSecretRef(protocol string) bool { + switch strings.ToLower(strings.TrimSpace(protocol)) { + case "rdp", "vnc", "ssh": + return true + default: + return false + } +} + +func PlaintextCredentialMetadataPaths(raw json.RawMessage) ([]string, error) { + if len(raw) == 0 { + return nil, nil + } + var value any + if err := json.Unmarshal(raw, &value); err != nil { + return nil, errors.New("metadata must be valid json") + } + metadata, ok := value.(map[string]any) + if !ok { + return nil, errors.New("metadata must be a json object") + } + var paths []string + collectCredentialPaths(metadata, "", &paths) + sort.Strings(paths) + return slices.Compact(paths), nil +} + +func collectCredentialPaths(value any, prefix string, paths *[]string) { + switch typed := value.(type) { + case map[string]any: + for key, child := range typed { + path := key + if prefix != "" { + path = prefix + "." + key + } + if isCredentialMetadataKey(key) { + *paths = append(*paths, path) + } + collectCredentialPaths(child, path, paths) + } + case []any: + for index, child := range typed { + collectCredentialPaths(child, fmt.Sprintf("%s[%d]", prefix, index), paths) + } + } +} + +func isCredentialMetadataKey(key string) bool { + normalized := normalizeMetadataKey(key) + if slices.Contains(safeReferenceKeys, normalized) { + return false + } + for _, fragment := range credentialKeyFragments { + if normalized == fragment || strings.HasSuffix(normalized, fragment) { + return true + } + } + return false +} + +func normalizeMetadataKey(key string) string { + key = strings.ToLower(strings.TrimSpace(key)) + replacer := strings.NewReplacer("_", "", "-", "", " ", "", ".", "") + return replacer.Replace(key) +} diff --git a/backend/internal/platform/secrets/resource_metadata_test.go b/backend/internal/platform/secrets/resource_metadata_test.go new file mode 100644 index 0000000..20d9111 --- /dev/null +++ b/backend/internal/platform/secrets/resource_metadata_test.go @@ -0,0 +1,52 @@ +package secrets + +import ( + "encoding/json" + "errors" + "slices" + "testing" +) + +func TestValidateResourceSecretReadinessAllowsPlaintextInDevelopment(t *testing.T) { + metadata := json.RawMessage(`{"username":"m","password":"secret"}`) + if err := ValidateResourceSecretReadiness("rdp", nil, metadata, "development"); err != nil { + t.Fatalf("development metadata should remain allowed for smoke/dev: %v", err) + } +} + +func TestValidateResourceSecretReadinessRejectsPlaintextCredentialsInProduction(t *testing.T) { + metadata := json.RawMessage(`{"rdp_host":"host","credentials":{"username":"m","password":"secret"}}`) + err := ValidateResourceSecretReadiness("rdp", stringPtr("vault://org/resource"), metadata, "production") + if !errors.Is(err, ErrPlaintextResourceCredentials) { + t.Fatalf("expected plaintext credential rejection, got %v", err) + } + + paths, err := PlaintextCredentialMetadataPaths(metadata) + if err != nil { + t.Fatalf("metadata paths: %v", err) + } + for _, expected := range []string{"credentials", "credentials.password", "credentials.username"} { + if !slices.Contains(paths, expected) { + t.Fatalf("expected sensitive path %q in %v", expected, paths) + } + } +} + +func TestValidateResourceSecretReadinessRequiresSecretRefForProductionRDP(t *testing.T) { + metadata := json.RawMessage(`{"rdp_host":"host","rdp_port":3389}`) + err := ValidateResourceSecretReadiness("rdp", nil, metadata, "production") + if !errors.Is(err, ErrMissingResourceSecretRef) { + t.Fatalf("expected missing secret_ref rejection, got %v", err) + } +} + +func TestValidateResourceSecretReadinessAllowsProductionSecretRef(t *testing.T) { + metadata := json.RawMessage(`{"rdp_host":"host","rdp_port":3389,"secret_ref":"vault://org/resource"}`) + if err := ValidateResourceSecretReadiness("rdp", stringPtr("vault://org/resource"), metadata, "production"); err != nil { + t.Fatalf("production secret_ref metadata should be accepted: %v", err) + } +} + +func stringPtr(value string) *string { + return &value +} diff --git a/backend/internal/platform/secrets/resource_secret_store.go b/backend/internal/platform/secrets/resource_secret_store.go new file mode 100644 index 0000000..cdec71a --- /dev/null +++ b/backend/internal/platform/secrets/resource_secret_store.go @@ -0,0 +1,259 @@ +package secrets + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "strings" + "time" + + "github.com/jackc/pgx/v5" + + postgresplatform "github.com/example/remote-access-platform/backend/internal/platform/postgres" +) + +var ( + ErrResourceSecretNotFound = errors.New("resource secret not found") + ErrSecretAccessDenied = errors.New("resource secret access denied") + ErrSecretLeaseRequired = errors.New("resource secret resolution requires lease proof") +) + +type ResourceSecretStore struct { + db postgresplatform.DBTX + encryptor *Encryptor + now func() time.Time +} + +type ResourceSecretResolver interface { + ResolveForSession(ctx context.Context, req ResolveResourceSecretRequest) (*ResolvedResourceSecret, error) +} + +type ResourceSecretDescriptor struct { + ID string `json:"id"` + OrganizationID string `json:"organization_id"` + ResourceID string `json:"resource_id"` + SecretRef string `json:"secret_ref"` + Protocol string `json:"protocol"` + Version int `json:"version"` + KeyID string `json:"key_id"` + Algorithm string `json:"algorithm"` + Metadata json.RawMessage `json:"metadata"` + CreatedAt time.Time `json:"created_at"` + RotatedAt *time.Time `json:"rotated_at,omitempty"` +} + +type UpsertResourceSecretCommand struct { + OrganizationID string + ResourceID string + Protocol string + SecretRef string + Payload json.RawMessage + Metadata json.RawMessage + ActorUserID string +} + +type ResolveResourceSecretRequest struct { + SecretRef string + OrganizationID string + ResourceID string + SessionID string + WorkerID string + LeaseID string +} + +type ResolvedResourceSecret struct { + Descriptor ResourceSecretDescriptor + Payload json.RawMessage +} + +func NewResourceSecretStore(db postgresplatform.DBTX, encryptor *Encryptor) *ResourceSecretStore { + return &ResourceSecretStore{db: db, encryptor: encryptor, now: time.Now} +} + +func (s *ResourceSecretStore) WithDB(db postgresplatform.DBTX) *ResourceSecretStore { + if s == nil { + return nil + } + return &ResourceSecretStore{db: db, encryptor: s.encryptor, now: s.now} +} + +func DefaultResourceSecretRef(organizationID, resourceID string) string { + return "rap-secret://org/" + strings.TrimSpace(organizationID) + "/resources/" + strings.TrimSpace(resourceID) + "/primary" +} + +func (s *ResourceSecretStore) Upsert(ctx context.Context, cmd UpsertResourceSecretCommand) (*ResourceSecretDescriptor, error) { + if s == nil || s.encryptor == nil { + return nil, ErrSecretEncryptionKeyMissing + } + payload, err := normalizeJSONObject(cmd.Payload) + if err != nil { + return nil, err + } + metadata, err := normalizeJSONObjectAllowEmpty(cmd.Metadata) + if err != nil { + return nil, err + } + secretRef := strings.TrimSpace(cmd.SecretRef) + if secretRef == "" { + secretRef = DefaultResourceSecretRef(cmd.OrganizationID, cmd.ResourceID) + } + protocol := strings.ToLower(strings.TrimSpace(cmd.Protocol)) + encrypted, err := s.encryptor.Encrypt(payload, ResourceSecretAAD(cmd.OrganizationID, cmd.ResourceID, secretRef, protocol)) + if err != nil { + return nil, err + } + now := s.now().UTC() + const query = ` +INSERT INTO resource_secrets ( + organization_id, resource_id, secret_ref, protocol, version, key_id, + algorithm, nonce, ciphertext, payload_sha256, metadata, created_by_user_id, + created_at, rotated_at +) VALUES ( + $1::uuid, $2::uuid, $3, $4, 1, $5, + $6, $7, $8, $9, $10::jsonb, NULLIF($11, '')::uuid, + $12, NULL +) +ON CONFLICT (resource_id) DO UPDATE SET + secret_ref = EXCLUDED.secret_ref, + protocol = EXCLUDED.protocol, + version = resource_secrets.version + 1, + key_id = EXCLUDED.key_id, + algorithm = EXCLUDED.algorithm, + nonce = EXCLUDED.nonce, + ciphertext = EXCLUDED.ciphertext, + payload_sha256 = EXCLUDED.payload_sha256, + metadata = EXCLUDED.metadata, + created_by_user_id = EXCLUDED.created_by_user_id, + rotated_at = EXCLUDED.created_at +RETURNING id::text, organization_id::text, resource_id::text, secret_ref, + protocol, version, key_id, algorithm, metadata, created_at, rotated_at +` + var descriptor ResourceSecretDescriptor + if err := s.db.QueryRow(ctx, query, + cmd.OrganizationID, + cmd.ResourceID, + secretRef, + protocol, + encrypted.KeyID, + encrypted.Algorithm, + encrypted.Nonce, + encrypted.Ciphertext, + encrypted.PayloadSHA256, + metadata, + cmd.ActorUserID, + now, + ).Scan( + &descriptor.ID, + &descriptor.OrganizationID, + &descriptor.ResourceID, + &descriptor.SecretRef, + &descriptor.Protocol, + &descriptor.Version, + &descriptor.KeyID, + &descriptor.Algorithm, + &descriptor.Metadata, + &descriptor.CreatedAt, + &descriptor.RotatedAt, + ); err != nil { + return nil, fmt.Errorf("upsert resource secret: %w", err) + } + return &descriptor, nil +} + +func (s *ResourceSecretStore) ResolveForSession(ctx context.Context, req ResolveResourceSecretRequest) (*ResolvedResourceSecret, error) { + if s == nil || s.encryptor == nil { + return nil, ErrSecretEncryptionKeyMissing + } + if strings.TrimSpace(req.LeaseID) == "" { + return nil, ErrSecretLeaseRequired + } + const query = ` +SELECT sec.id::text, sec.organization_id::text, sec.resource_id::text, sec.secret_ref, + sec.protocol, sec.version, sec.key_id, sec.algorithm, sec.metadata, + sec.created_at, sec.rotated_at, sec.nonce, sec.ciphertext, + rs.organization_id::text, rs.resource_id::text, COALESCE(rs.worker_id, ''), rs.state +FROM resource_secrets sec +JOIN remote_sessions rs ON rs.resource_id = sec.resource_id +WHERE sec.secret_ref = $1 AND rs.id = $2::uuid +` + var descriptor ResourceSecretDescriptor + var nonce, ciphertext []byte + var sessionOrganizationID, sessionResourceID, sessionWorkerID, sessionState string + if err := s.db.QueryRow(ctx, query, req.SecretRef, req.SessionID).Scan( + &descriptor.ID, + &descriptor.OrganizationID, + &descriptor.ResourceID, + &descriptor.SecretRef, + &descriptor.Protocol, + &descriptor.Version, + &descriptor.KeyID, + &descriptor.Algorithm, + &descriptor.Metadata, + &descriptor.CreatedAt, + &descriptor.RotatedAt, + &nonce, + &ciphertext, + &sessionOrganizationID, + &sessionResourceID, + &sessionWorkerID, + &sessionState, + ); err != nil { + if errors.Is(err, pgx.ErrNoRows) { + return nil, ErrResourceSecretNotFound + } + return nil, fmt.Errorf("resolve resource secret: %w", err) + } + if descriptor.OrganizationID != req.OrganizationID || + descriptor.ResourceID != req.ResourceID || + sessionOrganizationID != req.OrganizationID || + sessionResourceID != req.ResourceID || + sessionWorkerID != req.WorkerID || + !secretResolvableSessionState(sessionState) { + return nil, ErrSecretAccessDenied + } + plaintext, err := s.encryptor.Decrypt(EncryptedPayload{ + Algorithm: descriptor.Algorithm, + KeyID: descriptor.KeyID, + Nonce: nonce, + Ciphertext: ciphertext, + }, ResourceSecretAAD(descriptor.OrganizationID, descriptor.ResourceID, descriptor.SecretRef, descriptor.Protocol)) + if err != nil { + return nil, err + } + return &ResolvedResourceSecret{ + Descriptor: descriptor, + Payload: json.RawMessage(plaintext), + }, nil +} + +func normalizeJSONObject(raw json.RawMessage) (json.RawMessage, error) { + if len(raw) == 0 || !json.Valid(raw) { + return nil, ErrSecretPayloadInvalid + } + var decoded map[string]any + if err := json.Unmarshal(raw, &decoded); err != nil { + return nil, ErrSecretPayloadInvalid + } + encoded, err := json.Marshal(decoded) + if err != nil { + return nil, err + } + return json.RawMessage(encoded), nil +} + +func normalizeJSONObjectAllowEmpty(raw json.RawMessage) (json.RawMessage, error) { + if len(raw) == 0 { + return json.RawMessage(`{}`), nil + } + return normalizeJSONObject(raw) +} + +func secretResolvableSessionState(state string) bool { + switch state { + case "starting", "active", "reconnecting": + return true + default: + return false + } +} diff --git a/backend/migrations/000001_init.down.sql b/backend/migrations/000001_init.down.sql new file mode 100644 index 0000000..ebc884f --- /dev/null +++ b/backend/migrations/000001_init.down.sql @@ -0,0 +1,6 @@ +DROP TABLE IF EXISTS audit_logs; +DROP TABLE IF EXISTS secrets; +DROP TABLE IF EXISTS sessions; +DROP TABLE IF EXISTS resources; +DROP TABLE IF EXISTS devices; +DROP TABLE IF EXISTS users; diff --git a/backend/migrations/000001_init.up.sql b/backend/migrations/000001_init.up.sql new file mode 100644 index 0000000..59c454b --- /dev/null +++ b/backend/migrations/000001_init.up.sql @@ -0,0 +1,65 @@ +CREATE EXTENSION IF NOT EXISTS "pgcrypto"; + +CREATE TABLE IF NOT EXISTS users ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + email TEXT NOT NULL UNIQUE, + password_hash TEXT NOT NULL, + mfa_enabled BOOLEAN NOT NULL DEFAULT FALSE, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS devices ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE, + device_fingerprint TEXT NOT NULL, + trusted_at TIMESTAMPTZ, + last_seen_at TIMESTAMPTZ, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS resources ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + name TEXT NOT NULL, + address TEXT NOT NULL, + protocol TEXT NOT NULL, + secret_ref TEXT, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS sessions ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + resource_id UUID NOT NULL REFERENCES resources(id) ON DELETE RESTRICT, + controller_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + worker_id TEXT, + state TEXT NOT NULL, + detached_until TIMESTAMPTZ, + last_heartbeat_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS secrets ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + scope TEXT NOT NULL, + encrypted_payload BYTEA NOT NULL, + key_version TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS audit_logs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + actor_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + action TEXT NOT NULL, + target_type TEXT NOT NULL, + target_id TEXT NOT NULL, + payload JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_devices_user_id ON devices(user_id); +CREATE INDEX IF NOT EXISTS idx_sessions_resource_state ON sessions(resource_id, state); +CREATE INDEX IF NOT EXISTS idx_audit_logs_created_at ON audit_logs(created_at DESC); diff --git a/backend/migrations/000002_auth_foundation.down.sql b/backend/migrations/000002_auth_foundation.down.sql new file mode 100644 index 0000000..4c88d60 --- /dev/null +++ b/backend/migrations/000002_auth_foundation.down.sql @@ -0,0 +1,9 @@ +DROP TABLE IF EXISTS auth_sessions; +DROP INDEX IF EXISTS idx_devices_user_fingerprint; + +ALTER TABLE devices + DROP COLUMN IF EXISTS updated_at, + DROP COLUMN IF EXISTS revoked_reason, + DROP COLUMN IF EXISTS revoked_at, + DROP COLUMN IF EXISTS trust_status, + DROP COLUMN IF EXISTS device_label; diff --git a/backend/migrations/000002_auth_foundation.up.sql b/backend/migrations/000002_auth_foundation.up.sql new file mode 100644 index 0000000..0226a6f --- /dev/null +++ b/backend/migrations/000002_auth_foundation.up.sql @@ -0,0 +1,32 @@ +ALTER TABLE devices + ADD COLUMN IF NOT EXISTS device_label TEXT, + ADD COLUMN IF NOT EXISTS trust_status TEXT NOT NULL DEFAULT 'pending', + ADD COLUMN IF NOT EXISTS revoked_at TIMESTAMPTZ, + ADD COLUMN IF NOT EXISTS revoked_reason TEXT, + ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_devices_user_fingerprint + ON devices(user_id, device_fingerprint); + +CREATE TABLE IF NOT EXISTS auth_sessions ( + id UUID PRIMARY KEY, + user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE, + device_id UUID NOT NULL REFERENCES devices(id) ON DELETE RESTRICT, + refresh_token_hash TEXT NOT NULL, + refresh_expires_at TIMESTAMPTZ NOT NULL, + last_seen_at TIMESTAMPTZ, + last_rotated_at TIMESTAMPTZ, + revoked_at TIMESTAMPTZ, + revoked_reason TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_auth_sessions_user_id + ON auth_sessions(user_id); + +CREATE INDEX IF NOT EXISTS idx_auth_sessions_device_id + ON auth_sessions(device_id); + +CREATE INDEX IF NOT EXISTS idx_auth_sessions_revoked_at + ON auth_sessions(revoked_at); diff --git a/backend/migrations/000003_session_storage_foundation.down.sql b/backend/migrations/000003_session_storage_foundation.down.sql new file mode 100644 index 0000000..ed03924 --- /dev/null +++ b/backend/migrations/000003_session_storage_foundation.down.sql @@ -0,0 +1,4 @@ +DROP TABLE IF EXISTS audit_events; +DROP TABLE IF EXISTS session_attachments; +DROP TABLE IF EXISTS remote_sessions; +DROP TABLE IF EXISTS resource_policies; diff --git a/backend/migrations/000003_session_storage_foundation.up.sql b/backend/migrations/000003_session_storage_foundation.up.sql new file mode 100644 index 0000000..c051105 --- /dev/null +++ b/backend/migrations/000003_session_storage_foundation.up.sql @@ -0,0 +1,79 @@ +CREATE TABLE IF NOT EXISTS resource_policies ( + resource_id UUID PRIMARY KEY REFERENCES resources(id) ON DELETE CASCADE, + max_concurrent_sessions INTEGER NOT NULL DEFAULT 1, + takeover_policy TEXT NOT NULL DEFAULT 'trusted_device', + require_trusted_device BOOLEAN NOT NULL DEFAULT TRUE, + detach_grace_period_seconds INTEGER NOT NULL DEFAULT 1800, + clipboard_enabled BOOLEAN NOT NULL DEFAULT TRUE, + file_transfer_enabled BOOLEAN NOT NULL DEFAULT TRUE, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE TABLE IF NOT EXISTS remote_sessions ( + id UUID PRIMARY KEY, + resource_id UUID NOT NULL REFERENCES resources(id) ON DELETE RESTRICT, + protocol TEXT NOT NULL, + state TEXT NOT NULL, + worker_id TEXT, + controller_user_id UUID NOT NULL REFERENCES users(id) ON DELETE RESTRICT, + detach_deadline_at TIMESTAMPTZ, + last_heartbeat_at TIMESTAMPTZ, + takeover_version INTEGER NOT NULL DEFAULT 1, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_remote_sessions_resource_id + ON remote_sessions(resource_id); + +CREATE INDEX IF NOT EXISTS idx_remote_sessions_controller_user_id + ON remote_sessions(controller_user_id); + +CREATE INDEX IF NOT EXISTS idx_remote_sessions_state + ON remote_sessions(state); + +CREATE TABLE IF NOT EXISTS session_attachments ( + id UUID PRIMARY KEY, + remote_session_id UUID NOT NULL REFERENCES remote_sessions(id) ON DELETE CASCADE, + user_id UUID NOT NULL REFERENCES users(id) ON DELETE RESTRICT, + device_id UUID NOT NULL REFERENCES devices(id) ON DELETE RESTRICT, + role TEXT NOT NULL, + state TEXT NOT NULL, + superseded_by UUID REFERENCES session_attachments(id) ON DELETE SET NULL, + takeover_of UUID REFERENCES session_attachments(id) ON DELETE SET NULL, + attached_at TIMESTAMPTZ, + detached_at TIMESTAMPTZ, + last_input_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_session_attachments_remote_session_id + ON session_attachments(remote_session_id); + +CREATE INDEX IF NOT EXISTS idx_session_attachments_user_id + ON session_attachments(user_id); + +CREATE INDEX IF NOT EXISTS idx_session_attachments_state + ON session_attachments(state); + +CREATE TABLE IF NOT EXISTS audit_events ( + id UUID PRIMARY KEY, + actor_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + actor_device_id UUID REFERENCES devices(id) ON DELETE SET NULL, + event_type TEXT NOT NULL, + target_type TEXT NOT NULL, + target_id TEXT NOT NULL, + remote_session_id UUID REFERENCES remote_sessions(id) ON DELETE SET NULL, + payload JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_audit_events_created_at + ON audit_events(created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_audit_events_remote_session_id + ON audit_events(remote_session_id); diff --git a/backend/migrations/000004_resource_certificate_verification_mode.down.sql b/backend/migrations/000004_resource_certificate_verification_mode.down.sql new file mode 100644 index 0000000..487c321 --- /dev/null +++ b/backend/migrations/000004_resource_certificate_verification_mode.down.sql @@ -0,0 +1,5 @@ +ALTER TABLE resources + DROP CONSTRAINT IF EXISTS resources_certificate_verification_mode_check; + +ALTER TABLE resources + DROP COLUMN IF EXISTS certificate_verification_mode; diff --git a/backend/migrations/000004_resource_certificate_verification_mode.up.sql b/backend/migrations/000004_resource_certificate_verification_mode.up.sql new file mode 100644 index 0000000..7b83204 --- /dev/null +++ b/backend/migrations/000004_resource_certificate_verification_mode.up.sql @@ -0,0 +1,6 @@ +ALTER TABLE resources + ADD COLUMN certificate_verification_mode TEXT NOT NULL DEFAULT 'strict'; + +ALTER TABLE resources + ADD CONSTRAINT resources_certificate_verification_mode_check + CHECK (certificate_verification_mode IN ('strict', 'ignore')); diff --git a/backend/migrations/000005_platform_core_v2.down.sql b/backend/migrations/000005_platform_core_v2.down.sql new file mode 100644 index 0000000..4f32444 --- /dev/null +++ b/backend/migrations/000005_platform_core_v2.down.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS node_agent_update_runs; +DROP TABLE IF EXISTS node_partition_states; +DROP TABLE IF EXISTS node_update_policies; +DROP TABLE IF EXISTS node_services; +DROP TABLE IF EXISTS node_capabilities; +DROP TABLE IF EXISTS nodes; +DROP TABLE IF EXISTS identity_mappings; +DROP TABLE IF EXISTS identity_sources; + +DROP INDEX IF EXISTS idx_remote_sessions_organization_id; +ALTER TABLE remote_sessions + DROP COLUMN IF EXISTS organization_id; + +DROP INDEX IF EXISTS idx_resources_organization_id; +ALTER TABLE resources + DROP COLUMN IF EXISTS organization_id; + +DROP TABLE IF EXISTS organization_memberships; +DROP TABLE IF EXISTS organization_roles; +DROP TABLE IF EXISTS organizations; + +ALTER TABLE users + DROP CONSTRAINT IF EXISTS users_platform_role_check; + +ALTER TABLE users + DROP COLUMN IF EXISTS platform_role; diff --git a/backend/migrations/000005_platform_core_v2.up.sql b/backend/migrations/000005_platform_core_v2.up.sql new file mode 100644 index 0000000..57bf5b6 --- /dev/null +++ b/backend/migrations/000005_platform_core_v2.up.sql @@ -0,0 +1,219 @@ +ALTER TABLE users + ADD COLUMN IF NOT EXISTS platform_role TEXT NOT NULL DEFAULT 'user'; + +ALTER TABLE users + ADD CONSTRAINT users_platform_role_check + CHECK (platform_role IN ('user', 'platform_admin', 'platform_recovery_admin')); + +CREATE TABLE IF NOT EXISTS organizations ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + slug TEXT NOT NULL UNIQUE, + name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'active', + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT organizations_status_check + CHECK (status IN ('active', 'suspended', 'archived')) +); + +CREATE TABLE IF NOT EXISTS organization_roles ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + scope TEXT NOT NULL DEFAULT 'organization', + description TEXT NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT organization_roles_scope_check + CHECK (scope IN ('organization')) +); + +INSERT INTO organization_roles (id, name, scope, description) +VALUES + ('org_owner', 'Organization Owner', 'organization', 'Full organization control including membership administration.'), + ('org_admin', 'Organization Admin', 'organization', 'Administrative access within one organization.'), + ('org_operator', 'Organization Operator', 'organization', 'Operational access within one organization.'), + ('org_member', 'Organization Member', 'organization', 'Standard organization member access.'), + ('org_viewer', 'Organization Viewer', 'organization', 'Read-only organization visibility.') +ON CONFLICT (id) DO UPDATE SET + name = EXCLUDED.name, + description = EXCLUDED.description; + +CREATE TABLE IF NOT EXISTS organization_memberships ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + organization_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE, + user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE, + role_id TEXT NOT NULL REFERENCES organization_roles(id) ON DELETE RESTRICT, + status TEXT NOT NULL DEFAULT 'active', + invited_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE (organization_id, user_id), + CONSTRAINT organization_memberships_status_check + CHECK (status IN ('active', 'suspended', 'revoked')) +); + +CREATE INDEX IF NOT EXISTS idx_organization_memberships_user_id + ON organization_memberships(user_id); + +INSERT INTO organizations (slug, name, status, metadata) +VALUES ('default', 'Default Organization', 'active', '{"bootstrap":true}'::jsonb) +ON CONFLICT (slug) DO NOTHING; + +ALTER TABLE resources + ADD COLUMN IF NOT EXISTS organization_id UUID REFERENCES organizations(id) ON DELETE RESTRICT; + +UPDATE resources +SET organization_id = organizations.id +FROM organizations +WHERE organizations.slug = 'default' + AND resources.organization_id IS NULL; + +ALTER TABLE resources + ALTER COLUMN organization_id SET NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_resources_organization_id + ON resources(organization_id); + +ALTER TABLE remote_sessions + ADD COLUMN IF NOT EXISTS organization_id UUID REFERENCES organizations(id) ON DELETE RESTRICT; + +UPDATE remote_sessions +SET organization_id = resources.organization_id +FROM resources +WHERE resources.id = remote_sessions.resource_id + AND remote_sessions.organization_id IS NULL; + +ALTER TABLE remote_sessions + ALTER COLUMN organization_id SET NOT NULL; + +CREATE INDEX IF NOT EXISTS idx_remote_sessions_organization_id + ON remote_sessions(organization_id); + +CREATE TABLE IF NOT EXISTS identity_sources ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + organization_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE, + kind TEXT NOT NULL, + name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'active', + config JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT identity_sources_kind_check + CHECK (kind IN ('local', 'ldap_ad', 'oidc')), + CONSTRAINT identity_sources_status_check + CHECK (status IN ('active', 'disabled')) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_identity_sources_org_name + ON identity_sources(organization_id, name); + +CREATE TABLE IF NOT EXISTS identity_mappings ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + identity_source_id UUID NOT NULL REFERENCES identity_sources(id) ON DELETE CASCADE, + mapping_type TEXT NOT NULL, + external_selector JSONB NOT NULL DEFAULT '{}'::JSONB, + internal_target JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT identity_mappings_type_check + CHECK (mapping_type IN ('group_binding', 'claim_binding', 'user_binding')) +); + +CREATE INDEX IF NOT EXISTS idx_identity_mappings_source_id + ON identity_mappings(identity_source_id); + +CREATE TABLE IF NOT EXISTS nodes ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + owner_organization_id UUID REFERENCES organizations(id) ON DELETE SET NULL, + node_key TEXT NOT NULL UNIQUE, + name TEXT NOT NULL, + ownership_type TEXT NOT NULL, + registration_status TEXT NOT NULL DEFAULT 'pending', + health_status TEXT NOT NULL DEFAULT 'unknown', + version_state TEXT NOT NULL DEFAULT 'unknown', + partition_state TEXT NOT NULL DEFAULT 'healthy', + desired_version TEXT, + reported_version TEXT, + last_seen_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT nodes_ownership_type_check + CHECK (ownership_type IN ('platform_managed', 'customer_managed')), + CONSTRAINT nodes_registration_status_check + CHECK (registration_status IN ('pending', 'active', 'disabled', 'revoked')), + CONSTRAINT nodes_health_status_check + CHECK (health_status IN ('unknown', 'healthy', 'warning', 'critical')), + CONSTRAINT nodes_version_state_check + CHECK (version_state IN ('unknown', 'current', 'outdated', 'updating', 'rollback', 'failed')), + CONSTRAINT nodes_partition_state_check + CHECK (partition_state IN ('healthy', 'degraded', 'recovery', 'isolated')) +); + +CREATE TABLE IF NOT EXISTS node_capabilities ( + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + capability TEXT NOT NULL, + value JSONB NOT NULL DEFAULT '{}'::JSONB, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (node_id, capability) +); + +CREATE TABLE IF NOT EXISTS node_services ( + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + service_type TEXT NOT NULL, + enabled BOOLEAN NOT NULL DEFAULT FALSE, + desired_state TEXT NOT NULL DEFAULT 'disabled', + reported_state TEXT NOT NULL DEFAULT 'unknown', + last_reported_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (node_id, service_type), + CONSTRAINT node_services_desired_state_check + CHECK (desired_state IN ('enabled', 'disabled', 'drain')), + CONSTRAINT node_services_reported_state_check + CHECK (reported_state IN ('unknown', 'starting', 'running', 'degraded', 'stopped', 'failed')) +); + +CREATE TABLE IF NOT EXISTS node_update_policies ( + node_id UUID PRIMARY KEY REFERENCES nodes(id) ON DELETE CASCADE, + mode TEXT NOT NULL DEFAULT 'manual', + channel TEXT NOT NULL DEFAULT 'stable', + maintenance_window JSONB NOT NULL DEFAULT '{}'::JSONB, + canary BOOLEAN NOT NULL DEFAULT FALSE, + automatic_rollout BOOLEAN NOT NULL DEFAULT FALSE, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT node_update_policies_mode_check + CHECK (mode IN ('manual', 'automatic', 'staged', 'canary')) +); + +CREATE TABLE IF NOT EXISTS node_partition_states ( + node_id UUID PRIMARY KEY REFERENCES nodes(id) ON DELETE CASCADE, + cluster_state TEXT NOT NULL DEFAULT 'healthy', + recovery_mode TEXT NOT NULL DEFAULT 'normal', + notes TEXT, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT node_partition_states_cluster_state_check + CHECK (cluster_state IN ('healthy', 'degraded', 'recovery', 'isolated')), + CONSTRAINT node_partition_states_recovery_mode_check + CHECK (recovery_mode IN ('normal', 'manual_recovery', 'emergency')) +); + +CREATE TABLE IF NOT EXISTS node_agent_update_runs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + action TEXT NOT NULL, + target_version TEXT, + status TEXT NOT NULL DEFAULT 'pending', + requested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + acknowledged_at TIMESTAMPTZ, + completed_at TIMESTAMPTZ, + payload JSONB NOT NULL DEFAULT '{}'::JSONB, + CONSTRAINT node_agent_update_runs_action_check + CHECK (action IN ('update', 'rollback')), + CONSTRAINT node_agent_update_runs_status_check + CHECK (status IN ('pending', 'acknowledged', 'succeeded', 'failed')) +); + +CREATE INDEX IF NOT EXISTS idx_node_agent_update_runs_node_id + ON node_agent_update_runs(node_id, requested_at DESC); diff --git a/backend/migrations/000006_default_org_memberships_backfill.down.sql b/backend/migrations/000006_default_org_memberships_backfill.down.sql new file mode 100644 index 0000000..8f36e49 --- /dev/null +++ b/backend/migrations/000006_default_org_memberships_backfill.down.sql @@ -0,0 +1,7 @@ +DELETE FROM organization_memberships +WHERE organization_id IN ( + SELECT id + FROM organizations + WHERE slug = 'default' +) +AND invited_by_user_id IS NULL; diff --git a/backend/migrations/000006_default_org_memberships_backfill.up.sql b/backend/migrations/000006_default_org_memberships_backfill.up.sql new file mode 100644 index 0000000..a85dde1 --- /dev/null +++ b/backend/migrations/000006_default_org_memberships_backfill.up.sql @@ -0,0 +1,29 @@ +INSERT INTO organization_memberships ( + id, + organization_id, + user_id, + role_id, + status, + invited_by_user_id, + created_at, + updated_at +) +SELECT + gen_random_uuid(), + org.id, + u.id, + CASE + WHEN u.platform_role IN ('platform_admin', 'platform_recovery_admin') THEN 'org_owner' + ELSE 'org_member' + END, + 'active', + NULL, + NOW(), + NOW() +FROM users u +CROSS JOIN organizations org +LEFT JOIN organization_memberships om + ON om.organization_id = org.id + AND om.user_id = u.id +WHERE org.slug = 'default' + AND om.id IS NULL; diff --git a/backend/migrations/000007_clipboard_policy_mode.down.sql b/backend/migrations/000007_clipboard_policy_mode.down.sql new file mode 100644 index 0000000..ed51872 --- /dev/null +++ b/backend/migrations/000007_clipboard_policy_mode.down.sql @@ -0,0 +1,5 @@ +ALTER TABLE resource_policies + DROP CONSTRAINT IF EXISTS resource_policies_clipboard_mode_check; + +ALTER TABLE resource_policies + DROP COLUMN IF EXISTS clipboard_mode; diff --git a/backend/migrations/000007_clipboard_policy_mode.up.sql b/backend/migrations/000007_clipboard_policy_mode.up.sql new file mode 100644 index 0000000..2923e2b --- /dev/null +++ b/backend/migrations/000007_clipboard_policy_mode.up.sql @@ -0,0 +1,16 @@ +ALTER TABLE resource_policies + ADD COLUMN IF NOT EXISTS clipboard_mode TEXT NOT NULL DEFAULT 'disabled'; + +UPDATE resource_policies +SET clipboard_mode = CASE + WHEN clipboard_enabled THEN 'bidirectional' + ELSE 'disabled' +END +WHERE clipboard_mode = 'disabled'; + +ALTER TABLE resource_policies + DROP CONSTRAINT IF EXISTS resource_policies_clipboard_mode_check; + +ALTER TABLE resource_policies + ADD CONSTRAINT resource_policies_clipboard_mode_check + CHECK (clipboard_mode IN ('disabled', 'client_to_server', 'server_to_client', 'bidirectional')); diff --git a/backend/migrations/000008_file_transfer_policy_mode.down.sql b/backend/migrations/000008_file_transfer_policy_mode.down.sql new file mode 100644 index 0000000..043fa7d --- /dev/null +++ b/backend/migrations/000008_file_transfer_policy_mode.down.sql @@ -0,0 +1,5 @@ +ALTER TABLE resource_policies + DROP CONSTRAINT IF EXISTS resource_policies_file_transfer_mode_check; + +ALTER TABLE resource_policies + DROP COLUMN IF EXISTS file_transfer_mode; diff --git a/backend/migrations/000008_file_transfer_policy_mode.up.sql b/backend/migrations/000008_file_transfer_policy_mode.up.sql new file mode 100644 index 0000000..0915c0c --- /dev/null +++ b/backend/migrations/000008_file_transfer_policy_mode.up.sql @@ -0,0 +1,16 @@ +ALTER TABLE resource_policies + ADD COLUMN IF NOT EXISTS file_transfer_mode TEXT NOT NULL DEFAULT 'disabled'; + +UPDATE resource_policies +SET file_transfer_mode = 'disabled', + file_transfer_enabled = FALSE +WHERE file_transfer_mode IS NULL + OR file_transfer_mode = '' + OR file_transfer_mode = 'disabled'; + +ALTER TABLE resource_policies + DROP CONSTRAINT IF EXISTS resource_policies_file_transfer_mode_check; + +ALTER TABLE resource_policies + ADD CONSTRAINT resource_policies_file_transfer_mode_check + CHECK (file_transfer_mode IN ('disabled', 'client_to_server', 'server_to_client', 'bidirectional')); diff --git a/backend/migrations/000009_resource_secrets.down.sql b/backend/migrations/000009_resource_secrets.down.sql new file mode 100644 index 0000000..0e502d6 --- /dev/null +++ b/backend/migrations/000009_resource_secrets.down.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS resource_secrets; diff --git a/backend/migrations/000009_resource_secrets.up.sql b/backend/migrations/000009_resource_secrets.up.sql new file mode 100644 index 0000000..53427bb --- /dev/null +++ b/backend/migrations/000009_resource_secrets.up.sql @@ -0,0 +1,27 @@ +CREATE TABLE IF NOT EXISTS resource_secrets ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + organization_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE, + resource_id UUID NOT NULL REFERENCES resources(id) ON DELETE CASCADE, + secret_ref TEXT NOT NULL UNIQUE, + protocol TEXT NOT NULL, + version INTEGER NOT NULL DEFAULT 1, + key_id TEXT NOT NULL, + algorithm TEXT NOT NULL DEFAULT 'AES-256-GCM', + nonce BYTEA NOT NULL, + ciphertext BYTEA NOT NULL, + payload_sha256 TEXT NOT NULL, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + rotated_at TIMESTAMPTZ, + UNIQUE (resource_id) +); + +CREATE INDEX IF NOT EXISTS idx_resource_secrets_organization_id + ON resource_secrets(organization_id); + +CREATE INDEX IF NOT EXISTS idx_resource_secrets_resource_id + ON resource_secrets(resource_id); + +CREATE INDEX IF NOT EXISTS idx_resource_secrets_secret_ref + ON resource_secrets(secret_ref); diff --git a/backend/migrations/000010_cluster_node_admin_foundation.down.sql b/backend/migrations/000010_cluster_node_admin_foundation.down.sql new file mode 100644 index 0000000..a5cf354 --- /dev/null +++ b/backend/migrations/000010_cluster_node_admin_foundation.down.sql @@ -0,0 +1,9 @@ +DROP TABLE IF EXISTS cluster_audit_events; +DROP TABLE IF EXISTS node_latest_heartbeats; +DROP TABLE IF EXISTS node_heartbeats; +DROP TABLE IF EXISTS node_role_assignments; +DROP TABLE IF EXISTS node_join_requests; +DROP TABLE IF EXISTS node_join_tokens; +DROP TABLE IF EXISTS node_identities; +DROP TABLE IF EXISTS cluster_memberships; +DROP TABLE IF EXISTS clusters; diff --git a/backend/migrations/000010_cluster_node_admin_foundation.up.sql b/backend/migrations/000010_cluster_node_admin_foundation.up.sql new file mode 100644 index 0000000..97f0458 --- /dev/null +++ b/backend/migrations/000010_cluster_node_admin_foundation.up.sql @@ -0,0 +1,186 @@ +CREATE TABLE IF NOT EXISTS clusters ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + slug TEXT NOT NULL UNIQUE, + name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'active', + region TEXT, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT clusters_status_check + CHECK (status IN ('active', 'disabled', 'archived', 'degraded')) +); + +INSERT INTO clusters (slug, name, status, region, metadata) +VALUES ('default', 'Default Cluster', 'active', NULL, '{"bootstrap":true}'::jsonb) +ON CONFLICT (slug) DO NOTHING; + +CREATE TABLE IF NOT EXISTS cluster_memberships ( + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + membership_status TEXT NOT NULL DEFAULT 'active', + joined_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + last_seen_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + PRIMARY KEY (cluster_id, node_id), + CONSTRAINT cluster_memberships_status_check + CHECK (membership_status IN ('active', 'draining', 'disabled', 'revoked')) +); + +INSERT INTO cluster_memberships (cluster_id, node_id, membership_status, joined_at, last_seen_at, metadata) +SELECT c.id, n.id, 'active', COALESCE(n.created_at, NOW()), n.last_seen_at, '{"backfilled":true}'::jsonb +FROM clusters c +CROSS JOIN nodes n +WHERE c.slug = 'default' +ON CONFLICT (cluster_id, node_id) DO NOTHING; + +CREATE TABLE IF NOT EXISTS node_identities ( + node_id UUID PRIMARY KEY REFERENCES nodes(id) ON DELETE CASCADE, + public_key TEXT NOT NULL, + certificate_serial TEXT, + certificate_not_before TIMESTAMPTZ, + certificate_not_after TIMESTAMPTZ, + identity_status TEXT NOT NULL DEFAULT 'pending', + rotated_at TIMESTAMPTZ, + revoked_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT node_identities_status_check + CHECK (identity_status IN ('pending', 'active', 'rotating', 'revoked')) +); + +CREATE TABLE IF NOT EXISTS node_join_tokens ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + token_hash TEXT NOT NULL UNIQUE, + scope JSONB NOT NULL DEFAULT '{}'::JSONB, + expires_at TIMESTAMPTZ NOT NULL, + max_uses INTEGER NOT NULL DEFAULT 1, + used_count INTEGER NOT NULL DEFAULT 0, + status TEXT NOT NULL DEFAULT 'active', + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + revoked_at TIMESTAMPTZ, + CONSTRAINT node_join_tokens_status_check + CHECK (status IN ('active', 'revoked', 'expired')), + CONSTRAINT node_join_tokens_max_uses_check + CHECK (max_uses > 0), + CONSTRAINT node_join_tokens_used_count_check + CHECK (used_count >= 0) +); + +CREATE INDEX IF NOT EXISTS idx_node_join_tokens_cluster_status + ON node_join_tokens(cluster_id, status, expires_at); + +CREATE TABLE IF NOT EXISTS node_join_requests ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + join_token_id UUID REFERENCES node_join_tokens(id) ON DELETE SET NULL, + node_name TEXT NOT NULL, + node_fingerprint TEXT NOT NULL, + public_key TEXT NOT NULL, + reported_capabilities JSONB NOT NULL DEFAULT '{}'::JSONB, + reported_facts JSONB NOT NULL DEFAULT '{}'::JSONB, + requested_roles JSONB NOT NULL DEFAULT '[]'::JSONB, + status TEXT NOT NULL DEFAULT 'pending', + reviewed_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + reviewed_at TIMESTAMPTZ, + approved_node_id UUID REFERENCES nodes(id) ON DELETE SET NULL, + rejection_reason TEXT, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT node_join_requests_status_check + CHECK (status IN ('pending', 'approved', 'rejected', 'cancelled')) +); + +CREATE INDEX IF NOT EXISTS idx_node_join_requests_cluster_status + ON node_join_requests(cluster_id, status, created_at DESC); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_node_join_requests_pending_fingerprint + ON node_join_requests(cluster_id, node_fingerprint) + WHERE status = 'pending'; + +CREATE TABLE IF NOT EXISTS node_role_assignments ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + organization_id UUID REFERENCES organizations(id) ON DELETE CASCADE, + role TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'active', + policy JSONB NOT NULL DEFAULT '{}'::JSONB, + assigned_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + assigned_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + revoked_at TIMESTAMPTZ, + CONSTRAINT node_role_assignments_status_check + CHECK (status IN ('active', 'disabled', 'revoked')), + CONSTRAINT node_role_assignments_role_check + CHECK (role IN ( + 'entry-node', + 'relay-node', + 'core-mesh', + 'rdp-worker', + 'vnc-worker', + 'vpn-exit', + 'vpn-connector', + 'file-storage-cache', + 'update-cache', + 'video-relay' + )) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_node_role_assignments_unique_active + ON node_role_assignments(cluster_id, node_id, role, COALESCE(organization_id, '00000000-0000-0000-0000-000000000000'::uuid)) + WHERE status = 'active'; + +CREATE INDEX IF NOT EXISTS idx_node_role_assignments_cluster + ON node_role_assignments(cluster_id, role, status); + +CREATE TABLE IF NOT EXISTS node_heartbeats ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + health_status TEXT NOT NULL DEFAULT 'unknown', + reported_version TEXT, + capabilities JSONB NOT NULL DEFAULT '{}'::JSONB, + service_states JSONB NOT NULL DEFAULT '{}'::JSONB, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + observed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT node_heartbeats_health_status_check + CHECK (health_status IN ('unknown', 'healthy', 'warning', 'critical')) +); + +CREATE INDEX IF NOT EXISTS idx_node_heartbeats_cluster_node_observed + ON node_heartbeats(cluster_id, node_id, observed_at DESC); + +CREATE TABLE IF NOT EXISTS node_latest_heartbeats ( + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + heartbeat_id UUID REFERENCES node_heartbeats(id) ON DELETE SET NULL, + health_status TEXT NOT NULL DEFAULT 'unknown', + reported_version TEXT, + capabilities JSONB NOT NULL DEFAULT '{}'::JSONB, + service_states JSONB NOT NULL DEFAULT '{}'::JSONB, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + observed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (cluster_id, node_id), + CONSTRAINT node_latest_heartbeats_health_status_check + CHECK (health_status IN ('unknown', 'healthy', 'warning', 'critical')) +); + +CREATE TABLE IF NOT EXISTS cluster_audit_events ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID REFERENCES clusters(id) ON DELETE SET NULL, + actor_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + event_type TEXT NOT NULL, + target_type TEXT NOT NULL, + target_id TEXT, + payload JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_cluster_audit_events_cluster_created + ON cluster_audit_events(cluster_id, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_cluster_audit_events_type_created + ON cluster_audit_events(event_type, created_at DESC); diff --git a/backend/migrations/000011_service_workload_supervision_contract.down.sql b/backend/migrations/000011_service_workload_supervision_contract.down.sql new file mode 100644 index 0000000..99fe7de --- /dev/null +++ b/backend/migrations/000011_service_workload_supervision_contract.down.sql @@ -0,0 +1,3 @@ +DROP TABLE IF EXISTS node_workload_latest_statuses; +DROP TABLE IF EXISTS node_workload_status_reports; +DROP TABLE IF EXISTS node_workload_desired_states; diff --git a/backend/migrations/000011_service_workload_supervision_contract.up.sql b/backend/migrations/000011_service_workload_supervision_contract.up.sql new file mode 100644 index 0000000..1111f6c --- /dev/null +++ b/backend/migrations/000011_service_workload_supervision_contract.up.sql @@ -0,0 +1,57 @@ +CREATE TABLE IF NOT EXISTS node_workload_desired_states ( + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + service_type TEXT NOT NULL, + desired_state TEXT NOT NULL DEFAULT 'disabled', + version TEXT, + runtime_mode TEXT NOT NULL DEFAULT 'container', + artifact_ref TEXT, + config JSONB NOT NULL DEFAULT '{}'::JSONB, + environment JSONB NOT NULL DEFAULT '{}'::JSONB, + updated_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (cluster_id, node_id, service_type), + CONSTRAINT node_workload_desired_states_desired_state_check + CHECK (desired_state IN ('enabled', 'disabled', 'drain')), + CONSTRAINT node_workload_desired_states_runtime_mode_check + CHECK (runtime_mode IN ('native', 'container')) +); + +CREATE INDEX IF NOT EXISTS idx_node_workload_desired_states_cluster + ON node_workload_desired_states(cluster_id, service_type, desired_state); + +CREATE TABLE IF NOT EXISTS node_workload_status_reports ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + service_type TEXT NOT NULL, + reported_state TEXT NOT NULL DEFAULT 'unknown', + runtime_mode TEXT NOT NULL DEFAULT 'container', + version TEXT, + status_payload JSONB NOT NULL DEFAULT '{}'::JSONB, + observed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT node_workload_status_reports_reported_state_check + CHECK (reported_state IN ('unknown', 'starting', 'running', 'degraded', 'stopped', 'failed', 'not_implemented')), + CONSTRAINT node_workload_status_reports_runtime_mode_check + CHECK (runtime_mode IN ('native', 'container')) +); + +CREATE INDEX IF NOT EXISTS idx_node_workload_status_reports_node_observed + ON node_workload_status_reports(cluster_id, node_id, observed_at DESC); + +CREATE TABLE IF NOT EXISTS node_workload_latest_statuses ( + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + service_type TEXT NOT NULL, + status_report_id UUID REFERENCES node_workload_status_reports(id) ON DELETE SET NULL, + reported_state TEXT NOT NULL DEFAULT 'unknown', + runtime_mode TEXT NOT NULL DEFAULT 'container', + version TEXT, + status_payload JSONB NOT NULL DEFAULT '{}'::JSONB, + observed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (cluster_id, node_id, service_type), + CONSTRAINT node_workload_latest_statuses_reported_state_check + CHECK (reported_state IN ('unknown', 'starting', 'running', 'degraded', 'stopped', 'failed', 'not_implemented')), + CONSTRAINT node_workload_latest_statuses_runtime_mode_check + CHECK (runtime_mode IN ('native', 'container')) +); diff --git a/backend/migrations/000012_mesh_control_plane_preparation.down.sql b/backend/migrations/000012_mesh_control_plane_preparation.down.sql new file mode 100644 index 0000000..ac5a6b3 --- /dev/null +++ b/backend/migrations/000012_mesh_control_plane_preparation.down.sql @@ -0,0 +1,4 @@ +DROP TABLE IF EXISTS mesh_qos_policies; +DROP TABLE IF EXISTS mesh_route_intents; +DROP TABLE IF EXISTS mesh_latest_links; +DROP TABLE IF EXISTS mesh_link_observations; diff --git a/backend/migrations/000012_mesh_control_plane_preparation.up.sql b/backend/migrations/000012_mesh_control_plane_preparation.up.sql new file mode 100644 index 0000000..55d4fb6 --- /dev/null +++ b/backend/migrations/000012_mesh_control_plane_preparation.up.sql @@ -0,0 +1,94 @@ +CREATE TABLE IF NOT EXISTS mesh_link_observations ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + source_node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + target_node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + link_status TEXT NOT NULL DEFAULT 'unknown', + latency_ms INTEGER, + quality_score INTEGER, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + observed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT mesh_link_observations_status_check + CHECK (link_status IN ('unknown', 'reachable', 'degraded', 'unreachable')), + CONSTRAINT mesh_link_observations_quality_check + CHECK (quality_score IS NULL OR (quality_score >= 0 AND quality_score <= 100)) +); + +CREATE INDEX IF NOT EXISTS idx_mesh_link_observations_cluster_observed + ON mesh_link_observations(cluster_id, observed_at DESC); + +CREATE TABLE IF NOT EXISTS mesh_latest_links ( + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + source_node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + target_node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + observation_id UUID REFERENCES mesh_link_observations(id) ON DELETE SET NULL, + link_status TEXT NOT NULL DEFAULT 'unknown', + latency_ms INTEGER, + quality_score INTEGER, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + observed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (cluster_id, source_node_id, target_node_id), + CONSTRAINT mesh_latest_links_status_check + CHECK (link_status IN ('unknown', 'reachable', 'degraded', 'unreachable')), + CONSTRAINT mesh_latest_links_quality_check + CHECK (quality_score IS NULL OR (quality_score >= 0 AND quality_score <= 100)) +); + +CREATE TABLE IF NOT EXISTS mesh_route_intents ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + source_selector JSONB NOT NULL DEFAULT '{}'::JSONB, + destination_selector JSONB NOT NULL DEFAULT '{}'::JSONB, + service_class TEXT NOT NULL, + priority INTEGER NOT NULL DEFAULT 100, + status TEXT NOT NULL DEFAULT 'active', + policy JSONB NOT NULL DEFAULT '{}'::JSONB, + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT mesh_route_intents_service_class_check + CHECK (service_class IN ('input', 'control', 'render', 'clipboard', 'file_transfer', 'vpn_packets', 'telemetry')), + CONSTRAINT mesh_route_intents_status_check + CHECK (status IN ('active', 'disabled')) +); + +CREATE INDEX IF NOT EXISTS idx_mesh_route_intents_cluster_class + ON mesh_route_intents(cluster_id, service_class, status); + +CREATE TABLE IF NOT EXISTS mesh_qos_policies ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + service_class TEXT NOT NULL, + priority INTEGER NOT NULL, + reliability_mode TEXT NOT NULL, + drop_policy TEXT NOT NULL, + bandwidth_policy JSONB NOT NULL DEFAULT '{}'::JSONB, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE (cluster_id, service_class), + CONSTRAINT mesh_qos_policies_service_class_check + CHECK (service_class IN ('input', 'control', 'render', 'clipboard', 'file_transfer', 'vpn_packets', 'telemetry')), + CONSTRAINT mesh_qos_policies_reliability_check + CHECK (reliability_mode IN ('reliable', 'droppable', 'adaptive')), + CONSTRAINT mesh_qos_policies_drop_policy_check + CHECK (drop_policy IN ('never', 'latest_only', 'adaptive')) +); + +INSERT INTO mesh_qos_policies ( + cluster_id, service_class, priority, reliability_mode, drop_policy, bandwidth_policy, metadata +) +SELECT c.id, defaults.service_class, defaults.priority, defaults.reliability_mode, + defaults.drop_policy, '{}'::jsonb, '{"default":true}'::jsonb +FROM clusters c +CROSS JOIN ( + VALUES + ('input', 10, 'reliable', 'never'), + ('control', 20, 'reliable', 'never'), + ('clipboard', 40, 'reliable', 'never'), + ('render', 60, 'droppable', 'latest_only'), + ('file_transfer', 80, 'reliable', 'never'), + ('telemetry', 120, 'adaptive', 'adaptive'), + ('vpn_packets', 160, 'adaptive', 'adaptive') +) AS defaults(service_class, priority, reliability_mode, drop_policy) +ON CONFLICT (cluster_id, service_class) DO NOTHING; diff --git a/backend/migrations/000013_multi_cluster_partition_hardening.down.sql b/backend/migrations/000013_multi_cluster_partition_hardening.down.sql new file mode 100644 index 0000000..cf25fb8 --- /dev/null +++ b/backend/migrations/000013_multi_cluster_partition_hardening.down.sql @@ -0,0 +1,2 @@ +DROP VIEW IF EXISTS cluster_admin_summaries; +DROP TABLE IF EXISTS cluster_authority_states; diff --git a/backend/migrations/000013_multi_cluster_partition_hardening.up.sql b/backend/migrations/000013_multi_cluster_partition_hardening.up.sql new file mode 100644 index 0000000..7089ba0 --- /dev/null +++ b/backend/migrations/000013_multi_cluster_partition_hardening.up.sql @@ -0,0 +1,40 @@ +CREATE TABLE IF NOT EXISTS cluster_authority_states ( + cluster_id UUID PRIMARY KEY REFERENCES clusters(id) ON DELETE CASCADE, + authority_state TEXT NOT NULL DEFAULT 'authoritative', + mutation_mode TEXT NOT NULL DEFAULT 'normal', + term BIGINT NOT NULL DEFAULT 1, + notes TEXT, + updated_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT cluster_authority_states_authority_check + CHECK (authority_state IN ('authoritative', 'minority', 'isolated', 'recovery')), + CONSTRAINT cluster_authority_states_mutation_check + CHECK (mutation_mode IN ('normal', 'read_only', 'recovery_override')) +); + +INSERT INTO cluster_authority_states (cluster_id, authority_state, mutation_mode, term, notes) +SELECT id, 'authoritative', 'normal', 1, 'default authority state' +FROM clusters +ON CONFLICT (cluster_id) DO NOTHING; + +CREATE OR REPLACE VIEW cluster_admin_summaries AS +SELECT + c.id AS cluster_id, + c.slug, + c.name, + c.status, + c.region, + COALESCE(cas.authority_state, 'authoritative') AS authority_state, + COALESCE(cas.mutation_mode, 'normal') AS mutation_mode, + COUNT(DISTINCT cm.node_id) AS node_count, + COUNT(DISTINCT CASE WHEN n.health_status = 'healthy' THEN n.id END) AS healthy_node_count, + COUNT(DISTINCT CASE WHEN njr.status = 'pending' THEN njr.id END) AS pending_join_count, + COUNT(DISTINCT nra.id) AS active_role_assignment_count, + MAX(n.last_seen_at) AS last_node_seen_at +FROM clusters c +LEFT JOIN cluster_authority_states cas ON cas.cluster_id = c.id +LEFT JOIN cluster_memberships cm ON cm.cluster_id = c.id +LEFT JOIN nodes n ON n.id = cm.node_id +LEFT JOIN node_join_requests njr ON njr.cluster_id = c.id +LEFT JOIN node_role_assignments nra ON nra.cluster_id = c.id AND nra.status = 'active' +GROUP BY c.id, c.slug, c.name, c.status, c.region, cas.authority_state, cas.mutation_mode; diff --git a/backend/migrations/000014_vpn_ip_tunnel_control_plane.down.sql b/backend/migrations/000014_vpn_ip_tunnel_control_plane.down.sql new file mode 100644 index 0000000..5dc36e0 --- /dev/null +++ b/backend/migrations/000014_vpn_ip_tunnel_control_plane.down.sql @@ -0,0 +1,4 @@ +DROP TABLE IF EXISTS vpn_connection_leases; +DROP TABLE IF EXISTS vpn_connection_route_policies; +DROP TABLE IF EXISTS vpn_connection_allowed_nodes; +DROP TABLE IF EXISTS vpn_connections; diff --git a/backend/migrations/000014_vpn_ip_tunnel_control_plane.up.sql b/backend/migrations/000014_vpn_ip_tunnel_control_plane.up.sql new file mode 100644 index 0000000..ddb155f --- /dev/null +++ b/backend/migrations/000014_vpn_ip_tunnel_control_plane.up.sql @@ -0,0 +1,125 @@ +CREATE TABLE IF NOT EXISTS vpn_connections ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + organization_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE, + name TEXT NOT NULL, + target_endpoint JSONB NOT NULL DEFAULT '{}'::JSONB, + protocol_family TEXT NOT NULL DEFAULT 'generic', + credential_ref TEXT, + mode TEXT NOT NULL DEFAULT 'single_active', + desired_state TEXT NOT NULL DEFAULT 'disabled', + allowed_node_policy JSONB NOT NULL DEFAULT '{"mode":"explicit","node_ids":[]}'::JSONB, + routing_usage JSONB NOT NULL DEFAULT '[]'::JSONB, + route_policy JSONB NOT NULL DEFAULT '{}'::JSONB, + qos_policy JSONB NOT NULL DEFAULT '{}'::JSONB, + placement_policy JSONB NOT NULL DEFAULT '{}'::JSONB, + status TEXT NOT NULL DEFAULT 'disabled', + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + updated_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT vpn_connections_mode_check + CHECK (mode IN ('single_active')), + CONSTRAINT vpn_connections_desired_state_check + CHECK (desired_state IN ('enabled', 'disabled')), + CONSTRAINT vpn_connections_status_check + CHECK (status IN ('disabled', 'enabled', 'connecting', 'active', 'degraded', 'failed')), + CONSTRAINT vpn_connections_protocol_family_check + CHECK (protocol_family IN ('generic', 'wireguard', 'ipsec', 'openvpn')) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_vpn_connections_cluster_org_name + ON vpn_connections(cluster_id, organization_id, lower(name)); + +CREATE INDEX IF NOT EXISTS idx_vpn_connections_cluster_org_state + ON vpn_connections(cluster_id, organization_id, desired_state, status); + +CREATE TABLE IF NOT EXISTS vpn_connection_allowed_nodes ( + vpn_connection_id UUID NOT NULL REFERENCES vpn_connections(id) ON DELETE CASCADE, + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + role_preference TEXT NOT NULL DEFAULT 'candidate', + status TEXT NOT NULL DEFAULT 'active', + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (vpn_connection_id, node_id), + CONSTRAINT vpn_connection_allowed_nodes_membership_fk + FOREIGN KEY (cluster_id, node_id) + REFERENCES cluster_memberships(cluster_id, node_id) + ON DELETE CASCADE, + CONSTRAINT vpn_connection_allowed_nodes_preference_check + CHECK (role_preference IN ('candidate', 'standby', 'preferred')), + CONSTRAINT vpn_connection_allowed_nodes_status_check + CHECK (status IN ('active', 'disabled')) +); + +CREATE INDEX IF NOT EXISTS idx_vpn_connection_allowed_nodes_cluster_node + ON vpn_connection_allowed_nodes(cluster_id, node_id, status); + +CREATE TABLE IF NOT EXISTS vpn_connection_route_policies ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + vpn_connection_id UUID NOT NULL REFERENCES vpn_connections(id) ON DELETE CASCADE, + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + organization_id UUID NOT NULL REFERENCES organizations(id) ON DELETE CASCADE, + route_type TEXT NOT NULL, + destination TEXT NOT NULL, + action TEXT NOT NULL DEFAULT 'allow', + service_type TEXT, + priority INTEGER NOT NULL DEFAULT 100, + policy JSONB NOT NULL DEFAULT '{}'::JSONB, + status TEXT NOT NULL DEFAULT 'active', + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT vpn_connection_route_policies_route_type_check + CHECK (route_type IN ('cidr', 'dns_suffix', 'service', 'resource')), + CONSTRAINT vpn_connection_route_policies_action_check + CHECK (action IN ('allow', 'deny')), + CONSTRAINT vpn_connection_route_policies_status_check + CHECK (status IN ('active', 'disabled')) +); + +CREATE INDEX IF NOT EXISTS idx_vpn_connection_route_policies_connection + ON vpn_connection_route_policies(vpn_connection_id, status, priority); + +CREATE INDEX IF NOT EXISTS idx_vpn_connection_route_policies_cluster_org + ON vpn_connection_route_policies(cluster_id, organization_id, route_type, status); + +CREATE TABLE IF NOT EXISTS vpn_connection_leases ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + vpn_connection_id UUID NOT NULL REFERENCES vpn_connections(id) ON DELETE CASCADE, + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + owner_node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + lease_generation BIGINT NOT NULL, + fencing_token TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'active', + acquired_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + renewed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + expires_at TIMESTAMPTZ NOT NULL, + released_at TIMESTAMPTZ, + fenced_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + CONSTRAINT vpn_connection_leases_membership_fk + FOREIGN KEY (cluster_id, owner_node_id) + REFERENCES cluster_memberships(cluster_id, node_id) + ON DELETE CASCADE, + CONSTRAINT vpn_connection_leases_status_check + CHECK (status IN ('active', 'released', 'expired', 'fenced')), + CONSTRAINT vpn_connection_leases_expiry_check + CHECK (expires_at > acquired_at) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_vpn_connection_leases_generation + ON vpn_connection_leases(vpn_connection_id, lease_generation); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_vpn_connection_leases_single_active + ON vpn_connection_leases(vpn_connection_id) + WHERE status = 'active'; + +CREATE INDEX IF NOT EXISTS idx_vpn_connection_leases_owner + ON vpn_connection_leases(cluster_id, owner_node_id, status, expires_at); + +CREATE INDEX IF NOT EXISTS idx_vpn_connection_leases_connection_time + ON vpn_connection_leases(vpn_connection_id, acquired_at DESC); diff --git a/backend/migrations/000015_vpn_assignment_status_reports.down.sql b/backend/migrations/000015_vpn_assignment_status_reports.down.sql new file mode 100644 index 0000000..6398b7f --- /dev/null +++ b/backend/migrations/000015_vpn_assignment_status_reports.down.sql @@ -0,0 +1,2 @@ +DROP TABLE IF EXISTS vpn_connection_assignment_latest_statuses; +DROP TABLE IF EXISTS vpn_connection_assignment_status_reports; diff --git a/backend/migrations/000015_vpn_assignment_status_reports.up.sql b/backend/migrations/000015_vpn_assignment_status_reports.up.sql new file mode 100644 index 0000000..84db959 --- /dev/null +++ b/backend/migrations/000015_vpn_assignment_status_reports.up.sql @@ -0,0 +1,42 @@ +CREATE TABLE IF NOT EXISTS vpn_connection_assignment_status_reports ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + vpn_connection_id UUID NOT NULL REFERENCES vpn_connections(id) ON DELETE CASCADE, + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + observed_status TEXT NOT NULL DEFAULT 'unknown', + status_payload JSONB NOT NULL DEFAULT '{}'::JSONB, + observed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT vpn_connection_assignment_status_membership_fk + FOREIGN KEY (cluster_id, node_id) + REFERENCES cluster_memberships(cluster_id, node_id) + ON DELETE CASCADE, + CONSTRAINT vpn_connection_assignment_status_check + CHECK (observed_status IN ('not_started', 'assigned', 'lease_required', 'blocked', 'unknown')) +); + +CREATE INDEX IF NOT EXISTS idx_vpn_assignment_status_reports_node + ON vpn_connection_assignment_status_reports(cluster_id, node_id, observed_at DESC); + +CREATE INDEX IF NOT EXISTS idx_vpn_assignment_status_reports_connection + ON vpn_connection_assignment_status_reports(vpn_connection_id, observed_at DESC); + +CREATE TABLE IF NOT EXISTS vpn_connection_assignment_latest_statuses ( + vpn_connection_id UUID NOT NULL REFERENCES vpn_connections(id) ON DELETE CASCADE, + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + report_id UUID NOT NULL REFERENCES vpn_connection_assignment_status_reports(id) ON DELETE CASCADE, + observed_status TEXT NOT NULL, + status_payload JSONB NOT NULL DEFAULT '{}'::JSONB, + observed_at TIMESTAMPTZ NOT NULL, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (vpn_connection_id, node_id), + CONSTRAINT vpn_connection_assignment_latest_membership_fk + FOREIGN KEY (cluster_id, node_id) + REFERENCES cluster_memberships(cluster_id, node_id) + ON DELETE CASCADE, + CONSTRAINT vpn_connection_assignment_latest_status_check + CHECK (observed_status IN ('not_started', 'assigned', 'lease_required', 'blocked', 'unknown')) +); + +CREATE INDEX IF NOT EXISTS idx_vpn_assignment_latest_node + ON vpn_connection_assignment_latest_statuses(cluster_id, node_id, updated_at DESC); diff --git a/backend/migrations/000016_fabric_testing_telemetry.down.sql b/backend/migrations/000016_fabric_testing_telemetry.down.sql new file mode 100644 index 0000000..6e46592 --- /dev/null +++ b/backend/migrations/000016_fabric_testing_telemetry.down.sql @@ -0,0 +1,5 @@ +DROP INDEX IF EXISTS idx_node_telemetry_cluster_node_observed; +DROP TABLE IF EXISTS node_telemetry_observations; + +DROP INDEX IF EXISTS idx_fabric_testing_flags_unique_scope; +DROP TABLE IF EXISTS fabric_testing_flags; diff --git a/backend/migrations/000016_fabric_testing_telemetry.up.sql b/backend/migrations/000016_fabric_testing_telemetry.up.sql new file mode 100644 index 0000000..88a26d3 --- /dev/null +++ b/backend/migrations/000016_fabric_testing_telemetry.up.sql @@ -0,0 +1,43 @@ +CREATE TABLE IF NOT EXISTS fabric_testing_flags ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + scope_type TEXT NOT NULL, + scope_id UUID, + cluster_id UUID REFERENCES clusters(id) ON DELETE CASCADE, + enabled BOOLEAN NOT NULL DEFAULT FALSE, + telemetry_enabled BOOLEAN NOT NULL DEFAULT FALSE, + synthetic_links_enabled BOOLEAN NOT NULL DEFAULT FALSE, + history_retention_hours INTEGER NOT NULL DEFAULT 24, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + updated_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT fabric_testing_flags_scope_check + CHECK (scope_type IN ('platform', 'organization', 'node')), + CONSTRAINT fabric_testing_flags_retention_check + CHECK (history_retention_hours BETWEEN 1 AND 720) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_fabric_testing_flags_unique_scope + ON fabric_testing_flags ( + scope_type, + COALESCE(scope_id, '00000000-0000-0000-0000-000000000000'::uuid), + COALESCE(cluster_id, '00000000-0000-0000-0000-000000000000'::uuid) + ); + +CREATE TABLE IF NOT EXISTS node_telemetry_observations ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + cpu_percent DOUBLE PRECISION, + memory_used_bytes BIGINT, + memory_total_bytes BIGINT, + disk_used_bytes BIGINT, + disk_total_bytes BIGINT, + network_rx_bytes BIGINT, + network_tx_bytes BIGINT, + process_count INTEGER, + payload JSONB NOT NULL DEFAULT '{}'::JSONB, + observed_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_node_telemetry_cluster_node_observed + ON node_telemetry_observations(cluster_id, node_id, observed_at DESC); diff --git a/backend/migrations/000017_cluster_node_groups.down.sql b/backend/migrations/000017_cluster_node_groups.down.sql new file mode 100644 index 0000000..49e0ab4 --- /dev/null +++ b/backend/migrations/000017_cluster_node_groups.down.sql @@ -0,0 +1,2 @@ +DROP TABLE IF EXISTS cluster_node_group_memberships; +DROP TABLE IF EXISTS cluster_node_groups; diff --git a/backend/migrations/000017_cluster_node_groups.up.sql b/backend/migrations/000017_cluster_node_groups.up.sql new file mode 100644 index 0000000..397820a --- /dev/null +++ b/backend/migrations/000017_cluster_node_groups.up.sql @@ -0,0 +1,40 @@ +CREATE TABLE IF NOT EXISTS cluster_node_groups ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + parent_group_id UUID REFERENCES cluster_node_groups(id) ON DELETE SET NULL, + name TEXT NOT NULL, + description TEXT, + sort_order INTEGER NOT NULL DEFAULT 0, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT cluster_node_groups_name_check CHECK (length(trim(name)) > 0), + CONSTRAINT cluster_node_groups_not_self_parent CHECK (parent_group_id IS NULL OR parent_group_id <> id), + UNIQUE (cluster_id, id) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_cluster_node_groups_unique_sibling_name + ON cluster_node_groups(cluster_id, COALESCE(parent_group_id, '00000000-0000-0000-0000-000000000000'::uuid), lower(name)); + +CREATE INDEX IF NOT EXISTS idx_cluster_node_groups_parent + ON cluster_node_groups(cluster_id, parent_group_id, sort_order, name); + +CREATE TABLE IF NOT EXISTS cluster_node_group_memberships ( + cluster_id UUID NOT NULL, + node_id UUID NOT NULL, + group_id UUID NOT NULL, + assigned_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + assigned_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + PRIMARY KEY (cluster_id, node_id), + FOREIGN KEY (cluster_id, node_id) + REFERENCES cluster_memberships(cluster_id, node_id) + ON DELETE CASCADE, + FOREIGN KEY (cluster_id, group_id) + REFERENCES cluster_node_groups(cluster_id, id) + ON DELETE CASCADE +); + +CREATE INDEX IF NOT EXISTS idx_cluster_node_group_memberships_group + ON cluster_node_group_memberships(cluster_id, group_id); diff --git a/backend/migrations/000018_fabric_service_endpoints.down.sql b/backend/migrations/000018_fabric_service_endpoints.down.sql new file mode 100644 index 0000000..42e9651 --- /dev/null +++ b/backend/migrations/000018_fabric_service_endpoints.down.sql @@ -0,0 +1,4 @@ +DROP TABLE IF EXISTS fabric_egress_pool_nodes; +DROP TABLE IF EXISTS fabric_egress_pools; +DROP TABLE IF EXISTS fabric_entry_point_nodes; +DROP TABLE IF EXISTS fabric_entry_points; diff --git a/backend/migrations/000018_fabric_service_endpoints.up.sql b/backend/migrations/000018_fabric_service_endpoints.up.sql new file mode 100644 index 0000000..3c8c539 --- /dev/null +++ b/backend/migrations/000018_fabric_service_endpoints.up.sql @@ -0,0 +1,83 @@ +CREATE TABLE IF NOT EXISTS fabric_entry_points ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'active', + endpoint_type TEXT NOT NULL DEFAULT 'client_access', + public_endpoint TEXT, + policy JSONB NOT NULL DEFAULT '{}'::JSONB, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT fabric_entry_points_status_check + CHECK (status IN ('active', 'disabled', 'maintenance')), + CONSTRAINT fabric_entry_points_type_check + CHECK (endpoint_type IN ('client_access', 'admin', 'api', 'other')) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_fabric_entry_points_cluster_name + ON fabric_entry_points(cluster_id, LOWER(name)); + +CREATE INDEX IF NOT EXISTS idx_fabric_entry_points_cluster_status + ON fabric_entry_points(cluster_id, status, endpoint_type); + +CREATE TABLE IF NOT EXISTS fabric_entry_point_nodes ( + entry_point_id UUID NOT NULL REFERENCES fabric_entry_points(id) ON DELETE CASCADE, + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + status TEXT NOT NULL DEFAULT 'active', + priority INTEGER NOT NULL DEFAULT 100, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + added_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + added_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (entry_point_id, node_id), + CONSTRAINT fabric_entry_point_nodes_status_check + CHECK (status IN ('active', 'disabled', 'maintenance')), + CONSTRAINT fabric_entry_point_nodes_priority_check + CHECK (priority >= 0) +); + +CREATE INDEX IF NOT EXISTS idx_fabric_entry_point_nodes_cluster_node + ON fabric_entry_point_nodes(cluster_id, node_id, status); + +CREATE TABLE IF NOT EXISTS fabric_egress_pools ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'active', + description TEXT, + route_scope JSONB NOT NULL DEFAULT '{}'::JSONB, + policy JSONB NOT NULL DEFAULT '{}'::JSONB, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT fabric_egress_pools_status_check + CHECK (status IN ('active', 'disabled', 'maintenance')) +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_fabric_egress_pools_cluster_name + ON fabric_egress_pools(cluster_id, LOWER(name)); + +CREATE INDEX IF NOT EXISTS idx_fabric_egress_pools_cluster_status + ON fabric_egress_pools(cluster_id, status); + +CREATE TABLE IF NOT EXISTS fabric_egress_pool_nodes ( + egress_pool_id UUID NOT NULL REFERENCES fabric_egress_pools(id) ON DELETE CASCADE, + cluster_id UUID NOT NULL REFERENCES clusters(id) ON DELETE CASCADE, + node_id UUID NOT NULL REFERENCES nodes(id) ON DELETE CASCADE, + status TEXT NOT NULL DEFAULT 'active', + priority INTEGER NOT NULL DEFAULT 100, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + added_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + added_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + PRIMARY KEY (egress_pool_id, node_id), + CONSTRAINT fabric_egress_pool_nodes_status_check + CHECK (status IN ('active', 'disabled', 'maintenance')), + CONSTRAINT fabric_egress_pool_nodes_priority_check + CHECK (priority >= 0) +); + +CREATE INDEX IF NOT EXISTS idx_fabric_egress_pool_nodes_cluster_node + ON fabric_egress_pool_nodes(cluster_id, node_id, status); diff --git a/backend/migrations/000019_mesh_latest_link_observation_key.down.sql b/backend/migrations/000019_mesh_latest_link_observation_key.down.sql new file mode 100644 index 0000000..d22e79d --- /dev/null +++ b/backend/migrations/000019_mesh_latest_link_observation_key.down.sql @@ -0,0 +1,23 @@ +WITH ranked AS ( + SELECT ctid, + ROW_NUMBER() OVER ( + PARTITION BY cluster_id, source_node_id, target_node_id + ORDER BY observed_at DESC + ) AS rn + FROM mesh_latest_links +) +DELETE FROM mesh_latest_links +USING ranked +WHERE mesh_latest_links.ctid = ranked.ctid + AND ranked.rn > 1; + +DROP INDEX IF EXISTS idx_mesh_latest_links_cluster_observed; + +ALTER TABLE mesh_latest_links + DROP CONSTRAINT IF EXISTS mesh_latest_links_pkey; + +ALTER TABLE mesh_latest_links + ADD PRIMARY KEY (cluster_id, source_node_id, target_node_id); + +ALTER TABLE mesh_latest_links + DROP COLUMN IF EXISTS observation_key; diff --git a/backend/migrations/000019_mesh_latest_link_observation_key.up.sql b/backend/migrations/000019_mesh_latest_link_observation_key.up.sql new file mode 100644 index 0000000..fa1e20a --- /dev/null +++ b/backend/migrations/000019_mesh_latest_link_observation_key.up.sql @@ -0,0 +1,11 @@ +ALTER TABLE mesh_latest_links + ADD COLUMN IF NOT EXISTS observation_key TEXT NOT NULL DEFAULT 'default'; + +ALTER TABLE mesh_latest_links + DROP CONSTRAINT IF EXISTS mesh_latest_links_pkey; + +ALTER TABLE mesh_latest_links + ADD PRIMARY KEY (cluster_id, source_node_id, target_node_id, observation_key); + +CREATE INDEX IF NOT EXISTS idx_mesh_latest_links_cluster_observed + ON mesh_latest_links(cluster_id, observed_at DESC); diff --git a/backend/migrations/000020_installation_authority.down.sql b/backend/migrations/000020_installation_authority.down.sql new file mode 100644 index 0000000..babdbd9 --- /dev/null +++ b/backend/migrations/000020_installation_authority.down.sql @@ -0,0 +1,4 @@ +DROP INDEX IF EXISTS idx_platform_role_grants_unique_install_role; +DROP INDEX IF EXISTS idx_platform_role_grants_user_active; +DROP TABLE IF EXISTS platform_role_grants; +DROP TABLE IF EXISTS installation_authority; diff --git a/backend/migrations/000020_installation_authority.up.sql b/backend/migrations/000020_installation_authority.up.sql new file mode 100644 index 0000000..416eb50 --- /dev/null +++ b/backend/migrations/000020_installation_authority.up.sql @@ -0,0 +1,39 @@ +CREATE TABLE IF NOT EXISTS installation_authority ( + id SMALLINT PRIMARY KEY DEFAULT 1, + install_id TEXT NOT NULL, + authority_state TEXT NOT NULL DEFAULT 'active', + product_root_key_fingerprint TEXT NOT NULL DEFAULT '', + activation_payload JSONB NOT NULL, + activation_signature TEXT NOT NULL, + bootstrapped_owner_email TEXT NOT NULL, + bootstrapped_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + CONSTRAINT installation_authority_singleton_check CHECK (id = 1), + CONSTRAINT installation_authority_state_check CHECK (authority_state IN ('active', 'recovery_required', 'locked')) +); + +CREATE TABLE IF NOT EXISTS platform_role_grants ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE, + role TEXT NOT NULL, + install_id TEXT NOT NULL, + grant_payload JSONB NOT NULL, + grant_signature TEXT NOT NULL, + grant_source TEXT NOT NULL DEFAULT 'installation_activation', + granted_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + expires_at TIMESTAMPTZ, + revoked_at TIMESTAMPTZ, + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + CONSTRAINT platform_role_grants_role_check + CHECK (role IN ('platform_admin', 'platform_recovery_admin')), + CONSTRAINT platform_role_grants_source_check + CHECK (grant_source IN ('installation_activation', 'recovery_manifest', 'dev_insecure')) +); + +CREATE INDEX IF NOT EXISTS idx_platform_role_grants_user_active + ON platform_role_grants(user_id, role, revoked_at, expires_at); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_platform_role_grants_unique_install_role + ON platform_role_grants(user_id, role, install_id) + WHERE revoked_at IS NULL; diff --git a/backend/migrations/000021_cluster_authority_keys.down.sql b/backend/migrations/000021_cluster_authority_keys.down.sql new file mode 100644 index 0000000..1837731 --- /dev/null +++ b/backend/migrations/000021_cluster_authority_keys.down.sql @@ -0,0 +1,33 @@ +DROP VIEW IF EXISTS cluster_admin_summaries; + +CREATE VIEW cluster_admin_summaries AS +SELECT + c.id AS cluster_id, + c.slug, + c.name, + c.status, + c.region, + COALESCE(cas.authority_state, 'authoritative') AS authority_state, + COALESCE(cas.mutation_mode, 'normal') AS mutation_mode, + COUNT(DISTINCT cm.node_id) AS node_count, + COUNT(DISTINCT CASE WHEN n.health_status = 'healthy' THEN n.id END) AS healthy_node_count, + COUNT(DISTINCT CASE WHEN njr.status = 'pending' THEN njr.id END) AS pending_join_count, + COUNT(DISTINCT nra.id) AS active_role_assignment_count, + MAX(n.last_seen_at) AS last_node_seen_at +FROM clusters c +LEFT JOIN cluster_authority_states cas ON cas.cluster_id = c.id +LEFT JOIN cluster_memberships cm ON cm.cluster_id = c.id +LEFT JOIN nodes n ON n.id = cm.node_id +LEFT JOIN node_join_requests njr ON njr.cluster_id = c.id +LEFT JOIN node_role_assignments nra ON nra.cluster_id = c.id AND nra.status = 'active' +GROUP BY c.id, c.slug, c.name, c.status, c.region, cas.authority_state, cas.mutation_mode; + +ALTER TABLE node_join_requests + DROP COLUMN IF EXISTS approval_signature, + DROP COLUMN IF EXISTS approval_payload; + +ALTER TABLE node_join_tokens + DROP COLUMN IF EXISTS authority_signature, + DROP COLUMN IF EXISTS authority_payload; + +DROP TABLE IF EXISTS cluster_authorities; diff --git a/backend/migrations/000021_cluster_authority_keys.up.sql b/backend/migrations/000021_cluster_authority_keys.up.sql new file mode 100644 index 0000000..566e39a --- /dev/null +++ b/backend/migrations/000021_cluster_authority_keys.up.sql @@ -0,0 +1,54 @@ +CREATE TABLE IF NOT EXISTS cluster_authorities ( + cluster_id UUID PRIMARY KEY REFERENCES clusters(id) ON DELETE CASCADE, + authority_state TEXT NOT NULL DEFAULT 'active', + key_algorithm TEXT NOT NULL DEFAULT 'ed25519', + public_key TEXT NOT NULL, + public_key_fingerprint TEXT NOT NULL, + private_key TEXT NOT NULL, + created_by_user_id UUID REFERENCES users(id) ON DELETE SET NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + metadata JSONB NOT NULL DEFAULT '{}'::JSONB, + CONSTRAINT cluster_authorities_state_check + CHECK (authority_state IN ('active', 'rotating', 'revoked', 'recovery_required')), + CONSTRAINT cluster_authorities_algorithm_check + CHECK (key_algorithm = 'ed25519') +); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_cluster_authorities_fingerprint + ON cluster_authorities(public_key_fingerprint); + +ALTER TABLE node_join_tokens + ADD COLUMN IF NOT EXISTS authority_payload JSONB NOT NULL DEFAULT '{}'::JSONB, + ADD COLUMN IF NOT EXISTS authority_signature JSONB NOT NULL DEFAULT '{}'::JSONB; + +ALTER TABLE node_join_requests + ADD COLUMN IF NOT EXISTS approval_payload JSONB NOT NULL DEFAULT '{}'::JSONB, + ADD COLUMN IF NOT EXISTS approval_signature JSONB NOT NULL DEFAULT '{}'::JSONB; + +DROP VIEW IF EXISTS cluster_admin_summaries; + +CREATE VIEW cluster_admin_summaries AS +SELECT + c.id AS cluster_id, + c.slug, + c.name, + c.status, + c.region, + COALESCE(cas.authority_state, 'authoritative') AS authority_state, + COALESCE(cas.mutation_mode, 'normal') AS mutation_mode, + ca.key_algorithm AS cluster_key_algorithm, + ca.public_key_fingerprint AS cluster_key_fingerprint, + COUNT(DISTINCT cm.node_id) AS node_count, + COUNT(DISTINCT CASE WHEN n.health_status = 'healthy' THEN n.id END) AS healthy_node_count, + COUNT(DISTINCT CASE WHEN njr.status = 'pending' THEN njr.id END) AS pending_join_count, + COUNT(DISTINCT nra.id) AS active_role_assignment_count, + MAX(n.last_seen_at) AS last_node_seen_at +FROM clusters c +LEFT JOIN cluster_authority_states cas ON cas.cluster_id = c.id +LEFT JOIN cluster_authorities ca ON ca.cluster_id = c.id +LEFT JOIN cluster_memberships cm ON cm.cluster_id = c.id +LEFT JOIN nodes n ON n.id = cm.node_id +LEFT JOIN node_join_requests njr ON njr.cluster_id = c.id +LEFT JOIN node_role_assignments nra ON nra.cluster_id = c.id AND nra.status = 'active' +GROUP BY c.id, c.slug, c.name, c.status, c.region, cas.authority_state, cas.mutation_mode, ca.key_algorithm, ca.public_key_fingerprint; diff --git a/backend/migrations/000022_synthetic_mesh_service_class.down.sql b/backend/migrations/000022_synthetic_mesh_service_class.down.sql new file mode 100644 index 0000000..9cae624 --- /dev/null +++ b/backend/migrations/000022_synthetic_mesh_service_class.down.sql @@ -0,0 +1,21 @@ +UPDATE mesh_route_intents +SET service_class = 'control', + updated_at = NOW() +WHERE service_class = 'synthetic'; + +DELETE FROM mesh_qos_policies +WHERE service_class = 'synthetic'; + +ALTER TABLE mesh_route_intents + DROP CONSTRAINT IF EXISTS mesh_route_intents_service_class_check; + +ALTER TABLE mesh_route_intents + ADD CONSTRAINT mesh_route_intents_service_class_check + CHECK (service_class IN ('input', 'control', 'render', 'clipboard', 'file_transfer', 'vpn_packets', 'telemetry')); + +ALTER TABLE mesh_qos_policies + DROP CONSTRAINT IF EXISTS mesh_qos_policies_service_class_check; + +ALTER TABLE mesh_qos_policies + ADD CONSTRAINT mesh_qos_policies_service_class_check + CHECK (service_class IN ('input', 'control', 'render', 'clipboard', 'file_transfer', 'vpn_packets', 'telemetry')); diff --git a/backend/migrations/000022_synthetic_mesh_service_class.up.sql b/backend/migrations/000022_synthetic_mesh_service_class.up.sql new file mode 100644 index 0000000..7f07410 --- /dev/null +++ b/backend/migrations/000022_synthetic_mesh_service_class.up.sql @@ -0,0 +1,21 @@ +ALTER TABLE mesh_route_intents + DROP CONSTRAINT IF EXISTS mesh_route_intents_service_class_check; + +ALTER TABLE mesh_route_intents + ADD CONSTRAINT mesh_route_intents_service_class_check + CHECK (service_class IN ('input', 'control', 'synthetic', 'render', 'clipboard', 'file_transfer', 'vpn_packets', 'telemetry')); + +ALTER TABLE mesh_qos_policies + DROP CONSTRAINT IF EXISTS mesh_qos_policies_service_class_check; + +ALTER TABLE mesh_qos_policies + ADD CONSTRAINT mesh_qos_policies_service_class_check + CHECK (service_class IN ('input', 'control', 'synthetic', 'render', 'clipboard', 'file_transfer', 'vpn_packets', 'telemetry')); + +INSERT INTO mesh_qos_policies ( + cluster_id, service_class, priority, reliability_mode, drop_policy, bandwidth_policy, metadata +) +SELECT c.id, 'synthetic', 30, 'reliable', 'never', '{}'::jsonb, + '{"default":true,"control_plane_only":true}'::jsonb +FROM clusters c +ON CONFLICT (cluster_id, service_class) DO NOTHING; diff --git a/backend/pkg/contracts/message/message.go b/backend/pkg/contracts/message/message.go new file mode 100644 index 0000000..bb124ad --- /dev/null +++ b/backend/pkg/contracts/message/message.go @@ -0,0 +1,9 @@ +package message + +type Message struct { + Code string `json:"code"` + MessageKey string `json:"message_key"` + FallbackMessage string `json:"fallback_message"` + Details map[string]any `json:"details,omitempty"` + TraceID string `json:"trace_id,omitempty"` +} diff --git a/backend/pkg/contracts/session/dataplane.go b/backend/pkg/contracts/session/dataplane.go new file mode 100644 index 0000000..3905aa2 --- /dev/null +++ b/backend/pkg/contracts/session/dataplane.go @@ -0,0 +1,33 @@ +package session + +import "time" + +const ( + DataPlaneCandidateDirectWorkerWSS = "direct_worker_wss" + DataPlaneCandidateBackendGateway = "backend_gateway" + + DataPlaneChannelControl = "control" + DataPlaneChannelInput = "input" + DataPlaneChannelRender = "render" + DataPlaneChannelClipboard = "clipboard" + DataPlaneChannelFileUpload = "file_upload" + DataPlaneChannelFileDownload = "file_download" + DataPlaneChannelTelemetry = "telemetry" +) + +type DataPlaneOffer struct { + Preferred string `json:"preferred"` + Token string `json:"token"` + ExpiresAt time.Time `json:"expires_at"` + Candidates []DataPlaneCandidate `json:"candidates"` +} + +type DataPlaneCandidate struct { + Type string `json:"type"` + URL string `json:"url"` + WorkerID string `json:"worker_id,omitempty"` + NodeID string `json:"node_id,omitempty"` + ClusterID string `json:"cluster_id,omitempty"` + Priority int `json:"priority,omitempty"` + Metadata map[string]any `json:"metadata,omitempty"` +} diff --git a/backend/pkg/contracts/session/liveplane.go b/backend/pkg/contracts/session/liveplane.go new file mode 100644 index 0000000..628d10c --- /dev/null +++ b/backend/pkg/contracts/session/liveplane.go @@ -0,0 +1,35 @@ +package session + +import ( + "time" + + messagecontracts "github.com/example/remote-access-platform/backend/pkg/contracts/message" +) + +type ControllerBinding struct { + SessionID string `json:"session_id"` + AttachmentID string `json:"attachment_id"` + UserID string `json:"user_id"` + DeviceID string `json:"device_id"` + TakeoverVersion int `json:"takeover_version"` + BoundAt time.Time `json:"bound_at"` +} + +type AttachTokenClaims struct { + Token string `json:"token"` + SessionID string `json:"session_id"` + AttachmentID string `json:"attachment_id"` + UserID string `json:"user_id"` + DeviceID string `json:"device_id"` + WorkerID string `json:"worker_id"` + TakeoverVersion int `json:"takeover_version"` + ExpiresAt time.Time `json:"expires_at"` + Reconnectable bool `json:"reconnectable"` +} + +type TransportEnvelope struct { + Type string `json:"type"` + SessionID string `json:"session_id,omitempty"` + Payload map[string]any `json:"payload,omitempty"` + Event *messagecontracts.Message `json:"event,omitempty"` +} diff --git a/backend/pkg/contracts/session/session.go b/backend/pkg/contracts/session/session.go new file mode 100644 index 0000000..3a2cf05 --- /dev/null +++ b/backend/pkg/contracts/session/session.go @@ -0,0 +1,24 @@ +package session + +import "time" + +type State string + +const ( + StateStarting State = "starting" + StateActive State = "active" + StateDetached State = "detached" + StateReconnecting State = "reconnecting" + StateTerminated State = "terminated" + StateFailed State = "failed" +) + +type SessionDescriptor struct { + ID string `json:"id"` + ResourceID string `json:"resource_id"` + WorkerID string `json:"worker_id"` + State State `json:"state"` + ControllerID string `json:"controller_id"` + HeartbeatTTL time.Duration `json:"heartbeat_ttl"` + DetachGraceTime time.Duration `json:"detach_grace_time"` +} diff --git a/backend/pkg/contracts/worker/worker.go b/backend/pkg/contracts/worker/worker.go new file mode 100644 index 0000000..47ceadd --- /dev/null +++ b/backend/pkg/contracts/worker/worker.go @@ -0,0 +1,59 @@ +package worker + +import "time" + +type Protocol string + +const ( + ProtocolRDP Protocol = "rdp" +) + +type Status string + +const ( + StatusOnline Status = "online" + StatusDraining Status = "draining" + StatusOffline Status = "offline" +) + +type AttachRequest struct { + ResourceID string `json:"resource_id"` + ActorID string `json:"actor_id"` + SessionID string `json:"session_id,omitempty"` + RequiredCapabilities []string `json:"required_capabilities,omitempty"` + RenderQualityProfile string `json:"render_quality_profile,omitempty"` +} + +type WorkerLease struct { + LeaseID string `json:"lease_id"` + WorkerID string `json:"worker_id"` + Protocol Protocol `json:"protocol"` + ResourceID string `json:"resource_id"` + SessionID string `json:"session_id"` + Capabilities []string `json:"capabilities"` + ControlStream string `json:"control_stream"` + ExpiresAt time.Time `json:"expires_at"` + RenderQualityProfile string `json:"render_quality_profile,omitempty"` +} + +type WorkerRegistration struct { + WorkerID string `json:"worker_id"` + Protocol Protocol `json:"protocol"` + Status Status `json:"status"` + Capabilities []string `json:"capabilities"` + ControlStream string `json:"control_stream"` + LastHeartbeatAt time.Time `json:"last_heartbeat_at"` +} + +type WorkerHeartbeat struct { + WorkerID string `json:"worker_id"` + Status Status `json:"status"` + LastHeartbeatAt time.Time `json:"last_heartbeat_at"` +} + +type RoutedEnvelope struct { + SessionID string `json:"session_id"` + AttachmentID string `json:"attachment_id"` + Type string `json:"type"` + Payload map[string]any `json:"payload"` +} diff --git a/clients/windows/README.md b/clients/windows/README.md new file mode 100644 index 0000000..81d2e7a --- /dev/null +++ b/clients/windows/README.md @@ -0,0 +1,183 @@ +# Windows Access Client + +This project is the first native end-user Access Client for the platform. It +sits on top of the proven backend/session/worker foundation and now includes a +real RDP session window with direct worker data-plane support. + +## Current scope + +- WPF desktop shell in `clients/windows/src/RemoteAccessPlatform.Windows.App` +- application services for auth, organizations, resources, sessions, and gateway attach flows +- secure local token storage via DPAPI for MVP +- organization selection persisted locally +- resource list, active session list, and session window +- direct worker WSS data-plane integration with backend gateway fallback +- binary render receive path for direct worker WSS +- keyboard/mouse input forwarding +- text clipboard actions +- client-to-server file upload actions +- WebSocket state updates and takeover handling + +## Solution layout + +- `RemoteAccessPlatform.Windows.App`: WPF shell, composition root, windows, views +- `RemoteAccessPlatform.Windows.Application`: view models and client-side orchestration services +- `RemoteAccessPlatform.Windows.Contracts`: service abstractions +- `RemoteAccessPlatform.Windows.Models`: DTOs and persisted settings models +- `RemoteAccessPlatform.Windows.Transport`: HTTP and WebSocket transports +- `RemoteAccessPlatform.Windows.Settings`: DPAPI token storage and local settings persistence + +## Build assumptions + +- Current repository toolchain is `.NET SDK 10.0.101` +- The solution file is `RemoteAccessPlatform.Windows.slnx` +- WPF target framework is `net10.0-windows` + +## Build + +```powershell +dotnet build clients/windows/RemoteAccessPlatform.Windows.slnx +``` + +## Run + +```powershell +dotnet run --project clients/windows/src/RemoteAccessPlatform.Windows.App/RemoteAccessPlatform.Windows.App.csproj +``` + +For a live desktop smoke pass against an already running backend/worker: + +```powershell +pwsh -Sta -ExecutionPolicy Bypass -File scripts/windows-smoke/desktop-smoke.ps1 +``` + +For the worker-death/stale-lease verification variant against the current smoke host: + +```powershell +pwsh -Sta -ExecutionPolicy Bypass -File scripts/windows-smoke/desktop-smoke.ps1 ` + -VerifyWorkerDeath ` + -RemoteDockerHost 192.168.200.61 ` + -RemoteDockerUser test ` + -RemoteDockerPassword +``` + +The same worker-death command also covers input-fidelity regression checks. It proves: + +- focused session-surface lifecycle in the real WPF window +- keyboard input forwarding through the gateway to the worker +- mouse input forwarding through the gateway to the worker +- attach, detach, reattach, takeover, and worker-death handling without breaking the proven session lifecycle + +## Backend endpoints + +Default endpoints are defined in [appsettings.json](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\clients\windows\src\RemoteAccessPlatform.Windows.App\appsettings.json). + +- API: `http://192.168.200.61:8080/api/v1` +- WebSocket gateway: `ws://192.168.200.61:8080/api/v1/gateway/ws` +- Direct data-plane preference: `prefer_direct_data_plane`, default `true` +- Client environment: `environment`, default `development`; use `production` + or `prod` for production trust behavior +- Direct data-plane connect timeout: `direct_data_plane_connect_timeout_ms`, default `750` +- Direct data-plane color mode: `direct_data_plane_color_mode`, default `full_color`; smoke can set `grayscale` when the worker candidate advertises it +- Direct data-plane platform CA bundle: + `direct_data_plane_platform_ca_bundle`, optional app-local PEM/DER CA bundle + used only for direct worker WSS candidates with `tls_trust_mode=platform_ca` +- Smoke-only direct TLS override: `allow_insecure_direct_data_plane_tls_for_smoke`, default `false` + +The project default test backend runs on Docker host `192.168.200.61` +through Docker context `test-ubuntu` / SSH alias `docker-test`. + +## MVP behavior + +- Login stores access and refresh tokens in DPAPI-protected local storage +- Refresh rotation is handled by the auth service and transport retry path +- Organization selection is restored per user from local settings +- Resource and session actions rely on backend business rules; the client does not reimplement org or session policies +- Session windows connect to the existing gateway using backend-issued attach tokens +- When a session response includes a direct worker WSS data-plane candidate, the client uses it only if the candidate metadata explicitly marks it data-capable (`runtime_transport=json_v1` or `traffic_ready=true`); otherwise it stays on the backend gateway with no user-visible delay +- In production client environment, direct worker WSS is used only when the + candidate is marked `production_trusted=true` or uses `tls_trust_mode` of + `public_ca`/`platform_ca`. Smoke-only or untrusted direct candidates are + skipped and backend gateway fallback is used. +- For `tls_trust_mode=platform_ca`, the client may validate the worker + certificate against `direct_data_plane_platform_ca_bundle` with normal + hostname/SAN validation still enforced. This override is scoped only to the + direct worker WSS connection and is never applied to backend API/gateway + traffic. +- `allow_insecure_direct_data_plane_tls_for_smoke=true` bypasses certificate + validation only in non-production and only for smoke-only direct candidates. +- When direct candidate metadata advertises `render_transport=binary_v1` or `binary_render=true`, the client requests binary direct render frames and feeds them into the same latest-frame presenter path +- When direct candidate metadata advertises `supported_color_modes`, the client requests `direct_data_plane_color_mode`; unsupported modes fall back to `full_color` +- Backend gateway fallback continues to use JSON/base64 render frames for debug/fallback compatibility +- Direct worker WSS attempts are bounded by a short timeout and automatically fall back to the backend gateway +- `session.taken_over` closes the current transport and updates the window status + +## Localization Foundation + +- User-facing UI strings now live in [Strings.resx](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\clients\windows\src\RemoteAccessPlatform.Windows.Application\Resources\Strings.resx). +- A placeholder [Strings.ru.resx](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\clients\windows\src\RemoteAccessPlatform.Windows.Application\Resources\Strings.ru.resx) exists only to keep the structure ready for future translations. +- XAML uses strongly named static accessors from [Strings.cs](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\clients\windows\src\RemoteAccessPlatform.Windows.Application\Localization\Strings.cs). +- View models resolve backend `message_key` values through the same localization helper and fall back to backend `fallback_message` if a resource key is not available. +- The client should never display raw backend internal text directly as the primary UI contract. + +## Structured Messaging + +- Backend HTTP errors are parsed as structured envelopes instead of legacy raw strings. +- WebSocket envelopes may now include `event` alongside the existing `type` and `payload`. +- `BackendApiClient` preserves compatibility with legacy `"error": "..."` responses, but prefers the structured form when available. +- `SessionWindowViewModel` resolves websocket event messages by `message_key` first and only then falls back to `fallback_message`. +- Runtime smoke now proves both paths against the live backend: `message_key` resolution for invalid-login errors and fallback rendering for websocket state events whose keys are intentionally not present in client resources. + +## Developer Rules + +- Add every new user-visible client string to `Strings.resx` first. +- Keep English as the source language during development. +- When introducing a new backend user-facing error or websocket event, define a stable `code` and `message_key`. +- Do not bind UI directly to raw backend English messages when a structured `message_key` is available. + +## Known limitations + +- RDP rendering is usable but still has small redraw artifacts that must be + hardened in the next RDP visual correctness pass. +- Advanced graphics modes, codecs, tiles, GPU acceleration, fullscreen polish, + audio, printer, webcam, and multi-monitor are not implemented. +- Clipboard is text-only. +- File upload is accepted client-to-server. Server-to-client download has a + runtime-proven core data path and lifecycle blocking through the restricted + `RAP_Transfers\ToClient` drop-zone model; manual Windows-client UI proof is + still pending before full Stage 5.2 acceptance. +- Restricted drive visibility is accepted with the current + `rap-rdp-worker:rdp-p1-region-order2` baseline. +- Linux and mobile clients are not implemented. +- Mesh, node-agent updater runtime, VPN/IP tunnel runtime, and identity sync + runtime are not implemented here. + +## Desktop smoke status + +- proven against a live backend: login, organization switching, org-scoped resource refresh, and token refresh after forced short access-token expiry +- proven against the live localization-ready messaging model: invalid-login UI error rendering, structured websocket event rendering, fallback websocket state message rendering, takeover event messaging, and worker-death failure messaging +- proven backend/runtime bug fixed during smoke: auth device upsert now casts `trusted_at` and other timestamps explicitly in [backend/internal/modules/auth/postgres_store.go](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\backend\internal\modules\auth\postgres_store.go) +- proven client runtime bugs fixed during smoke: + - empty organization/resource/session views now initialize correctly via code-behind constructors + - dynamic status/session text controls no longer override automation-visible text with static automation names +- proven from the desktop client in Stage 2: focused session-surface lifecycle, keyboard forwarding, and mouse forwarding against the live backend/worker +- proven from the desktop client: `SessionWindow` creation, first render, DataContext binding, gateway connect, attach, detach, reattach, and takeover open path against the live backend +- proven via desktop smoke plus client diagnostics: takeover reuses the same session id and does not create a new desktop session shell for a different remote session +- proven backend fix for desktop failure semantics: [backend/internal/modules/sessiongateway/module.go](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\backend\internal\modules\sessiongateway\module.go) now distinguishes terminal session states from controller takeover instead of treating missing live binding as `session.taken_over` +- proven client hardening for gateway-close fallback: [SessionWindowViewModel.cs](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\clients\windows\src\RemoteAccessPlatform.Windows.Application\ViewModels\SessionWindowViewModel.cs) now refreshes authoritative session state after abrupt gateway closure +- proven end-to-end in the current smoke harness: worker-death desktop handling now reaches the failed session state in the real client UI +- current smoke harness for the remaining path is in [scripts/windows-smoke/desktop-smoke.ps1](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\scripts\windows-smoke\desktop-smoke.ps1) +## Clipboard Text MVP + +The session window exposes explicit text-only clipboard actions. The client +does not install global clipboard hooks and does not watch the OS clipboard in +the background. Clipboard text is read or written only when the user invokes +the session-window clipboard buttons inside an active session context. + +Blocked clipboard transfers are surfaced through localized session event-log +messages. The client treats backend `message_key` as the localization key and +falls back to the backend English `fallback_message` only when the key is not +present locally. + +The client never sends images, HTML, RTF, binary blobs, or file-like clipboard +payloads. Only Unicode text is read from or written to the local clipboard. diff --git a/clients/windows/RemoteAccessPlatform.Windows.slnx b/clients/windows/RemoteAccessPlatform.Windows.slnx new file mode 100644 index 0000000..a7d2282 --- /dev/null +++ b/clients/windows/RemoteAccessPlatform.Windows.slnx @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/clients/windows/src/RemoteAccessPlatform.Windows.App/App.xaml b/clients/windows/src/RemoteAccessPlatform.Windows.App/App.xaml new file mode 100644 index 0000000..6fc51b6 --- /dev/null +++ b/clients/windows/src/RemoteAccessPlatform.Windows.App/App.xaml @@ -0,0 +1,10 @@ + + + + + + diff --git a/clients/windows/src/RemoteAccessPlatform.Windows.App/App.xaml.cs b/clients/windows/src/RemoteAccessPlatform.Windows.App/App.xaml.cs new file mode 100644 index 0000000..da351e5 --- /dev/null +++ b/clients/windows/src/RemoteAccessPlatform.Windows.App/App.xaml.cs @@ -0,0 +1,46 @@ +using System.Windows; +using RemoteAccessPlatform.Windows.App.Bootstrap; +using RemoteAccessPlatform.Windows.App.Diagnostics; + +namespace RemoteAccessPlatform.Windows.App; + +public partial class App : System.Windows.Application +{ + private ClientRuntime? _runtime; + + protected override void OnStartup(StartupEventArgs e) + { + base.OnStartup(e); + + ClientTrace.EnsureInitialized(); + ClientTrace.Write("App startup"); + + DispatcherUnhandledException += (_, args) => + { + ClientTrace.Write($"DispatcherUnhandledException: {args.Exception}"); + }; + AppDomain.CurrentDomain.UnhandledException += (_, args) => + { + ClientTrace.Write($"AppDomainUnhandledException: {args.ExceptionObject}"); + }; + TaskScheduler.UnobservedTaskException += (_, args) => + { + ClientTrace.Write($"TaskScheduler.UnobservedTaskException: {args.Exception}"); + }; + + _runtime = ClientRuntime.Create(); + ClientTrace.Write("ClientRuntime created"); + var mainWindow = new MainWindow(_runtime.MainViewModel); + MainWindow = mainWindow; + ClientTrace.Write("MainWindow constructed"); + mainWindow.Show(); + ClientTrace.Write($"MainWindow shown. Title={mainWindow.Title}, IsVisible={mainWindow.IsVisible}"); + } + + protected override void OnExit(ExitEventArgs e) + { + ClientTrace.Write("App exit"); + _runtime?.Dispose(); + base.OnExit(e); + } +} diff --git a/clients/windows/src/RemoteAccessPlatform.Windows.App/AssemblyInfo.cs b/clients/windows/src/RemoteAccessPlatform.Windows.App/AssemblyInfo.cs new file mode 100644 index 0000000..cc29e7f --- /dev/null +++ b/clients/windows/src/RemoteAccessPlatform.Windows.App/AssemblyInfo.cs @@ -0,0 +1,10 @@ +using System.Windows; + +[assembly:ThemeInfo( + ResourceDictionaryLocation.None, //where theme specific resource dictionaries are located + //(used if a resource is not found in the page, + // or application resource dictionaries) + ResourceDictionaryLocation.SourceAssembly //where the generic resource dictionary is located + //(used if a resource is not found in the page, + // app, or any theme specific resource dictionaries) +)] diff --git a/clients/windows/src/RemoteAccessPlatform.Windows.App/Bootstrap/ClientRuntime.cs b/clients/windows/src/RemoteAccessPlatform.Windows.App/Bootstrap/ClientRuntime.cs new file mode 100644 index 0000000..104249e --- /dev/null +++ b/clients/windows/src/RemoteAccessPlatform.Windows.App/Bootstrap/ClientRuntime.cs @@ -0,0 +1,107 @@ +using System.Net.Http; +using System.IO; +using System.Text.Json; +using RemoteAccessPlatform.Windows.Application.Services; +using RemoteAccessPlatform.Windows.Application.ViewModels; +using RemoteAccessPlatform.Windows.Models; +using RemoteAccessPlatform.Windows.Settings; +using RemoteAccessPlatform.Windows.Transport; + +namespace RemoteAccessPlatform.Windows.App.Bootstrap; + +public sealed class ClientRuntime : IDisposable +{ + public ClientRuntime( + HttpClient httpClient, + AuthenticationService authenticationService, + MainViewModel mainViewModel) + { + HttpClient = httpClient; + AuthenticationService = authenticationService; + MainViewModel = mainViewModel; + } + + public HttpClient HttpClient { get; } + public AuthenticationService AuthenticationService { get; } + public MainViewModel MainViewModel { get; } + + public static ClientRuntime Create() + { + BackendEndpointOptions endpointOptions = LoadEndpointOptions(); + + var httpClient = new HttpClient + { + Timeout = TimeSpan.FromSeconds(30) + }; + + var tokenStore = new DpapiTokenStore(); + var localSettingsStore = new JsonLocalSettingsStore(); + var deviceIdentityProvider = new LocalDeviceIdentityProvider(localSettingsStore); + + AuthenticationService? authenticationService = null; + var apiClient = new BackendApiClient( + httpClient, + endpointOptions, + cancellationToken => authenticationService!.GetAccessTokenAsync(cancellationToken), + cancellationToken => authenticationService!.TryRefreshAsync(cancellationToken)); + + authenticationService = new AuthenticationService(apiClient, tokenStore, deviceIdentityProvider); + var organizationContextService = new OrganizationContextService(apiClient, localSettingsStore); + var resourceCatalogService = new ResourceCatalogService(apiClient); + var sessionService = new SessionService(apiClient); + var sessionGatewayClient = new SessionGatewayClient(endpointOptions); + var mainViewModel = new MainViewModel( + authenticationService, + organizationContextService, + resourceCatalogService, + sessionService, + sessionGatewayClient); + + return new ClientRuntime(httpClient, authenticationService, mainViewModel); + } + + public void Dispose() + { + HttpClient.Dispose(); + } + + private static BackendEndpointOptions LoadEndpointOptions() + { + string path = Path.Combine(AppContext.BaseDirectory, "appsettings.json"); + if (!File.Exists(path)) + { + return new BackendEndpointOptions(); + } + + using FileStream stream = File.OpenRead(path); + using JsonDocument document = JsonDocument.Parse(stream); + if (!document.RootElement.TryGetProperty("backend", out JsonElement backend)) + { + return new BackendEndpointOptions(); + } + + return new BackendEndpointOptions + { + ApiBaseUrl = backend.TryGetProperty("api_base_url", out JsonElement apiBaseUrl) + ? apiBaseUrl.GetString() ?? "http://192.168.200.61:8080/api/v1" + : "http://192.168.200.61:8080/api/v1", + GatewayWebSocketUrl = backend.TryGetProperty("gateway_websocket_url", out JsonElement gatewayUrl) + ? gatewayUrl.GetString() ?? "ws://192.168.200.61:8080/api/v1/gateway/ws" + : "ws://192.168.200.61:8080/api/v1/gateway/ws", + PreferDirectDataPlane = !backend.TryGetProperty("prefer_direct_data_plane", out JsonElement preferDirect) || preferDirect.GetBoolean(), + DirectDataPlaneConnectTimeoutMs = backend.TryGetProperty("direct_data_plane_connect_timeout_ms", out JsonElement directTimeout) + ? Math.Max(100, directTimeout.GetInt32()) + : 750, + AllowInsecureDirectDataPlaneTlsForSmoke = backend.TryGetProperty("allow_insecure_direct_data_plane_tls_for_smoke", out JsonElement insecureDirectTls) && insecureDirectTls.GetBoolean(), + DirectDataPlaneColorMode = backend.TryGetProperty("direct_data_plane_color_mode", out JsonElement colorMode) + ? colorMode.GetString() ?? "full_color" + : "full_color", + DirectDataPlanePlatformCaBundle = backend.TryGetProperty("direct_data_plane_platform_ca_bundle", out JsonElement platformCaBundle) + ? platformCaBundle.GetString() + : null, + Environment = backend.TryGetProperty("environment", out JsonElement environment) + ? environment.GetString() ?? "development" + : "development" + }; + } +} diff --git a/clients/windows/src/RemoteAccessPlatform.Windows.App/Converters/InverseBooleanToVisibilityConverter.cs b/clients/windows/src/RemoteAccessPlatform.Windows.App/Converters/InverseBooleanToVisibilityConverter.cs new file mode 100644 index 0000000..2aafc71 --- /dev/null +++ b/clients/windows/src/RemoteAccessPlatform.Windows.App/Converters/InverseBooleanToVisibilityConverter.cs @@ -0,0 +1,19 @@ +using System.Globalization; +using System.Windows; +using System.Windows.Data; + +namespace RemoteAccessPlatform.Windows.App.Converters; + +public sealed class InverseBooleanToVisibilityConverter : IValueConverter +{ + public object Convert(object value, Type targetType, object parameter, CultureInfo culture) + { + bool visible = value is bool boolValue && !boolValue; + return visible ? Visibility.Visible : Visibility.Collapsed; + } + + public object ConvertBack(object value, Type targetType, object parameter, CultureInfo culture) + { + return value is Visibility visibility && visibility != Visibility.Visible; + } +} diff --git a/clients/windows/src/RemoteAccessPlatform.Windows.App/Diagnostics/ClientTrace.cs b/clients/windows/src/RemoteAccessPlatform.Windows.App/Diagnostics/ClientTrace.cs new file mode 100644 index 0000000..248f460 --- /dev/null +++ b/clients/windows/src/RemoteAccessPlatform.Windows.App/Diagnostics/ClientTrace.cs @@ -0,0 +1,56 @@ +using System.Diagnostics; +using System.IO; + +namespace RemoteAccessPlatform.Windows.App.Diagnostics; + +internal static class ClientTrace +{ + private static readonly object Sync = new(); + private static bool _initialized; + private static string? _logPath; + + public static string LogPath + { + get + { + EnsureInitialized(); + return _logPath!; + } + } + + public static void EnsureInitialized() + { + lock (Sync) + { + if (_initialized) + { + return; + } + + string root = Path.Combine( + Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), + "RemoteAccessPlatform", + "WindowsClient", + "logs"); + Directory.CreateDirectory(root); + _logPath = Path.Combine(root, "desktop-client.log"); + + var stream = new FileStream(_logPath, FileMode.Append, FileAccess.Write, FileShare.ReadWrite); + var listener = new TextWriterTraceListener(stream) + { + TraceOutputOptions = TraceOptions.DateTime + }; + + Trace.AutoFlush = true; + Trace.Listeners.Add(listener); + Trace.WriteLine($"[{DateTimeOffset.Now:O}] Trace initialized. PID={Environment.ProcessId}"); + _initialized = true; + } + } + + public static void Write(string message) + { + EnsureInitialized(); + Trace.WriteLine($"[{DateTimeOffset.Now:O}] [T{Environment.CurrentManagedThreadId}] {message}"); + } +} diff --git a/clients/windows/src/RemoteAccessPlatform.Windows.App/Input/SessionInputMapper.cs b/clients/windows/src/RemoteAccessPlatform.Windows.App/Input/SessionInputMapper.cs new file mode 100644 index 0000000..99d163b --- /dev/null +++ b/clients/windows/src/RemoteAccessPlatform.Windows.App/Input/SessionInputMapper.cs @@ -0,0 +1,129 @@ +using System.Runtime.InteropServices; +using System.Windows; +using System.Windows.Input; +using RemoteAccessPlatform.Windows.Models; + +namespace RemoteAccessPlatform.Windows.App.Input; + +internal static class SessionInputMapper +{ + private const uint MapVkToScEx = 4; + + public static SessionInputEventDto? CreateKeyboardEvent(KeyEventArgs args, bool isKeyDown) + { + Key key = args.Key == Key.System ? args.SystemKey : args.Key; + if (key is Key.None or Key.ImeProcessed or Key.DeadCharProcessed) + { + return null; + } + + int virtualKey = KeyInterop.VirtualKeyFromKey(key); + if (virtualKey <= 0) + { + return null; + } + + uint scanCode = MapVirtualKey((uint)virtualKey, MapVkToScEx); + if (scanCode == 0) + { + return null; + } + + return new SessionInputEventDto + { + Kind = "keyboard", + Action = isKeyDown ? "key_down" : "key_up", + ScanCode = scanCode & 0xFF, + IsExtended = IsExtendedKey(virtualKey) + }; + } + + public static SessionInputEventDto CreateFocusEvent(bool focused) + { + return new SessionInputEventDto + { + Kind = "focus", + Action = focused ? "focus_in" : "focus_out", + Focused = focused + }; + } + + public static SessionInputEventDto CreateMouseMoveEvent(Point position, Size surfaceSize) + { + (double x, double y) = Normalize(position, surfaceSize); + return new SessionInputEventDto + { + Kind = "mouse", + Action = "move", + NormalizedX = x, + NormalizedY = y, + SurfaceWidth = surfaceSize.Width, + SurfaceHeight = surfaceSize.Height + }; + } + + public static SessionInputEventDto CreateMouseButtonEvent(string button, bool pressed, Point position, Size surfaceSize) + { + (double x, double y) = Normalize(position, surfaceSize); + return new SessionInputEventDto + { + Kind = "mouse", + Action = pressed ? "button_down" : "button_up", + Button = button, + NormalizedX = x, + NormalizedY = y, + SurfaceWidth = surfaceSize.Width, + SurfaceHeight = surfaceSize.Height + }; + } + + public static SessionInputEventDto CreateMouseWheelEvent(int wheelDelta, bool horizontal, Point position, Size surfaceSize) + { + (double x, double y) = Normalize(position, surfaceSize); + return new SessionInputEventDto + { + Kind = "mouse", + Action = "wheel", + WheelDelta = wheelDelta, + IsHorizontalWheel = horizontal, + NormalizedX = x, + NormalizedY = y, + SurfaceWidth = surfaceSize.Width, + SurfaceHeight = surfaceSize.Height + }; + } + + private static (double X, double Y) Normalize(Point position, Size surfaceSize) + { + double width = Math.Max(1d, surfaceSize.Width); + double height = Math.Max(1d, surfaceSize.Height); + double x = Math.Clamp(position.X / width, 0d, 1d); + double y = Math.Clamp(position.Y / height, 0d, 1d); + return (x, y); + } + + private static bool IsExtendedKey(int virtualKey) + { + return virtualKey switch + { + 0x21 or // PAGE UP + 0x22 or // PAGE DOWN + 0x23 or // END + 0x24 or // HOME + 0x25 or // LEFT + 0x26 or // UP + 0x27 or // RIGHT + 0x28 or // DOWN + 0x2D or // INSERT + 0x2E or // DELETE + 0x6F or // NUMPAD DIVIDE + 0x90 or // NUMLOCK + 0xA3 or // RIGHT CTRL + 0xA5 => true, // RIGHT ALT + _ => false + }; + } + + [DllImport("user32.dll", EntryPoint = "MapVirtualKeyW", ExactSpelling = true)] + private static extern uint MapVirtualKey(uint code, uint mapType); +} diff --git a/clients/windows/src/RemoteAccessPlatform.Windows.App/MainWindow.xaml b/clients/windows/src/RemoteAccessPlatform.Windows.App/MainWindow.xaml new file mode 100644 index 0000000..6a3a898 --- /dev/null +++ b/clients/windows/src/RemoteAccessPlatform.Windows.App/MainWindow.xaml @@ -0,0 +1,141 @@ + + + + + + + + + + + + + + + + + + + + + +