Initial project snapshot

This commit is contained in:
2026-04-28 22:29:50 +03:00
commit 8ba0561f4f
365 changed files with 91832 additions and 0 deletions
+22
View File
@@ -0,0 +1,22 @@
FROM debian:12.10-slim
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
bash \
build-essential \
ca-certificates \
cmake \
curl \
gdb \
git \
freerdp2-dev \
libboost-dev \
libssl-dev \
libwinpr2-dev \
ninja-build \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /workspaces/rdp-proxy
+22
View File
@@ -0,0 +1,22 @@
{
"name": "rdp-proxy-worker",
"build": {
"dockerfile": "Dockerfile",
"context": ".."
},
"workspaceFolder": "/workspaces/rdp-proxy",
"customizations": {
"vscode": {
"settings": {
"cmake.sourceDirectory": "${workspaceFolder}/workers/rdp-worker",
"cmake.buildDirectory": "${workspaceFolder}/workers/rdp-worker/build"
},
"extensions": [
"ms-vscode.cmake-tools",
"ms-vscode.cpptools"
]
}
},
"postCreateCommand": "cmake --preset dev --directory workers/rdp-worker",
"remoteUser": "root"
}
+28
View File
@@ -0,0 +1,28 @@
name: build
on:
push:
pull_request:
jobs:
backend:
runs-on: ubuntu-24.04
defaults:
run:
working-directory: backend
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: '1.23.8'
- run: go mod download
- run: go build ./...
worker:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Build worker image
run: docker build --tag rap-rdp-worker:ci --file workers/rdp-worker/Dockerfile workers/rdp-worker
- name: Verify worker binary path
run: docker run --rm --entrypoint /bin/sh rap-rdp-worker:ci -lc "test -x /usr/local/bin/rdp-worker && test -x /usr/local/bin/rdp-worker-dataplane-token-probe && test -x /usr/local/bin/rdp-worker-dataplane-bind-probe"
+64
View File
@@ -0,0 +1,64 @@
# OS
.DS_Store
Thumbs.db
# IDE
.idea/
.vscode/
*.swp
*.swo
# Logs
*.log
logs/
tmp/
temp/
# Env
.env
.env.*
!.env.example
# Go
backend/bin/
backend/.cache/
backend/vendor/
*.test
coverage.out
# C/C++
build/
cmake-build-*/
CMakeFiles/
CMakeCache.txt
compile_commands.json
*.o
*.obj
*.so
*.a
*.dll
*.exe
# .NET / WPF
bin/
obj/
*.user
*.suo
# Node / React
node_modules/
dist/
build/
.next/
coverage/
# Python (if scripts appear later)
__pycache__/
*.pyc
# Docker
*.local.yml
# Generated artifacts
artifacts/
out/
+618
View File
@@ -0,0 +1,618 @@
# CODEX CONTEXT
## Project identity
This project is a production-grade distributed secure access platform.
It started as a custom RDP proxy with persistent server-side sessions, but the final target architecture is broader:
- distributed secure access fabric
- multi-tenant platform
- session broker for GUI and future non-GUI protocols
- cluster mesh of nodes
- connector/VPN layer
- customer-managed and platform-managed nodes
- node-agent based self-update / rollback / health supervision
## Current proven foundation
The current codebase already proved the most risky low-level lifecycle assumptions for RDP:
- real FreeRDP connect works
- session state transitions to active work
- terminate works
- detach works without killing the remote session
- reattach works without recreating the remote session
- takeover works without recreating the remote session
- per-resource certificate verification policy exists
- `certificate_verification_mode = strict | ignore`
- `strict` is default
- `ignore` works on a per-resource basis
- worker build is reproducible
- backend build is reproducible
This proven lifecycle must NOT be broken by future architecture work.
## Current architecture baseline
Current audit and baseline snapshot:
- `docs/audits/PROJECT_AUDIT_2026-04-26.md`
- `docs/audits/CURRENT_BASELINE_MATRIX.md`
### Test environment
- Canonical test Docker host: `192.168.200.61`
- Canonical Docker context: `test-ubuntu`
- Canonical SSH alias: `docker-test`
- Backend API for local/client smoke runs: `http://192.168.200.61:8080/api/v1`
- WebSocket gateway for local/client smoke runs: `ws://192.168.200.61:8080/api/v1/gateway/ws`
- Stage C17 planning is completed.
- C17A synthetic mesh runtime skeleton is implemented and test-proven in
`rap-node-agent` only. It is disabled by default and carries synthetic
`fabric.probe` / `fabric.probe_ack` messages only.
- C17B route health and failover probes are implemented and test-proven in
`rap-node-agent` only. They are disabled by default and carry synthetic
`fabric.route_health` / `fabric.route_health_ack` messages only.
- C17C relay semantic hardening is implemented and test-proven in
`rap-node-agent` only. It is disabled by default and models synthetic
per-channel queues/QoS/backpressure only.
- C17D non-production test-service path is implemented and test-proven in
`rap-node-agent` only. It is disabled by default and carries only bounded
`synthetic.echo` test payloads.
- C17E/C17F/C17G are implemented and proven for live synthetic HTTP transport,
scoped synthetic route config, and Control Plane scoped synthetic config
consumption.
- C17H deployed multi-agent synthetic config smoke is runtime-proven on
`docker-test`: five running `rap-node-agent` containers consume
backend-issued node-scoped synthetic config, direct and single-relay
synthetic route-health observations return to the Control Plane, and
production forwarding remains disabled.
- C17I production forwarding gate foundation is implemented and test-proven:
`rap-node-agent` has an explicit production-forwarding gate, while
`/mesh/v1/forward` still refuses production payload forwarding until a later
approved runtime stage.
- C17J production envelope contract is implemented and test-proven:
`/mesh/v1/forward` validates route-bound production envelopes for
`fabric_control` / `fabric.control` only when the gate is enabled, rejects
service channels, and still refuses production forwarding.
- C17K production envelope observation is implemented and test-proven:
valid accepted envelopes can be observed locally as metadata-only records
after validation; rejected envelopes are not observed, observation failure
fails closed, and production forwarding remains unavailable.
- C17L bounded production observation sink is implemented and test-proven:
accepted metadata-only observations can be retained locally with fixed
capacity, oldest-entry drop behavior, and no payload body storage.
- C17M production observation sink wiring is implemented and test-proven:
node-agent can wire the bounded local metadata-only sink when
`RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` is explicitly greater than
zero; the wiring is disabled by default and exposes no read API.
- C17N production observation sink metrics are implemented and test-proven:
local sink metrics expose only capacity, current depth, accepted total, and
dropped-oldest total; they expose no observation records or payload metadata.
- C17O production observation sink local metrics logging is implemented and
test-proven: node-agent logs aggregate sink metrics locally when the sink is
explicitly enabled; no read API or Control Plane reporting is added.
- C17P production observation sink change-driven metrics logging is implemented
and test-proven: node-agent suppresses repeated identical local sink metrics
logs; no read API or Control Plane reporting is added.
- C17Q production forwarding gate/runtime log boundary is implemented and
test-proven: node-agent logs production forwarding gate state separately from
production forwarding runtime state. Runtime state remained false until
C17Z introduced gate-controlled `fabric.control` direct forwarding.
- C17R production observation sink capacity guard is implemented and
test-proven: `RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` is rejected
above `10000`.
- C17S production observation panic fail-closed hardening is implemented and
test-proven: observer errors and observer panics both fail closed as
observation failure.
- C17T production envelope payload boundary is implemented and test-proven:
validated production `fabric.control` envelope payloads are bounded to
`4096` bytes and oversized envelopes are rejected before observation.
- C17U production envelope created-at skew boundary is implemented and
test-proven: validated production `fabric.control` envelopes whose
`created_at` is more than one minute in the future are rejected before
observation.
- C17V peer endpoint candidate model is implemented and test-proven:
node-scoped synthetic mesh config now carries route-scoped endpoint
candidates with transport, address, reachability, NAT type, connectivity
mode, priority, policy tags, verification time, and metadata. This is a
model/config boundary only; no production route scoring, NAT traversal,
shortcut routing, or forwarding runtime is implemented.
- C17W peer endpoint candidate scoring model is implemented and test-proven:
`rap-node-agent` can rank already-scoped endpoint candidates using soft
inputs such as transport, reachability, connectivity mode, NAT type,
priority, region, policy tags, channel class, and verification age. This is
a scoring helper only; it does not open connections, choose production
routes, or forward payloads.
- C17X health-aware endpoint candidate scoring overlay is implemented and
test-proven: endpoint candidate scoring can optionally use local health
observations keyed by `endpoint_id`, including latency, success/failure
history, recent failure reason, reliability score, and observation freshness.
This remains advisory scoring only and is not wired into production route
execution.
- C17Y Platform Owner synthetic mesh visibility is implemented and
build/test-proven: `web-admin` reads node-scoped synthetic mesh config and
shows config enabled state, route counts, peer endpoints, endpoint
candidates, C17X advisory scoring boundary, and `production_forwarding`.
This remains platform-owner visibility only and does not enable production
forwarding.
- C17Z production fabric-control direct forwarding boundary is implemented and
test-proven: when `RAP_MESH_PRODUCTION_FORWARDING_ENABLED=true`,
`/mesh/v1/forward` can deliver valid route-bound `fabric.control` envelopes
at the local destination or forward them to a direct next hop from explicit
peer endpoint config. Service channels, arbitrary relay forwarding,
multi-hop production route execution, and RDP/VPN/file/video/service payloads
remain unavailable.
- C17Z1 production fabric-control multi-hop route-path boundary is implemented
and test-proven: production `fabric.control` envelopes can carry
`route_path` and `visited_node_ids`; relay nodes validate path position,
forward only to the next path node, update TTL/hop/visited metadata, and
reject loops. Service payloads remain unavailable.
- C17Z2 production fabric-control forwarding observability boundary is
implemented and test-proven: node-agent emits local
`mesh_production_forward_event` logs for accepted, forwarded, delivered, and
rejected production `fabric.control` envelopes. Logs are metadata-only and
include no payload bodies or read API.
- C17Z3 production fabric-control route-config boundary is implemented and
test-proven: when scoped/control-plane mesh routes are available locally,
production `fabric.control` envelopes must match configured route_id/path/
next-hop/channel/expiry/TTL/hop limits before forwarding.
- C17Z4 scoped peer directory and recovery seeds boundary is implemented and
test/build-proven: node-scoped mesh config carries scoped `peer_directory`
and explicit bounded `recovery_seeds`; node-agent parses/validates them and
web-admin shows counts.
- C17Z5 node-agent peer cache runtime boundary is implemented and test-proven:
node-agent builds a local `PeerCache`, selects bounded warm peers, probes warm
peers with `/mesh/v1/health`, and reports metadata-only mesh-link
observations when synthetic mesh testing is enabled.
- C17Z6 dynamic endpoint reporting boundary is implemented and test-proven:
node-agent reports explicit advertised mesh endpoint metadata in heartbeat,
and Control Plane projects latest reported endpoints/candidates into
node-scoped synthetic mesh config.
- C17Z7 private/corporate endpoint candidate boundary is implemented and
test-proven: node-agent reports multiple advertised endpoint candidates,
scoring rewards private/corporate same-site candidates, and peer cache can
use the best candidate address for warm health.
- C17Z8 peer connection state machine boundary is implemented and test-proven:
node-agent tracks warm-peer states `disconnected`, `connecting`, `ready`,
`degraded`, and `backoff`, with bounded backoff after repeated health probe
failures.
- C17Z9 peer recovery planner boundary is implemented and test-proven:
node-agent targets a bounded stable ready-peer set, enters recovery when
ready peers fall below target, and selects bounded recovery probes from warm
peers, recovery seeds, and other connectable scoped peers.
- C17Z10 peer connection intent planner boundary is implemented and
test-proven: node-agent classifies bounded peer work as maintain/probe/
recover and classifies transport readiness as direct/private_lan/
corporate_lan/outbound_only/relay_required, with rendezvous-required
metadata only.
- C17Z11 peer connection manager runtime boundary is implemented and
test-proven: node-agent uses a reusable HTTP keep-alive client for real
control-plane health probes of direct/private/corporate peers and records
`waiting_rendezvous` for outbound-only/relay-required peers.
- C17Z12 rendezvous/relay control-plane contract is implemented and
docker-test-runtime-proven: backend issues node-scoped `rendezvous_leases`,
node-agent resolves matching `waiting_rendezvous` intents into
`relay_control`, probes relay `/mesh/v1/health`, records and maintains
`relay_ready`, and keeps service payload forwarding disabled.
- C17Z13 rendezvous lease telemetry is implemented and
docker-test-runtime-proven: node-agent reports
`mesh_rendezvous_lease_report` with relay admission, peer admission,
TTL/renewal posture, `relay_ready`, and explicit no-payload boundary flags;
web-admin shows `rv leases` in recent heartbeat tables.
- C17Z14 rendezvous lease refresh contract is implemented and
docker-test-runtime-proven: node-agent refreshes renewal-needed/stale
rendezvous leases through node-scoped synthetic config reload, updates the
running peer cache/route/lease state, and reports refresh plus stale relay
withdrawal/reselection telemetry. Service payload forwarding remains
unavailable.
- C17Z15 backend relay replacement policy is implemented and
docker-test-runtime-proven: backend consumes recent stale-relay heartbeat
feedback, withdraws stale explicit rendezvous leases, scores alternate relay
candidates from route adjacency, endpoint priority, policy tags, and recent
mesh-link health, and returns replacement leases plus
`rendezvous_relay_policy` decisions in node-scoped synthetic config.
Node-agent reports `c17z15.mesh_rendezvous_lease_report.v1` and keeps stale
state scoped to the exact lease/relay, so replacement leases for the same
peer are not marked stale by association. Service payload forwarding remains
unavailable.
- C17Z16 route/path decision artifact is implemented and
docker-test-runtime-proven: backend `c17z16.synthetic.v1` config includes
`route_path_decisions` with original hops, effective hops, local previous/
next hop, selected replacement relay, generation, score reasons, and
no-payload boundary flags. Node-agent stores the control-plane route
generation and reports `c17z16.mesh_route_path_decision_report.v1` plus
`c17z16.mesh_rendezvous_lease_report.v1`. Service payload forwarding remains
unavailable.
- C17Z17 node-side route generation tracker is implemented and
docker-test-runtime-proven: backend `c17z17.synthetic.v1` config and
node-agent `mesh_route_generation_report` track active/applied/unchanged/
withdrawn route decisions, generation changes, total counters, and
`withdrawn_by_replacement` records for stale relay paths when replacement is
first observed. Service payload forwarding remains unavailable.
- C17Z18 synthetic route-health effective path runtime is implemented and
docker-test-runtime-proven: backend `c17z18.synthetic.v1` config and
node-agent `mesh_route_health_config_report` apply Control Plane
`route_path_decisions` to synthetic route-health route config only. The
synthetic runtime probes selected effective paths through replacement relays,
reports expected/observed hops and drift state, and backend latest mesh links
preserve route-health observations separately from connection-manager
observations. Service payload forwarding remains unavailable.
- C17Z19 synthetic route-health feedback scoring is implemented and
docker-test-runtime-proven: backend consumes recent `synthetic_route_health`
observations in relay scoring, uses drift/unreachable/failure metadata to
mark the exact selected relay stale, boosts healthy low-latency relay
candidates, and returns replacement leases/route decisions through the
existing synthetic config contract. Migration `000022` adds the `synthetic`
mesh service class. Service payload forwarding remains unavailable.
- C17Z20 node-side route-health feedback refresh is implemented and
docker-test-runtime-proven: after reporting synthetic route-health
drift/unreachable/failure, node-agent performs a bounded node-scoped
synthetic-config refresh, applies returned replacement route decisions to
route-health config immediately, and reports
`c17z20.mesh_route_health_feedback_refresh_report.v1`. Service payload
forwarding remains unavailable.
- Installation Authority foundation is implemented: production requires strict
Product Root public key config, first-owner bootstrap uses signed Ed25519
activation manifests, `installation_authority` and signed
`platform_role_grants` are persisted, and strict platform-admin checks ignore
direct `users.platform_role` database edits without a valid signed grant.
Web-admin exposes installation status/first-owner bootstrap, and
`scripts/installation/product-root-tool.go` generates keys/manifests for
offline product-root operations.
- Cluster Authority and node enrollment bootstrap are docker-test lifecycle
smoke-proven in run `dev-bootstrap-20260428-201430`: a fresh dev install
bootstrapped the first owner, created a cluster, issued a signed join token,
accepted real `rap-node-agent` enrollment, owner-approved the join request,
agent-polled signed bootstrap, persisted cluster authority pin, heartbeated,
and verified signed `c17z18.synthetic.v1` Control Plane config. Production
service payload forwarding remains unavailable.
- Migration `000021_cluster_authority_keys` drops/recreates
`cluster_admin_summaries` because fresh replay proved PostgreSQL cannot
change that view layout via `CREATE OR REPLACE VIEW`.
- `rap-node-agent` desired-workload polling/status reporting is gated by
`RAP_WORKLOAD_SUPERVISION_ENABLED=false` by default while service runtime
supervision remains a stub.
- C18 VPN/IP tunnel service target design is completed as documentation only.
- C18A VPN/IP tunnel control-plane data model foundation is implemented and
backend-test-proven.
- C18B VPN/IP tunnel lease/fencing hardening is implemented and
backend-test-proven.
- C18C VPN/IP tunnel node-agent desired-state consumption/reporting is
implemented and backend-test-proven.
- No next platform-core implementation step is automatically authorized after
C17Z20. The next mesh layer should stay limited to route-health feedback
refresh dampening/no-change cooldown unless the user explicitly chooses
another staged task.
- Latest RDP performance reference image:
`rap-rdp-worker:rdp-perf6-dirty-region`
- Stage 5.2 file-download runtime artifacts remain preserved for when RDP work
resumes, but they are not the active next task.
- Do not use `docker.cin.su` for this project unless explicitly requested for a separate one-off check.
### Backend
- Go
- PostgreSQL = source of truth
- Redis = live coordination / routing only
- REST for control plane
- WebSocket for live session channel
### Worker
- C++ worker
- FreeRDP integration
- worker runtime hides FreeRDP details from backend
- The C++ worker remains the primary RDP runtime.
- Target RDP performance direction: `docs/architecture/RDP_SERVICE_CPP_PERFORMANCE_TARGET.md`.
- The RDP performance rewrite scope is limited to C++ RDP service adapter
internals. It must not redesign backend control plane, cluster transport,
organizations, leases, or session lifecycle.
- The C# RDP service skeleton is inactive research scaffolding and is not the
current runtime direction.
- Current RDP Adapter baseline: RDP-Perf-6 dirty-region direct binary rendering
is completed and smoke-proven on `docker-test`. RDP work is paused by product
decision; next active work is Fabric Core / cluster foundation.
- P3/P3.1 security-readiness foundation exists: production mode rejects
plaintext credential-like resource metadata, requires `secret_ref` for
RDP/VNC/SSH resources, and has an encrypted PostgreSQL-backed resource secret
storage/resolver MVP. P3.2 direct-worker TLS/PKI guard exists.
- P3.3 production-like test-stand smoke is complete on `docker-test`: backend
runs in `APP_ENV=production` with a test-only secret key file, a secret-backed
RDP resource starts real sessions through the resolver path, metadata/audit do
not contain plaintext credentials, and backend gateway fallback remains
available when direct worker WSS trust is `smoke_insecure`.
- P3.4 production direct-worker WSS trust model is documented in
`docs/architecture/PRODUCTION_DIRECT_WORKER_WSS_TRUST.md`; it defines
platform CA/public CA behavior, worker certificate SAN/identity requirements,
app-local Windows trust direction, rotation/revocation, and the future
`platform_ca` smoke plan. No RDP runtime behavior changed in P3.4.
- P3.5 app-local platform CA trust is implemented and runtime-proven on
`docker-test`: Windows client validates direct worker WSS with an app-local
platform CA bundle, keeps hostname/SAN validation enabled, selects
`direct_worker_wss` without insecure TLS bypass, and falls back to backend
gateway for unknown CA / smoke-only production cases.
- P3.6 stale Redis worker/live event idempotency is implemented and
runtime-proven: stale worker events for terminal PostgreSQL sessions are
ignored, backend restart survives stale Redis events, and terminal sessions
are not reopened.
- Stage 5.2 server-to-client file download core data path is runtime-proven:
direct worker WSS and backend gateway fallback both download text/binary
files from `RAP_Transfers\ToClient` with matching size/hash, and direct
policy blocking is proven for `disabled` and `client_to_server`. Lifecycle
blocking is also runtime-proven for detach, old-client takeover, and worker
failure. Runtime report:
`artifacts/stage5-2-file-download-runtime-report.md`.
- Stage 5.2 is not fully accepted yet. Remaining proof: Windows desktop UI
download path and regression matrix for rendering/input/clipboard/upload/
reconnect/takeover.
### Clients
- future native clients:
- Windows: native desktop client first
- Linux: native desktop client later
- web UI is admin/control plane, not the primary power-user client
## Final architecture direction
The long-term target architecture is documented in:
- `docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`
- `docs/architecture/CLUSTER_NODE_ADMIN_FOUNDATION.md`
- `docs/architecture/WEB_INGRESS_AND_ADMIN_UI_MODEL.md`
This document defines the target Secure Access Fabric architecture only. It is not the current implementation scope and must not be used as permission to start mesh, VPN, multi-cluster, updater, or realtime data-plane migration work without an explicit staged prompt.
`CLUSTER_NODE_ADMIN_FOUNDATION.md` defines the next platform-core planning
baseline for clusters, node enrollment, native node-agent identity, platform
admin console, multi-cluster administration, and future organization admin
visibility. It is a staged foundation document, not permission to implement
mesh packet routing or VPN runtime.
`WEB_INGRESS_AND_ADMIN_UI_MODEL.md` defines WEB as HTTP/HTTPS ingress and
Admin UI presentation only. Cluster configuration remains Control Plane
ownership through scoped APIs, PostgreSQL source-of-truth mutations, and audit.
Dynamic pages must be safe schema-driven projections and must not embed
internal topology, peer caches, route caches, secrets, raw credentials, or
arbitrary executable code.
Admin endpoint placement is explicit. Fabric Storage / Config Storage nodes do
not automatically host or move the cluster panel. Platform Owner Console
remains global platform-owner scope. Cluster Admin Endpoint requires explicit
admin/web ingress role assignment, cluster health/trust readiness, and Control
Plane authorization. Organization Admin Panel remains a tenant-safe projection.
The final platform must support:
1. Multi-tenancy / Organizations
- platform has many organizations
- each organization has isolated users, groups, resources, policies, audit, connectors
- users may belong to multiple organizations
- organization admins only see their organization
- platform admins see platform scope
2. Identity federation
- local users
- LDAP / Active Directory
- OIDC
- future extensibility for more identity sources
- access mappings based on external groups / claims
3. Cluster of nodes
- no mandatory single central node
- many nodes across many sites
- nodes can be platform-managed or customer-managed
- customer-managed nodes are sandboxed cluster participants, not full cluster owners
4. Node agent
- small stable always-running agent on every node
- supervises services
- downloads updates
- verifies signed artifacts
- can rollback to previous version
- can restart crashed services
- can work on thin or thick nodes
5. Service-based node model
Each node is not monolithic.
A node has:
- capabilities: what it can do physically/technically
- enabled services: what it is allowed/assigned to do
Possible services include:
- ingress-gateway
- mesh-router
- relay
- connector-host
- vpn-adapter
- session-worker
- media-relay
- file-relay
- update-cache
- config-replica
- audit-sink
- metrics-exporter
6. Cluster mesh and routing
- encrypted inter-node communication
- dynamic topology
- no need for full mesh
- multi-hop routing allowed
- route failover
- client failover between ingress nodes
- connector failover between nodes
7. Split-brain prevention
- quorum-based cluster behavior
- minority partition must not become a second authoritative cluster
- degraded / recovery / isolated modes
- manual recovery / promote decision by platform recovery admin
8. Connector / VPN layer
- connectors are reusable network access methods
- one connector may be used by multiple resources
- connector placement and failover are controlled by policy
- nodes may be allowed or disallowed to host connectors
- direct access, VPN, relay and future egress modes must fit this model
9. Future exit mode
- split tunnel
- full tunnel
- internet access through cluster
- not first implementation priority
## Non-negotiable design rules
- Do not rewrite proven session lifecycle carelessly.
- Do not turn Redis into a source of truth.
- Do not make certificate-ignore a global worker setting.
- Do not make customer-managed nodes platform-wide trusted by default.
- Do not create a separate cluster per organization.
- Do not assume a single permanently reachable central node.
- Do not rely on “secret protocol with no docs” as security.
- Security must come from crypto, auth, isolation, policy and observability.
- Prefer incremental evolution from current proven system.
- Do not collapse platform control plane and data plane into one vague layer.
## Implementation strategy
The codebase must evolve in phases.
Current implementation focus remains:
- RDP work is paused by product decision
- preserve the accepted RDP Adapter baseline and Stage 5.x file-transfer work
- do not delete or rewrite the current RDP MVP while platform-core work starts
- C1-C9 platform-core foundations are implemented and verified: clusters,
node enrollment, node-agent scaffold, platform admin console, workload
supervision contract, mesh control-plane prep, mesh skeleton, multi-cluster
hardening, and organization admin foundation
- C10 Fabric Core configuration distribution design is completed
- C11 signed scoped cluster snapshot model is completed
- C12 node local state store is completed
- C13 Fabric Storage / Config Storage service foundation is completed
- C14 peer directory and cache model is completed
- C15 Fabric Routing Engine skeleton is completed
- C16 secure node-to-node channel lifecycle is completed
- C17 mesh routing runtime implementation plan is completed
- C17A synthetic mesh runtime skeleton is implemented and test-proven with
synthetic fabric messages only, no RDP/VPN/production service traffic
- C17B route health and failover probes are implemented and test-proven with
synthetic traffic only, no RDP/VPN/production service traffic
- C17C relay semantic hardening is implemented and test-proven with synthetic
channel classes only, no RDP/VPN/production service traffic
- C17D non-production test-service path is implemented and test-proven with
bounded `synthetic.echo` traffic only, no RDP/VPN/production service traffic
- C17E live node-to-node synthetic HTTP transport is implemented and
smoke-proven with synthetic traffic only
- C17F scoped synthetic route config loading and route-health reporting is
implemented and smoke-proven with synthetic traffic only
- C17G Control Plane scoped synthetic config read/consume is implemented and
test-proven with synthetic traffic only
- C17H deployed multi-agent synthetic config smoke is implemented and
runtime-proven on `docker-test` with synthetic traffic only
- C17I production forwarding gate foundation is implemented and test-proven;
production forwarding remains unavailable
- C17J production envelope contract validation is implemented and test-proven;
production forwarding remains unavailable
- C17K production envelope observation is implemented and test-proven;
production forwarding remains unavailable
- C17L bounded production observation sink is implemented and test-proven;
production forwarding remains unavailable
- C17M production observation sink wiring is implemented and test-proven;
production forwarding remains unavailable
- C17N production observation sink metrics are implemented and test-proven;
production forwarding remains unavailable
- C17O production observation sink local metrics logging is implemented and
test-proven; production forwarding remains unavailable
- C17P production observation sink change-driven metrics logging is implemented
and test-proven; production forwarding remains unavailable
- C17Q production forwarding gate/runtime log boundary is implemented and
test-proven; production forwarding remains unavailable
- C17R production observation sink capacity guard is implemented and
test-proven; production forwarding remains unavailable
- C17S production observation panic fail-closed hardening is implemented and
test-proven; production forwarding remains unavailable
- C17T production envelope payload boundary is implemented and test-proven;
production forwarding remains unavailable
- C17U production envelope created-at skew boundary is implemented and
test-proven; production forwarding remains unavailable
- C17V peer endpoint candidate model and NAT/connectivity hints are
implemented and test-proven; production forwarding remains unavailable
- C17W peer endpoint candidate scoring model is implemented and test-proven;
production forwarding remains unavailable
- C17X health-aware endpoint candidate scoring overlay is implemented and
test-proven; production forwarding remains unavailable
- C17Y Platform Owner synthetic mesh visibility is implemented and
build/test-proven; production forwarding remains unavailable
- C17Z production fabric-control direct forwarding is implemented and
test-proven; production service traffic remains unavailable
- C17Z1 production fabric-control multi-hop route-path forwarding is
implemented and test-proven; production service traffic remains unavailable
- C17Z2 production fabric-control forwarding observability is implemented and
test-proven; production service traffic remains unavailable
- C17Z3 production fabric-control route-config boundary is implemented and
test-proven; production service traffic remains unavailable
- C17Z4 scoped peer directory/recovery seed boundary is implemented and
test/build-proven; production service traffic remains unavailable
- C17Z5 node-agent peer cache runtime boundary is implemented and test-proven;
production service traffic remains unavailable
- C17Z6 dynamic endpoint reporting boundary is implemented and test-proven;
production service traffic remains unavailable
- C17Z7 private/corporate endpoint candidate boundary is implemented and
test-proven; production service traffic remains unavailable
- C17Z8 peer connection state machine boundary is implemented and test-proven;
production service traffic remains unavailable
- C17Z9 peer recovery planner boundary is implemented and test-proven;
production service traffic remains unavailable
- C17Z10 peer connection intent planner boundary is implemented and
test-proven; production service traffic remains unavailable
- C17Z11 peer connection manager runtime boundary is implemented and
test-proven; production service traffic remains unavailable
- C17Z12 rendezvous/relay control-plane contract is implemented and
docker-test-runtime-proven; production service traffic remains unavailable
- C17Z13 rendezvous lease telemetry is implemented and
docker-test-runtime-proven; production service traffic remains unavailable
- C17Z14 rendezvous lease refresh contract is implemented and
docker-test-runtime-proven; production service traffic remains unavailable
- C17Z15 backend relay replacement policy is implemented and
docker-test-runtime-proven; production service traffic remains unavailable
- C17Z16 route/path decision artifact is implemented and
docker-test-runtime-proven; production service traffic remains unavailable
- C17Z17 node-side route generation tracker is implemented and
docker-test-runtime-proven; production service traffic remains unavailable
- C17Z18 synthetic route-health effective path runtime is implemented and
docker-test-runtime-proven; production service traffic remains unavailable
- C17Z19 synthetic route-health feedback scoring is implemented and
docker-test-runtime-proven; production service traffic remains unavailable
- C17Z20 node-side route-health feedback refresh is implemented and
docker-test-runtime-proven; production service traffic remains unavailable
- Cluster Authority plus node enrollment bootstrap polling are docker-test
lifecycle-smoke-proven; fresh install migration replay is fixed for
`cluster_admin_summaries`
- C18 VPN/IP tunnel service target design is completed as documentation only
- C18A VPN/IP tunnel control-plane data model foundation is implemented and
backend-test-proven
- C18B VPN/IP tunnel lease/fencing hardening is implemented and
backend-test-proven
- C18C VPN/IP tunnel node-agent desired-state consumption/reporting is
implemented and backend-test-proven
- Version Storage / Update Repository is documented as a future Fabric Core
service for signed release manifests, OS/arch artifacts,
stable/current/candidate channels, update-cache mirroring, node-agent
update supervision, rollback, and explicit data-structure migration bundles.
Runtime updater behavior is not implemented.
- no next platform-core implementation step is automatically authorized after
C17Z20; choose the next narrow staged prompt explicitly before continuing
- preserve the proven RDP lifecycle behavior
- keep the current backend gateway available as the active/fallback implementation path
The current phase is NOT:
- full mesh routing implementation
- full VPN orchestration
- multi-cluster runtime traffic handling
- production data-plane migration
- updater runtime
- video meetings
- final native client UI redesign
Future mesh, VPN, multi-cluster, node-agent updater, and production realtime data-plane work must be introduced only through explicit, narrow, staged implementation prompts.
Always keep the project production-oriented. Do not simplify it into a toy app.
+119
View File
@@ -0,0 +1,119 @@
# Remote Access Platform
Production-oriented Secure Access Fabric platform.
The project started as an RDP proxy, but the target architecture is broader:
- multi-tenant control plane
- direct realtime data plane
- service adapters for RDP now and VNC/SSH/VPN/file/video later
- native Access Clients
- future secure mesh / node-agent / updater / connector model
RDP is the first proven service baseline. RDP work is currently paused by
product decision while the project moves to the Secure Access Fabric
platform-core foundation: clusters, node enrollment, node-agent identity, role
assignments, and platform administration.
## Current Baseline
- Backend: Go control plane with PostgreSQL as source of truth and Redis for
live coordination/routing only.
- Worker: active C++ RDP Adapter worker based on FreeRDP.
- Windows client: C# / WPF native Access Client.
- Data plane: direct worker WSS for realtime RDP when available, backend
WebSocket gateway retained as fallback/debug.
- Current test Docker host: `docker-test` / `192.168.200.61`.
- Current test Docker deployment for Stage 5.2 proof:
`rap-backend-smoke:stage5-2-download` and
`rap-rdp-worker:stage5-2-download`.
See the current audit and baseline matrix before starting new work:
- `docs/audits/PROJECT_AUDIT_2026-04-26.md`
- `docs/audits/CURRENT_BASELINE_MATRIX.md`
## Proven RDP Capabilities
- real RDP connect through worker
- active/detach/reattach/takeover/terminate lifecycle
- takeover without remote session recreation
- worker-death/orphan-active-session recovery
- direct worker WSS data plane
- binary direct render frames
- backend gateway JSON/base64 fallback
- Windows client rendering and input
- text clipboard with policy enforcement
- client-to-server file upload into controlled worker storage
- restricted transfer-drive visibility through `RAP_Transfers`, runtime-proven
- server-to-client file download implementation through
`RAP_Transfers\ToClient`; core direct/fallback data path is runtime-proven,
lifecycle blocking is runtime-proven, and manual desktop UI acceptance remains
pending
## Repository Structure
- `backend/` - Go control plane
- `workers/rdp-worker/` - active C++ RDP Adapter worker
- `workers/rdp-service-csharp/` - inactive research scaffold, not current runtime
- `clients/windows/` - Windows native Access Client
- `docs/architecture/` - target and staged architecture documents
- `docs/codex/` - current Codex status and next-step prompts
- `docs/audits/` - current audits and baseline matrices
- `scripts/` - smoke and helper scripts
- `deploy/` - deployment assets
- `web-admin/` - future/admin UI area
## Read Order
1. `CODEX_CONTEXT.md`
2. `docs/audits/PROJECT_AUDIT_2026-04-26.md`
3. `docs/audits/CURRENT_BASELINE_MATRIX.md`
4. `docs/codex/CURRENT_STATUS.md`
5. `docs/codex/ARCHITECTURE_GUARDRAILS.md`
6. `docs/architecture/RDP_ADAPTER_RUNTIME.md`
7. `docs/architecture/DATA_PLANE_V1.md`
8. `docs/architecture/CLUSTER_NODE_ADMIN_FOUNDATION.md`
9. `docs/codex/NEXT_STEP_PROMPT.md`
Do not use `docs/_legacy_v1` for implementation decisions. Legacy files are
historical reference only.
## Current Next Step
RDP work is paused. Platform-core stages C1-C9 are implemented and verified:
cluster/node model foundation, node enrollment hardening, native
`rap-node-agent` MVP, Platform Admin Console MVP, service workload supervision
contract, mesh control-plane preparation, Mesh MVP skeleton, multi-cluster
hardening, and organization admin foundation.
Planning baseline:
- `docs/architecture/CLUSTER_NODE_ADMIN_FOUNDATION.md`
- `artifacts/c1-cluster-node-foundation-report.md`
- `artifacts/c2-node-enrollment-hardening-report.md`
- `artifacts/c3-rap-node-agent-mvp-report.md`
- `artifacts/c4-platform-admin-console-report.md`
- `artifacts/c5-service-workload-supervision-contract-report.md`
- `artifacts/c6-mesh-control-plane-preparation-report.md`
- `artifacts/c7-mesh-mvp-skeleton-report.md`
- `artifacts/c8-multi-cluster-hardening-report.md`
- `artifacts/c9-organization-admin-foundation-report.md`
Stage C1 implemented the backend foundation for:
- explicit clusters
- cluster memberships
- node join tokens with hashed tokens only
- node join requests
- node identity/certificate metadata
- node role assignments
- node heartbeat/latest health
- cluster audit events
- safe migration/backfill of existing node data into a default cluster
Recommended next step is a decision between platform auth hardening for
web-admin, node PKI hardening, node-agent runtime hardening, mesh runtime DP-0,
or returning to paused RDP work.
Do not start all continuation paths at once.
+13
View File
@@ -0,0 +1,13 @@
FROM golang:1.23-bookworm AS build
WORKDIR /src
COPY agents/rap-node-agent/go.mod ./
RUN go mod download
COPY agents/rap-node-agent/ ./
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/rap-node-agent ./cmd/rap-node-agent
FROM gcr.io/distroless/static-debian12:nonroot
COPY --from=build /out/rap-node-agent /usr/local/bin/rap-node-agent
USER nonroot:nonroot
ENTRYPOINT ["/usr/local/bin/rap-node-agent"]
+487
View File
@@ -0,0 +1,487 @@
# rap-node-agent
Native node agent MVP for the Secure Access Fabric.
Status: Stage C17Z18 synthetic route-health effective path boundary.
This agent is intentionally native. Containers may package service workloads,
but the host-level node identity belongs to `rap-node-agent`.
## Current Scope
Implemented:
- config loading from flags/environment
- local identity state file
- enrollment request client
- heartbeat client
- capability/facts payload
- status-only service reporting payload
- mesh control-channel skeleton
- route-health message skeleton
- relay skeleton that refuses production payload forwarding
- disabled-by-default synthetic mesh runtime for `fabric.probe` /
`fabric.probe_ack`
- direct and single-relay synthetic route tests
- synthetic `fabric.route_health` / `fabric.route_health_ack`
- local route success/failure observations
- fallback route selection for test topology
- route cache invalidation on version changes
- synthetic relay envelope validation
- per-channel bounded queues for synthetic traffic
- QoS dequeue order: `fabric_control`, then `route_control`, then `telemetry`
- telemetry-only stale message drop under backpressure
- reliable fabric/control queue rejection when full
- bounded non-production `synthetic.echo` test-service path
- direct, single-relay, and forced-fallback test-service proofs
- live HTTP peer transport for synthetic mesh envelopes
- disabled-by-default synthetic mesh HTTP endpoint in `rap-node-agent`
- `mesh-live-smoke` harness proving direct and single-relay synthetic traffic
over real local HTTP endpoints
- scoped synthetic mesh config file loading for peer endpoints and routes
- Control Plane synthetic mesh config read fallback when no local scoped config
file is set
- synthetic route-health observations reported to the Control Plane when test
flags allow synthetic links
- explicit production mesh forwarding gate config; production forwarding still
has no runtime implementation and remains unavailable
- route-bound production mesh envelope contract and fail-closed validation on
`/mesh/v1/forward`
- metadata-only production envelope observation hook for valid envelopes, still
without forwarding payloads
- bounded metadata-only production envelope observation sink for accepted
observations
- disabled-by-default node-agent wiring for the bounded observation sink
- local metrics for the bounded observation sink without exposing observation
records
- local node-agent logging for bounded observation sink metrics
- change-driven suppression for unchanged bounded observation sink metrics logs
- explicit local log distinction between production forwarding gate state and
production forwarding runtime state
- node-scoped rendezvous lease refresh through Control Plane synthetic config
- stale relay withdrawal/reselection telemetry
- relay replacement contract reporting for stale rendezvous relays
- route/path decision contract reporting for control-plane route generations
- route generation apply/withdraw tracking for control-plane path decisions
- synthetic route-health route config refresh from Control Plane path
decisions
- route-health expected/observed effective path drift reporting
- maximum capacity guard for the local production observation sink
- panic-safe fail-closed production envelope observation wrapper
- explicit `4096` byte payload boundary for validated production
fabric-control envelopes
- explicit future-skew boundary for validated production envelope `created_at`
- scoped synthetic peer endpoint candidate config with reachability,
NAT/connectivity hints, priority, policy tags, and metadata
- deterministic local peer endpoint candidate scoring model for synthetic
config candidates
- optional local health observation overlay for endpoint candidate scoring
- gate-controlled production `fabric.control` direct next-hop delivery
- route-path-bound production `fabric.control` multi-hop forwarding
- local metadata-only production `fabric.control` forwarding event logs
- route-config-bound production `fabric.control` forwarding validation
- scoped peer directory and bounded recovery seed config parsing/validation
- node-local peer cache with bounded warm peer health probes
- advertised mesh endpoint reporting through heartbeat metadata
- multiple advertised endpoint candidates, including private/corporate LAN
- peer connection state machine for warm-peer health
- bounded peer recovery planner over peer cache and connection states
- peer connection intent planner with transport readiness classification
- peer connection manager for real control-plane health over reusable
HTTP keep-alive transport
- route-health effective-path runtime through replacement relay control paths
Not implemented yet:
- mesh packet routing
- production mesh service traffic
- VPN runtime
- production workload supervision
- certificate issuance/rotation
- updater runtime
- privileged host route/firewall control
## Build
```powershell
cd agents\rap-node-agent
go test ./...
go build -o bin\rap-node-agent.exe .\cmd\rap-node-agent
go build -o bin\mesh-live-smoke.exe .\cmd\mesh-live-smoke
```
## First Enrollment
Create a join token from the platform control plane, then run:
```powershell
.\bin\rap-node-agent.exe `
-backend-url http://192.168.200.61:8080/api/v1 `
-cluster-id <cluster_id> `
-join-token <raw_join_token> `
-node-name test-node-1 `
-state-dir C:\ProgramData\RapNodeAgent
```
The agent submits a pending join request and exits. It does not self-activate.
A platform admin must approve the join request.
## Enrollment Approval
When the agent enrolls, it stores the returned `pending_join_request_id` and
polls the Control Plane bootstrap endpoint until the platform owner approves
the request or the enrollment timeout expires. After approval, the agent
verifies the signed bootstrap contract and writes the approved `node_id`,
`cluster_id`, `identity_status=active`, `cluster_authority_public_key`, and
`cluster_authority_fingerprint` into `identity.json`.
Future C3 hardening can add signed node certificates and automatic secure
certificate material exchange.
Then run the agent again:
```powershell
.\bin\rap-node-agent.exe `
-backend-url http://192.168.200.61:8080/api/v1 `
-state-dir C:\ProgramData\RapNodeAgent
```
It sends periodic heartbeats to:
```text
/api/v1/clusters/{clusterID}/nodes/{nodeID}/heartbeats
```
## Environment Variables
- `RAP_BACKEND_URL`
- `RAP_CLUSTER_ID`
- `RAP_CLUSTER_AUTHORITY_PUBLIC_KEY`
- `RAP_CLUSTER_AUTHORITY_FINGERPRINT`
- `RAP_JOIN_TOKEN`
- `RAP_NODE_NAME`
- `RAP_NODE_STATE_DIR`
- `RAP_WORKLOAD_SUPERVISION_ENABLED`
- `RAP_HEARTBEAT_INTERVAL_SECONDS`
- `RAP_ENROLLMENT_POLL_INTERVAL_SECONDS`
- `RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS`
- `RAP_MESH_SYNTHETIC_RUNTIME_ENABLED`
- `RAP_MESH_LISTEN_ADDR`
- `RAP_MESH_ADVERTISE_ENDPOINT`
- `RAP_MESH_ADVERTISE_ENDPOINTS_JSON`
- `RAP_MESH_ADVERTISE_TRANSPORT`
- `RAP_MESH_CONNECTIVITY_MODE`
- `RAP_MESH_NAT_TYPE`
- `RAP_MESH_REGION`
- `RAP_MESH_SYNTHETIC_CONFIG`
- `RAP_MESH_PEER_ENDPOINTS_JSON`
- `RAP_MESH_SYNTHETIC_ROUTES_JSON`
- `RAP_MESH_PRODUCTION_FORWARDING_ENABLED`
- `RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY`
`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED` defaults to `false`. It gates only the
C17A/C17B/C17C/C17D/C17E synthetic probe, route-health, relay scheduling,
bounded `synthetic.echo` test-service runtime, and live synthetic HTTP endpoint.
It must not be used for RDP, VPN, file, video, or other production service
traffic.
`RAP_WORKLOAD_SUPERVISION_ENABLED` defaults to `false`. While service runtime
supervision is still a stub, the agent does not poll desired workloads or report
workload status unless this flag is explicitly enabled.
`RAP_MESH_LISTEN_ADDR` starts the C17E/C17F/C17G synthetic HTTP endpoint only when
`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true`. `RAP_MESH_SYNTHETIC_CONFIG` points to
a scoped synthetic mesh config snapshot and is preferred over debug JSON.
`RAP_MESH_PEER_ENDPOINTS_JSON` is a JSON object mapping peer node IDs to
endpoint URLs. `RAP_MESH_SYNTHETIC_ROUTES_JSON` is a JSON array of synthetic
route objects. If no local scoped config file is set, the agent asks the
Control Plane for:
```text
/clusters/{clusterID}/nodes/{nodeID}/mesh/synthetic-config
```
The JSON variables are debug fallback only.
Control Plane synthetic config with `authority_required=true` must include a
signed `authority_payload` / `authority_signature` envelope and a
`cluster_authority` descriptor. The agent verifies the signature, validates the
config hash, and rejects mismatched pinned authority values when
`RAP_CLUSTER_AUTHORITY_PUBLIC_KEY`, `RAP_CLUSTER_AUTHORITY_FINGERPRINT`, or the
same fields in `identity.json` are set.
`RAP_MESH_PRODUCTION_FORWARDING_ENABLED` defaults to `false`. It is a future
production-forwarding gate only. Turning it on does not enable production mesh
payload forwarding; `/mesh/v1/forward` still returns an unavailable runtime
response after validating the route-bound production envelope contract, until
a later approved production mesh stage implements route-bound, policy-bound
forwarding.
The production envelope contract requires route, hop, TTL, expiry, payload
length, and SHA-256 payload hash fields. C17J accepts only the
`fabric_control` channel class and `fabric.control` message type for
validation. RDP, VPN, render, file, video, and service workload channels are
rejected.
C17K adds a local metadata-only observation hook after successful production
envelope validation. Observations include route/message/hop/channel metadata and
payload length/hash, not the payload body. Observation failure fails closed, and
the endpoint still does not forward payloads.
C17L adds a bounded in-memory observation sink for accepted metadata-only
observations. The sink drops the oldest observation when full and still stores
no payload bodies.
`RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` defaults to `0`. When set above
zero, C17M wires the bounded metadata-only sink into the node-agent mesh server.
This remains local-only, exposes no read API, stores no payload bodies, and
does not enable production forwarding. C17R rejects values above `10000`.
C17N adds local sink metrics: configured capacity, current depth, accepted
total, and dropped-oldest total. Metrics do not expose observation records,
route IDs, message IDs, hashes, payload metadata, or payload bodies.
C17O logs those aggregate metrics locally from the node-agent loop when the
sink is explicitly enabled. This does not add a read API or Control Plane
reporting.
C17P logs aggregate sink metrics only when they change, so steady heartbeat
loops do not repeat identical local metrics lines.
C17Q logs `production_forwarding_gate_enabled` separately from
`production_forwarding_runtime_enabled`. The runtime field remains `false`;
turning on the gate still does not enable production forwarding.
C17S makes production envelope observation panic-safe. Observer errors and
observer panics both fail closed as observation failure; forwarding remains
unavailable.
C17T limits validated production `fabric.control` envelope payloads to 4096
bytes. Oversized envelopes are rejected before observation.
C17U rejects production `fabric.control` envelopes whose `created_at` is more
than one minute in the future.
C17V adds scoped peer endpoint candidates to synthetic mesh config. Candidate
entries describe possible per-node endpoints with transport, address,
reachability, NAT type, connectivity mode, priority, policy tags, verification
time, and metadata. They are model/config hints only; no production route
scoring, NAT traversal, shortcut routing, or forwarding runtime is implemented.
C17W adds deterministic local scoring for scoped endpoint candidates. Scoring
uses transport, reachability, connectivity mode, NAT type, priority, preferred
region, policy tags, channel class, and verification age. It returns ranked
candidates and reason labels only; it does not select production routes, open
connections, perform NAT traversal, or forward payloads.
C17X extends candidate scoring with optional local health observations keyed by
`endpoint_id`. Observations can contribute latency, success/failure history,
recent failure reason, reliability score, and freshness/staleness signals.
The score remains advisory only and is not wired into production forwarding.
C17Z adds the first narrow production forwarding runtime. When
`RAP_MESH_PRODUCTION_FORWARDING_ENABLED=true`, `/mesh/v1/forward` can deliver
route-bound `fabric.control` envelopes at the local destination or forward them
to a direct next hop from explicit peer endpoint config. Service channels,
RDP/VPN/file/video payloads, arbitrary relay forwarding, and multi-hop
production route execution remain unavailable.
C17Z1 adds route-path-bound multi-hop forwarding for production
`fabric.control` only. Envelopes may carry `route_path` and
`visited_node_ids`; each relay validates its path position, forwards only to
the next route-path node, updates TTL/hop/visited metadata, and rejects loops.
Service payloads remain unavailable.
C17Z2 emits local `mesh_production_forward_event` logs for production
`fabric.control` forwarding outcomes: accepted, forwarded, delivered, and
rejected. Logs include route/message/hop/channel/status/reason/TTL/hop count/
route path length/visited count/payload length metadata only. Payload bodies
are not logged, no observation read API is added, and service payloads remain
unavailable.
C17Z3 binds production `fabric.control` forwarding to loaded scoped or
Control Plane route config when routes are available locally. Configured
envelopes must match `route_id`, cluster, source, destination, route path,
next hop, allowed channel, expiry, max TTL, and max hop count before
forwarding. If no route config is present, existing C17Z1 behavior is
preserved. Service payloads remain unavailable.
C17Z4 adds scoped peer directory and recovery seed config. `peer_directory`
describes only peers needed by the node-scoped mesh config. `recovery_seeds`
is an explicit, bounded bootstrap/recovery list and is not a full cluster node
list. The node-agent parses and validates these fields, but does not yet
implement a persistent connection manager, NAT traversal, or
relay/rendezvous runtime.
C17Z5 turns scoped peer directory and recovery seed config into node-local
runtime `PeerCache` state. The cache builds a bounded warm peer set from
route-adjacent peers, recovery seeds, peer endpoints, and endpoint candidates.
When synthetic mesh testing is enabled, the node-agent probes warm peers with
`/mesh/v1/health` and reports metadata-only mesh-link observations. This is not
a persistent connection manager and does not forward service payloads.
C17Z6 adds advertised mesh endpoint reporting. When
`RAP_MESH_ADVERTISE_ENDPOINT` is set, node-agent includes a
`mesh_endpoint_report` in heartbeat metadata with transport, connectivity mode,
NAT hint, region, observed time, and endpoint candidate metadata. Control Plane
can project the latest reported endpoint into node-scoped synthetic mesh config
for route-path peers. This does not perform automatic public IP discovery,
STUN/TURN/ICE NAT classification, or service payload forwarding.
C17Z7 adds `RAP_MESH_ADVERTISE_ENDPOINTS_JSON` for multiple advertised
endpoints per node. Candidates can describe public, private, corporate/LAN,
outbound, or relay-style addresses. Endpoint scoring rewards `private-lan`,
`corp-lan`, and `same-site` policy tags, and peer cache can use the best
candidate address for warm-peer health probes. This supports corporate-network
cluster segments without enabling service payload forwarding.
C17Z8 adds a node-local peer connection state machine on top of warm-peer
health probes. Warm peers move through `disconnected`, `connecting`, `ready`,
`degraded`, and `backoff`; repeated probe failures enter bounded backoff, and
successful probes recover to `ready`. Mesh-link observations include
metadata-only connection state. This is not a persistent socket/session manager
and does not forward service payloads.
C17Z9 adds a node-local peer recovery planner. The node targets a bounded
stable ready-peer set, defaulting to three connectable peers when available,
instead of probing every known cluster node. When ready peers fall below target,
the planner selects bounded recovery probes from warm peers, recovery seeds,
and other connectable scoped peers, skipping active backoff entries. Heartbeats
include metadata-only `mesh_peer_recovery_report` state. This is not persistent
connection transport, NAT traversal, relay/rendezvous runtime, or service
payload forwarding.
C17Z10 adds a node-local peer connection intent planner over the C17Z9 recovery
plan. It classifies bounded peer work as `maintain`, `probe`, or `recover`,
and classifies transport readiness as `direct`, `private_lan`,
`corporate_lan`, `outbound_only`, or `relay_required`. Heartbeats include
metadata-only `mesh_peer_connection_intent_report` counts. This is not
persistent connection transport, STUN/TURN/ICE, NAT traversal, relay runtime,
or service payload forwarding.
C17Z11 adds the first real node-local peer connection manager for mesh
control-plane health. It uses a reusable HTTP keep-alive client to probe
direct/private/corporate peer endpoints selected by C17Z10 intents, updates
the shared peer connection tracker, and records `waiting_rendezvous` for
outbound-only or relay-required peers. Heartbeats include metadata-only
`mesh_peer_connection_manager_report` state. This is not STUN/TURN/ICE,
relay/rendezvous runtime, route lease generation, VPN runtime, or service
payload forwarding.
C17Z12 adds a node-scoped rendezvous/relay control-plane lease contract for
peers that would otherwise remain `waiting_rendezvous`. The agent consumes
`rendezvous_leases`, resolves matching intents into `relay_control`, probes the
relay node `/mesh/v1/health`, and records `relay_ready` for the peer control
path. This remains control-plane health only and does not enable RDP/VPN/file/
video/service payload forwarding, arbitrary relay packet forwarding,
STUN/TURN/ICE, or host networking changes.
C17Z13 adds heartbeat telemetry for rendezvous lease admission and renewal
posture. The agent emits `mesh_rendezvous_lease_report` with local role,
relay/peer admission counts, TTL, renewal-after time, renewal-needed status,
`relay_ready`, and explicit no-payload boundary flags. This remains
metadata-only control-plane telemetry and does not enable service payload
forwarding.
C17Z14 adds a control-plane refresh contract for rendezvous leases. When a
lease is renewal-needed, expired, invalid, or tied to a stale relay state, the
agent reloads node-scoped synthetic config from Control Plane, updates the
running peer cache/route/lease state, and reports refresh counters plus stale
relay withdrawal/reselection fields. This remains control-plane health only
and does not enable service payload forwarding.
C17Z15 adds the node side of backend relay replacement policy. The agent
advertises the relay replacement contract capability and emits
`c17z15.mesh_rendezvous_lease_report.v1`; stale relay state is matched to the
exact rendezvous lease/relay when that metadata is present, so an alternate
replacement lease for the same peer is not treated as stale by association.
This remains control-plane health only and does not enable service payload
forwarding.
C17Z16 adds route/path decision reporting. The agent consumes
`route_path_decisions` from Control Plane synthetic config, keeps the latest
control-plane generation in local state, and emits
`c17z18.mesh_route_path_decision_report.v1` with effective hops, previous/next
hop, selected replacement relay, generation, and no-payload boundary flags.
This remains metadata-only route planning and does not enable service payload
forwarding.
C17Z17 adds node-side route generation tracking for Control Plane
`route_path_decisions`. The agent emits
`c17z18.mesh_route_generation_report.v1` with active, applied, unchanged, and
withdrawn decision counts, total counters, generation change state, active
decision details, and withdrawn decision details. When the first observed
config already contains a stale relay replacement, the tracker emits a
`withdrawn_by_replacement` record for the old relay path. This remains
metadata-only route planning and does not enable service payload forwarding.
C17Z18 applies Control Plane `route_path_decisions` to synthetic route-health
route config only. The agent keeps base routes separate from route-health
routes, periodically refreshes scoped config, emits
`c17z18.mesh_route_health_config_report.v1`, and reports route-health
observations with expected/observed hops and drift status. This probes
replacement relay effective paths for control-plane health only and does not
enable service payload forwarding.
Scoped synthetic config shape:
```json
{
"schema_version": "c17z18.synthetic.v1",
"cluster_id": "cluster-1",
"local_node_id": "node-a",
"config_version": "config-v1",
"peer_directory_version": "peers-v1",
"policy_version": "policy-v1",
"peer_endpoints": {
"node-b": "http://127.0.0.1:19002"
},
"peer_endpoint_candidates": {
"node-b": [
{
"endpoint_id": "node-b-public",
"node_id": "node-b",
"transport": "direct_tcp_tls",
"address": "203.0.113.20:443",
"reachability": "public",
"nat_type": "restricted",
"connectivity_mode": "direct",
"priority": 10
}
]
},
"routes": [],
"route_path_decisions": {
"schema_version": "c17z18.route_path_decisions.v1",
"decisions": []
}
}
```
## C17E Live Synthetic Smoke
Run:
```powershell
cd agents\rap-node-agent
go run .\cmd\mesh-live-smoke
```
Expected:
- scoped synthetic config loads
- direct `node-a -> node-b` synthetic probe succeeds
- relay `node-a -> node-r -> node-b` synthetic probe succeeds
- bounded `synthetic.echo` test-service succeeds
- `production_forwarding=false`
## Safety Rules
- The agent never assigns roles to itself.
- The agent reports capabilities only.
- Platform policy assigns roles.
- No RDP/VPN/production service traffic is carried by the C17A-C17Z18 staged
mesh runtime.
- Production forwarding remains disabled by default and limited to
`fabric.control` when explicitly enabled.
- No privileged operations are performed by the current agent.
@@ -0,0 +1,186 @@
package main
import (
"context"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
)
type smokeNode struct {
Local mesh.PeerIdentity
Runtime *mesh.SyntheticRuntime
URL string
server *httptest.Server
}
type smokeReport struct {
Stage string `json:"stage"`
ProductionForwarding bool `json:"production_forwarding"`
ScopedConfigLoaded bool `json:"scoped_config_loaded"`
DirectProbeAccepted bool `json:"direct_probe_accepted"`
DirectPath []string `json:"direct_path"`
RelayProbeAccepted bool `json:"relay_probe_accepted"`
RelayPath []string `json:"relay_path"`
TestServiceAccepted bool `json:"test_service_accepted"`
TestServiceEchoPayload string `json:"test_service_echo_payload"`
PeerEndpoints map[string]any `json:"peer_endpoints"`
}
func main() {
report, err := run(context.Background())
if err != nil {
fmt.Fprintf(os.Stderr, "mesh live smoke failed: %v\n", err)
os.Exit(1)
}
payload, err := json.MarshalIndent(report, "", " ")
if err != nil {
fmt.Fprintf(os.Stderr, "marshal report: %v\n", err)
os.Exit(1)
}
fmt.Println(string(payload))
}
func run(ctx context.Context) (smokeReport, error) {
nodeA := newSmokeNode(mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeR := newSmokeNode(mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"})
defer nodeR.Close()
nodeB := newSmokeNode(mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
directRoute := smokeRoute("route-direct", []string{"node-a", "node-b"})
relayRoute := smokeRoute("route-relay", []string{"node-a", "node-r", "node-b"})
routes := []mesh.SyntheticRoute{directRoute, relayRoute}
nodeAConfigPath, err := writeSmokeScopedConfig(nodeA.Local, map[string]string{
"node-r": nodeR.URL,
"node-b": nodeB.URL,
}, routes)
if err != nil {
return smokeReport{}, err
}
nodeAConfig, err := mesh.LoadScopedSyntheticConfig(nodeAConfigPath, nodeA.Local)
if err != nil {
return smokeReport{}, fmt.Errorf("load node-a scoped config: %w", err)
}
nodeA.Runtime = smokeRuntime(nodeA.Local, nodeAConfig.Routes, nodeAConfig.PeerEndpoints)
nodeR.Runtime = smokeRuntime(nodeR.Local, routes, map[string]string{
"node-b": nodeB.URL,
})
nodeB.Runtime = smokeRuntime(nodeB.Local, routes, map[string]string{})
directAck, err := nodeA.Runtime.SendProbe(ctx, directRoute.RouteID, mesh.SyntheticChannelFabricControl, "smoke-direct")
if err != nil {
return smokeReport{}, fmt.Errorf("direct probe: %w", err)
}
relayAck, err := nodeA.Runtime.SendProbe(ctx, relayRoute.RouteID, mesh.SyntheticChannelFabricControl, "smoke-relay")
if err != nil {
return smokeReport{}, fmt.Errorf("relay probe: %w", err)
}
testService, err := nodeA.Runtime.SendTestService(ctx, relayRoute.RouteID, mesh.SyntheticChannelRouteControl, mesh.SyntheticTestServiceRequest{
RequestID: "smoke-test-service",
OrganizationID: mesh.SyntheticDefaultTestOrganizationID,
ServiceType: mesh.SyntheticTestServiceType,
Payload: "hello-c17e",
SentAt: time.Now().UTC(),
})
if err != nil {
return smokeReport{}, fmt.Errorf("test service: %w", err)
}
return smokeReport{
Stage: "C17F scoped synthetic config plus live HTTP transport",
ProductionForwarding: false,
ScopedConfigLoaded: nodeAConfig.ConfigVersion == "smoke-config-v1",
DirectProbeAccepted: directAck.MessageType == mesh.SyntheticMessageProbeAck,
DirectPath: decodeProbePath(directAck),
RelayProbeAccepted: relayAck.MessageType == mesh.SyntheticMessageProbeAck,
RelayPath: decodeProbePath(relayAck),
TestServiceAccepted: testService.Ack.MessageType == mesh.SyntheticMessageTestServiceAck,
TestServiceEchoPayload: testService.Response.EchoPayload,
PeerEndpoints: map[string]any{
"node-a": nodeA.URL,
"node-r": nodeR.URL,
"node-b": nodeB.URL,
},
}, nil
}
func writeSmokeScopedConfig(local mesh.PeerIdentity, peers map[string]string, routes []mesh.SyntheticRoute) (string, error) {
path := filepath.Join(os.TempDir(), "rap-c17e-node-a-scoped-mesh.json")
payload, err := json.Marshal(mesh.ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: local.ClusterID,
LocalNodeID: local.NodeID,
ConfigVersion: "smoke-config-v1",
PeerDirectoryVersion: "smoke-peers-v1",
PolicyVersion: "smoke-policy-v1",
PeerEndpoints: peers,
Routes: routes,
})
if err != nil {
return "", err
}
if err := os.WriteFile(path, payload, 0o600); err != nil {
return "", err
}
return path, nil
}
func newSmokeNode(local mesh.PeerIdentity) *smokeNode {
node := &smokeNode{Local: local}
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
}))
node.URL = node.server.URL
return node
}
func (n *smokeNode) Close() {
if n.server != nil {
n.server.Close()
}
}
func smokeRuntime(local mesh.PeerIdentity, routes []mesh.SyntheticRoute, peers map[string]string) *mesh.SyntheticRuntime {
return mesh.NewSyntheticRuntime(mesh.SyntheticRuntimeConfig{
Enabled: true,
Local: local,
Routes: routes,
AllowedChannels: []string{
mesh.SyntheticChannelFabricControl,
mesh.SyntheticChannelRouteControl,
},
Transport: mesh.NewHTTPPeerTransport(peers),
})
}
func smokeRoute(routeID string, hops []string) mesh.SyntheticRoute {
return mesh.SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{mesh.SyntheticChannelFabricControl, mesh.SyntheticChannelRouteControl},
MaxTTL: 8,
MaxHops: 8,
ExpiresAt: time.Now().UTC().Add(time.Hour),
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func decodeProbePath(envelope mesh.SyntheticEnvelope) []string {
var payload mesh.SyntheticProbeAckPayload
_ = json.Unmarshal(envelope.Payload, &payload)
return payload.Path
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+3
View File
@@ -0,0 +1,3 @@
module github.com/example/remote-access-platform/agents/rap-node-agent
go 1.23.2
@@ -0,0 +1,70 @@
package agent
import (
"runtime"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const Version = "0.1.0-c3"
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
return client.EnrollRequest{
ClusterID: clusterID,
JoinToken: joinToken,
NodeName: identity.NodeName,
NodeFingerprint: identity.NodeFingerprint,
PublicKey: identity.PublicKey,
ReportedCapabilities: map[string]any{
"can_accept_client_ingress": false,
"can_accept_node_ingress": false,
"can_route_mesh": false,
"can_run_rdp_worker": true,
"can_run_vnc_worker": false,
"can_run_vpn_exit": false,
"can_run_vpn_connector": false,
"can_run_file_cache": false,
"can_run_update_cache": false,
"can_run_video_relay": false,
"native_node_agent_version": Version,
"service_supervision_enabled": false,
},
ReportedFacts: map[string]any{
"os": runtime.GOOS,
"arch": runtime.GOARCH,
"agent": "rap-node-agent",
"agent_ver": Version,
},
RequestedRoles: []string{},
}
}
func HeartbeatPayload() client.HeartbeatRequest {
return client.HeartbeatRequest{
HealthStatus: "healthy",
ReportedVersion: Version,
Capabilities: map[string]any{
"native_node_agent": true,
},
ServiceStates: map[string]any{
"workload_supervision": "not_implemented_c3",
},
Metadata: map[string]any{
"stage": "c3",
},
}
}
func MeshSelfObservationPayload(identity state.Identity) client.MeshLinkObservationRequest {
return client.MeshLinkObservationRequest{
SourceNodeID: identity.NodeID,
TargetNodeID: identity.NodeID,
LinkStatus: "reachable",
Metadata: map[string]any{
"stage": "c6",
"traffic_forwarding": false,
"observation_type": "self",
},
}
}
@@ -0,0 +1,44 @@
package agent
import (
"testing"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
func TestEnrollmentPayloadDoesNotRequestRolesByDefault(t *testing.T) {
payload := EnrollmentPayload("cluster-1", "join-token", state.Identity{
NodeName: "node-a",
NodeFingerprint: "fp",
PublicKey: "pub",
})
if payload.ClusterID != "cluster-1" || payload.JoinToken != "join-token" {
t.Fatalf("unexpected enrollment payload: %+v", payload)
}
if len(payload.RequestedRoles) != 0 {
t.Fatalf("agent must not self-assign roles: %+v", payload.RequestedRoles)
}
if payload.ReportedCapabilities["can_run_rdp_worker"] != true {
t.Fatalf("expected rdp capability in MVP payload: %+v", payload.ReportedCapabilities)
}
}
func TestHeartbeatPayloadIsStatusOnly(t *testing.T) {
payload := HeartbeatPayload()
if payload.HealthStatus != "healthy" {
t.Fatalf("HealthStatus = %q", payload.HealthStatus)
}
if payload.ServiceStates["workload_supervision"] == "running" {
t.Fatal("C3 must not pretend workload supervision is implemented")
}
}
func TestMeshSelfObservationDoesNotEnableTrafficForwarding(t *testing.T) {
payload := MeshSelfObservationPayload(state.Identity{NodeID: "node-1"})
if payload.SourceNodeID != "node-1" || payload.TargetNodeID != "node-1" {
t.Fatalf("unexpected mesh self observation payload: %+v", payload)
}
if payload.Metadata["traffic_forwarding"] != false {
t.Fatalf("traffic forwarding must stay disabled in C6: %+v", payload.Metadata)
}
}
@@ -0,0 +1,33 @@
package agent
import (
"runtime"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
func TelemetryPayload(identity state.Identity, startedAt time.Time) client.TelemetryRequest {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
used := int64(mem.Alloc)
total := int64(mem.Sys)
processCount := runtime.NumGoroutine()
return client.TelemetryRequest{
MemoryUsedBytes: &used,
MemoryTotalBytes: &total,
ProcessCount: &processCount,
Payload: map[string]any{
"agent": "rap-node-agent",
"agent_version": Version,
"node_name": identity.NodeName,
"os": runtime.GOOS,
"arch": runtime.GOARCH,
"goroutines": runtime.NumGoroutine(),
"uptime_seconds": int64(time.Since(startedAt).Seconds()),
"telemetry_source": "testing_flag",
},
ObservedAt: time.Now().UTC(),
}
}
@@ -0,0 +1,110 @@
package authority
import (
"crypto/ed25519"
"crypto/sha256"
"encoding/base64"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"strings"
)
const (
AuthoritySchemaVersion = "rap.cluster_authority.v1"
SignatureSchemaVersion = "rap.cluster_authority.signature.v1"
AlgorithmEd25519 = "ed25519"
)
var (
ErrInvalidKey = errors.New("invalid cluster authority key")
ErrInvalidSignature = errors.New("invalid cluster authority signature")
ErrInvalidPayload = errors.New("invalid cluster authority payload")
)
type Signature struct {
SchemaVersion string `json:"schema_version"`
Algorithm string `json:"algorithm"`
KeyFingerprint string `json:"key_fingerprint"`
Signature string `json:"signature"`
}
func VerifyRaw(publicKeyB64 string, payload json.RawMessage, signature Signature) error {
if signature.SchemaVersion != SignatureSchemaVersion {
return fmt.Errorf("%w: schema_version must be %s", ErrInvalidSignature, SignatureSchemaVersion)
}
if signature.Algorithm != AlgorithmEd25519 {
return fmt.Errorf("%w: algorithm must be %s", ErrInvalidSignature, AlgorithmEd25519)
}
publicKey, err := decodePublicKey(publicKeyB64)
if err != nil {
return err
}
if signature.KeyFingerprint != Fingerprint(publicKey) {
return fmt.Errorf("%w: key fingerprint mismatch", ErrInvalidSignature)
}
canonical, err := CanonicalJSON(payload)
if err != nil {
return err
}
decodedSignature, err := decodeBase64(strings.TrimSpace(signature.Signature))
if err != nil || len(decodedSignature) != ed25519.SignatureSize {
return fmt.Errorf("%w: signature must be base64 ed25519 signature", ErrInvalidSignature)
}
if !ed25519.Verify(publicKey, canonical, decodedSignature) {
return ErrInvalidSignature
}
return nil
}
func Fingerprint(publicKey ed25519.PublicKey) string {
sum := sha256.Sum256(publicKey)
return "rap-ca-ed25519-" + hex.EncodeToString(sum[:16])
}
func HashRaw(raw json.RawMessage) (string, error) {
canonical, err := CanonicalJSON(raw)
if err != nil {
return "", err
}
sum := sha256.Sum256(canonical)
return hex.EncodeToString(sum[:]), nil
}
func CanonicalJSON(raw json.RawMessage) ([]byte, error) {
if len(raw) == 0 {
return nil, fmt.Errorf("%w: empty payload", ErrInvalidPayload)
}
var value any
if err := json.Unmarshal(raw, &value); err != nil {
return nil, fmt.Errorf("%w: invalid json: %v", ErrInvalidPayload, err)
}
canonical, err := json.Marshal(value)
if err != nil {
return nil, fmt.Errorf("%w: canonical json: %v", ErrInvalidPayload, err)
}
return canonical, nil
}
func decodePublicKey(value string) (ed25519.PublicKey, error) {
decoded, err := decodeBase64(strings.TrimSpace(value))
if err != nil {
return nil, fmt.Errorf("%w: public key must be base64 encoded", ErrInvalidKey)
}
if len(decoded) != ed25519.PublicKeySize {
return nil, fmt.Errorf("%w: public key must decode to %d bytes", ErrInvalidKey, ed25519.PublicKeySize)
}
return ed25519.PublicKey(decoded), nil
}
func decodeBase64(value string) ([]byte, error) {
if value == "" {
return nil, errors.New("empty base64 value")
}
decoded, err := base64.StdEncoding.DecodeString(value)
if err == nil {
return decoded, nil
}
return base64.RawStdEncoding.DecodeString(value)
}
@@ -0,0 +1,52 @@
package authority
import (
"crypto/ed25519"
"encoding/base64"
"encoding/json"
"errors"
"testing"
)
func TestVerifyRawAcceptsSignedPayload(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1"}`)
canonical, err := CanonicalJSON(payload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signature := Signature{
SchemaVersion: SignatureSchemaVersion,
Algorithm: AlgorithmEd25519,
KeyFingerprint: Fingerprint(publicKey),
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
}
if err := VerifyRaw(base64.StdEncoding.EncodeToString(publicKey), payload, signature); err != nil {
t.Fatalf("VerifyRaw: %v", err)
}
}
func TestVerifyRawRejectsTamperedPayload(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1"}`)
canonical, err := CanonicalJSON(payload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signature := Signature{
SchemaVersion: SignatureSchemaVersion,
Algorithm: AlgorithmEd25519,
KeyFingerprint: Fingerprint(publicKey),
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
}
tampered := json.RawMessage(`{"cluster_id":"cluster-2","schema_version":"test.v1"}`)
if err := VerifyRaw(base64.StdEncoding.EncodeToString(publicKey), tampered, signature); !errors.Is(err, ErrInvalidSignature) {
t.Fatalf("err = %v, want ErrInvalidSignature", err)
}
}
@@ -0,0 +1,400 @@
package client
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
)
type Client struct {
baseURL string
httpClient *http.Client
}
type EnrollRequest struct {
ClusterID string `json:"cluster_id"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
ReportedCapabilities map[string]any `json:"reported_capabilities"`
ReportedFacts map[string]any `json:"reported_facts"`
RequestedRoles []string `json:"requested_roles"`
}
type EnrollResponse struct {
Status string `json:"status"`
JoinRequest json.RawMessage `json:"join_request"`
}
type EnrollmentBootstrapRequest struct {
ClusterID string `json:"cluster_id"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
}
type EnrollmentBootstrapResponse struct {
Status string `json:"status"`
JoinRequest json.RawMessage `json:"join_request"`
Bootstrap *NodeBootstrap `json:"node_bootstrap,omitempty"`
}
type NodeBootstrap struct {
NodeID string `json:"node_id"`
ClusterID string `json:"cluster_id"`
IdentityStatus string `json:"identity_status"`
Certificate map[string]any `json:"certificate"`
HeartbeatEndpoint string `json:"heartbeat_endpoint"`
ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
}
type HeartbeatRequest struct {
HealthStatus string `json:"health_status"`
ReportedVersion string `json:"reported_version,omitempty"`
Capabilities map[string]any `json:"capabilities"`
ServiceStates map[string]any `json:"service_states"`
Metadata map[string]any `json:"metadata"`
}
type HeartbeatResponse struct {
Heartbeat json.RawMessage `json:"heartbeat"`
TestingFlags EffectiveTestingFlags `json:"testing_flags"`
}
type EffectiveTestingFlags struct {
Enabled bool `json:"enabled"`
TelemetryEnabled bool `json:"telemetry_enabled"`
SyntheticLinksEnabled bool `json:"synthetic_links_enabled"`
HistoryRetentionHours int `json:"history_retention_hours"`
AppliedScopes []string `json:"applied_scopes"`
}
type DesiredWorkload struct {
ServiceType string `json:"service_type"`
DesiredState string `json:"desired_state"`
Version string `json:"version,omitempty"`
RuntimeMode string `json:"runtime_mode"`
ArtifactRef string `json:"artifact_ref,omitempty"`
Config map[string]any `json:"config"`
Environment map[string]any `json:"environment"`
}
type WorkloadStatusRequest struct {
ReportedState string `json:"reported_state"`
RuntimeMode string `json:"runtime_mode"`
Version string `json:"version,omitempty"`
StatusPayload map[string]any `json:"status_payload"`
}
type MeshLinkObservationRequest struct {
SourceNodeID string `json:"source_node_id"`
TargetNodeID string `json:"target_node_id"`
LinkStatus string `json:"link_status"`
LatencyMs *int `json:"latency_ms,omitempty"`
QualityScore *int `json:"quality_score,omitempty"`
Metadata map[string]any `json:"metadata"`
}
type TelemetryRequest struct {
CPUPercent *float64 `json:"cpu_percent,omitempty"`
MemoryUsedBytes *int64 `json:"memory_used_bytes,omitempty"`
MemoryTotalBytes *int64 `json:"memory_total_bytes,omitempty"`
DiskUsedBytes *int64 `json:"disk_used_bytes,omitempty"`
DiskTotalBytes *int64 `json:"disk_total_bytes,omitempty"`
NetworkRxBytes *int64 `json:"network_rx_bytes,omitempty"`
NetworkTxBytes *int64 `json:"network_tx_bytes,omitempty"`
ProcessCount *int `json:"process_count,omitempty"`
Payload map[string]any `json:"payload"`
ObservedAt time.Time `json:"observed_at"`
}
type SyntheticMeshRouteConfig struct {
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
Hops []string `json:"hops"`
AllowedChannels []string `json:"allowed_channels"`
ExpiresAt time.Time `json:"expires_at"`
MaxTTL int `json:"max_ttl"`
MaxHops int `json:"max_hops"`
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticMeshConfig struct {
Enabled bool `json:"enabled"`
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
AuthorityRequired bool `json:"authority_required"`
ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
ConfigVersion string `json:"config_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerEndpoints map[string]string `json:"peer_endpoints"`
PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"`
PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"`
RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"`
RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"`
RendezvousRelayPolicy *RendezvousRelayPolicyReport `json:"rendezvous_relay_policy,omitempty"`
RoutePathDecisions *RoutePathDecisionReport `json:"route_path_decisions,omitempty"`
Routes []SyntheticMeshRouteConfig `json:"routes"`
ProductionForwarding bool `json:"production_forwarding"`
}
type ClusterAuthorityDescriptor struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
AuthorityState string `json:"authority_state"`
KeyAlgorithm string `json:"key_algorithm"`
PublicKey string `json:"public_key"`
PublicKeyFingerprint string `json:"public_key_fingerprint"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type ClusterSignature struct {
SchemaVersion string `json:"schema_version"`
Algorithm string `json:"algorithm"`
KeyFingerprint string `json:"key_fingerprint"`
Signature string `json:"signature"`
SignedAt time.Time `json:"signed_at"`
}
type PeerDirectoryEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
}
type PeerRecoverySeed struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type PeerRendezvousLease struct {
LeaseID string `json:"lease_id"`
PeerNodeID string `json:"peer_node_id"`
RelayNodeID string `json:"relay_node_id"`
RelayEndpoint string `json:"relay_endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
RouteIDs []string `json:"route_ids,omitempty"`
AllowedChannels []string `json:"allowed_channels,omitempty"`
Priority int `json:"priority"`
ControlPlaneOnly bool `json:"control_plane_only"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
Reason string `json:"reason,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type RendezvousRelayPolicyDecision struct {
RouteID string `json:"route_id,omitempty"`
PeerNodeID string `json:"peer_node_id"`
WithdrawnLeaseID string `json:"withdrawn_lease_id,omitempty"`
StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"`
SelectedRelayID string `json:"selected_relay_id,omitempty"`
SelectedEndpoint string `json:"selected_endpoint,omitempty"`
Score int `json:"score,omitempty"`
Reason string `json:"reason"`
ScoreReasons []string `json:"score_reasons,omitempty"`
ReporterNodeID string `json:"reporter_node_id,omitempty"`
}
type RendezvousRelayPolicyReport struct {
SchemaVersion string `json:"schema_version"`
ScoringMode string `json:"scoring_mode"`
FeedbackMaxAgeSeconds int `json:"feedback_max_age_seconds"`
StaleRelayCount int `json:"stale_relay_count"`
WithdrawnLeaseCount int `json:"withdrawn_lease_count"`
ReplacementLeaseCount int `json:"replacement_lease_count"`
Decisions []RendezvousRelayPolicyDecision `json:"decisions,omitempty"`
}
type RoutePathDecision struct {
DecisionID string `json:"decision_id"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
OriginalHops []string `json:"original_hops"`
EffectiveHops []string `json:"effective_hops"`
PreviousHopID string `json:"previous_hop_id,omitempty"`
NextHopID string `json:"next_hop_id,omitempty"`
LocalRole string `json:"local_role"`
SelectedRelayID string `json:"selected_relay_id,omitempty"`
SelectedRelayEndpoint string `json:"selected_relay_endpoint,omitempty"`
StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"`
RendezvousPeerNodeID string `json:"rendezvous_peer_node_id,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RendezvousLeaseReason string `json:"rendezvous_lease_reason,omitempty"`
DecisionSource string `json:"decision_source"`
Generation string `json:"generation"`
PathScore int `json:"path_score,omitempty"`
ScoreReasons []string `json:"score_reasons,omitempty"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
ExpiresAt time.Time `json:"expires_at"`
}
type RoutePathDecisionReport struct {
SchemaVersion string `json:"schema_version"`
DecisionMode string `json:"decision_mode"`
Generation string `json:"generation"`
DecisionCount int `json:"decision_count"`
ReplacementDecisionCount int `json:"replacement_decision_count"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
Decisions []RoutePathDecision `json:"decisions,omitempty"`
}
type PeerEndpointCandidate struct {
EndpointID string `json:"endpoint_id"`
NodeID string `json:"node_id"`
Transport string `json:"transport"`
Address string `json:"address"`
AddressFamily string `json:"address_family,omitempty"`
Reachability string `json:"reachability"`
NATType string `json:"nat_type,omitempty"`
ConnectivityMode string `json:"connectivity_mode"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
PolicyTags []string `json:"policy_tags,omitempty"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
func New(baseURL string) *Client {
return &Client{
baseURL: baseURL,
httpClient: &http.Client{
Timeout: 15 * time.Second,
},
}
}
func (c *Client) Enroll(ctx context.Context, request EnrollRequest) (EnrollResponse, error) {
var response EnrollResponse
if err := c.postJSON(ctx, "/node-agents/enroll", request, &response); err != nil {
return EnrollResponse{}, err
}
return response, nil
}
func (c *Client) BootstrapEnrollment(ctx context.Context, joinRequestID string, request EnrollmentBootstrapRequest) (EnrollmentBootstrapResponse, error) {
var response EnrollmentBootstrapResponse
path := fmt.Sprintf("/node-agents/enrollments/%s/bootstrap", joinRequestID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return EnrollmentBootstrapResponse{}, err
}
return response, nil
}
func (c *Client) Heartbeat(ctx context.Context, clusterID, nodeID string, request HeartbeatRequest) (HeartbeatResponse, error) {
var response HeartbeatResponse
path := fmt.Sprintf("/clusters/%s/nodes/%s/heartbeats", clusterID, nodeID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return HeartbeatResponse{}, err
}
return response, nil
}
func (c *Client) DesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]DesiredWorkload, error) {
var response struct {
DesiredWorkloads []DesiredWorkload `json:"desired_workloads"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/desired", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return nil, err
}
return response.DesiredWorkloads, nil
}
func (c *Client) ReportWorkloadStatus(ctx context.Context, clusterID, nodeID, serviceType string, request WorkloadStatusRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/%s/status", clusterID, nodeID, serviceType)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) ReportMeshLink(ctx context.Context, clusterID string, request MeshLinkObservationRequest) error {
path := fmt.Sprintf("/clusters/%s/mesh/links", clusterID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) ReportTelemetry(ctx context.Context, clusterID, nodeID string, request TelemetryRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/telemetry", clusterID, nodeID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) SyntheticMeshConfig(ctx context.Context, clusterID, nodeID string) (SyntheticMeshConfig, error) {
var response struct {
Config SyntheticMeshConfig `json:"synthetic_mesh_config"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/mesh/synthetic-config", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return SyntheticMeshConfig{}, err
}
return response.Config, nil
}
func (c *Client) getJSON(ctx context.Context, path string, response any) error {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
if err != nil {
return err
}
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
if response == nil {
return nil
}
return json.NewDecoder(httpResp.Body).Decode(response)
}
func (c *Client) postJSON(ctx context.Context, path string, request any, response any) error {
payload, err := json.Marshal(request)
if err != nil {
return err
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(payload))
if err != nil {
return err
}
httpReq.Header.Set("Content-Type", "application/json")
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
if response == nil {
return nil
}
return json.NewDecoder(httpResp.Body).Decode(response)
}
@@ -0,0 +1,183 @@
package config
import (
"errors"
"flag"
"os"
"path/filepath"
"strconv"
"strings"
"time"
)
const MaxMeshProductionObservationSinkCapacity = 10000
type Config struct {
BackendURL string
ClusterID string
ClusterAuthorityPublicKey string
ClusterAuthorityFingerprint string
JoinToken string
NodeName string
StateDir string
WorkloadSupervisionEnabled bool
HeartbeatInterval time.Duration
EnrollmentPollInterval time.Duration
EnrollmentPollTimeout time.Duration
MeshSyntheticRuntimeEnabled bool
MeshProductionForwardingEnabled bool
MeshProductionObservationSinkCapacity int
MeshListenAddr string
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
MeshRegion string
MeshSyntheticConfigPath string
MeshPeerEndpointsJSON string
MeshSyntheticRoutesJSON string
}
func Load(args []string, env map[string]string) (Config, error) {
if env == nil {
env = readEnv()
}
defaultStateDir := filepath.Join(".", ".rap-node-agent")
fs := flag.NewFlagSet("rap-node-agent", flag.ContinueOnError)
cfg := Config{}
fs.StringVar(&cfg.BackendURL, "backend-url", getEnv(env, "RAP_BACKEND_URL", "http://127.0.0.1:8080/api/v1"), "Backend API base URL.")
fs.StringVar(&cfg.ClusterID, "cluster-id", getEnv(env, "RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.ClusterAuthorityPublicKey, "cluster-authority-public-key", getEnv(env, "RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned cluster authority Ed25519 public key.")
fs.StringVar(&cfg.ClusterAuthorityFingerprint, "cluster-authority-fingerprint", getEnv(env, "RAP_CLUSTER_AUTHORITY_FINGERPRINT", ""), "Pinned cluster authority key fingerprint.")
fs.StringVar(&cfg.JoinToken, "join-token", getEnv(env, "RAP_JOIN_TOKEN", ""), "Short-lived node join token.")
fs.StringVar(&cfg.NodeName, "node-name", getEnv(env, "RAP_NODE_NAME", hostnameOrDefault()), "Node display name.")
fs.StringVar(&cfg.StateDir, "state-dir", getEnv(env, "RAP_NODE_STATE_DIR", defaultStateDir), "Local node-agent state directory.")
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getEnvBool(env, "RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable desired workload polling and status reporting. Disabled by default while service runtime is not implemented.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "direct_tcp_tls"), "Transport label for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshRegion, "mesh-region", getEnv(env, "RAP_MESH_REGION", ""), "Optional region/site hint for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshSyntheticConfigPath, "mesh-synthetic-config", getEnv(env, "RAP_MESH_SYNTHETIC_CONFIG", ""), "Path to scoped synthetic mesh config snapshot. Preferred over debug JSON env.")
fs.StringVar(&cfg.MeshPeerEndpointsJSON, "mesh-peer-endpoints-json", getEnv(env, "RAP_MESH_PEER_ENDPOINTS_JSON", ""), "JSON object mapping peer node_id to synthetic mesh endpoint URL.")
fs.StringVar(&cfg.MeshSyntheticRoutesJSON, "mesh-synthetic-routes-json", getEnv(env, "RAP_MESH_SYNTHETIC_ROUTES_JSON", ""), "JSON array of synthetic mesh routes for test-only runtime.")
heartbeatSeconds := getEnvInt(env, "RAP_HEARTBEAT_INTERVAL_SECONDS", 15)
fs.DurationVar(&cfg.HeartbeatInterval, "heartbeat-interval", time.Duration(heartbeatSeconds)*time.Second, "Heartbeat interval.")
enrollmentPollIntervalSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5)
enrollmentPollTimeoutSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 600)
fs.DurationVar(&cfg.EnrollmentPollInterval, "enrollment-poll-interval", time.Duration(enrollmentPollIntervalSeconds)*time.Second, "Enrollment approval polling interval.")
fs.DurationVar(&cfg.EnrollmentPollTimeout, "enrollment-poll-timeout", time.Duration(enrollmentPollTimeoutSeconds)*time.Second, "Enrollment approval polling timeout.")
if err := fs.Parse(args); err != nil {
return Config{}, err
}
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.ClusterAuthorityPublicKey = strings.TrimSpace(cfg.ClusterAuthorityPublicKey)
cfg.ClusterAuthorityFingerprint = strings.TrimSpace(cfg.ClusterAuthorityFingerprint)
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
cfg.MeshSyntheticConfigPath = strings.TrimSpace(cfg.MeshSyntheticConfigPath)
cfg.MeshPeerEndpointsJSON = strings.TrimSpace(cfg.MeshPeerEndpointsJSON)
cfg.MeshSyntheticRoutesJSON = strings.TrimSpace(cfg.MeshSyntheticRoutesJSON)
if cfg.BackendURL == "" {
return Config{}, errors.New("backend URL is required")
}
if cfg.NodeName == "" {
return Config{}, errors.New("node name is required")
}
if cfg.StateDir == "" {
return Config{}, errors.New("state dir is required")
}
if cfg.HeartbeatInterval <= 0 {
return Config{}, errors.New("heartbeat interval must be positive")
}
if cfg.EnrollmentPollInterval <= 0 {
return Config{}, errors.New("enrollment poll interval must be positive")
}
if cfg.EnrollmentPollTimeout < 0 {
return Config{}, errors.New("enrollment poll timeout must not be negative")
}
if cfg.MeshProductionObservationSinkCapacity < 0 {
return Config{}, errors.New("mesh production observation sink capacity must not be negative")
}
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
}
return cfg, nil
}
func readEnv() map[string]string {
out := map[string]string{}
for _, pair := range os.Environ() {
key, value, ok := strings.Cut(pair, "=")
if ok {
out[key] = value
}
}
return out
}
func getEnv(env map[string]string, key, fallback string) string {
if value := strings.TrimSpace(env[key]); value != "" {
return value
}
return fallback
}
func getEnvInt(env map[string]string, key string, fallback int) int {
value := strings.TrimSpace(env[key])
if value == "" {
return fallback
}
parsed, err := strconv.Atoi(value)
if err != nil || parsed <= 0 {
return fallback
}
return parsed
}
func getEnvSignedInt(env map[string]string, key string, fallback int) int {
value := strings.TrimSpace(env[key])
if value == "" {
return fallback
}
parsed, err := strconv.Atoi(value)
if err != nil {
return fallback
}
return parsed
}
func getEnvBool(env map[string]string, key string, fallback bool) bool {
value := strings.ToLower(strings.TrimSpace(env[key]))
switch value {
case "1", "true", "yes", "y", "on":
return true
case "0", "false", "no", "n", "off":
return false
default:
return fallback
}
}
func hostnameOrDefault() string {
host, err := os.Hostname()
if err != nil || strings.TrimSpace(host) == "" {
return "rap-node"
}
return host
}
@@ -0,0 +1,104 @@
package config
import (
"testing"
"time"
)
func TestLoadConfigFromEnvAndArgs(t *testing.T) {
cfg, err := Load([]string{"-node-name", "node-b"}, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1/",
"RAP_CLUSTER_ID": "cluster-1",
"RAP_CLUSTER_AUTHORITY_PUBLIC_KEY": "public-key-b64",
"RAP_CLUSTER_AUTHORITY_FINGERPRINT": "rap-ca-ed25519-test",
"RAP_JOIN_TOKEN": "join-token",
"RAP_NODE_NAME": "node-a",
"RAP_NODE_STATE_DIR": "/tmp/rap-node",
"RAP_WORKLOAD_SUPERVISION_ENABLED": "true",
"RAP_HEARTBEAT_INTERVAL_SECONDS": "7",
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS": "3",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true",
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5",
"RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001",
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443/",
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
"RAP_MESH_CONNECTIVITY_MODE": "outbound_only",
"RAP_MESH_NAT_TYPE": "symmetric",
"RAP_MESH_REGION": "eu",
"RAP_MESH_SYNTHETIC_CONFIG": "/tmp/rap-node/mesh-synthetic.json",
"RAP_MESH_PEER_ENDPOINTS_JSON": `{"node-b":"http://127.0.0.1:19002"}`,
"RAP_MESH_SYNTHETIC_ROUTES_JSON": `[{"route_id":"route-1"}]`,
})
if err != nil {
t.Fatalf("load config: %v", err)
}
if cfg.BackendURL != "http://backend/api/v1" {
t.Fatalf("BackendURL = %q", cfg.BackendURL)
}
if cfg.NodeName != "node-b" {
t.Fatalf("NodeName = %q", cfg.NodeName)
}
if cfg.ClusterAuthorityPublicKey != "public-key-b64" || cfg.ClusterAuthorityFingerprint != "rap-ca-ed25519-test" {
t.Fatalf("unexpected cluster authority pin config: %+v", cfg)
}
if cfg.HeartbeatInterval != 7*time.Second {
t.Fatalf("HeartbeatInterval = %s", cfg.HeartbeatInterval)
}
if cfg.EnrollmentPollInterval != 3*time.Second || cfg.EnrollmentPollTimeout != 30*time.Second {
t.Fatalf("unexpected enrollment polling config: %+v", cfg)
}
if !cfg.WorkloadSupervisionEnabled {
t.Fatal("WorkloadSupervisionEnabled = false, want true")
}
if !cfg.MeshSyntheticRuntimeEnabled {
t.Fatal("MeshSyntheticRuntimeEnabled = false, want true")
}
if !cfg.MeshProductionForwardingEnabled {
t.Fatal("MeshProductionForwardingEnabled = false, want true")
}
if cfg.MeshProductionObservationSinkCapacity != 5 {
t.Fatalf("MeshProductionObservationSinkCapacity = %d, want 5", cfg.MeshProductionObservationSinkCapacity)
}
if cfg.MeshListenAddr != "127.0.0.1:19001" {
t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr)
}
if cfg.MeshAdvertiseEndpoint != "https://node-a.example.test:443" ||
cfg.MeshAdvertiseEndpointsJSON == "" ||
cfg.MeshAdvertiseTransport != "wss" ||
cfg.MeshConnectivityMode != "outbound_only" ||
cfg.MeshNATType != "symmetric" ||
cfg.MeshRegion != "eu" {
t.Fatalf("unexpected mesh advertise config: %+v", cfg)
}
if cfg.MeshSyntheticConfigPath != "/tmp/rap-node/mesh-synthetic.json" {
t.Fatalf("MeshSyntheticConfigPath = %q", cfg.MeshSyntheticConfigPath)
}
if cfg.MeshPeerEndpointsJSON == "" || cfg.MeshSyntheticRoutesJSON == "" {
t.Fatalf("mesh live synthetic config was not loaded: %+v", cfg)
}
}
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "-1",
})
if err == nil {
t.Fatal("Load returned nil error for negative sink capacity")
}
}
func TestLoadConfigRejectsTooLargeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "10001",
})
if err == nil {
t.Fatal("Load returned nil error for too-large sink capacity")
}
}
@@ -0,0 +1,111 @@
package mesh
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
)
type Client struct {
BaseURL string
HTTPClient *http.Client
}
func NewClient(baseURL string) Client {
return Client{
BaseURL: baseURL,
HTTPClient: &http.Client{
Timeout: 5 * time.Second,
},
}
}
func (c Client) SendHealth(ctx context.Context, message HealthMessage) (HealthAck, error) {
payload, err := json.Marshal(message)
if err != nil {
return HealthAck{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/health", bytes.NewReader(payload))
if err != nil {
return HealthAck{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return HealthAck{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return HealthAck{}, fmt.Errorf("mesh health rejected with status %d", resp.StatusCode)
}
var ack HealthAck
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return HealthAck{}, err
}
return ack, nil
}
func (c Client) SendSynthetic(ctx context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return SyntheticEnvelope{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/synthetic/probe", bytes.NewReader(payload))
if err != nil {
return SyntheticEnvelope{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return SyntheticEnvelope{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return SyntheticEnvelope{}, fmt.Errorf("mesh synthetic probe rejected with status %d", resp.StatusCode)
}
var ack SyntheticEnvelope
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return SyntheticEnvelope{}, err
}
return ack, nil
}
func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return ProductionForwardResult{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/forward", bytes.NewReader(payload))
if err != nil {
return ProductionForwardResult{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return ProductionForwardResult{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return ProductionForwardResult{}, fmt.Errorf("mesh production forward rejected with status %d", resp.StatusCode)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return ProductionForwardResult{}, err
}
return result, nil
}
@@ -0,0 +1,288 @@
package mesh
import (
"encoding/json"
"errors"
"time"
)
const ProtocolVersion = "mesh-control-v1"
var (
ErrClusterMismatch = errors.New("mesh peer cluster mismatch")
ErrNodeMismatch = errors.New("mesh peer node mismatch")
ErrForwardDisabled = errors.New("production payload forwarding is disabled by mesh production gate")
ErrForwardRuntimeUnavailable = errors.New("production mesh forwarding runtime is unavailable for this route or stage")
ErrForwardPeerUnavailable = errors.New("production mesh next peer is unavailable")
ErrForwardEnvelopeInvalid = errors.New("production mesh envelope is invalid")
ErrForwardObservationFailed = errors.New("production mesh envelope observation failed")
ErrMeshRuntimeDisabled = errors.New("mesh synthetic runtime is disabled")
ErrUnsupportedSyntheticMessage = errors.New("unsupported synthetic mesh message")
ErrRouteIDRequired = errors.New("mesh synthetic route id is required")
ErrRouteNotFound = errors.New("mesh synthetic route not found")
ErrInvalidRoutePath = errors.New("mesh synthetic route path is invalid")
ErrRouteExpired = errors.New("mesh synthetic route is expired")
ErrTTLExhausted = errors.New("mesh synthetic route ttl exhausted")
ErrLoopDetected = errors.New("mesh synthetic route loop detected")
ErrUnauthorizedChannel = errors.New("mesh synthetic channel is not authorized")
ErrSyntheticPeerUnavailable = errors.New("mesh synthetic next peer is unavailable")
ErrNoHealthySyntheticRoute = errors.New("mesh synthetic no healthy route available")
ErrSyntheticRelayQueueFull = errors.New("mesh synthetic relay queue is full")
ErrSyntheticRelayQueueEmpty = errors.New("mesh synthetic relay queue is empty")
ErrSyntheticPayloadTooLarge = errors.New("mesh synthetic payload is too large")
ErrSyntheticOrganizationMismatch = errors.New("mesh synthetic organization mismatch")
ErrUnsupportedSyntheticService = errors.New("unsupported synthetic test service")
ErrSyntheticRequestInvalid = errors.New("mesh synthetic request is invalid")
)
const (
SyntheticMessageProbe = "fabric.probe"
SyntheticMessageProbeAck = "fabric.probe_ack"
SyntheticMessageRouteHealth = "fabric.route_health"
SyntheticMessageRouteHealthAck = "fabric.route_health_ack"
SyntheticMessageTelemetry = "fabric.telemetry"
SyntheticMessageTestService = "fabric.test_service"
SyntheticMessageTestServiceAck = "fabric.test_service_ack"
SyntheticTestServiceType = "synthetic.echo"
SyntheticDefaultTestOrganizationID = "org-test"
SyntheticDefaultMaxTestPayloadBytes = 4096
SyntheticChannelFabricControl = "fabric_control"
SyntheticChannelRouteControl = "route_control"
SyntheticChannelTelemetry = "telemetry"
SyntheticRouteStateUnknown = "unknown"
SyntheticRouteStateHealthy = "healthy"
SyntheticRouteStateDegraded = "degraded"
SyntheticRouteStateFailed = "failed"
ProductionChannelFabricControl = "fabric_control"
ProductionMessageFabricControl = "fabric.control"
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionEnvelopeFutureSkew = time.Minute
)
type PeerIdentity struct {
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
}
type SyntheticRoute struct {
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
Hops []string `json:"hops"`
AllowedChannels []string `json:"allowed_channels"`
ExpiresAt time.Time `json:"expires_at"`
MaxTTL int `json:"max_ttl"`
MaxHops int `json:"max_hops"`
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticEnvelope struct {
ProtocolVersion string `json:"protocol_version"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
From PeerIdentity `json:"from"`
To PeerIdentity `json:"to"`
Channel string `json:"channel"`
MessageType string `json:"message_type"`
TTL int `json:"ttl"`
HopCount int `json:"hop_count"`
Visited []string `json:"visited"`
Sequence uint64 `json:"sequence"`
SentAt time.Time `json:"sent_at"`
Payload json.RawMessage `json:"payload,omitempty"`
}
type SyntheticProbePayload struct {
ProbeID string `json:"probe_id"`
SentAt time.Time `json:"sent_at"`
}
type SyntheticProbeAckPayload struct {
ProbeID string `json:"probe_id"`
Path []string `json:"path"`
AcceptedAt time.Time `json:"accepted_at"`
}
type SyntheticRouteObservation struct {
RouteID string `json:"route_id"`
State string `json:"state"`
LastSuccessAt time.Time `json:"last_success_at,omitempty"`
LastFailureAt time.Time `json:"last_failure_at,omitempty"`
LastFailureReason string `json:"last_failure_reason,omitempty"`
SuccessCount uint64 `json:"success_count"`
FailureCount uint64 `json:"failure_count"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticRouteHealthResult struct {
RequestedRouteID string `json:"requested_route_id"`
SelectedRouteID string `json:"selected_route_id"`
FallbackUsed bool `json:"fallback_used"`
Ack SyntheticEnvelope `json:"ack"`
Observation SyntheticRouteObservation `json:"observation"`
}
type SyntheticTestServiceRequest struct {
RequestID string `json:"request_id"`
OrganizationID string `json:"organization_id"`
ServiceType string `json:"service_type"`
Payload string `json:"payload"`
SentAt time.Time `json:"sent_at"`
}
type SyntheticTestServiceResponse struct {
RequestID string `json:"request_id"`
OrganizationID string `json:"organization_id"`
ServiceType string `json:"service_type"`
EchoPayload string `json:"echo_payload"`
Path []string `json:"path"`
AcceptedAt time.Time `json:"accepted_at"`
}
type SyntheticTestServiceResult struct {
RequestedRouteID string `json:"requested_route_id"`
SelectedRouteID string `json:"selected_route_id"`
FallbackUsed bool `json:"fallback_used"`
Ack SyntheticEnvelope `json:"ack"`
Response SyntheticTestServiceResponse `json:"response"`
Observation SyntheticRouteObservation `json:"observation"`
}
type SyntheticRouteCacheVersion struct {
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticRelayQueuePolicy struct {
Channel string `json:"channel"`
Capacity int `json:"capacity"`
Droppable bool `json:"droppable"`
}
type SyntheticRelayEnqueueResult struct {
Channel string `json:"channel"`
QueueDepth int `json:"queue_depth"`
QueueCapacity int `json:"queue_capacity"`
Dropped bool `json:"dropped"`
DroppedSequence uint64 `json:"dropped_sequence,omitempty"`
AcceptedSequence uint64 `json:"accepted_sequence"`
}
type SyntheticRelayQueueMetrics struct {
Enqueued uint64 `json:"enqueued"`
Dequeued uint64 `json:"dequeued"`
Dropped uint64 `json:"dropped"`
Rejected uint64 `json:"rejected"`
LastRejectReason string `json:"last_reject_reason,omitempty"`
QueueDepths map[string]int `json:"queue_depths"`
}
type HealthMessage struct {
ProtocolVersion string `json:"protocol_version"`
From PeerIdentity `json:"from"`
To PeerIdentity `json:"to"`
ObservedAt time.Time `json:"observed_at"`
LinkStatus string `json:"link_status"`
LatencyMs *int `json:"latency_ms,omitempty"`
QualityScore *int `json:"quality_score,omitempty"`
}
type HealthAck struct {
ProtocolVersion string `json:"protocol_version"`
Accepted bool `json:"accepted"`
By PeerIdentity `json:"by"`
}
type ProductionEnvelope struct {
FabricProtocolVersion string `json:"fabric_protocol_version"`
MessageID string `json:"message_id"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
CurrentHopNodeID string `json:"current_hop_node_id"`
NextHopNodeID string `json:"next_hop_node_id"`
RoutePath []string `json:"route_path,omitempty"`
VisitedNodeIDs []string `json:"visited_node_ids,omitempty"`
ChannelClass string `json:"channel_class"`
MessageType string `json:"message_type"`
TTL int `json:"ttl"`
HopCount int `json:"hop_count"`
CreatedAt time.Time `json:"created_at"`
ExpiresAt time.Time `json:"expires_at"`
PayloadLength int `json:"payload_length"`
PayloadHash string `json:"payload_hash"`
Payload json.RawMessage `json:"payload,omitempty"`
}
type ProductionEnvelopeObservation struct {
MessageID string `json:"message_id"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
CurrentHopNodeID string `json:"current_hop_node_id"`
NextHopNodeID string `json:"next_hop_node_id"`
RoutePath []string `json:"route_path,omitempty"`
VisitedNodeIDs []string `json:"visited_node_ids,omitempty"`
ChannelClass string `json:"channel_class"`
MessageType string `json:"message_type"`
TTL int `json:"ttl"`
HopCount int `json:"hop_count"`
PayloadLength int `json:"payload_length"`
PayloadHash string `json:"payload_hash"`
ObservedAt time.Time `json:"observed_at"`
}
type ProductionForwardResult struct {
Accepted bool `json:"accepted"`
Delivered bool `json:"delivered"`
Forwarded bool `json:"forwarded"`
By PeerIdentity `json:"by"`
MessageID string `json:"message_id"`
RouteID string `json:"route_id"`
NextNodeID string `json:"next_node_id,omitempty"`
}
type ProductionForwardLogEntry struct {
Event string `json:"event"`
RouteID string `json:"route_id,omitempty"`
MessageID string `json:"message_id,omitempty"`
ClusterID string `json:"cluster_id,omitempty"`
LocalNodeID string `json:"local_node_id,omitempty"`
SourceNodeID string `json:"source_node_id,omitempty"`
DestinationNodeID string `json:"destination_node_id,omitempty"`
CurrentHopNodeID string `json:"current_hop_node_id,omitempty"`
NextHopNodeID string `json:"next_hop_node_id,omitempty"`
ChannelClass string `json:"channel_class,omitempty"`
MessageType string `json:"message_type,omitempty"`
Reason string `json:"reason,omitempty"`
StatusCode int `json:"status_code,omitempty"`
TTL int `json:"ttl,omitempty"`
HopCount int `json:"hop_count,omitempty"`
RoutePathLength int `json:"route_path_length,omitempty"`
VisitedCount int `json:"visited_count,omitempty"`
PayloadLength int `json:"payload_length,omitempty"`
OccurredAt time.Time `json:"occurred_at"`
}
func ValidatePeer(local PeerIdentity, remote PeerIdentity) error {
if local.ClusterID == "" || remote.ClusterID == "" || local.ClusterID != remote.ClusterID {
return ErrClusterMismatch
}
if remote.NodeID == "" {
return ErrNodeMismatch
}
return nil
}
@@ -0,0 +1,258 @@
package mesh
import (
"sort"
"strings"
"time"
)
type EndpointCandidateScoreOptions struct {
ChannelClass string
PreferredRegion string
Now time.Time
MaxVerificationAge time.Duration
Observations map[string]EndpointCandidateHealthObservation
MaxObservationAge time.Duration
}
type EndpointCandidateHealthObservation struct {
EndpointID string `json:"endpoint_id"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
SuccessCount uint64 `json:"success_count,omitempty"`
FailureCount uint64 `json:"failure_count,omitempty"`
LastFailureReason string `json:"last_failure_reason,omitempty"`
ReliabilityScore int `json:"reliability_score,omitempty"`
ObservedAt time.Time `json:"observed_at,omitempty"`
}
type ScoredPeerEndpointCandidate struct {
Candidate PeerEndpointCandidate `json:"candidate"`
Score int `json:"score"`
Reasons []string `json:"reasons,omitempty"`
}
func RankPeerEndpointCandidates(candidates []PeerEndpointCandidate, opts EndpointCandidateScoreOptions) []ScoredPeerEndpointCandidate {
if len(candidates) == 0 {
return nil
}
out := make([]ScoredPeerEndpointCandidate, 0, len(candidates))
for _, candidate := range candidates {
out = append(out, scorePeerEndpointCandidate(candidate, opts))
}
sort.SliceStable(out, func(i, j int) bool {
if out[i].Score != out[j].Score {
return out[i].Score > out[j].Score
}
if out[i].Candidate.Priority != out[j].Candidate.Priority {
return out[i].Candidate.Priority < out[j].Candidate.Priority
}
if out[i].Candidate.NodeID != out[j].Candidate.NodeID {
return out[i].Candidate.NodeID < out[j].Candidate.NodeID
}
return out[i].Candidate.EndpointID < out[j].Candidate.EndpointID
})
return out
}
func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions) ScoredPeerEndpointCandidate {
score := 100
reasons := []string{"base"}
switch candidate.Transport {
case "direct_tcp_tls":
score += 35
reasons = append(reasons, "transport:direct_tcp_tls")
case "wss":
score += 25
reasons = append(reasons, "transport:wss")
case "outbound_reverse":
score += 10
reasons = append(reasons, "transport:outbound_reverse")
case "relay":
score += 5
reasons = append(reasons, "transport:relay")
default:
score -= 100
reasons = append(reasons, "transport:unknown")
}
switch candidate.Reachability {
case "public":
score += 30
reasons = append(reasons, "reachability:public")
case "private":
score += 15
reasons = append(reasons, "reachability:private")
case "relay":
score += 5
reasons = append(reasons, "reachability:relay")
case "outbound_only":
score -= 5
reasons = append(reasons, "reachability:outbound_only")
default:
score -= 15
reasons = append(reasons, "reachability:unknown")
}
switch candidate.ConnectivityMode {
case "direct":
score += 30
reasons = append(reasons, "connectivity:direct")
case "outbound_only":
score += 5
reasons = append(reasons, "connectivity:outbound_only")
case "relay_required":
score -= 5
reasons = append(reasons, "connectivity:relay_required")
default:
score -= 10
reasons = append(reasons, "connectivity:unknown")
}
switch candidate.NATType {
case "", "none":
score += 15
reasons = append(reasons, "nat:none")
case "full_cone":
score += 10
reasons = append(reasons, "nat:full_cone")
case "restricted", "port_restricted":
score += 3
reasons = append(reasons, "nat:restricted")
case "symmetric":
score -= 20
reasons = append(reasons, "nat:symmetric")
case "blocked":
score -= 60
reasons = append(reasons, "nat:blocked")
default:
score -= 8
reasons = append(reasons, "nat:unknown")
}
if candidate.Priority > 0 {
score -= candidate.Priority
reasons = append(reasons, "priority")
}
if opts.PreferredRegion != "" && candidate.Region != "" {
if strings.EqualFold(candidate.Region, opts.PreferredRegion) {
score += 12
reasons = append(reasons, "region:preferred")
} else {
score -= 4
reasons = append(reasons, "region:remote")
}
}
if hasPolicyTag(candidate.PolicyTags, "fast-path") {
score += 10
reasons = append(reasons, "policy:fast-path")
}
if hasPolicyTag(candidate.PolicyTags, "private-lan") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "same-site") {
score += 18
reasons = append(reasons, "policy:private-lan")
}
if hasPolicyTag(candidate.PolicyTags, "costly") {
score -= 10
reasons = append(reasons, "policy:costly")
}
if opts.ChannelClass == SyntheticChannelFabricControl || opts.ChannelClass == SyntheticChannelRouteControl {
if candidate.ConnectivityMode == "direct" {
score += 8
reasons = append(reasons, "channel:control-direct")
}
if candidate.Transport == "relay" {
score -= 8
reasons = append(reasons, "channel:control-relay-penalty")
}
}
if !opts.Now.IsZero() && candidate.LastVerifiedAt != nil && opts.MaxVerificationAge > 0 {
age := opts.Now.Sub(candidate.LastVerifiedAt.UTC())
if age >= 0 && age <= opts.MaxVerificationAge {
score += 8
reasons = append(reasons, "verified:fresh")
} else {
score -= 12
reasons = append(reasons, "verified:stale")
}
}
if observation, ok := opts.Observations[candidate.EndpointID]; ok {
observationScore, observationReasons := scoreEndpointCandidateObservation(observation, opts)
score += observationScore
reasons = append(reasons, observationReasons...)
}
return ScoredPeerEndpointCandidate{
Candidate: candidate,
Score: score,
Reasons: reasons,
}
}
func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
score := 0
reasons := []string{"observation:present"}
if !opts.Now.IsZero() && !observation.ObservedAt.IsZero() && opts.MaxObservationAge > 0 {
age := opts.Now.Sub(observation.ObservedAt.UTC())
if age < 0 || age > opts.MaxObservationAge {
return -12, []string{"observation:stale"}
}
score += 6
reasons = append(reasons, "observation:fresh")
}
switch {
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
score += 18
reasons = append(reasons, "latency:low")
case observation.LastLatencyMs <= 150:
score += 8
reasons = append(reasons, "latency:moderate")
case observation.LastLatencyMs > 0:
score -= 10
reasons = append(reasons, "latency:high")
}
if observation.ReliabilityScore > 0 {
switch {
case observation.ReliabilityScore >= 90:
score += 15
reasons = append(reasons, "reliability:high")
case observation.ReliabilityScore >= 70:
score += 5
reasons = append(reasons, "reliability:moderate")
default:
score -= 12
reasons = append(reasons, "reliability:low")
}
}
if observation.SuccessCount > 0 {
score += boundedInt(int(observation.SuccessCount), 1, 10)
reasons = append(reasons, "history:success")
}
if observation.FailureCount > 0 {
score -= boundedInt(int(observation.FailureCount)*6, 6, 30)
reasons = append(reasons, "history:failure")
}
if strings.TrimSpace(observation.LastFailureReason) != "" {
score -= 8
reasons = append(reasons, "failure:recent")
}
return score, reasons
}
func hasPolicyTag(tags []string, needle string) bool {
for _, tag := range tags {
if strings.EqualFold(strings.TrimSpace(tag), needle) {
return true
}
}
return false
}
func boundedInt(value, minValue, maxValue int) int {
if value < minValue {
return minValue
}
if value > maxValue {
return maxValue
}
return value
}
@@ -0,0 +1,278 @@
package mesh
import (
"testing"
"time"
)
func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
fresh := now.Add(-time.Minute)
stale := now.Add(-2 * time.Hour)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "symmetric",
ConnectivityMode: "relay_required",
Region: "us",
Priority: 1,
LastVerifiedAt: &fresh,
},
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Region: "eu",
Priority: 10,
PolicyTags: []string{"fast-path"},
LastVerifiedAt: &fresh,
},
{
EndpointID: "node-b-private-stale",
NodeID: "node-b",
Transport: "wss",
Address: "10.0.0.5:443",
Reachability: "private",
NATType: "restricted",
ConnectivityMode: "direct",
Region: "eu",
Priority: 5,
LastVerifiedAt: &stale,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: "eu",
Now: now,
MaxVerificationAge: time.Hour,
})
if len(ranked) != 3 {
t.Fatalf("ranked length = %d, want 3", len(ranked))
}
if ranked[0].Candidate.EndpointID != "node-b-public" {
t.Fatalf("top endpoint = %q, want node-b-public: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if ranked[0].Score <= ranked[1].Score {
t.Fatalf("top score = %d, second = %d", ranked[0].Score, ranked[1].Score)
}
if !containsReason(ranked[0].Reasons, "policy:fast-path") || !containsReason(ranked[0].Reasons, "verified:fresh") {
t.Fatalf("top reasons missing expected hints: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
candidates := []PeerEndpointCandidate{
{
EndpointID: "endpoint-b",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.21:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "endpoint-a",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{})
if ranked[0].Candidate.EndpointID != "endpoint-a" {
t.Fatalf("tie top endpoint = %q, want endpoint-a", ranked[0].Candidate.EndpointID)
}
}
func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 10,
},
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "10.24.10.20:19001",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 1,
PolicyTags: []string{"corp-lan", "same-site"},
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: "corp-eu",
Now: now,
})
if ranked[0].Candidate.EndpointID != "node-b-corp-lan" {
t.Fatalf("top endpoint = %q, want node-b-corp-lan: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "policy:private-lan") || !containsReason(ranked[0].Reasons, "region:preferred") {
t.Fatalf("corp LAN reasons missing: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T) {
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-outbound",
NodeID: "node-b",
Transport: "outbound_reverse",
Address: "node-b.reverse.local",
Reachability: "outbound_only",
NATType: "symmetric",
ConnectivityMode: "outbound_only",
Priority: 20,
},
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "blocked",
ConnectivityMode: "relay_required",
Priority: 30,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelRouteControl,
})
if len(ranked) != 2 {
t.Fatalf("ranked length = %d, want 2", len(ranked))
}
for _, item := range ranked {
if item.Candidate.EndpointID == "" {
t.Fatalf("ranked candidate lost identity: %+v", item)
}
}
}
func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "node-b-wss",
NodeID: "node-b",
Transport: "wss",
Address: "node-b.example.test",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
MaxObservationAge: 5 * time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"node-b-direct": {
EndpointID: "node-b-direct",
LastLatencyMs: 240,
FailureCount: 3,
LastFailureReason: "connect_timeout",
ReliabilityScore: 50,
ObservedAt: now.Add(-time.Minute),
},
"node-b-wss": {
EndpointID: "node-b-wss",
LastLatencyMs: 35,
SuccessCount: 8,
ReliabilityScore: 95,
ObservedAt: now.Add(-time.Minute),
},
},
})
if ranked[0].Candidate.EndpointID != "node-b-wss" {
t.Fatalf("top endpoint = %q, want node-b-wss: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "latency:low") || !containsReason(ranked[0].Reasons, "reliability:high") {
t.Fatalf("top reasons missing health hints: %+v", ranked[0].Reasons)
}
if !containsReason(ranked[1].Reasons, "history:failure") || !containsReason(ranked[1].Reasons, "failure:recent") {
t.Fatalf("failed endpoint reasons missing failure hints: %+v", ranked[1].Reasons)
}
}
func TestRankPeerEndpointCandidatesTreatsStaleObservationAsPenalty(t *testing.T) {
now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
MaxObservationAge: 5 * time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"node-b-direct": {
EndpointID: "node-b-direct",
LastLatencyMs: 20,
ObservedAt: now.Add(-time.Hour),
},
},
})
if !containsReason(ranked[0].Reasons, "observation:stale") {
t.Fatalf("reasons missing stale observation: %+v", ranked[0].Reasons)
}
if containsReason(ranked[0].Reasons, "latency:low") {
t.Fatalf("stale observation should not contribute latency: %+v", ranked[0].Reasons)
}
}
func containsReason(reasons []string, reason string) bool {
for _, item := range reasons {
if item == reason {
return true
}
}
return false
}
@@ -0,0 +1,42 @@
package mesh
import (
"context"
"net/http"
"strings"
)
// HTTPPeerTransport sends synthetic mesh envelopes to explicitly configured
// peer endpoints. It is intentionally narrow: production forwarding remains
// disabled and only SyntheticRuntime messages use this transport.
type HTTPPeerTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
}
func NewHTTPPeerTransport(peerURLs map[string]string) *HTTPPeerTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
}
}
return &HTTPPeerTransport{PeerURLs: normalized}
}
func (t *HTTPPeerTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
if t == nil {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
}
return client.SendSynthetic(ctx, envelope)
}
@@ -0,0 +1,130 @@
package mesh
import (
"context"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestHTTPPeerTransportDirectSyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-direct", []string{"node-a", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-direct")
if err != nil {
t.Fatalf("send live direct probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportSingleRelaySyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeR := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"})
defer nodeR.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-relay", []string{"node-a", "node-r", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-r": nodeR.URL})
nodeR.Runtime = newLiveRuntime(nodeR.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-relay")
if err != nil {
t.Fatalf("send live relay probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-r", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportMissingPeer(t *testing.T) {
transport := NewHTTPPeerTransport(map[string]string{})
_, err := transport.SendSynthetic(context.Background(), "node-missing", SyntheticEnvelope{})
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
}
}
type liveSyntheticNode struct {
Local PeerIdentity
Runtime *SyntheticRuntime
URL string
server *httptest.Server
}
func newLiveSyntheticNode(t *testing.T, local PeerIdentity) *liveSyntheticNode {
t.Helper()
node := &liveSyntheticNode{Local: local}
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
}))
node.URL = node.server.URL
return node
}
func (n *liveSyntheticNode) Close() {
if n.server != nil {
n.server.Close()
}
}
func newLiveRuntime(local PeerIdentity, routes []SyntheticRoute, peers map[string]string) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: local,
Routes: routes,
Transport: NewHTTPPeerTransport(peers),
})
}
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
MaxTTL: 8,
MaxHops: 8,
ExpiresAt: time.Now().UTC().Add(time.Hour),
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func sameStrings(left, right []string) bool {
if len(left) != len(right) {
return false
}
for i := range left {
if left[i] != right[i] {
return false
}
}
return true
}
@@ -0,0 +1,374 @@
package mesh
import (
"sort"
"strings"
"time"
)
const DefaultWarmPeerLimit = 8
type PeerCacheConfig struct {
Local PeerIdentity
PeerEndpoints map[string]string
PeerEndpointCandidates map[string][]PeerEndpointCandidate
PeerDirectory []PeerDirectoryEntry
RecoverySeeds []PeerRecoverySeed
RendezvousLeases []PeerRendezvousLease
Routes []SyntheticRoute
WarmPeerLimit int
PreferredRegion string
Now time.Time
}
type PeerCache struct {
snapshot PeerCacheSnapshot
}
type PeerCacheSnapshot struct {
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
PeerCount int `json:"peer_count"`
WarmPeerCount int `json:"warm_peer_count"`
RecoverySeedCount int `json:"recovery_seed_count"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
BuiltAt time.Time `json:"built_at"`
Entries []PeerCacheEntry `json:"entries"`
}
type PeerCacheEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
}
type peerCacheBuildEntry struct {
PeerCacheEntry
adjacentRoutePeer bool
bestScore int
}
func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
now := cfg.Now.UTC()
if now.IsZero() {
now = time.Now().UTC()
}
limit := cfg.WarmPeerLimit
if limit <= 0 {
limit = DefaultWarmPeerLimit
}
entries := map[string]*peerCacheBuildEntry{}
for _, item := range cfg.PeerDirectory {
nodeID := strings.TrimSpace(item.NodeID)
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.RouteIDs = mergeStrings(entry.RouteIDs, item.RouteIDs)
entry.EndpointCount = maxInt(entry.EndpointCount, item.EndpointCount)
entry.CandidateCount = maxInt(entry.CandidateCount, item.CandidateCount)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, item.ConnectivityModes)
entry.RecoverySeed = entry.RecoverySeed || item.RecoverySeed
}
for nodeID, endpoint := range cfg.PeerEndpoints {
nodeID = strings.TrimSpace(nodeID)
endpoint = strings.TrimSpace(endpoint)
if nodeID == "" || nodeID == cfg.Local.NodeID || endpoint == "" {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.Endpoint = endpoint
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
}
for nodeID, candidates := range cfg.PeerEndpointCandidates {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" || nodeID == cfg.Local.NodeID || len(candidates) == 0 {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.CandidateCount = maxInt(entry.CandidateCount, len(candidates))
for _, candidate := range candidates {
if strings.TrimSpace(candidate.ConnectivityMode) != "" {
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{candidate.ConnectivityMode})
}
}
scored := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: cfg.PreferredRegion,
Now: now,
MaxVerificationAge: time.Hour,
})
if len(scored) > 0 {
entry.BestCandidateID = scored[0].Candidate.EndpointID
entry.BestCandidateAddr = scored[0].Candidate.Address
entry.BestTransport = scored[0].Candidate.Transport
entry.BestReachability = scored[0].Candidate.Reachability
entry.BestConnectivity = scored[0].Candidate.ConnectivityMode
entry.BestNATType = scored[0].Candidate.NATType
entry.BestPolicyTags = append([]string{}, scored[0].Candidate.PolicyTags...)
entry.BestCandidateScore = scored[0].Score
entry.bestScore = scored[0].Score
if strings.TrimSpace(scored[0].Candidate.Address) != "" {
entry.Endpoint = strings.TrimSpace(scored[0].Candidate.Address)
}
}
}
for _, route := range cfg.Routes {
path := routePath(route)
localIndex := indexOf(path, cfg.Local.NodeID)
if localIndex < 0 {
continue
}
for _, nodeID := range path {
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.RouteIDs = mergeStrings(entry.RouteIDs, []string{route.RouteID})
}
for _, adjacentIndex := range []int{localIndex - 1, localIndex + 1} {
if adjacentIndex < 0 || adjacentIndex >= len(path) {
continue
}
nodeID := path[adjacentIndex]
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
peerCacheEntry(entries, nodeID).adjacentRoutePeer = true
}
}
for _, seed := range cfg.RecoverySeeds {
nodeID := strings.TrimSpace(seed.NodeID)
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.RecoverySeed = true
if entry.Endpoint == "" {
entry.Endpoint = strings.TrimSpace(seed.Endpoint)
}
if strings.TrimSpace(seed.ConnectivityMode) != "" {
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{seed.ConnectivityMode})
}
}
rendezvousLeases := 0
for _, lease := range cfg.RendezvousLeases {
if !leaseUsableForPeerCache(lease, cfg.Local.NodeID, now) {
continue
}
rendezvousLeases++
if lease.PeerNodeID != cfg.Local.NodeID {
entry := peerCacheEntry(entries, lease.PeerNodeID)
useLeaseEndpoint := shouldUseRendezvousEndpoint(*entry)
entry.RendezvousLeaseID = lease.LeaseID
entry.RelayNodeID = lease.RelayNodeID
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
entry.RelayControl = true
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
if useLeaseEndpoint {
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_control")
entry.BestReachability = "relay"
entry.BestConnectivity = firstNonEmpty(lease.ConnectivityMode, "relay_required")
entry.Endpoint = entry.RelayEndpoint
entry.BestCandidateID = lease.LeaseID
entry.BestCandidateAddr = entry.RelayEndpoint
entry.bestScore = maxInt(entry.bestScore, 500)
}
}
if lease.PeerNodeID == cfg.Local.NodeID && lease.RelayNodeID != "" && lease.RelayNodeID != cfg.Local.NodeID {
entry := peerCacheEntry(entries, lease.RelayNodeID)
if entry.Endpoint == "" {
entry.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
}
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_control"})
}
}
out := make([]peerCacheBuildEntry, 0, len(entries))
recoverySeeds := 0
for _, entry := range entries {
sort.Strings(entry.RouteIDs)
sort.Strings(entry.ConnectivityModes)
if entry.RecoverySeed {
recoverySeeds++
}
out = append(out, *entry)
}
sort.SliceStable(out, func(i, j int) bool {
left := warmPeerPriority(out[i])
right := warmPeerPriority(out[j])
if left != right {
return left > right
}
return out[i].NodeID < out[j].NodeID
})
warm := 0
for i := range out {
if warm >= limit {
break
}
if warmPeerPriority(out[i]) <= 0 {
continue
}
out[i].Warm = true
out[i].WarmReason = warmPeerReason(out[i])
warm++
}
sort.SliceStable(out, func(i, j int) bool {
return out[i].NodeID < out[j].NodeID
})
snapshotEntries := make([]PeerCacheEntry, 0, len(out))
for _, entry := range out {
snapshotEntries = append(snapshotEntries, entry.PeerCacheEntry)
}
return &PeerCache{snapshot: PeerCacheSnapshot{
ClusterID: cfg.Local.ClusterID,
LocalNodeID: cfg.Local.NodeID,
PeerCount: len(snapshotEntries),
WarmPeerCount: warm,
RecoverySeedCount: recoverySeeds,
RendezvousLeaseCount: rendezvousLeases,
BuiltAt: now,
Entries: snapshotEntries,
}}
}
func (c *PeerCache) Snapshot() PeerCacheSnapshot {
if c == nil {
return PeerCacheSnapshot{}
}
snapshot := c.snapshot
snapshot.Entries = append([]PeerCacheEntry{}, c.snapshot.Entries...)
return snapshot
}
func (c *PeerCache) WarmPeerIDs() []string {
snapshot := c.Snapshot()
out := make([]string, 0, snapshot.WarmPeerCount)
for _, entry := range snapshot.Entries {
if entry.Warm {
out = append(out, entry.NodeID)
}
}
return out
}
func peerCacheEntry(entries map[string]*peerCacheBuildEntry, nodeID string) *peerCacheBuildEntry {
if entry, ok := entries[nodeID]; ok {
return entry
}
entry := &peerCacheBuildEntry{PeerCacheEntry: PeerCacheEntry{NodeID: nodeID}}
entries[nodeID] = entry
return entry
}
func warmPeerPriority(entry peerCacheBuildEntry) int {
score := 0
if entry.adjacentRoutePeer {
score += 1000
}
if entry.RecoverySeed {
score += 500
}
if entry.Endpoint != "" {
score += 100
}
if entry.bestScore > 0 {
score += entry.bestScore
}
if entry.RelayControl {
score += 300
}
score += entry.CandidateCount
return score
}
func warmPeerReason(entry peerCacheBuildEntry) string {
if entry.adjacentRoutePeer {
return "route_adjacent"
}
if entry.RecoverySeed {
return "recovery_seed"
}
if entry.RelayControl {
return "rendezvous_lease"
}
if entry.BestCandidateID != "" {
return "endpoint_candidate"
}
if entry.Endpoint != "" {
return "peer_endpoint"
}
return "scoped_peer"
}
func leaseUsableForPeerCache(lease PeerRendezvousLease, localNodeID string, now time.Time) bool {
if strings.TrimSpace(lease.LeaseID) == "" ||
strings.TrimSpace(lease.PeerNodeID) == "" ||
strings.TrimSpace(lease.RelayNodeID) == "" ||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
lease.ExpiresAt.IsZero() ||
!lease.ExpiresAt.After(now) ||
!lease.ControlPlaneOnly {
return false
}
return lease.PeerNodeID != localNodeID || lease.RelayNodeID != localNodeID
}
func shouldUseRendezvousEndpoint(entry peerCacheBuildEntry) bool {
if strings.TrimSpace(entry.Endpoint) == "" {
return true
}
transport := strings.ToLower(strings.TrimSpace(entry.BestTransport))
reachability := strings.ToLower(strings.TrimSpace(entry.BestReachability))
connectivity := strings.ToLower(strings.TrimSpace(entry.BestConnectivity))
return strings.Contains(transport, "relay") ||
strings.Contains(transport, "outbound") ||
reachability == "relay" ||
reachability == "outbound_only" ||
connectivity == "relay_required" ||
connectivity == "outbound_only"
}
func mergeStrings(existing []string, incoming []string) []string {
seen := map[string]struct{}{}
out := make([]string, 0, len(existing)+len(incoming))
for _, value := range append(existing, incoming...) {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
return out
}
func maxInt(left, right int) int {
if left > right {
return left
}
return right
}
@@ -0,0 +1,170 @@
package mesh
import (
"testing"
"time"
)
func TestPeerCacheSelectsAdjacentWarmPeersWithinLimit(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-a": "http://node-a:19000",
"node-r": "http://node-r:19000",
"node-c": "http://node-c:19000",
},
Routes: []SyntheticRoute{
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "https://seed.example.test", Transport: "direct_tcp_tls", Priority: 10},
},
WarmPeerLimit: 2,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
warm := cache.WarmPeerIDs()
if len(warm) != 2 || warm[0] != "node-a" || warm[1] != "node-r" {
t.Fatalf("warm peers = %+v, want adjacent node-a/node-r", warm)
}
snapshot := cache.Snapshot()
if snapshot.PeerCount != 4 || snapshot.RecoverySeedCount != 1 {
t.Fatalf("unexpected snapshot counts: %+v", snapshot)
}
}
func TestPeerCachePromotesRecoverySeedAfterRoutePeers(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
Routes: []SyntheticRoute{
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "wss://seed.example.test/mesh", Transport: "wss", ConnectivityMode: "direct", Priority: 1},
},
WarmPeerLimit: 3,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
warm := cache.WarmPeerIDs()
if len(warm) != 3 || warm[0] != "node-a" || warm[1] != "node-r" || warm[2] != "node-seed" {
t.Fatalf("warm peers = %+v, want adjacent peers then seed", warm)
}
seed, ok := peerCacheEntryByID(cache.Snapshot(), "node-seed")
if !ok || !seed.RecoverySeed || seed.WarmReason != "recovery_seed" {
t.Fatalf("unexpected seed entry: %+v", seed)
}
}
func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay.example.test",
Reachability: "relay",
ConnectivityMode: "relay_required",
Priority: 20,
},
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 1,
LastVerifiedAt: &now,
},
},
},
WarmPeerLimit: 1,
Now: now,
})
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
if !ok {
t.Fatal("node-b missing from cache")
}
if entry.BestCandidateID != "node-b-public" || !entry.Warm {
t.Fatalf("unexpected candidate selection: %+v", entry)
}
}
func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "https://node-b.public.example.test:443",
},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "https://node-b.public.example.test:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 10,
},
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "http://10.24.10.20:19001",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 1,
PolicyTags: []string{"corp-lan"},
},
},
},
PreferredRegion: "corp-eu",
WarmPeerLimit: 1,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
if !ok {
t.Fatal("node-b missing from peer cache")
}
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "http://10.24.10.20:19001" {
t.Fatalf("peer cache did not choose corp LAN endpoint: %+v", entry)
}
}
func peerCacheRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: append([]string{}, hops...),
AllowedChannels: []string{SyntheticChannelFabricControl},
ExpiresAt: time.Now().UTC().Add(time.Hour),
}
}
func peerCacheEntryByID(snapshot PeerCacheSnapshot, nodeID string) (PeerCacheEntry, bool) {
for _, entry := range snapshot.Entries {
if entry.NodeID == nodeID {
return entry, true
}
}
return PeerCacheEntry{}, false
}
@@ -0,0 +1,303 @@
package mesh
import (
"net"
"net/netip"
"net/url"
"sort"
"strings"
"time"
)
const (
PeerConnectionIntentMaintain = "maintain"
PeerConnectionIntentProbe = "probe"
PeerConnectionIntentRecover = "recover"
)
const (
PeerTransportModeDirect = "direct"
PeerTransportModePrivateLAN = "private_lan"
PeerTransportModeCorporateLAN = "corporate_lan"
PeerTransportModeOutboundOnly = "outbound_only"
PeerTransportModeRelayRequired = "relay_required"
PeerTransportModeRelayControl = "relay_control"
PeerTransportModeUnknown = "unknown"
)
type PeerConnectionIntentPlanConfig struct {
PeerCache PeerCacheSnapshot
RecoveryPlan PeerRecoveryPlan
RendezvousLeases []PeerRendezvousLease
Now time.Time
}
type PeerConnectionIntentPlan struct {
Mode string `json:"mode"`
IntentCount int `json:"intent_count"`
MaintainCount int `json:"maintain_count"`
ProbeCount int `json:"probe_count"`
RecoverCount int `json:"recover_count"`
DirectCount int `json:"direct_count"`
PrivateLANCount int `json:"private_lan_count"`
CorporateLANCount int `json:"corporate_lan_count"`
OutboundOnlyCount int `json:"outbound_only_count"`
RelayRequiredCount int `json:"relay_required_count"`
RelayControlCount int `json:"relay_control_count"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
GeneratedAt time.Time `json:"generated_at"`
Intents []PeerConnectionIntent `json:"intents,omitempty"`
}
type PeerConnectionIntent struct {
NodeID string `json:"node_id"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
ConnectionState string `json:"connection_state"`
Transport string `json:"transport,omitempty"`
TransportMode string `json:"transport_mode"`
Reachability string `json:"reachability,omitempty"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
NATType string `json:"nat_type,omitempty"`
PolicyTags []string `json:"policy_tags,omitempty"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ControlPlaneOnly bool `json:"control_plane_only"`
RecoverySeed bool `json:"recovery_seed"`
Priority int `json:"priority"`
GeneratedAt time.Time `json:"generated_at"`
}
func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectionIntentPlan {
now := normalizedNow(cfg.Now)
entryByNode := map[string]PeerCacheEntry{}
for _, entry := range cfg.PeerCache.Entries {
if strings.TrimSpace(entry.NodeID) == "" {
continue
}
entryByNode[entry.NodeID] = entry
}
intents := make([]PeerConnectionIntent, 0, len(cfg.RecoveryPlan.Candidates))
for _, candidate := range cfg.RecoveryPlan.Candidates {
if strings.TrimSpace(candidate.NodeID) == "" {
continue
}
entry := entryByNode[candidate.NodeID]
intent := PeerConnectionIntent{
NodeID: candidate.NodeID,
Action: connectionIntentAction(candidate),
Reason: candidate.Reason,
Endpoint: candidate.Endpoint,
ConnectionState: candidate.ConnectionState,
Transport: firstNonEmpty(candidate.BestTransport, entry.BestTransport),
Reachability: entry.BestReachability,
ConnectivityMode: entry.BestConnectivity,
NATType: entry.BestNATType,
PolicyTags: append([]string{}, entry.BestPolicyTags...),
BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID),
RendezvousLeaseID: entry.RendezvousLeaseID,
RelayNodeID: entry.RelayNodeID,
RelayEndpoint: entry.RelayEndpoint,
RelayCandidate: entry.RelayControl,
ControlPlaneOnly: entry.RelayControl,
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
Priority: candidate.Priority,
GeneratedAt: now,
}
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent)
intent.TransportMode = mode
intent.RequiresRendezvous = requiresRendezvous
intent.DirectCandidate = directCandidate
if intent.RequiresRendezvous {
if lease, ok := rendezvousLeaseForPeer(cfg.RendezvousLeases, intent.NodeID, now); ok {
applyRendezvousLease(&intent, lease)
}
}
intents = append(intents, intent)
}
sort.SliceStable(intents, func(i, j int) bool {
if intents[i].Priority != intents[j].Priority {
return intents[i].Priority > intents[j].Priority
}
return intents[i].NodeID < intents[j].NodeID
})
plan := PeerConnectionIntentPlan{
Mode: cfg.RecoveryPlan.Mode,
IntentCount: len(intents),
GeneratedAt: now,
Intents: intents,
}
for _, intent := range intents {
switch intent.Action {
case PeerConnectionIntentMaintain:
plan.MaintainCount++
case PeerConnectionIntentProbe:
plan.ProbeCount++
case PeerConnectionIntentRecover:
plan.RecoverCount++
}
switch intent.TransportMode {
case PeerTransportModeDirect:
plan.DirectCount++
case PeerTransportModePrivateLAN:
plan.PrivateLANCount++
case PeerTransportModeCorporateLAN:
plan.CorporateLANCount++
case PeerTransportModeOutboundOnly:
plan.OutboundOnlyCount++
case PeerTransportModeRelayRequired:
plan.RelayRequiredCount++
case PeerTransportModeRelayControl:
plan.RelayControlCount++
}
if intent.RequiresRendezvous {
plan.RendezvousRequiredCount++
}
if intent.RendezvousResolved {
plan.RendezvousResolvedCount++
}
if intent.RendezvousLeaseID != "" {
plan.RendezvousLeaseCount++
}
}
return plan
}
func connectionIntentAction(candidate PeerRecoveryCandidate) string {
switch candidate.Reason {
case "maintain_ready":
return PeerConnectionIntentMaintain
case "recover_degraded", "recover_seed", "recover_warm", "recover_peer":
return PeerConnectionIntentRecover
default:
return PeerConnectionIntentProbe
}
}
func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
transport := strings.ToLower(strings.TrimSpace(intent.Transport))
connectivity := strings.ToLower(strings.TrimSpace(intent.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(intent.Reachability))
tags := lowerStringSet(intent.PolicyTags)
if strings.Contains(transport, "relay") || connectivity == "relay_required" || reachability == "relay" {
return PeerTransportModeRelayRequired, true, false
}
if connectivity == "outbound_only" || reachability == "outbound_only" {
return PeerTransportModeOutboundOnly, true, false
}
if tags["corp-lan"] || tags["same-site"] {
return PeerTransportModeCorporateLAN, false, true
}
if tags["private-lan"] || reachability == "private" || endpointHasPrivateHost(intent.Endpoint) {
return PeerTransportModePrivateLAN, false, true
}
if strings.Contains(transport, "direct") || reachability == "public" || connectivity == "direct" {
return PeerTransportModeDirect, false, true
}
return PeerTransportModeUnknown, false, false
}
func rendezvousLeaseForPeer(leases []PeerRendezvousLease, peerNodeID string, now time.Time) (PeerRendezvousLease, bool) {
now = normalizedNow(now)
candidates := make([]PeerRendezvousLease, 0, len(leases))
for _, lease := range leases {
if strings.TrimSpace(lease.PeerNodeID) != peerNodeID ||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
strings.TrimSpace(lease.RelayNodeID) == "" ||
!lease.ControlPlaneOnly ||
lease.ExpiresAt.IsZero() ||
!lease.ExpiresAt.After(now) {
continue
}
candidates = append(candidates, lease)
}
if len(candidates) == 0 {
return PeerRendezvousLease{}, false
}
sort.SliceStable(candidates, func(i, j int) bool {
leftPriority := candidates[i].Priority
rightPriority := candidates[j].Priority
if leftPriority <= 0 {
leftPriority = 100
}
if rightPriority <= 0 {
rightPriority = 100
}
if leftPriority != rightPriority {
return leftPriority < rightPriority
}
if !candidates[i].ExpiresAt.Equal(candidates[j].ExpiresAt) {
return candidates[i].ExpiresAt.After(candidates[j].ExpiresAt)
}
return candidates[i].LeaseID < candidates[j].LeaseID
})
return candidates[0], true
}
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease) {
intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
intent.Transport = firstNonEmpty(lease.Transport, "relay_control")
intent.TransportMode = PeerTransportModeRelayControl
intent.RequiresRendezvous = false
intent.RendezvousResolved = true
intent.DirectCandidate = false
intent.RelayCandidate = true
intent.RendezvousLeaseID = lease.LeaseID
intent.RelayNodeID = lease.RelayNodeID
intent.RelayEndpoint = intent.Endpoint
intent.ControlPlaneOnly = true
if lease.ConnectivityMode != "" {
intent.ConnectivityMode = lease.ConnectivityMode
}
}
func endpointHasPrivateHost(rawEndpoint string) bool {
rawEndpoint = strings.TrimSpace(rawEndpoint)
if rawEndpoint == "" {
return false
}
host := rawEndpoint
if parsed, err := url.Parse(rawEndpoint); err == nil && parsed.Host != "" {
host = parsed.Host
}
if splitHost, _, err := net.SplitHostPort(host); err == nil {
host = splitHost
}
addr, err := netip.ParseAddr(strings.Trim(host, "[]"))
if err != nil {
return false
}
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
}
func lowerStringSet(values []string) map[string]bool {
out := map[string]bool{}
for _, value := range values {
value = strings.ToLower(strings.TrimSpace(value))
if value != "" {
out[value] = true
}
}
return out
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
}
return ""
}
@@ -0,0 +1,234 @@
package mesh
import (
"testing"
"time"
)
func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
BestTransport: "direct_tcp_tls",
BestReachability: "private",
BestConnectivity: "direct",
BestPolicyTags: []string{"corp-lan", "same-site"},
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeSteady,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
ConnectionState: PeerConnectionReady,
Reason: "maintain_ready",
Priority: 100,
},
},
},
Now: now,
})
if plan.IntentCount != 1 || plan.MaintainCount != 1 || plan.CorporateLANCount != 1 {
t.Fatalf("unexpected plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.Action != PeerConnectionIntentMaintain || intent.TransportMode != PeerTransportModeCorporateLAN || intent.RequiresRendezvous {
t.Fatalf("unexpected corporate intent: %+v", intent)
}
}
func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
BestTransport: "direct_tcp_tls",
BestReachability: "outbound_only",
BestConnectivity: "outbound_only",
},
{
NodeID: "node-c",
Endpoint: "relay://fabric-relay/node-c",
BestTransport: "relay",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 90,
},
{
NodeID: "node-c",
Endpoint: "relay://fabric-relay/node-c",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_seed",
Priority: 80,
},
},
},
Now: now,
})
if plan.RecoverCount != 2 || plan.OutboundOnlyCount != 1 || plan.RelayRequiredCount != 1 || plan.RendezvousRequiredCount != 2 {
t.Fatalf("unexpected rendezvous counts: %+v", plan)
}
if plan.Intents[0].Action != PeerConnectionIntentRecover || plan.Intents[1].Action != PeerConnectionIntentRecover {
t.Fatalf("unexpected actions: %+v", plan.Intents)
}
}
func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
BestTransport: "relay",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 100,
},
},
},
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
},
},
Now: now,
})
if plan.IntentCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.TransportMode != PeerTransportModeRelayControl ||
intent.Endpoint != "http://node-r:19000" ||
intent.RelayNodeID != "node-r" ||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
!intent.RelayCandidate ||
!intent.RendezvousResolved ||
intent.RequiresRendezvous {
t.Fatalf("unexpected resolved rendezvous intent: %+v", intent)
}
}
func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
BestTransport: "relay",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
ConnectionState: PeerConnectionWaiting,
Reason: "recover_warm",
Priority: 100,
},
},
},
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-expired-preferred",
PeerNodeID: "node-b",
RelayNodeID: "node-r-old",
RelayEndpoint: "http://node-r-old:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 1,
ControlPlaneOnly: true,
IssuedAt: now.Add(-10 * time.Minute),
ExpiresAt: now.Add(-time.Second),
},
{
LeaseID: "lease-active-reselected",
PeerNodeID: "node-b",
RelayNodeID: "node-r-new",
RelayEndpoint: "http://node-r-new:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 20,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
},
},
Now: now,
})
if plan.RendezvousResolvedCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected reselected plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.RendezvousLeaseID != "lease-active-reselected" ||
intent.RelayNodeID != "node-r-new" ||
intent.Endpoint != "http://node-r-new:19000" {
t.Fatalf("expired lease was not skipped: %+v", intent)
}
}
func TestPeerConnectionIntentsClassifyPrivateEndpointWithoutCandidateHints(t *testing.T) {
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{NodeID: "node-b", Endpoint: "http://192.168.10.20:19001"},
}},
RecoveryPlan: PeerRecoveryPlan{Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://192.168.10.20:19001",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_peer",
Priority: 10,
},
}},
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
if plan.PrivateLANCount != 1 || plan.Intents[0].TransportMode != PeerTransportModePrivateLAN || !plan.Intents[0].DirectCandidate {
t.Fatalf("unexpected private endpoint classification: %+v", plan)
}
}
@@ -0,0 +1,304 @@
package mesh
import (
"context"
"net/http"
"strings"
"sync"
"time"
)
const (
PeerConnectionProbeReachable = "reachable"
PeerConnectionProbeUnreachable = "unreachable"
PeerConnectionProbeDeferred = "deferred"
PeerConnectionProbeSkipped = "skipped"
)
const (
DefaultPeerConnectionProbeTimeout = 2 * time.Second
)
type PeerConnectionManagerConfig struct {
Local PeerIdentity
PeerCache *PeerCache
Tracker *PeerConnectionTracker
RendezvousLeases []PeerRendezvousLease
HTTPClient *http.Client
ProbeTimeout time.Duration
Now func() time.Time
}
type PeerConnectionManager struct {
local PeerIdentity
peerCache *PeerCache
tracker *PeerConnectionTracker
rendezvousLeases []PeerRendezvousLease
httpClient *http.Client
probeTimeout time.Duration
now func() time.Time
mu sync.Mutex
lastCycle PeerConnectionManagerCycle
}
type PeerConnectionManagerCycle struct {
Mode string `json:"mode"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
ProbeTimeoutMs int `json:"probe_timeout_ms"`
IntentCount int `json:"intent_count"`
Attempted int `json:"attempted"`
Succeeded int `json:"succeeded"`
Failed int `json:"failed"`
Deferred int `json:"deferred"`
Skipped int `json:"skipped"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RelayControlCount int `json:"relay_control_count"`
RecoveryPlan PeerRecoveryPlan `json:"recovery_plan"`
IntentPlan PeerConnectionIntentPlan `json:"intent_plan"`
Results []PeerConnectionProbeResult `json:"results,omitempty"`
}
type PeerConnectionManagerSnapshot struct {
LastCycle PeerConnectionManagerCycle `json:"last_cycle"`
}
type PeerConnectionProbeResult struct {
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
probeTimeout := cfg.ProbeTimeout
if probeTimeout <= 0 {
probeTimeout = DefaultPeerConnectionProbeTimeout
}
httpClient := cfg.HTTPClient
if httpClient == nil {
httpClient = &http.Client{
Transport: &http.Transport{
MaxIdleConns: 64,
MaxIdleConnsPerHost: 8,
IdleConnTimeout: 90 * time.Second,
},
Timeout: probeTimeout + time.Second,
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
}
return &PeerConnectionManager{
local: cfg.Local,
peerCache: cfg.PeerCache,
tracker: cfg.Tracker,
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
httpClient: httpClient,
probeTimeout: probeTimeout,
now: now,
}
}
func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionManagerCycle {
peerCache, rendezvousLeases := m.peerConfigSnapshot()
if m == nil || peerCache == nil || m.tracker == nil {
return PeerConnectionManagerCycle{}
}
startedAt := normalizedNow(m.now())
peerSnapshot := peerCache.Snapshot()
recoveryPlan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: peerSnapshot,
Connections: m.tracker.Snapshot(),
TargetReadyPeers: DefaultStablePeerTarget,
MaxProbeCandidates: DefaultRecoveryProbeLimit,
Now: startedAt,
})
intentPlan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: peerSnapshot,
RecoveryPlan: recoveryPlan,
RendezvousLeases: rendezvousLeases,
Now: startedAt,
})
cycle := PeerConnectionManagerCycle{
Mode: recoveryPlan.Mode,
StartedAt: startedAt,
ProbeTimeoutMs: int(m.probeTimeout.Milliseconds()),
IntentCount: intentPlan.IntentCount,
RendezvousRequiredCount: intentPlan.RendezvousRequiredCount,
RendezvousResolvedCount: intentPlan.RendezvousResolvedCount,
RelayControlCount: intentPlan.RelayControlCount,
RecoveryPlan: recoveryPlan,
IntentPlan: intentPlan,
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
}
for _, intent := range intentPlan.Intents {
result := m.probeIntent(ctx, intent)
cycle.Results = append(cycle.Results, result)
switch result.LinkStatus {
case PeerConnectionProbeReachable:
cycle.Attempted++
cycle.Succeeded++
case PeerConnectionProbeUnreachable:
cycle.Attempted++
cycle.Failed++
case PeerConnectionProbeDeferred:
cycle.Deferred++
case PeerConnectionProbeSkipped:
cycle.Skipped++
}
}
cycle.CompletedAt = normalizedNow(m.now())
m.mu.Lock()
m.lastCycle = cycle
m.mu.Unlock()
return cycle
}
func (m *PeerConnectionManager) Snapshot() PeerConnectionManagerSnapshot {
if m == nil {
return PeerConnectionManagerSnapshot{}
}
m.mu.Lock()
defer m.mu.Unlock()
return PeerConnectionManagerSnapshot{LastCycle: m.lastCycle}
}
func (m *PeerConnectionManager) UpdatePeerConfig(peerCache *PeerCache, rendezvousLeases []PeerRendezvousLease) {
if m == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
m.peerCache = peerCache
m.rendezvousLeases = append([]PeerRendezvousLease{}, rendezvousLeases...)
}
func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvousLease) {
if m == nil {
return nil, nil
}
m.mu.Lock()
defer m.mu.Unlock()
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
}
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult {
startedAt := normalizedNow(m.now())
result := PeerConnectionProbeResult{
NodeID: intent.NodeID,
Action: intent.Action,
Reason: intent.Reason,
Endpoint: intent.Endpoint,
TransportMode: intent.TransportMode,
RequiresRendezvous: intent.RequiresRendezvous,
RendezvousResolved: intent.RendezvousResolved,
DirectCandidate: intent.DirectCandidate,
RelayCandidate: intent.RelayCandidate,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
StartedAt: startedAt,
}
peer := PeerCacheEntry{
NodeID: intent.NodeID,
Endpoint: intent.Endpoint,
Warm: true,
WarmReason: intent.Reason,
RecoverySeed: intent.RecoverySeed,
BestCandidateID: intent.BestCandidateID,
BestTransport: intent.Transport,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
}
if intent.RequiresRendezvous {
result.LinkStatus = PeerConnectionProbeDeferred
result.FailureReason = "rendezvous_required"
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
result.CompletedAt = normalizedNow(m.now())
return result
}
if strings.TrimSpace(intent.Endpoint) == "" || (!intent.DirectCandidate && !intent.RelayCandidate) {
result.LinkStatus = PeerConnectionProbeDeferred
result.FailureReason = "direct_candidate_unavailable"
if intent.RelayCandidate {
result.FailureReason = "relay_candidate_unavailable"
}
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
result.CompletedAt = normalizedNow(m.now())
return result
}
if !m.tracker.ShouldProbe(intent.NodeID, startedAt) {
result.LinkStatus = PeerConnectionProbeSkipped
result.FailureReason = "backoff_active"
result.ConnectionState = m.connectionState(intent.NodeID)
result.CompletedAt = normalizedNow(m.now())
return result
}
m.tracker.BeginProbe(peer, startedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
defer cancel()
target := PeerIdentity{
ClusterID: m.local.ClusterID,
NodeID: intent.NodeID,
}
if intent.RelayCandidate && intent.RelayNodeID != "" {
target.NodeID = intent.RelayNodeID
}
_, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
completedAt := normalizedNow(m.now())
if err != nil {
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = err.Error()
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt)
result.CompletedAt = completedAt
return result
}
latency := int(completedAt.Sub(startedAt).Milliseconds())
if latency < 0 {
latency = 0
}
result.LinkStatus = PeerConnectionProbeReachable
result.LatencyMs = latency
if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt)
}
result.CompletedAt = completedAt
return result
}
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
snapshot := m.tracker.Snapshot()
for _, entry := range snapshot.Entries {
if entry.NodeID == nodeID {
return entry
}
}
return PeerConnectionState{NodeID: nodeID, State: PeerConnectionDisconnected}
}
func (c Client) withHTTPClient(httpClient *http.Client) Client {
c.HTTPClient = httpClient
return c
}
@@ -0,0 +1,190 @@
package mesh
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: server.URL,
Reachability: "private",
ConnectivityMode: "direct",
PolicyTags: []string{"corp-lan", "same-site"},
Priority: 1,
},
},
"node-c": {
{
EndpointID: "node-c-relay",
NodeID: "node-c",
Transport: "relay",
Address: "relay://fabric/node-c",
Reachability: "relay",
ConnectivityMode: "relay_required",
Priority: 1,
},
},
},
WarmPeerLimit: 2,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Deferred != 1 || cycle.RendezvousRequiredCount != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || snapshot.Waiting != 1 {
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
}
if cycle.Results[0].NodeID != "node-b" || cycle.Results[0].LinkStatus != PeerConnectionProbeReachable {
t.Fatalf("direct peer was not probed first: %+v", cycle.Results)
}
if cycle.Results[1].NodeID != "node-c" || cycle.Results[1].LinkStatus != PeerConnectionProbeDeferred {
t.Fatalf("relay peer was not deferred: %+v", cycle.Results)
}
}
func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "http://127.0.0.1:1",
},
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 20 * time.Millisecond},
ProbeTimeout: 20 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
for i := 0; i < 3; i++ {
manager.ProbeOnce(context.Background())
}
backoff := tracker.Snapshot()
if backoff.Backoff != 1 {
t.Fatalf("expected backoff after repeated failures: %+v", backoff)
}
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 0 || len(cycle.Results) != 0 {
t.Fatalf("active backoff peer should not be attempted: %+v", cycle)
}
}
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
}.Handler())
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
leases := []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: server.URL,
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
},
}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay://fabric/node-b",
Reachability: "relay",
ConnectivityMode: "relay_required",
Priority: 10,
},
},
},
RendezvousLeases: leases,
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
RendezvousLeases: leases,
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 ||
cycle.Succeeded != 1 ||
cycle.Deferred != 0 ||
cycle.RelayControlCount != 1 ||
cycle.RendezvousResolvedCount != 1 ||
cycle.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control cycle: %+v", cycle)
}
if len(cycle.Results) != 1 ||
cycle.Results[0].NodeID != "node-b" ||
cycle.Results[0].RelayNodeID != "node-r" ||
cycle.Results[0].ConnectionState.State != PeerConnectionRelayReady {
t.Fatalf("unexpected relay-control result: %+v", cycle.Results)
}
snapshot := tracker.Snapshot()
if snapshot.RelayReady != 1 || snapshot.Waiting != 0 {
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
}
}
@@ -0,0 +1,284 @@
package mesh
import (
"sort"
"sync"
"time"
)
const (
PeerConnectionDisconnected = "disconnected"
PeerConnectionConnecting = "connecting"
PeerConnectionReady = "ready"
PeerConnectionRelayReady = "relay_ready"
PeerConnectionDegraded = "degraded"
PeerConnectionBackoff = "backoff"
PeerConnectionWaiting = "waiting_rendezvous"
)
const (
peerConnectionBackoffBase = 5 * time.Second
peerConnectionBackoffMax = time.Minute
)
type PeerConnectionTracker struct {
mu sync.Mutex
entries map[string]PeerConnectionState
}
type PeerConnectionState struct {
NodeID string `json:"node_id"`
State string `json:"state"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
ConsecutiveSuccesses int `json:"consecutive_successes"`
ConsecutiveFailures int `json:"consecutive_failures"`
LastLatencyMs int `json:"last_latency_ms,omitempty"`
LastFailureReason string `json:"last_failure_reason,omitempty"`
LastTransitionAt time.Time `json:"last_transition_at"`
LastProbeAt time.Time `json:"last_probe_at,omitempty"`
BackoffUntil time.Time `json:"backoff_until,omitempty"`
}
type PeerConnectionSnapshot struct {
Total int `json:"total"`
Ready int `json:"ready"`
RelayReady int `json:"relay_ready"`
Degraded int `json:"degraded"`
Backoff int `json:"backoff"`
Waiting int `json:"waiting_rendezvous"`
Connecting int `json:"connecting"`
Disconnected int `json:"disconnected"`
StateCounts map[string]int `json:"state_counts"`
Entries []PeerConnectionState `json:"entries"`
LastTransitionAt time.Time `json:"last_transition_at,omitempty"`
}
func NewPeerConnectionTracker(peerSnapshot PeerCacheSnapshot, now time.Time) *PeerConnectionTracker {
now = normalizedNow(now)
tracker := &PeerConnectionTracker{entries: map[string]PeerConnectionState{}}
for _, peer := range peerSnapshot.Entries {
if !peer.Warm || peer.NodeID == "" {
continue
}
tracker.entries[peer.NodeID] = PeerConnectionState{
NodeID: peer.NodeID,
State: PeerConnectionDisconnected,
Warm: peer.Warm,
WarmReason: peer.WarmReason,
Endpoint: peer.Endpoint,
BestCandidateID: peer.BestCandidateID,
LastTransitionAt: now,
}
}
return tracker
}
func (t *PeerConnectionTracker) ShouldProbe(nodeID string, now time.Time) bool {
if t == nil {
return true
}
t.mu.Lock()
defer t.mu.Unlock()
entry, ok := t.entries[nodeID]
if !ok {
return true
}
now = normalizedNow(now)
return entry.State != PeerConnectionBackoff || entry.BackoffUntil.IsZero() || !entry.BackoffUntil.After(now)
}
func (t *PeerConnectionTracker) BeginProbe(peer PeerCacheEntry, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
if entry.State != PeerConnectionReady && entry.State != PeerConnectionDegraded {
entry.State = PeerConnectionConnecting
entry.LastTransitionAt = now
}
entry.LastProbeAt = now
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entries[nodeID]
entry.NodeID = nodeID
entry.ConsecutiveSuccesses++
entry.ConsecutiveFailures = 0
entry.LastLatencyMs = latencyMs
entry.LastFailureReason = ""
entry.LastProbeAt = now
entry.BackoffUntil = time.Time{}
nextState := PeerConnectionReady
if latencyMs >= 500 {
nextState = PeerConnectionDegraded
}
if entry.State != nextState {
entry.State = nextState
entry.LastTransitionAt = now
}
t.entries[nodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
entry.ConsecutiveSuccesses++
entry.ConsecutiveFailures = 0
entry.LastLatencyMs = latencyMs
entry.LastFailureReason = ""
entry.LastProbeAt = now
entry.BackoffUntil = time.Time{}
if entry.State != PeerConnectionRelayReady {
entry.State = PeerConnectionRelayReady
entry.LastTransitionAt = now
}
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordFailure(nodeID string, reason string, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entries[nodeID]
entry.NodeID = nodeID
entry.ConsecutiveFailures++
entry.ConsecutiveSuccesses = 0
entry.LastFailureReason = reason
entry.LastProbeAt = now
nextState := PeerConnectionDegraded
if entry.ConsecutiveFailures >= 3 {
nextState = PeerConnectionBackoff
entry.BackoffUntil = now.Add(peerConnectionBackoffDuration(entry.ConsecutiveFailures))
}
if entry.State != nextState {
entry.State = nextState
entry.LastTransitionAt = now
}
t.entries[nodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordDeferred(peer PeerCacheEntry, reason string, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
entry.State = PeerConnectionWaiting
entry.LastFailureReason = reason
entry.LastProbeAt = time.Time{}
entry.LastTransitionAt = now
entry.BackoffUntil = time.Time{}
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) Snapshot() PeerConnectionSnapshot {
if t == nil {
return PeerConnectionSnapshot{StateCounts: map[string]int{}}
}
t.mu.Lock()
defer t.mu.Unlock()
entries := make([]PeerConnectionState, 0, len(t.entries))
counts := map[string]int{
PeerConnectionDisconnected: 0,
PeerConnectionConnecting: 0,
PeerConnectionReady: 0,
PeerConnectionRelayReady: 0,
PeerConnectionDegraded: 0,
PeerConnectionBackoff: 0,
PeerConnectionWaiting: 0,
}
var lastTransition time.Time
for _, entry := range t.entries {
entries = append(entries, entry)
counts[entry.State]++
if entry.LastTransitionAt.After(lastTransition) {
lastTransition = entry.LastTransitionAt
}
}
sort.SliceStable(entries, func(i, j int) bool {
return entries[i].NodeID < entries[j].NodeID
})
return PeerConnectionSnapshot{
Total: len(entries),
Ready: counts[PeerConnectionReady],
RelayReady: counts[PeerConnectionRelayReady],
Degraded: counts[PeerConnectionDegraded],
Backoff: counts[PeerConnectionBackoff],
Waiting: counts[PeerConnectionWaiting],
Connecting: counts[PeerConnectionConnecting],
Disconnected: counts[PeerConnectionDisconnected],
StateCounts: counts,
Entries: entries,
LastTransitionAt: lastTransition,
}
}
func (t *PeerConnectionTracker) entry(peer PeerCacheEntry, now time.Time) PeerConnectionState {
entry, ok := t.entries[peer.NodeID]
if !ok {
entry = PeerConnectionState{
NodeID: peer.NodeID,
State: PeerConnectionDisconnected,
LastTransitionAt: now,
}
}
entry.Warm = peer.Warm
entry.WarmReason = peer.WarmReason
entry.Endpoint = peer.Endpoint
entry.BestCandidateID = peer.BestCandidateID
entry.RendezvousLeaseID = peer.RendezvousLeaseID
entry.RelayNodeID = peer.RelayNodeID
entry.RelayEndpoint = peer.RelayEndpoint
entry.RelayControl = peer.RelayControl
return entry
}
func peerConnectionBackoffDuration(failures int) time.Duration {
if failures < 3 {
return 0
}
backoff := peerConnectionBackoffBase * time.Duration(failures-2)
if backoff > peerConnectionBackoffMax {
return peerConnectionBackoffMax
}
return backoff
}
func normalizedNow(now time.Time) time.Time {
if now.IsZero() {
return time.Now().UTC()
}
return now.UTC()
}
@@ -0,0 +1,76 @@
package mesh
import (
"testing"
"time"
)
func TestPeerConnectionTrackerTransitionsReadyAndDegraded(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "http://node-b:19000"},
},
}, now)
begin := tracker.BeginProbe(PeerCacheEntry{NodeID: "node-b", Warm: true}, now.Add(time.Second))
if begin.State != PeerConnectionConnecting {
t.Fatalf("begin state = %q, want connecting", begin.State)
}
ready := tracker.RecordSuccess("node-b", 42, now.Add(2*time.Second))
if ready.State != PeerConnectionReady || ready.ConsecutiveSuccesses != 1 || ready.ConsecutiveFailures != 0 {
t.Fatalf("ready state unexpected: %+v", ready)
}
degraded := tracker.RecordSuccess("node-b", 800, now.Add(3*time.Second))
if degraded.State != PeerConnectionDegraded || degraded.LastLatencyMs != 800 {
t.Fatalf("degraded state unexpected: %+v", degraded)
}
}
func TestPeerConnectionTrackerBackoffAfterRepeatedFailures(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{{NodeID: "node-b", Warm: true}},
}, now)
first := tracker.RecordFailure("node-b", "timeout", now.Add(time.Second))
if first.State != PeerConnectionDegraded {
t.Fatalf("first failure state = %q, want degraded", first.State)
}
_ = tracker.RecordFailure("node-b", "timeout", now.Add(2*time.Second))
third := tracker.RecordFailure("node-b", "timeout", now.Add(3*time.Second))
if third.State != PeerConnectionBackoff || third.BackoffUntil.IsZero() {
t.Fatalf("third failure did not enter backoff: %+v", third)
}
if tracker.ShouldProbe("node-b", now.Add(4*time.Second)) {
t.Fatal("ShouldProbe returned true during backoff")
}
if !tracker.ShouldProbe("node-b", third.BackoffUntil.Add(time.Millisecond)) {
t.Fatal("ShouldProbe returned false after backoff")
}
recovered := tracker.RecordSuccess("node-b", 12, third.BackoffUntil.Add(time.Second))
if recovered.State != PeerConnectionReady || recovered.ConsecutiveFailures != 0 || !recovered.BackoffUntil.IsZero() {
t.Fatalf("success did not recover from backoff: %+v", recovered)
}
}
func TestPeerConnectionTrackerSnapshotCountsStates(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-a", Warm: true},
{NodeID: "node-b", Warm: true},
{NodeID: "node-c", Warm: true},
},
}, now)
tracker.RecordSuccess("node-a", 25, now.Add(time.Second))
tracker.RecordFailure("node-b", "timeout", now.Add(time.Second))
tracker.RecordFailure("node-c", "timeout", now.Add(time.Second))
tracker.RecordFailure("node-c", "timeout", now.Add(2*time.Second))
tracker.RecordFailure("node-c", "timeout", now.Add(3*time.Second))
snapshot := tracker.Snapshot()
if snapshot.Total != 3 || snapshot.Ready != 1 || snapshot.Degraded != 1 || snapshot.Backoff != 1 {
t.Fatalf("unexpected snapshot: %+v", snapshot)
}
}
@@ -0,0 +1,276 @@
package mesh
import (
"sort"
"strings"
"time"
)
const (
PeerRecoveryModeSteady = "steady"
PeerRecoveryModeRecovery = "recovery"
)
const (
DefaultStablePeerTarget = 3
DefaultRecoveryProbeLimit = 6
)
type PeerRecoveryPlanConfig struct {
PeerCache PeerCacheSnapshot
Connections PeerConnectionSnapshot
TargetReadyPeers int
MaxProbeCandidates int
Now time.Time
}
type PeerRecoveryPlan struct {
Mode string `json:"mode"`
Healthy bool `json:"healthy"`
TargetReadyPeers int `json:"target_ready_peers"`
ReadyPeerCount int `json:"ready_peer_count"`
DegradedPeerCount int `json:"degraded_peer_count"`
BackoffPeerCount int `json:"backoff_peer_count"`
ConnectablePeerCount int `json:"connectable_peer_count"`
Deficit int `json:"deficit"`
ProbeCandidateCount int `json:"probe_candidate_count"`
RecoverySeedCandidateCount int `json:"recovery_seed_candidate_count"`
GeneratedAt time.Time `json:"generated_at"`
Candidates []PeerRecoveryCandidate `json:"candidates,omitempty"`
}
type PeerRecoveryCandidate struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint,omitempty"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
ConnectionState string `json:"connection_state"`
ConsecutiveFailures int `json:"consecutive_failures,omitempty"`
LastLatencyMs int `json:"last_latency_ms,omitempty"`
BackoffUntil time.Time `json:"backoff_until,omitempty"`
Reason string `json:"reason"`
Priority int `json:"priority"`
}
type peerRecoveryCandidateBuild struct {
PeerRecoveryCandidate
}
func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
now := normalizedNow(cfg.Now)
target := cfg.TargetReadyPeers
if target <= 0 {
target = DefaultStablePeerTarget
}
limit := cfg.MaxProbeCandidates
if limit <= 0 {
limit = DefaultRecoveryProbeLimit
}
connectable := connectablePeerCount(cfg.PeerCache)
if target > connectable {
target = connectable
}
if limit < target {
limit = target
}
connectionByNode := map[string]PeerConnectionState{}
for _, connection := range cfg.Connections.Entries {
if strings.TrimSpace(connection.NodeID) == "" {
continue
}
connectionByNode[connection.NodeID] = connection
}
entryByNode := map[string]PeerCacheEntry{}
for _, entry := range cfg.PeerCache.Entries {
if strings.TrimSpace(entry.NodeID) == "" {
continue
}
entryByNode[entry.NodeID] = entry
}
ready := 0
degraded := 0
backoff := 0
for nodeID, connection := range connectionByNode {
entry, ok := entryByNode[nodeID]
if !ok || strings.TrimSpace(entry.Endpoint) == "" {
continue
}
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
ready++
case PeerConnectionDegraded:
degraded++
case PeerConnectionBackoff:
backoff++
}
}
deficit := target - ready
if deficit < 0 {
deficit = 0
}
mode := PeerRecoveryModeSteady
if deficit > 0 {
mode = PeerRecoveryModeRecovery
}
if mode == PeerRecoveryModeSteady {
limit = target
}
candidates := make([]peerRecoveryCandidateBuild, 0, len(cfg.PeerCache.Entries))
for _, entry := range cfg.PeerCache.Entries {
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
continue
}
connection := connectionByNode[entry.NodeID]
if connection.State == "" {
connection.State = PeerConnectionDisconnected
}
if connection.State == PeerConnectionBackoff && connection.BackoffUntil.After(now) {
continue
}
reason, ok := peerRecoveryCandidateReason(mode, entry, connection)
if !ok {
continue
}
candidate := PeerRecoveryCandidate{
NodeID: entry.NodeID,
Endpoint: strings.TrimSpace(entry.Endpoint),
Warm: entry.Warm,
WarmReason: entry.WarmReason,
RecoverySeed: entry.RecoverySeed,
BestCandidateID: entry.BestCandidateID,
BestTransport: entry.BestTransport,
ConnectionState: connection.State,
ConsecutiveFailures: connection.ConsecutiveFailures,
LastLatencyMs: connection.LastLatencyMs,
BackoffUntil: connection.BackoffUntil,
Reason: reason,
Priority: peerRecoveryCandidatePriority(entry, connection, reason),
}
candidates = append(candidates, peerRecoveryCandidateBuild{PeerRecoveryCandidate: candidate})
}
sort.SliceStable(candidates, func(i, j int) bool {
if candidates[i].Priority != candidates[j].Priority {
return candidates[i].Priority > candidates[j].Priority
}
return candidates[i].NodeID < candidates[j].NodeID
})
if len(candidates) > limit {
candidates = candidates[:limit]
}
outCandidates := make([]PeerRecoveryCandidate, 0, len(candidates))
recoverySeedCandidates := 0
for _, candidate := range candidates {
outCandidates = append(outCandidates, candidate.PeerRecoveryCandidate)
if candidate.RecoverySeed {
recoverySeedCandidates++
}
}
return PeerRecoveryPlan{
Mode: mode,
Healthy: deficit == 0,
TargetReadyPeers: target,
ReadyPeerCount: ready,
DegradedPeerCount: degraded,
BackoffPeerCount: backoff,
ConnectablePeerCount: connectable,
Deficit: deficit,
ProbeCandidateCount: len(outCandidates),
RecoverySeedCandidateCount: recoverySeedCandidates,
GeneratedAt: now,
Candidates: outCandidates,
}
}
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState) (string, bool) {
if mode == PeerRecoveryModeSteady {
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
return "maintain_ready", true
}
return "", false
}
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
return "maintain_ready", true
}
if connection.State == PeerConnectionDegraded {
return "recover_degraded", true
}
if entry.Warm {
return "recover_warm", true
}
if entry.RecoverySeed {
return "recover_seed", true
}
return "recover_peer", true
}
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string) int {
score := 0
if entry.Warm {
score += 1000
}
switch entry.WarmReason {
case "route_adjacent":
score += 500
case "recovery_seed":
score += 350
case "endpoint_candidate":
score += 200
case "peer_endpoint":
score += 100
}
if entry.RecoverySeed {
score += 250
}
if entry.BestCandidateID != "" {
score += 150
}
score += entry.BestCandidateScore / 10
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
score += 600
case PeerConnectionDegraded:
score += 350
case PeerConnectionConnecting:
score += 200
case PeerConnectionDisconnected:
score += 100
}
switch reason {
case "maintain_ready":
score += 500
case "recover_degraded":
score += 300
case "recover_seed":
score += 250
case "recover_warm":
score += 150
}
if connection.LastLatencyMs > 0 {
score -= connection.LastLatencyMs / 10
}
if score < 0 {
return 0
}
return score
}
func connectablePeerCount(snapshot PeerCacheSnapshot) int {
count := 0
for _, entry := range snapshot.Entries {
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
continue
}
count++
}
return count
}
@@ -0,0 +1,139 @@
package mesh
import (
"testing"
"time"
)
func TestPeerRecoveryPlanMaintainsBoundedReadyPeers(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
recoveryPlanPeer("node-a", true, false, "route_adjacent"),
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
recoveryPlanPeer("node-c", true, false, "peer_endpoint"),
recoveryPlanPeer("node-d", true, false, "peer_endpoint"),
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 40},
{NodeID: "node-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-c", State: PeerConnectionReady, LastLatencyMs: 30},
{NodeID: "node-d", State: PeerConnectionReady, LastLatencyMs: 10},
}},
Now: now,
})
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
t.Fatalf("unexpected plan health: %+v", plan)
}
if plan.TargetReadyPeers != DefaultStablePeerTarget || len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("unexpected bounded candidates: %+v", plan)
}
for _, candidate := range plan.Candidates {
if candidate.Reason != "maintain_ready" {
t.Fatalf("unexpected candidate reason: %+v", candidate)
}
}
}
func TestPeerRecoveryPlanAddsRecoverySeedWhenReadyDeficit(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
recoveryPlanPeer("node-a", true, false, "route_adjacent"),
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
recoveryPlanPeer("node-seed", false, true, ""),
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-b", State: PeerConnectionBackoff, BackoffUntil: now.Add(time.Minute)},
}},
Now: now,
})
if plan.Mode != PeerRecoveryModeRecovery || plan.Healthy {
t.Fatalf("unexpected recovery mode: %+v", plan)
}
if plan.Deficit != 2 || plan.RecoverySeedCandidateCount != 1 {
t.Fatalf("unexpected deficit/seed count: %+v", plan)
}
if !recoveryPlanHasCandidate(plan, "node-seed", "recover_seed") {
t.Fatalf("recovery seed was not selected: %+v", plan.Candidates)
}
if recoveryPlanHasCandidate(plan, "node-b", "") {
t.Fatalf("active backoff peer should not be selected: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{
NodeID: "node-c",
Endpoint: "http://relay:19001",
Warm: true,
WarmReason: "rendezvous_lease",
RendezvousLeaseID: "lease-1",
RelayNodeID: "node-r",
RelayEndpoint: "http://relay:19001",
RelayControl: true,
},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-c", State: PeerConnectionRelayReady, LastLatencyMs: 15},
}},
Now: now,
})
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
t.Fatalf("unexpected steady plan: %+v", plan)
}
if !recoveryPlanHasCandidate(plan, "node-c", "maintain_ready") {
t.Fatalf("relay-ready peer was not maintained: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{NodeID: "node-a", Warm: true, WarmReason: "route_adjacent"},
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
}},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-b", State: PeerConnectionReady},
}},
Now: now,
})
if plan.TargetReadyPeers != 1 || !plan.Healthy {
t.Fatalf("target should be capped by connectable peers: %+v", plan)
}
}
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
return PeerCacheEntry{
NodeID: nodeID,
Endpoint: "http://" + nodeID + ":19001",
Warm: warm,
WarmReason: warmReason,
RecoverySeed: recoverySeed,
}
}
func recoveryPlanHasCandidate(plan PeerRecoveryPlan, nodeID string, reason string) bool {
for _, candidate := range plan.Candidates {
if candidate.NodeID != nodeID {
continue
}
return reason == "" || candidate.Reason == reason
}
return false
}
@@ -0,0 +1,149 @@
package mesh
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"time"
)
func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope, now time.Time) error {
if envelope.FabricProtocolVersion != ProtocolVersion {
return fmt.Errorf("%w: unsupported fabric_protocol_version", ErrForwardEnvelopeInvalid)
}
if envelope.MessageID == "" {
return fmt.Errorf("%w: message_id is required", ErrForwardEnvelopeInvalid)
}
if envelope.RouteID == "" {
return fmt.Errorf("%w: route_id is required", ErrForwardEnvelopeInvalid)
}
if envelope.ClusterID == "" || envelope.ClusterID != local.ClusterID {
return ErrClusterMismatch
}
if envelope.SourceNodeID == "" || envelope.DestinationNodeID == "" {
return fmt.Errorf("%w: source_node_id and destination_node_id are required", ErrForwardEnvelopeInvalid)
}
if envelope.CurrentHopNodeID != local.NodeID {
return ErrNodeMismatch
}
if envelope.NextHopNodeID == "" {
return fmt.Errorf("%w: next_hop_node_id is required", ErrForwardEnvelopeInvalid)
}
if len(envelope.RoutePath) > 0 {
if err := validateProductionRoutePath(local, envelope); err != nil {
return err
}
}
if envelope.ChannelClass != ProductionChannelFabricControl {
return ErrUnauthorizedChannel
}
if envelope.MessageType != ProductionMessageFabricControl {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
if envelope.TTL <= 0 {
return ErrTTLExhausted
}
if envelope.HopCount < 0 {
return fmt.Errorf("%w: hop_count must not be negative", ErrForwardEnvelopeInvalid)
}
if envelope.CreatedAt.IsZero() || envelope.ExpiresAt.IsZero() {
return fmt.Errorf("%w: created_at and expires_at are required", ErrForwardEnvelopeInvalid)
}
if envelope.CreatedAt.After(now.UTC().Add(MaxProductionEnvelopeFutureSkew)) {
return fmt.Errorf("%w: created_at exceeds allowed future skew", ErrForwardEnvelopeInvalid)
}
if !envelope.ExpiresAt.After(now.UTC()) {
return ErrRouteExpired
}
if envelope.PayloadLength != len(envelope.Payload) {
return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes {
return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadHash == "" {
return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid)
}
sum := sha256.Sum256(envelope.Payload)
if envelope.PayloadHash != hex.EncodeToString(sum[:]) {
return fmt.Errorf("%w: payload_hash mismatch", ErrForwardEnvelopeInvalid)
}
return nil
}
func validateProductionRoutePath(local PeerIdentity, envelope ProductionEnvelope) error {
if len(envelope.RoutePath) < 2 {
return ErrInvalidRoutePath
}
if envelope.RoutePath[0] != envelope.SourceNodeID || envelope.RoutePath[len(envelope.RoutePath)-1] != envelope.DestinationNodeID {
return ErrInvalidRoutePath
}
currentIndex := -1
seen := map[string]struct{}{}
for index, nodeID := range envelope.RoutePath {
if nodeID == "" {
return ErrInvalidRoutePath
}
if _, duplicate := seen[nodeID]; duplicate {
return ErrLoopDetected
}
seen[nodeID] = struct{}{}
if nodeID == local.NodeID {
currentIndex = index
}
}
if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID {
return ErrNodeMismatch
}
if containsProductionNodeID(envelope.VisitedNodeIDs, local.NodeID) {
return ErrLoopDetected
}
for _, visitedNodeID := range envelope.VisitedNodeIDs {
if visitedNodeID == "" || !containsProductionNodeID(envelope.RoutePath, visitedNodeID) {
return ErrInvalidRoutePath
}
}
if envelope.DestinationNodeID == local.NodeID {
if envelope.NextHopNodeID != local.NodeID {
return ErrInvalidRoutePath
}
return nil
}
if currentIndex >= len(envelope.RoutePath)-1 {
return ErrInvalidRoutePath
}
if envelope.NextHopNodeID != envelope.RoutePath[currentIndex+1] {
return ErrInvalidRoutePath
}
return nil
}
func containsProductionNodeID(values []string, needle string) bool {
for _, value := range values {
if value == needle {
return true
}
}
return false
}
func NewProductionEnvelopeObservation(envelope ProductionEnvelope, observedAt time.Time) ProductionEnvelopeObservation {
return ProductionEnvelopeObservation{
MessageID: envelope.MessageID,
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
SourceNodeID: envelope.SourceNodeID,
DestinationNodeID: envelope.DestinationNodeID,
CurrentHopNodeID: envelope.CurrentHopNodeID,
NextHopNodeID: envelope.NextHopNodeID,
RoutePath: append([]string{}, envelope.RoutePath...),
VisitedNodeIDs: append([]string{}, envelope.VisitedNodeIDs...),
ChannelClass: envelope.ChannelClass,
MessageType: envelope.MessageType,
TTL: envelope.TTL,
HopCount: envelope.HopCount,
PayloadLength: envelope.PayloadLength,
PayloadHash: envelope.PayloadHash,
ObservedAt: observedAt.UTC(),
}
}
@@ -0,0 +1,81 @@
package mesh
import (
"context"
"sync"
)
type ProductionEnvelopeObservationSink struct {
mu sync.Mutex
capacity int
items []ProductionEnvelopeObservation
accepted uint64
dropped uint64
}
type ProductionEnvelopeObservationSinkMetrics struct {
Capacity int `json:"capacity"`
CurrentDepth int `json:"current_depth"`
AcceptedTotal uint64 `json:"accepted_total"`
DroppedOldest uint64 `json:"dropped_oldest"`
}
func NewProductionEnvelopeObservationSink(capacity int) *ProductionEnvelopeObservationSink {
if capacity < 1 {
capacity = 1
}
return &ProductionEnvelopeObservationSink{
capacity: capacity,
items: make([]ProductionEnvelopeObservation, 0, capacity),
}
}
func (s *ProductionEnvelopeObservationSink) Observe(_ context.Context, observation ProductionEnvelopeObservation) error {
s.mu.Lock()
defer s.mu.Unlock()
s.accepted++
if len(s.items) == s.capacity {
copy(s.items, s.items[1:])
s.items[len(s.items)-1] = observation
s.dropped++
return nil
}
s.items = append(s.items, observation)
return nil
}
func (s *ProductionEnvelopeObservationSink) Snapshot() []ProductionEnvelopeObservation {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]ProductionEnvelopeObservation, len(s.items))
copy(out, s.items)
return out
}
func (s *ProductionEnvelopeObservationSink) Len() int {
s.mu.Lock()
defer s.mu.Unlock()
return len(s.items)
}
func (s *ProductionEnvelopeObservationSink) Capacity() int {
s.mu.Lock()
defer s.mu.Unlock()
return s.capacity
}
func (s *ProductionEnvelopeObservationSink) Metrics() ProductionEnvelopeObservationSinkMetrics {
s.mu.Lock()
defer s.mu.Unlock()
return ProductionEnvelopeObservationSinkMetrics{
Capacity: s.capacity,
CurrentDepth: len(s.items),
AcceptedTotal: s.accepted,
DroppedOldest: s.dropped,
}
}
@@ -0,0 +1,80 @@
package mesh
import (
"fmt"
"time"
)
func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope ProductionEnvelope, routes []SyntheticRoute, now time.Time) error {
if len(routes) == 0 {
return nil
}
route, ok := productionRouteByID(routes, envelope.RouteID)
if !ok {
return ErrRouteNotFound
}
if route.ClusterID != envelope.ClusterID || route.ClusterID != local.ClusterID {
return ErrClusterMismatch
}
if route.SourceNodeID != envelope.SourceNodeID || route.DestinationNodeID != envelope.DestinationNodeID {
return ErrInvalidRoutePath
}
if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) {
return ErrRouteExpired
}
if !contains(route.AllowedChannels, ProductionChannelFabricControl) {
return ErrUnauthorizedChannel
}
path := routePath(route)
if len(path) < 2 || path[0] != route.SourceNodeID || path[len(path)-1] != route.DestinationNodeID {
return ErrInvalidRoutePath
}
if len(envelope.RoutePath) > 0 && !sameNodePath(envelope.RoutePath, path) {
return ErrInvalidRoutePath
}
if len(path) > 2 && len(envelope.RoutePath) == 0 {
return ErrInvalidRoutePath
}
currentIndex := indexOf(path, local.NodeID)
if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID {
return ErrNodeMismatch
}
expectedNextHop := local.NodeID
if local.NodeID != envelope.DestinationNodeID {
if currentIndex >= len(path)-1 {
return ErrInvalidRoutePath
}
expectedNextHop = path[currentIndex+1]
}
if envelope.NextHopNodeID != expectedNextHop {
return ErrInvalidRoutePath
}
if route.MaxTTL > 0 && envelope.TTL > route.MaxTTL {
return fmt.Errorf("%w: ttl exceeds configured route max_ttl", ErrForwardEnvelopeInvalid)
}
if route.MaxHops > 0 && envelope.HopCount > route.MaxHops {
return fmt.Errorf("%w: hop_count exceeds configured route max_hops", ErrForwardEnvelopeInvalid)
}
return nil
}
func productionRouteByID(routes []SyntheticRoute, routeID string) (SyntheticRoute, bool) {
for _, route := range routes {
if route.RouteID == routeID {
return route, true
}
}
return SyntheticRoute{}, false
}
func sameNodePath(a []string, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
@@ -0,0 +1,43 @@
package mesh
import (
"context"
"net/http"
"strings"
)
type ProductionForwardTransport interface {
SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error)
}
type HTTPProductionForwardTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
}
func NewHTTPProductionForwardTransport(peerURLs map[string]string) *HTTPProductionForwardTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
}
}
return &HTTPProductionForwardTransport{PeerURLs: normalized}
}
func (t *HTTPProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
if t == nil {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
}
return client.SendProduction(ctx, envelope)
}
@@ -0,0 +1,241 @@
package mesh
import (
"encoding/json"
"fmt"
"os"
"strings"
"time"
)
type ScopedSyntheticConfig struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
ConfigVersion string `json:"config_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerEndpoints map[string]string `json:"peer_endpoints"`
PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"`
PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"`
RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"`
RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"`
Routes []SyntheticRoute `json:"routes"`
}
type PeerDirectoryEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
}
type PeerRecoverySeed struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type PeerRendezvousLease struct {
LeaseID string `json:"lease_id"`
PeerNodeID string `json:"peer_node_id"`
RelayNodeID string `json:"relay_node_id"`
RelayEndpoint string `json:"relay_endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
RouteIDs []string `json:"route_ids,omitempty"`
AllowedChannels []string `json:"allowed_channels,omitempty"`
Priority int `json:"priority"`
ControlPlaneOnly bool `json:"control_plane_only"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
Reason string `json:"reason,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type PeerEndpointCandidate struct {
EndpointID string `json:"endpoint_id"`
NodeID string `json:"node_id"`
Transport string `json:"transport"`
Address string `json:"address"`
AddressFamily string `json:"address_family,omitempty"`
Reachability string `json:"reachability"`
NATType string `json:"nat_type,omitempty"`
ConnectivityMode string `json:"connectivity_mode"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
PolicyTags []string `json:"policy_tags,omitempty"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
func LoadScopedSyntheticConfig(path string, local PeerIdentity) (ScopedSyntheticConfig, error) {
payload, err := os.ReadFile(path)
if err != nil {
return ScopedSyntheticConfig{}, err
}
var cfg ScopedSyntheticConfig
if err := json.Unmarshal(payload, &cfg); err != nil {
return ScopedSyntheticConfig{}, fmt.Errorf("parse scoped synthetic mesh config: %w", err)
}
if err := cfg.Validate(local); err != nil {
return ScopedSyntheticConfig{}, err
}
return cfg, nil
}
func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
if cfg.SchemaVersion == "" {
return fmt.Errorf("scoped synthetic mesh config schema_version is required")
}
if cfg.ClusterID == "" || cfg.ClusterID != local.ClusterID {
return ErrClusterMismatch
}
if cfg.LocalNodeID == "" || cfg.LocalNodeID != local.NodeID {
return ErrNodeMismatch
}
for nodeID, endpoint := range cfg.PeerEndpoints {
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
}
}
for nodeID, candidates := range cfg.PeerEndpointCandidates {
if strings.TrimSpace(nodeID) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint candidate node")
}
for _, candidate := range candidates {
if strings.TrimSpace(candidate.EndpointID) == "" ||
strings.TrimSpace(candidate.NodeID) == "" ||
candidate.NodeID != nodeID ||
strings.TrimSpace(candidate.Transport) == "" ||
strings.TrimSpace(candidate.Address) == "" ||
strings.TrimSpace(candidate.Reachability) == "" ||
strings.TrimSpace(candidate.ConnectivityMode) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
}
}
}
if err := validatePeerDirectory(cfg.PeerDirectory, cfg.LocalNodeID); err != nil {
return err
}
if err := validateRecoverySeeds(cfg.RecoverySeeds); err != nil {
return err
}
if err := validateRendezvousLeases(cfg.RendezvousLeases, cfg.Routes, cfg.LocalNodeID); err != nil {
return err
}
for _, route := range cfg.Routes {
if route.ClusterID != cfg.ClusterID {
return ErrClusterMismatch
}
path := routePath(route)
if len(path) < 2 {
return ErrInvalidRoutePath
}
if !contains(path, cfg.LocalNodeID) {
return ErrNodeMismatch
}
if route.ExpiresAt.IsZero() {
return fmt.Errorf("scoped synthetic route %q expires_at is required", route.RouteID)
}
if !route.ExpiresAt.After(time.Now().UTC()) {
return ErrRouteExpired
}
}
return nil
}
func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) error {
seen := map[string]struct{}{}
for _, entry := range entries {
nodeID := strings.TrimSpace(entry.NodeID)
if nodeID == "" || nodeID == localNodeID {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory entry")
}
if _, duplicate := seen[nodeID]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate peer directory entry")
}
seen[nodeID] = struct{}{}
if entry.EndpointCount < 0 || entry.CandidateCount < 0 {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory count")
}
}
return nil
}
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
if len(seeds) > 20 {
return fmt.Errorf("scoped synthetic mesh config contains too many recovery seeds")
}
seen := map[string]struct{}{}
for _, seed := range seeds {
key := strings.TrimSpace(seed.NodeID) + "\x00" + strings.TrimSpace(seed.Endpoint)
if strings.TrimSpace(seed.NodeID) == "" ||
strings.TrimSpace(seed.Endpoint) == "" ||
strings.TrimSpace(seed.Transport) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
}
if _, duplicate := seen[key]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate recovery seed")
}
seen[key] = struct{}{}
}
return nil
}
func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRoute, localNodeID string) error {
if len(leases) > 20 {
return fmt.Errorf("scoped synthetic mesh config contains too many rendezvous leases")
}
routesByID := map[string]SyntheticRoute{}
for _, route := range routes {
if strings.TrimSpace(route.RouteID) != "" {
routesByID[route.RouteID] = route
}
}
seen := map[string]struct{}{}
now := time.Now().UTC()
for _, lease := range leases {
if strings.TrimSpace(lease.LeaseID) == "" ||
strings.TrimSpace(lease.PeerNodeID) == "" ||
strings.TrimSpace(lease.RelayNodeID) == "" ||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
strings.TrimSpace(lease.Transport) == "" ||
lease.PeerNodeID == lease.RelayNodeID ||
!lease.ControlPlaneOnly ||
lease.ExpiresAt.IsZero() ||
!lease.ExpiresAt.After(now) ||
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
}
if _, duplicate := seen[lease.LeaseID]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate rendezvous lease")
}
seen[lease.LeaseID] = struct{}{}
if len(lease.RouteIDs) == 0 {
continue
}
visible := false
for _, routeID := range lease.RouteIDs {
route, ok := routesByID[routeID]
if !ok {
return fmt.Errorf("scoped synthetic mesh config contains rendezvous lease for unknown route")
}
path := routePath(route)
if contains(path, localNodeID) && contains(path, lease.PeerNodeID) && contains(path, lease.RelayNodeID) {
visible = true
}
}
if !visible {
return fmt.Errorf("scoped synthetic mesh config contains out-of-scope rendezvous lease")
}
}
return nil
}
@@ -0,0 +1,235 @@
package mesh
import (
"encoding/json"
"errors"
"os"
"path/filepath"
"testing"
"time"
)
func TestLoadScopedSyntheticConfig(t *testing.T) {
expiresAt := time.Now().UTC().Add(time.Hour)
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
ConfigVersion: "config-v1",
PeerDirectoryVersion: "peers-v1",
PolicyVersion: "policy-v1",
PeerEndpoints: map[string]string{"node-b": "http://127.0.0.1:19002"},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
Priority: 10,
},
},
},
PeerDirectory: []PeerDirectoryEntry{
{
NodeID: "node-b",
RouteIDs: []string{"route-a-b"},
EndpointCount: 1,
CandidateCount: 1,
ConnectivityModes: []string{"direct"},
RecoverySeed: true,
},
},
RecoverySeeds: []PeerRecoverySeed{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
Transport: "direct_tcp_tls",
ConnectivityMode: "direct",
Priority: 10,
},
},
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
RouteIDs: []string{"route-a-b"},
AllowedChannels: []string{"fabric_control", "route_control"},
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: expiresAt.Add(-time.Minute),
ExpiresAt: expiresAt,
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
})
cfg, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err != nil {
t.Fatalf("load scoped config: %v", err)
}
if cfg.ConfigVersion != "config-v1" || cfg.PeerEndpoints["node-b"] == "" || len(cfg.Routes) != 1 {
t.Fatalf("unexpected config: %+v", cfg)
}
if got := cfg.PeerEndpointCandidates["node-b"]; len(got) != 1 || got[0].EndpointID != "node-b-public" {
t.Fatalf("unexpected endpoint candidates: %+v", cfg.PeerEndpointCandidates)
}
if len(cfg.PeerDirectory) != 1 || cfg.PeerDirectory[0].NodeID != "node-b" || !cfg.PeerDirectory[0].RecoverySeed {
t.Fatalf("unexpected peer directory: %+v", cfg.PeerDirectory)
}
if len(cfg.RecoverySeeds) != 1 || cfg.RecoverySeeds[0].NodeID != "node-b" {
t.Fatalf("unexpected recovery seeds: %+v", cfg.RecoverySeeds)
}
if len(cfg.RendezvousLeases) != 1 || cfg.RendezvousLeases[0].RelayNodeID != "node-r" {
t.Fatalf("unexpected rendezvous leases: %+v", cfg.RendezvousLeases)
}
}
func TestLoadScopedSyntheticConfigRejectsWrongCluster(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-2",
LocalNodeID: "node-a",
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if !errors.Is(err, ErrClusterMismatch) {
t.Fatalf("err = %v, want ErrClusterMismatch", err)
}
}
func TestLoadScopedSyntheticConfigRejectsWrongNode(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-x",
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if !errors.Is(err, ErrNodeMismatch) {
t.Fatalf("err = %v, want ErrNodeMismatch", err)
}
}
func TestLoadScopedSyntheticConfigRejectsExpiredRoute(t *testing.T) {
route := liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})
route.ExpiresAt = time.Now().UTC().Add(-time.Minute)
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
Routes: []SyntheticRoute{route},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if !errors.Is(err, ErrRouteExpired) {
t.Fatalf("err = %v, want ErrRouteExpired", err)
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-c",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
ConnectivityMode: "direct",
},
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid peer endpoint candidate error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidPeerDirectory(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerDirectory: []PeerDirectoryEntry{
{NodeID: "node-a"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid peer directory error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-b", Endpoint: "", Transport: "direct_tcp_tls"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid recovery seed error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
RouteIDs: []string{"route-a-b"},
ExpiresAt: time.Now().UTC().Add(time.Hour),
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid rendezvous lease error")
}
}
func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
t.Helper()
payload, err := json.Marshal(cfg)
if err != nil {
t.Fatalf("marshal config: %v", err)
}
path := filepath.Join(t.TempDir(), "mesh-config.json")
if err := os.WriteFile(path, payload, 0o600); err != nil {
t.Fatalf("write config: %v", err)
}
return path
}
@@ -0,0 +1,291 @@
package mesh
import (
"context"
"encoding/json"
"net/http"
"time"
)
type ProductionEnvelopeObserver func(context.Context, ProductionEnvelopeObservation) error
type ProductionForwardLogger func(ProductionForwardLogEntry)
type Server struct {
Local PeerIdentity
SyntheticRuntime *SyntheticRuntime
ProductionForwardingEnabled bool
ProductionEnvelopeObserver ProductionEnvelopeObserver
ProductionForwardTransport ProductionForwardTransport
ProductionForwardLogger ProductionForwardLogger
ProductionRoutes []SyntheticRoute
}
func (s Server) Handler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/mesh/v1/health", s.handleHealth)
mux.HandleFunc("/mesh/v1/forward", s.handleForward)
mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe)
return mux
}
func (s Server) handleHealth(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
var message HealthMessage
if err := json.NewDecoder(r.Body).Decode(&message); err != nil {
http.Error(w, "invalid health message", http.StatusBadRequest)
return
}
if message.ProtocolVersion != ProtocolVersion {
http.Error(w, "unsupported mesh protocol version", http.StatusBadRequest)
return
}
if err := ValidatePeer(s.Local, message.From); err != nil {
http.Error(w, err.Error(), http.StatusForbidden)
return
}
if message.To.NodeID != "" && message.To.NodeID != s.Local.NodeID {
http.Error(w, ErrNodeMismatch.Error(), http.StatusForbidden)
return
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(HealthAck{
ProtocolVersion: ProtocolVersion,
Accepted: true,
By: s.Local,
})
}
func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if !s.ProductionForwardingEnabled {
s.logProductionForward(ProductionForwardLogEntry{
Event: "production_forward_rejected",
ClusterID: s.Local.ClusterID,
LocalNodeID: s.Local.NodeID,
Reason: ErrForwardDisabled.Error(),
StatusCode: http.StatusNotImplemented,
OccurredAt: time.Now().UTC(),
})
http.Error(w, ErrForwardDisabled.Error(), http.StatusNotImplemented)
return
}
var envelope ProductionEnvelope
if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil {
s.logProductionForward(ProductionForwardLogEntry{
Event: "production_forward_rejected",
ClusterID: s.Local.ClusterID,
LocalNodeID: s.Local.NodeID,
Reason: "invalid production mesh envelope",
StatusCode: http.StatusBadRequest,
OccurredAt: time.Now().UTC(),
})
http.Error(w, "invalid production mesh envelope", http.StatusBadRequest)
return
}
if err := ValidateProductionEnvelope(s.Local, envelope, time.Now().UTC()); err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
if err := ValidateProductionEnvelopeRouteConfig(s.Local, envelope, s.ProductionRoutes, time.Now().UTC()); err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
s.logProductionForward(productionForwardLogEntry("production_forward_accepted", s.Local, envelope, "", 0))
if s.ProductionEnvelopeObserver != nil {
observation := NewProductionEnvelopeObservation(envelope, time.Now().UTC())
if err := observeProductionEnvelope(r.Context(), s.ProductionEnvelopeObserver, observation); err != nil {
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, ErrForwardObservationFailed.Error(), http.StatusInternalServerError))
http.Error(w, ErrForwardObservationFailed.Error(), http.StatusInternalServerError)
return
}
}
if envelope.DestinationNodeID == s.Local.NodeID {
s.logProductionForward(productionForwardLogEntry("production_forward_delivered", s.Local, envelope, "", http.StatusOK))
writeProductionForwardResult(w, ProductionForwardResult{
Accepted: true,
Delivered: true,
By: s.Local,
MessageID: envelope.MessageID,
RouteID: envelope.RouteID,
})
return
}
if envelope.NextHopNodeID == s.Local.NodeID {
s.rejectProductionForward(w, envelope, ErrLoopDetected, forwardStatusCode(ErrLoopDetected))
return
}
if len(envelope.RoutePath) == 0 && envelope.NextHopNodeID != envelope.DestinationNodeID {
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
return
}
if s.ProductionForwardTransport == nil {
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
return
}
if envelope.TTL <= 1 {
s.rejectProductionForward(w, envelope, ErrTTLExhausted, forwardStatusCode(ErrTTLExhausted))
return
}
forwarded := envelope
forwarded.CurrentHopNodeID = envelope.NextHopNodeID
forwarded.NextHopNodeID = nextProductionHopAfter(envelope.RoutePath, envelope.NextHopNodeID, envelope.DestinationNodeID)
forwarded.TTL = envelope.TTL - 1
forwarded.HopCount = envelope.HopCount + 1
forwarded.VisitedNodeIDs = append(append([]string{}, envelope.VisitedNodeIDs...), s.Local.NodeID)
result, err := s.ProductionForwardTransport.SendProduction(r.Context(), envelope.NextHopNodeID, forwarded)
if err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
s.logProductionForward(productionForwardLogEntry("production_forward_forwarded", s.Local, envelope, "", http.StatusOK))
result.Accepted = true
result.Forwarded = true
result.By = s.Local
result.MessageID = envelope.MessageID
result.RouteID = envelope.RouteID
result.NextNodeID = envelope.NextHopNodeID
writeProductionForwardResult(w, result)
}
func (s Server) rejectProductionForward(w http.ResponseWriter, envelope ProductionEnvelope, err error, statusCode int) {
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, err.Error(), statusCode))
http.Error(w, err.Error(), statusCode)
}
func (s Server) logProductionForward(entry ProductionForwardLogEntry) {
if s.ProductionForwardLogger == nil {
return
}
if entry.OccurredAt.IsZero() {
entry.OccurredAt = time.Now().UTC()
}
s.ProductionForwardLogger(entry)
}
func productionForwardLogEntry(event string, local PeerIdentity, envelope ProductionEnvelope, reason string, statusCode int) ProductionForwardLogEntry {
return ProductionForwardLogEntry{
Event: event,
RouteID: envelope.RouteID,
MessageID: envelope.MessageID,
ClusterID: envelope.ClusterID,
LocalNodeID: local.NodeID,
SourceNodeID: envelope.SourceNodeID,
DestinationNodeID: envelope.DestinationNodeID,
CurrentHopNodeID: envelope.CurrentHopNodeID,
NextHopNodeID: envelope.NextHopNodeID,
ChannelClass: envelope.ChannelClass,
MessageType: envelope.MessageType,
Reason: reason,
StatusCode: statusCode,
TTL: envelope.TTL,
HopCount: envelope.HopCount,
RoutePathLength: len(envelope.RoutePath),
VisitedCount: len(envelope.VisitedNodeIDs),
PayloadLength: envelope.PayloadLength,
OccurredAt: time.Now().UTC(),
}
}
func nextProductionHopAfter(routePath []string, currentNodeID string, destinationNodeID string) string {
if len(routePath) == 0 {
return destinationNodeID
}
for index, nodeID := range routePath {
if nodeID == currentNodeID {
if index >= len(routePath)-1 {
return currentNodeID
}
return routePath[index+1]
}
}
return destinationNodeID
}
func writeProductionForwardResult(w http.ResponseWriter, result ProductionForwardResult) {
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(result)
}
func observeProductionEnvelope(ctx context.Context, observer ProductionEnvelopeObserver, observation ProductionEnvelopeObservation) (err error) {
if observer == nil {
return nil
}
defer func() {
if recover() != nil {
err = ErrForwardObservationFailed
}
}()
return observer(ctx, observation)
}
func (s Server) handleSyntheticProbe(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if s.SyntheticRuntime == nil {
http.Error(w, ErrMeshRuntimeDisabled.Error(), http.StatusServiceUnavailable)
return
}
var envelope SyntheticEnvelope
if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil {
http.Error(w, "invalid synthetic mesh envelope", http.StatusBadRequest)
return
}
ack, err := s.SyntheticRuntime.Receive(r.Context(), envelope)
if err != nil {
http.Error(w, err.Error(), syntheticStatusCode(err))
return
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(ack)
}
func NewHealthMessage(from, to PeerIdentity) HealthMessage {
status := "reachable"
return HealthMessage{
ProtocolVersion: ProtocolVersion,
From: from,
To: to,
ObservedAt: time.Now().UTC(),
LinkStatus: status,
}
}
func syntheticStatusCode(err error) int {
switch err {
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
return http.StatusForbidden
case ErrMeshRuntimeDisabled:
return http.StatusServiceUnavailable
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrUnsupportedSyntheticMessage, ErrRouteIDRequired:
return http.StatusBadRequest
case ErrRouteNotFound, ErrSyntheticPeerUnavailable:
return http.StatusNotFound
default:
return http.StatusBadRequest
}
}
func forwardStatusCode(err error) int {
switch err {
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
return http.StatusForbidden
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrRouteIDRequired:
return http.StatusBadRequest
case ErrForwardRuntimeUnavailable:
return http.StatusNotImplemented
case ErrRouteNotFound:
return http.StatusNotFound
case ErrForwardPeerUnavailable:
return http.StatusBadGateway
default:
return http.StatusBadRequest
}
}
@@ -0,0 +1,802 @@
package mesh
import (
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestMeshHealthAcceptsSameCluster(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{Local: local}.Handler())
defer server.Close()
client := NewClient(server.URL)
ack, err := client.SendHealth(context.Background(), NewHealthMessage(
PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
local,
))
if err != nil {
t.Fatalf("send health: %v", err)
}
if !ack.Accepted || ack.By.NodeID != "node-b" {
t.Fatalf("unexpected ack: %+v", ack)
}
}
func TestMeshHealthRejectsClusterMismatch(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{Local: local}.Handler())
defer server.Close()
message := NewHealthMessage(PeerIdentity{ClusterID: "cluster-2", NodeID: "node-a"}, local)
payload, err := json.Marshal(message)
if err != nil {
t.Fatalf("marshal message: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/health", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post health: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
}
func TestMeshForwardingDisabled(t *testing.T) {
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
defer server.Close()
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/octet-stream", bytes.NewReader([]byte("payload")))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
}
func TestMeshForwardingGateEnabledStillHasNoProductionRuntime(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
payload, err := json.Marshal(validProductionEnvelope(local))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
}
func TestMeshForwardingGateDeliversFabricControlAtDestination(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var events []ProductionForwardLogEntry
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
events = append(events, entry)
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = local.NodeID
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = local.NodeID
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
t.Fatalf("decode result: %v", err)
}
if !result.Accepted || !result.Delivered || result.Forwarded || result.By.NodeID != local.NodeID {
t.Fatalf("unexpected result: %+v", result)
}
if !hasProductionForwardEvent(events, "production_forward_accepted") || !hasProductionForwardEvent(events, "production_forward_delivered") {
t.Fatalf("missing production forward events: %+v", events)
}
}
func TestMeshForwardingGateForwardsDirectFabricControlToNextHop(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var deliveredObservation ProductionEnvelopeObservation
serverC := httptest.NewServer(Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
}.Handler())
defer serverB.Close()
envelope := validProductionEnvelope(nodeB)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = nodeC.NodeID
envelope.CurrentHopNodeID = nodeB.NodeID
envelope.NextHopNodeID = nodeC.NodeID
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
t.Fatalf("decode result: %v", err)
}
if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeC.NodeID || result.By.NodeID != nodeB.NodeID {
t.Fatalf("unexpected forward result: %+v", result)
}
if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.MessageID != envelope.MessageID {
t.Fatalf("destination did not observe forwarded envelope: %+v", deliveredObservation)
}
}
func TestMeshForwardingGateForwardsMultiHopFabricControlByRoutePath(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var deliveredObservation ProductionEnvelopeObservation
var nodeREvents []ProductionForwardLogEntry
var nodeBEvents []ProductionForwardLogEntry
serverC := httptest.NewServer(Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
serverR := httptest.NewServer(Server{
Local: nodeR,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
nodeREvents = append(nodeREvents, entry)
},
}.Handler())
defer serverR.Close()
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeR.NodeID: serverR.URL,
}),
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
nodeBEvents = append(nodeBEvents, entry)
},
}.Handler())
defer serverB.Close()
envelope := validProductionEnvelope(nodeB)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = nodeC.NodeID
envelope.CurrentHopNodeID = nodeB.NodeID
envelope.NextHopNodeID = nodeR.NodeID
envelope.RoutePath = []string{"node-a", nodeB.NodeID, nodeR.NodeID, nodeC.NodeID}
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
t.Fatalf("decode result: %v", err)
}
if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeR.NodeID || result.By.NodeID != nodeB.NodeID {
t.Fatalf("unexpected multi-hop result: %+v", result)
}
if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.NextHopNodeID != nodeC.NodeID {
t.Fatalf("destination did not observe final hop: %+v", deliveredObservation)
}
if len(deliveredObservation.VisitedNodeIDs) != 2 || deliveredObservation.VisitedNodeIDs[0] != nodeB.NodeID || deliveredObservation.VisitedNodeIDs[1] != nodeR.NodeID {
t.Fatalf("visited path not propagated: %+v", deliveredObservation.VisitedNodeIDs)
}
if !hasProductionForwardEvent(nodeBEvents, "production_forward_forwarded") || !hasProductionForwardEvent(nodeREvents, "production_forward_forwarded") {
t.Fatalf("missing relay forward events: nodeB=%+v nodeR=%+v", nodeBEvents, nodeREvents)
}
}
func TestMeshForwardingGateForwardsConfiguredProductionRoute(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
route := configuredProductionRoute("route-1", []string{"node-a", "node-b", "node-r", nodeC.NodeID})
var deliveredObservation ProductionEnvelopeObservation
serverC := httptest.NewServer(Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
serverR := httptest.NewServer(Server{
Local: nodeR,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
}.Handler())
defer serverR.Close()
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeR.NodeID: serverR.URL,
}),
}.Handler())
defer serverB.Close()
envelope := validProductionEnvelope(nodeB)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = nodeC.NodeID
envelope.CurrentHopNodeID = nodeB.NodeID
envelope.NextHopNodeID = nodeR.NodeID
envelope.RoutePath = route.Hops
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
if deliveredObservation.RouteID != route.RouteID || deliveredObservation.CurrentHopNodeID != nodeC.NodeID {
t.Fatalf("configured route was not delivered: %+v", deliveredObservation)
}
}
func TestMeshForwardingGateRejectsUnknownConfiguredProductionRoute(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{
configuredProductionRoute("route-other", []string{"node-a", local.NodeID, "node-c"}),
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotFound {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotFound)
}
}
func TestMeshForwardingGateRejectsConfiguredProductionRouteWrongNextHop(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
route := configuredProductionRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"})
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = "node-c"
envelope.RoutePath = route.Hops
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
}
func TestMeshForwardingGateRejectsRoutePathWrongNextHop(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
var events []ProductionForwardLogEntry
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
events = append(events, entry)
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = "node-x"
envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", "node-c"}
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
if !hasProductionForwardEvent(events, "production_forward_rejected") {
t.Fatalf("missing reject event: %+v", events)
}
}
func TestMeshForwardingGateRejectsRoutePathLoop(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = "node-r"
envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", local.NodeID, "node-c"}
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
}
func TestMeshForwardingGateRejectsInvalidProductionEnvelope(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.PayloadHash = "bad-hash"
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
}
func TestMeshForwardingGateRejectsOversizedProductionEnvelopePayload(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
observed := false
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
observed = true
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.Payload = json.RawMessage(`"` + string(bytes.Repeat([]byte("a"), MaxProductionEnvelopePayloadBytes+1)) + `"`)
sum := sha256.Sum256(envelope.Payload)
envelope.PayloadLength = len(envelope.Payload)
envelope.PayloadHash = hex.EncodeToString(sum[:])
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
if observed {
t.Fatal("observer called for oversized envelope")
}
}
func TestMeshForwardingGateRejectsFutureCreatedAt(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
observed := false
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
observed = true
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.CreatedAt = time.Now().UTC().Add(MaxProductionEnvelopeFutureSkew + time.Second)
envelope.ExpiresAt = envelope.CreatedAt.Add(time.Minute)
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
if observed {
t.Fatal("observer called for future-created envelope")
}
}
func TestMeshForwardingGateObservesValidEnvelopeWithoutPayload(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
var observed ProductionEnvelopeObservation
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
observed = observation
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
if observed.MessageID != envelope.MessageID || observed.RouteID != envelope.RouteID {
t.Fatalf("unexpected observation: %+v", observed)
}
if observed.PayloadHash != envelope.PayloadHash || observed.PayloadLength != envelope.PayloadLength {
t.Fatalf("payload metadata missing from observation: %+v", observed)
}
}
func TestMeshForwardingGateDoesNotObserveRejectedEnvelope(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
observed := false
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
observed = true
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.ClusterID = "wrong-cluster"
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
if observed {
t.Fatal("observer called for rejected envelope")
}
}
func TestMeshForwardingGateFailsClosedWhenObservationFails(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
return errors.New("observer down")
},
}.Handler())
defer server.Close()
payload, err := json.Marshal(validProductionEnvelope(local))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusInternalServerError {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError)
}
}
func TestMeshForwardingGateFailsClosedWhenObservationPanics(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
panic("observer panic")
},
}.Handler())
defer server.Close()
payload, err := json.Marshal(validProductionEnvelope(local))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusInternalServerError {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError)
}
}
func TestObserveProductionEnvelopeAllowsNilObserver(t *testing.T) {
if err := observeProductionEnvelope(context.Background(), nil, ProductionEnvelopeObservation{}); err != nil {
t.Fatalf("observeProductionEnvelope nil observer err = %v", err)
}
}
func TestProductionEnvelopeObservationSinkKeepsBoundedMetadata(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
sink := NewProductionEnvelopeObservationSink(2)
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: sink.Observe,
}.Handler())
defer server.Close()
for i := 1; i <= 3; i++ {
envelope := validProductionEnvelope(local)
envelope.MessageID = "message-" + string(rune('0'+i))
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
}
observations := sink.Snapshot()
if len(observations) != 2 {
t.Fatalf("observation count = %d, want 2", len(observations))
}
if observations[0].MessageID != "message-2" || observations[1].MessageID != "message-3" {
t.Fatalf("unexpected bounded observations: %+v", observations)
}
if observations[0].PayloadHash == "" || observations[0].PayloadLength == 0 {
t.Fatalf("payload metadata missing from bounded observation: %+v", observations[0])
}
metrics := sink.Metrics()
if metrics.Capacity != 2 || metrics.CurrentDepth != 2 || metrics.AcceptedTotal != 3 || metrics.DroppedOldest != 1 {
t.Fatalf("unexpected sink metrics: %+v", metrics)
}
}
func TestProductionEnvelopeObservationSinkMetricsStartEmpty(t *testing.T) {
sink := NewProductionEnvelopeObservationSink(3)
metrics := sink.Metrics()
if metrics.Capacity != 3 || metrics.CurrentDepth != 0 || metrics.AcceptedTotal != 0 || metrics.DroppedOldest != 0 {
t.Fatalf("unexpected empty metrics: %+v", metrics)
}
}
func TestMeshForwardingGateRejectsServiceChannel(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.ChannelClass = "render"
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
}
func TestMeshForwardingRequiresPost(t *testing.T) {
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
defer server.Close()
resp, err := http.Get(server.URL + "/mesh/v1/forward")
if err != nil {
t.Fatalf("get forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusMethodNotAllowed {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusMethodNotAllowed)
}
}
func validProductionEnvelope(local PeerIdentity) ProductionEnvelope {
payload := json.RawMessage(`{"kind":"control"}`)
sum := sha256.Sum256(payload)
now := time.Now().UTC()
return ProductionEnvelope{
FabricProtocolVersion: ProtocolVersion,
MessageID: "message-1",
RouteID: "route-1",
ClusterID: local.ClusterID,
SourceNodeID: "node-a",
DestinationNodeID: "node-c",
CurrentHopNodeID: local.NodeID,
NextHopNodeID: "node-c",
ChannelClass: ProductionChannelFabricControl,
MessageType: ProductionMessageFabricControl,
TTL: 4,
HopCount: 1,
CreatedAt: now,
ExpiresAt: now.Add(time.Minute),
PayloadLength: len(payload),
PayloadHash: hex.EncodeToString(sum[:]),
Payload: payload,
}
}
func configuredProductionRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: append([]string{}, hops...),
AllowedChannels: []string{ProductionChannelFabricControl},
ExpiresAt: time.Now().UTC().Add(time.Hour),
MaxTTL: 8,
MaxHops: 8,
}
}
func hasProductionForwardEvent(events []ProductionForwardLogEntry, event string) bool {
for _, item := range events {
if item.Event == event {
return true
}
}
return false
}
func TestSyntheticEndpointDisabledByDefault(t *testing.T) {
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
defer server.Close()
resp, err := http.Post(server.URL+"/mesh/v1/synthetic/probe", "application/json", bytes.NewReader([]byte(`{}`)))
if err != nil {
t.Fatalf("post synthetic probe: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusServiceUnavailable {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusServiceUnavailable)
}
}
@@ -0,0 +1,280 @@
package mesh
import (
"sync"
"time"
)
type SyntheticRelaySchedulerConfig struct {
Enabled bool
Local PeerIdentity
QueuePolicies []SyntheticRelayQueuePolicy
AllowedChannels []string
AllowedMessageTypes []string
Now func() time.Time
Logger func(SyntheticLogEntry)
}
type SyntheticRelayScheduler struct {
enabled bool
local PeerIdentity
policies map[string]SyntheticRelayQueuePolicy
allowedChannels map[string]struct{}
allowedMessageTypes map[string]struct{}
priorityOrder []string
now func() time.Time
logger func(SyntheticLogEntry)
mu sync.Mutex
queues map[string][]SyntheticEnvelope
metrics SyntheticRelayQueueMetrics
}
func NewSyntheticRelayScheduler(cfg SyntheticRelaySchedulerConfig) *SyntheticRelayScheduler {
policies := cfg.QueuePolicies
if len(policies) == 0 {
policies = []SyntheticRelayQueuePolicy{
{Channel: SyntheticChannelFabricControl, Capacity: 64, Droppable: false},
{Channel: SyntheticChannelRouteControl, Capacity: 64, Droppable: false},
{Channel: SyntheticChannelTelemetry, Capacity: 16, Droppable: true},
}
}
policyMap := map[string]SyntheticRelayQueuePolicy{}
allowedChannels := map[string]struct{}{}
priorityOrder := make([]string, 0, len(policies))
for _, policy := range policies {
if policy.Channel == "" {
continue
}
if policy.Capacity <= 0 {
policy.Capacity = 1
}
policyMap[policy.Channel] = policy
allowedChannels[policy.Channel] = struct{}{}
priorityOrder = append(priorityOrder, policy.Channel)
}
for _, channel := range cfg.AllowedChannels {
if channel != "" {
allowedChannels[channel] = struct{}{}
}
}
messageTypes := cfg.AllowedMessageTypes
if len(messageTypes) == 0 {
messageTypes = []string{
SyntheticMessageProbe,
SyntheticMessageProbeAck,
SyntheticMessageRouteHealth,
SyntheticMessageRouteHealthAck,
SyntheticMessageTelemetry,
SyntheticMessageTestService,
SyntheticMessageTestServiceAck,
}
}
allowedMessageTypes := map[string]struct{}{}
for _, messageType := range messageTypes {
if messageType != "" {
allowedMessageTypes[messageType] = struct{}{}
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
}
return &SyntheticRelayScheduler{
enabled: cfg.Enabled,
local: cfg.Local,
policies: policyMap,
allowedChannels: allowedChannels,
allowedMessageTypes: allowedMessageTypes,
priorityOrder: priorityOrder,
now: now,
logger: cfg.Logger,
queues: map[string][]SyntheticEnvelope{},
metrics: SyntheticRelayQueueMetrics{
QueueDepths: map[string]int{},
},
}
}
func (s *SyntheticRelayScheduler) Enqueue(envelope SyntheticEnvelope) (SyntheticRelayEnqueueResult, error) {
if err := s.validateEnvelope(envelope); err != nil {
s.reject(envelope, err)
return SyntheticRelayEnqueueResult{}, err
}
policy := s.policies[envelope.Channel]
result := SyntheticRelayEnqueueResult{
Channel: envelope.Channel,
QueueCapacity: policy.Capacity,
AcceptedSequence: envelope.Sequence,
}
s.mu.Lock()
queue := s.queues[envelope.Channel]
if len(queue) >= policy.Capacity {
if !policy.Droppable {
s.metrics.Rejected++
s.metrics.LastRejectReason = ErrSyntheticRelayQueueFull.Error()
s.mu.Unlock()
s.log(SyntheticLogEntry{
Event: "fabric_relay_rejected",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
Reason: ErrSyntheticRelayQueueFull.Error(),
QueueDepth: len(queue),
QueueCapacity: policy.Capacity,
OccurredAt: s.now(),
})
return SyntheticRelayEnqueueResult{}, ErrSyntheticRelayQueueFull
}
result.Dropped = true
result.DroppedSequence = queue[0].Sequence
queue = queue[1:]
s.metrics.Dropped++
}
queue = append(queue, envelope)
s.queues[envelope.Channel] = queue
result.QueueDepth = len(queue)
s.metrics.Enqueued++
s.metrics.QueueDepths[envelope.Channel] = len(queue)
s.mu.Unlock()
s.log(SyntheticLogEntry{
Event: "fabric_relay_enqueued",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
QueueDepth: result.QueueDepth,
QueueCapacity: result.QueueCapacity,
Dropped: result.Dropped,
DroppedSequence: result.DroppedSequence,
OccurredAt: s.now(),
})
return result, nil
}
func (s *SyntheticRelayScheduler) Dequeue() (SyntheticEnvelope, error) {
if !s.enabled {
return SyntheticEnvelope{}, ErrMeshRuntimeDisabled
}
s.mu.Lock()
for _, channel := range s.priorityOrder {
queue := s.queues[channel]
if len(queue) == 0 {
continue
}
envelope := queue[0]
queue = queue[1:]
s.queues[channel] = queue
s.metrics.Dequeued++
s.metrics.QueueDepths[channel] = len(queue)
s.mu.Unlock()
s.log(SyntheticLogEntry{
Event: "fabric_relay_dequeued",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
QueueDepth: len(queue),
QueueCapacity: s.policies[channel].Capacity,
OccurredAt: s.now(),
})
return envelope, nil
}
s.mu.Unlock()
return SyntheticEnvelope{}, ErrSyntheticRelayQueueEmpty
}
func (s *SyntheticRelayScheduler) SnapshotQueueMetrics() SyntheticRelayQueueMetrics {
s.mu.Lock()
defer s.mu.Unlock()
depths := map[string]int{}
for channel, depth := range s.metrics.QueueDepths {
depths[channel] = depth
}
for channel, queue := range s.queues {
depths[channel] = len(queue)
}
return SyntheticRelayQueueMetrics{
Enqueued: s.metrics.Enqueued,
Dequeued: s.metrics.Dequeued,
Dropped: s.metrics.Dropped,
Rejected: s.metrics.Rejected,
LastRejectReason: s.metrics.LastRejectReason,
QueueDepths: depths,
}
}
func (s *SyntheticRelayScheduler) validateEnvelope(envelope SyntheticEnvelope) error {
if s == nil || !s.enabled {
return ErrMeshRuntimeDisabled
}
if envelope.ProtocolVersion != ProtocolVersion {
return ErrUnsupportedSyntheticMessage
}
if envelope.RouteID == "" {
return ErrRouteIDRequired
}
if envelope.ClusterID == "" || envelope.ClusterID != s.local.ClusterID {
return ErrClusterMismatch
}
if envelope.From.ClusterID != s.local.ClusterID || envelope.From.NodeID == "" {
return ErrNodeMismatch
}
if envelope.To.ClusterID != s.local.ClusterID || envelope.To.NodeID != s.local.NodeID {
return ErrNodeMismatch
}
if envelope.TTL <= 0 {
return ErrTTLExhausted
}
if envelope.HopCount <= 0 {
return ErrInvalidRoutePath
}
if contains(envelope.Visited, s.local.NodeID) {
return ErrLoopDetected
}
if _, ok := s.allowedChannels[envelope.Channel]; !ok {
return ErrUnauthorizedChannel
}
if _, ok := s.policies[envelope.Channel]; !ok {
return ErrUnauthorizedChannel
}
if _, ok := s.allowedMessageTypes[envelope.MessageType]; !ok {
return ErrUnsupportedSyntheticMessage
}
return nil
}
func (s *SyntheticRelayScheduler) reject(envelope SyntheticEnvelope, err error) {
reason := ""
if err != nil {
reason = err.Error()
}
if s != nil {
s.mu.Lock()
s.metrics.Rejected++
s.metrics.LastRejectReason = reason
s.mu.Unlock()
}
if s != nil {
s.log(SyntheticLogEntry{
Event: "fabric_relay_rejected",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
Reason: reason,
OccurredAt: s.now(),
})
}
}
func (s *SyntheticRelayScheduler) log(entry SyntheticLogEntry) {
if s.logger != nil {
s.logger(entry)
}
}
@@ -0,0 +1,213 @@
package mesh
import (
"errors"
"testing"
)
func TestSyntheticRelaySchedulerDequeuesByQoSPriority(t *testing.T) {
scheduler := testRelayScheduler()
telemetry := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1)
routeControl := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2)
fabricControl := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 3)
if _, err := scheduler.Enqueue(telemetry); err != nil {
t.Fatalf("enqueue telemetry: %v", err)
}
if _, err := scheduler.Enqueue(routeControl); err != nil {
t.Fatalf("enqueue route control: %v", err)
}
if _, err := scheduler.Enqueue(fabricControl); err != nil {
t.Fatalf("enqueue fabric control: %v", err)
}
first, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue first: %v", err)
}
second, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue second: %v", err)
}
third, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue third: %v", err)
}
if first.Channel != SyntheticChannelFabricControl {
t.Fatalf("first channel = %q, want fabric_control", first.Channel)
}
if second.Channel != SyntheticChannelRouteControl {
t.Fatalf("second channel = %q, want route_control", second.Channel)
}
if third.Channel != SyntheticChannelTelemetry {
t.Fatalf("third channel = %q, want telemetry", third.Channel)
}
}
func TestSyntheticRelaySchedulerDropsOldestTelemetryOnly(t *testing.T) {
scheduler := testRelayScheduler()
first := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1)
second := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 2)
if result, err := scheduler.Enqueue(first); err != nil || result.Dropped {
t.Fatalf("enqueue first result=%+v err=%v", result, err)
}
result, err := scheduler.Enqueue(second)
if err != nil {
t.Fatalf("enqueue second: %v", err)
}
if !result.Dropped || result.DroppedSequence != 1 {
t.Fatalf("result = %+v, want dropped sequence 1", result)
}
dequeued, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue: %v", err)
}
if dequeued.Sequence != 2 {
t.Fatalf("dequeued sequence = %d, want 2", dequeued.Sequence)
}
metrics := scheduler.SnapshotQueueMetrics()
if metrics.Dropped != 1 || metrics.Enqueued != 2 {
t.Fatalf("metrics = %+v, want one drop and two enqueues", metrics)
}
}
func TestSyntheticRelaySchedulerRejectsFullReliableQueue(t *testing.T) {
scheduler := testRelayScheduler()
first := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)
second := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 2)
if _, err := scheduler.Enqueue(first); err != nil {
t.Fatalf("enqueue first: %v", err)
}
_, err := scheduler.Enqueue(second)
if !errors.Is(err, ErrSyntheticRelayQueueFull) {
t.Fatalf("err = %v, want ErrSyntheticRelayQueueFull", err)
}
dequeued, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue: %v", err)
}
if dequeued.Sequence != 1 {
t.Fatalf("dequeued sequence = %d, want 1", dequeued.Sequence)
}
metrics := scheduler.SnapshotQueueMetrics()
if metrics.Dropped != 0 || metrics.Rejected != 1 {
t.Fatalf("metrics = %+v, want no drop and one rejection", metrics)
}
}
func TestSyntheticRelaySchedulerRejectsInvalidEnvelopes(t *testing.T) {
tests := []struct {
name string
mutate func(*SyntheticEnvelope)
want error
}{
{
name: "wrong cluster",
mutate: func(envelope *SyntheticEnvelope) {
envelope.ClusterID = "cluster-2"
},
want: ErrClusterMismatch,
},
{
name: "wrong node",
mutate: func(envelope *SyntheticEnvelope) {
envelope.To.NodeID = "node-x"
},
want: ErrNodeMismatch,
},
{
name: "unauthorized channel",
mutate: func(envelope *SyntheticEnvelope) {
envelope.Channel = "rdp_render"
},
want: ErrUnauthorizedChannel,
},
{
name: "unsupported message",
mutate: func(envelope *SyntheticEnvelope) {
envelope.MessageType = "rdp.input"
},
want: ErrUnsupportedSyntheticMessage,
},
{
name: "ttl exhausted",
mutate: func(envelope *SyntheticEnvelope) {
envelope.TTL = 0
},
want: ErrTTLExhausted,
},
{
name: "loop detected",
mutate: func(envelope *SyntheticEnvelope) {
envelope.Visited = append(envelope.Visited, "node-r")
},
want: ErrLoopDetected,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
scheduler := testRelayScheduler()
envelope := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)
tt.mutate(&envelope)
_, err := scheduler.Enqueue(envelope)
if !errors.Is(err, tt.want) {
t.Fatalf("err = %v, want %v", err, tt.want)
}
})
}
}
func TestSyntheticRelaySchedulerDisabledRejects(t *testing.T) {
scheduler := NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
})
_, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1))
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
if _, err := scheduler.Dequeue(); !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("dequeue err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func TestSyntheticRelaySchedulerQueueDepthSnapshot(t *testing.T) {
scheduler := testRelayScheduler()
if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)); err != nil {
t.Fatalf("enqueue fabric control: %v", err)
}
if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2)); err != nil {
t.Fatalf("enqueue route control: %v", err)
}
metrics := scheduler.SnapshotQueueMetrics()
if metrics.QueueDepths[SyntheticChannelFabricControl] != 1 {
t.Fatalf("fabric_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelFabricControl])
}
if metrics.QueueDepths[SyntheticChannelRouteControl] != 1 {
t.Fatalf("route_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelRouteControl])
}
}
func testRelayScheduler() *SyntheticRelayScheduler {
return NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
QueuePolicies: []SyntheticRelayQueuePolicy{
{Channel: SyntheticChannelFabricControl, Capacity: 1, Droppable: false},
{Channel: SyntheticChannelRouteControl, Capacity: 1, Droppable: false},
{Channel: SyntheticChannelTelemetry, Capacity: 1, Droppable: true},
},
})
}
func testRelayEnvelope(channel string, messageType string, sequence uint64) SyntheticEnvelope {
route := testRoute("route-relay-scheduler", []string{"node-a", "node-r", "node-b"})
envelope := testEnvelope(route, "node-a", "node-r")
envelope.Channel = channel
envelope.MessageType = messageType
envelope.Sequence = sequence
return envelope
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,432 @@
package mesh
import (
"context"
"encoding/json"
"errors"
"testing"
"time"
)
type syntheticTestTransport struct {
nodes map[string]*SyntheticRuntime
}
func (t syntheticTestTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
next := t.nodes[nextNodeID]
if next == nil {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
return next.Receive(ctx, envelope)
}
func TestSyntheticRuntimeDirectProbe(t *testing.T) {
route := testRoute("route-direct", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-direct")
if err != nil {
t.Fatalf("send probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
if ack.From.NodeID != "node-b" || ack.To.NodeID != "node-a" {
t.Fatalf("unexpected ack peers: from=%+v to=%+v", ack.From, ack.To)
}
payload := decodeAckPayload(t, ack)
if len(payload.Path) != 2 || payload.Path[0] != "node-a" || payload.Path[1] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-b", payload.Path)
}
if nodeB.SnapshotMetrics().ProbeAcksCreated != 1 {
t.Fatalf("ProbeAcksCreated = %d, want 1", nodeB.SnapshotMetrics().ProbeAcksCreated)
}
}
func TestSyntheticRuntimeSingleRelayProbe(t *testing.T) {
route := testRoute("route-relay", []string{"node-a", "node-r", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeR := testRuntime("node-r", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-r"] = nodeR
transport.nodes["node-b"] = nodeB
ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-relay")
if err != nil {
t.Fatalf("send probe: %v", err)
}
payload := decodeAckPayload(t, ack)
if len(payload.Path) != 3 || payload.Path[0] != "node-a" || payload.Path[1] != "node-r" || payload.Path[2] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", payload.Path)
}
if nodeR.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded)
}
}
func TestSyntheticRuntimeDisabledRejectsProbe(t *testing.T) {
route := testRoute("route-disabled", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
})
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-disabled")
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func TestSyntheticRuntimeRejectsWrongCluster(t *testing.T) {
route := testRoute("route-wrong-cluster", []string{"node-a", "node-b"})
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
envelope := testEnvelope(route, "node-a", "node-b")
envelope.ClusterID = "cluster-2"
_, err := nodeB.Receive(context.Background(), envelope)
if !errors.Is(err, ErrClusterMismatch) {
t.Fatalf("err = %v, want ErrClusterMismatch", err)
}
}
func TestSyntheticRuntimeRejectsWrongNode(t *testing.T) {
route := testRoute("route-wrong-node", []string{"node-a", "node-b"})
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
envelope := testEnvelope(route, "node-a", "node-c")
_, err := nodeB.Receive(context.Background(), envelope)
if !errors.Is(err, ErrNodeMismatch) {
t.Fatalf("err = %v, want ErrNodeMismatch", err)
}
}
func TestSyntheticRuntimeRejectsUnauthorizedChannel(t *testing.T) {
route := testRoute("route-unauthorized", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
_, err := nodeA.SendProbe(context.Background(), route.RouteID, "rdp_render", "probe-unauthorized")
if !errors.Is(err, ErrUnauthorizedChannel) {
t.Fatalf("err = %v, want ErrUnauthorizedChannel", err)
}
}
func TestSyntheticRuntimeRejectsExpiredRoute(t *testing.T) {
route := testRoute("route-expired", []string{"node-a", "node-b"})
route.ExpiresAt = time.Now().UTC().Add(-time.Minute)
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-expired")
if !errors.Is(err, ErrRouteExpired) {
t.Fatalf("err = %v, want ErrRouteExpired", err)
}
}
func TestSyntheticRuntimeRejectsTTLExhaustion(t *testing.T) {
route := testRoute("route-ttl", []string{"node-a", "node-r", "node-b"})
route.MaxTTL = 1
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeR := testRuntime("node-r", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-r"] = nodeR
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-ttl")
if !errors.Is(err, ErrTTLExhausted) {
t.Fatalf("err = %v, want ErrTTLExhausted", err)
}
}
func TestSyntheticRuntimeRejectsLoop(t *testing.T) {
route := testRoute("route-loop", []string{"node-a", "node-b"})
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
envelope := testEnvelope(route, "node-a", "node-b")
envelope.Visited = []string{"node-a", "node-b"}
_, err := nodeB.Receive(context.Background(), envelope)
if !errors.Is(err, ErrLoopDetected) {
t.Fatalf("err = %v, want ErrLoopDetected", err)
}
}
func TestSyntheticRuntimeRejectsUnavailablePeer(t *testing.T) {
route := testRoute("route-missing-peer", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}, route)
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-missing-peer")
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
}
}
func TestSyntheticRuntimeRouteHealthProbeRecordsSuccess(t *testing.T) {
route := testRoute("route-health", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-health")
if err != nil {
t.Fatalf("send route health probe: %v", err)
}
if result.Ack.MessageType != SyntheticMessageRouteHealthAck {
t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageRouteHealthAck)
}
if result.FallbackUsed {
t.Fatal("FallbackUsed = true, want false")
}
observation, ok := nodeA.SnapshotRouteObservation(route.RouteID)
if !ok {
t.Fatal("route observation missing")
}
if observation.State != SyntheticRouteStateHealthy || observation.SuccessCount != 1 {
t.Fatalf("observation = %+v, want healthy success", observation)
}
if observation.PolicyVersion != "policy-v1" || observation.PeerDirectoryVersion != "peers-v1" || observation.RouteVersion != "route-v1" {
t.Fatalf("observation versions = %+v", observation)
}
metrics := nodeA.SnapshotMetrics()
if metrics.RouteHealthProbesSent != 1 || metrics.RouteDeliveriesSucceeded != 1 {
t.Fatalf("metrics = %+v, want health probe success", metrics)
}
}
func TestSyntheticRuntimeRouteHealthUsesDedicatedRouteConfig(t *testing.T) {
base := testRoute("route-effective-health", []string{"node-a", "node-old", "node-b"})
effective := testRoute("route-effective-health", []string{"node-a", "node-new", "node-b"})
effective.RouteVersion = "decision-v1"
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntimeWithRouteHealth("node-a", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
nodeOld := testRuntimeWithRouteHealth("node-old", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
nodeNew := testRuntimeWithRouteHealth("node-new", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
nodeB := testRuntimeWithRouteHealth("node-b", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
transport.nodes["node-a"] = nodeA
transport.nodes["node-old"] = nodeOld
transport.nodes["node-new"] = nodeNew
transport.nodes["node-b"] = nodeB
health, err := nodeA.SendRouteHealthProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-health-effective")
if err != nil {
t.Fatalf("send route health probe: %v", err)
}
healthPayload := decodeAckPayload(t, health.Ack)
if got, want := healthPayload.Path, []string{"node-a", "node-new", "node-b"}; !sameStrings(got, want) {
t.Fatalf("route health path = %v, want %v", got, want)
}
if nodeNew.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("node-new forwarded = %d, want 1", nodeNew.SnapshotMetrics().ProbesForwarded)
}
if nodeOld.SnapshotMetrics().ProbesForwarded != 0 {
t.Fatalf("node-old forwarded = %d, want 0 before regular probe", nodeOld.SnapshotMetrics().ProbesForwarded)
}
observation, ok := nodeA.SnapshotRouteObservation(base.RouteID)
if !ok || observation.RouteVersion != "decision-v1" {
t.Fatalf("route health observation = %+v, want decision route version", observation)
}
probe, err := nodeA.SendProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-regular")
if err != nil {
t.Fatalf("send regular probe: %v", err)
}
probePayload := decodeAckPayload(t, probe)
if got, want := probePayload.Path, []string{"node-a", "node-old", "node-b"}; !sameStrings(got, want) {
t.Fatalf("regular probe path = %v, want %v", got, want)
}
if nodeOld.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("node-old forwarded = %d, want 1 after regular probe", nodeOld.SnapshotMetrics().ProbesForwarded)
}
}
func TestSyntheticRuntimeRouteHealthUsesFallbackWhenPreferredUnavailable(t *testing.T) {
preferred := testRoute("route-preferred", []string{"node-a", "node-r", "node-b"})
fallback := testRoute("route-fallback", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, preferred, fallback)
nodeB := testRuntime("node-b", transport, preferred, fallback)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendRouteHealthProbeWithFallback(
context.Background(),
preferred.RouteID,
[]string{fallback.RouteID},
SyntheticChannelFabricControl,
"probe-fallback",
)
if err != nil {
t.Fatalf("send route health probe with fallback: %v", err)
}
if !result.FallbackUsed {
t.Fatal("FallbackUsed = false, want true")
}
if result.SelectedRouteID != fallback.RouteID {
t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID)
}
preferredObservation, ok := nodeA.SnapshotRouteObservation(preferred.RouteID)
if !ok {
t.Fatal("preferred route observation missing")
}
if preferredObservation.State != SyntheticRouteStateFailed || preferredObservation.FailureCount != 1 {
t.Fatalf("preferred observation = %+v, want failed", preferredObservation)
}
fallbackObservation, ok := nodeA.SnapshotRouteObservation(fallback.RouteID)
if !ok {
t.Fatal("fallback route observation missing")
}
if fallbackObservation.State != SyntheticRouteStateHealthy || fallbackObservation.SuccessCount != 1 {
t.Fatalf("fallback observation = %+v, want healthy", fallbackObservation)
}
metrics := nodeA.SnapshotMetrics()
if metrics.FallbackRoutesUsed != 1 || metrics.WarmRoutesPromoted != 1 || metrics.RouteDeliveriesFailed != 1 {
t.Fatalf("metrics = %+v, want fallback promotion and one failed delivery", metrics)
}
}
func TestSyntheticRuntimeRouteCacheInvalidatesOnVersionChange(t *testing.T) {
route := testRoute("route-cache", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache"); err != nil {
t.Fatalf("send route health probe: %v", err)
}
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok {
t.Fatal("route observation missing before invalidation")
}
invalidated := nodeA.InvalidateRouteCache("policy_changed", SyntheticRouteCacheVersion{PolicyVersion: "policy-v2"})
if invalidated != 1 {
t.Fatalf("invalidated = %d, want 1", invalidated)
}
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); ok {
t.Fatal("route observation still present after invalidation")
}
if nodeA.SnapshotMetrics().RouteCacheInvalidations != 1 {
t.Fatalf("RouteCacheInvalidations = %d, want 1", nodeA.SnapshotMetrics().RouteCacheInvalidations)
}
}
func TestSyntheticRuntimeRouteCacheKeepsCurrentVersion(t *testing.T) {
route := testRoute("route-cache-current", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache-current"); err != nil {
t.Fatalf("send route health probe: %v", err)
}
invalidated := nodeA.InvalidateRouteCache("same_versions", SyntheticRouteCacheVersion{
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
})
if invalidated != 0 {
t.Fatalf("invalidated = %d, want 0", invalidated)
}
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok {
t.Fatal("route observation missing after same-version invalidation")
}
}
func TestSyntheticRuntimeRouteHealthDisabledRejects(t *testing.T) {
route := testRoute("route-health-disabled", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
})
_, err := nodeA.SendRouteHealthProbeWithFallback(
context.Background(),
route.RouteID,
[]string{"route-fallback"},
SyntheticChannelFabricControl,
"probe-disabled-health",
)
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func testRuntime(nodeID string, transport SyntheticTransport, routes ...SyntheticRoute) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID},
Routes: routes,
Transport: transport,
MaxTTL: 8,
MaxHops: 8,
})
}
func testRuntimeWithRouteHealth(nodeID string, transport SyntheticTransport, routes []SyntheticRoute, routeHealthRoutes []SyntheticRoute) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID},
Routes: routes,
RouteHealthRoutes: routeHealthRoutes,
Transport: transport,
MaxTTL: 8,
MaxHops: 8,
})
}
func testRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
ExpiresAt: time.Now().UTC().Add(time.Hour),
MaxTTL: 8,
MaxHops: 8,
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func testEnvelope(route SyntheticRoute, fromNodeID string, toNodeID string) SyntheticEnvelope {
payload, _ := json.Marshal(SyntheticProbePayload{
ProbeID: "probe-test",
SentAt: time.Now().UTC(),
})
return SyntheticEnvelope{
ProtocolVersion: ProtocolVersion,
RouteID: route.RouteID,
ClusterID: route.ClusterID,
From: PeerIdentity{ClusterID: route.ClusterID, NodeID: fromNodeID},
To: PeerIdentity{ClusterID: route.ClusterID, NodeID: toNodeID},
Channel: SyntheticChannelFabricControl,
MessageType: SyntheticMessageProbe,
TTL: 8,
HopCount: 1,
Visited: []string{fromNodeID},
Sequence: 1,
SentAt: time.Now().UTC(),
Payload: payload,
}
}
func decodeAckPayload(t *testing.T, envelope SyntheticEnvelope) SyntheticProbeAckPayload {
t.Helper()
var payload SyntheticProbeAckPayload
if err := json.Unmarshal(envelope.Payload, &payload); err != nil {
t.Fatalf("decode ack payload: %v", err)
}
return payload
}
@@ -0,0 +1,235 @@
package mesh
import (
"context"
"encoding/json"
"errors"
"strings"
"testing"
)
func TestSyntheticRuntimeTestServiceDirectRoute(t *testing.T) {
route := testServiceRoute("route-test-service-direct", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-direct", "hello"))
if err != nil {
t.Fatalf("send test service: %v", err)
}
if result.Ack.MessageType != SyntheticMessageTestServiceAck {
t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageTestServiceAck)
}
if result.Response.EchoPayload != "hello" {
t.Fatalf("EchoPayload = %q, want hello", result.Response.EchoPayload)
}
if len(result.Response.Path) != 2 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-b", result.Response.Path)
}
metrics := nodeA.SnapshotMetrics()
if metrics.TestServiceRequestsSent != 1 || metrics.TestServiceDeliveriesSucceeded != 1 {
t.Fatalf("metrics = %+v, want one test service success", metrics)
}
}
func TestSyntheticRuntimeTestServiceSingleRelayRoute(t *testing.T) {
route := testServiceRoute("route-test-service-relay", []string{"node-a", "node-r", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeR := testRuntime("node-r", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-r"] = nodeR
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-relay", "relay"))
if err != nil {
t.Fatalf("send test service: %v", err)
}
if len(result.Response.Path) != 3 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-r" || result.Response.Path[2] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", result.Response.Path)
}
if nodeR.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded)
}
}
func TestSyntheticRuntimeTestServiceUsesForcedFallback(t *testing.T) {
preferred := testServiceRoute("route-test-service-preferred", []string{"node-a", "node-r", "node-b"})
fallback := testServiceRoute("route-test-service-fallback", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, preferred, fallback)
nodeB := testRuntime("node-b", transport, preferred, fallback)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestServiceWithFallback(
context.Background(),
preferred.RouteID,
[]string{fallback.RouteID},
SyntheticChannelRouteControl,
testServiceRequest("request-fallback", "fallback"),
)
if err != nil {
t.Fatalf("send test service with fallback: %v", err)
}
if !result.FallbackUsed {
t.Fatal("FallbackUsed = false, want true")
}
if result.SelectedRouteID != fallback.RouteID {
t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID)
}
if result.Response.EchoPayload != "fallback" {
t.Fatalf("EchoPayload = %q, want fallback", result.Response.EchoPayload)
}
metrics := nodeA.SnapshotMetrics()
if metrics.TestServiceFallbacksUsed != 1 || metrics.TestServiceDeliveriesFailed != 1 || metrics.TestServiceDeliveriesSucceeded != 1 {
t.Fatalf("metrics = %+v, want fallback success with one preferred failure", metrics)
}
}
func TestSyntheticRuntimeTestServiceRejectsWrongOrganization(t *testing.T) {
route := testServiceRoute("route-test-service-wrong-org", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
request := testServiceRequest("request-wrong-org", "hello")
request.OrganizationID = "org-other"
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
if !errors.Is(err, ErrSyntheticOrganizationMismatch) {
t.Fatalf("err = %v, want ErrSyntheticOrganizationMismatch", err)
}
}
func TestSyntheticRuntimeTestServiceRejectsUnsupportedService(t *testing.T) {
route := testServiceRoute("route-test-service-unsupported", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
request := testServiceRequest("request-unsupported", "hello")
request.ServiceType = "rdp"
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
if !errors.Is(err, ErrUnsupportedSyntheticService) {
t.Fatalf("err = %v, want ErrUnsupportedSyntheticService", err)
}
}
func TestSyntheticRuntimeTestServiceRejectsOversizedPayload(t *testing.T) {
route := testServiceRoute("route-test-service-oversized", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
MaxTestPayloadBytes: 4,
})
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-oversized", "12345"))
if !errors.Is(err, ErrSyntheticPayloadTooLarge) {
t.Fatalf("err = %v, want ErrSyntheticPayloadTooLarge", err)
}
}
func TestSyntheticRuntimeTestServiceRejectsUnauthorizedChannel(t *testing.T) {
route := testServiceRoute("route-test-service-channel", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelFabricControl, testServiceRequest("request-channel", "hello"))
if !errors.Is(err, ErrUnauthorizedChannel) {
t.Fatalf("err = %v, want ErrUnauthorizedChannel", err)
}
}
func TestSyntheticRuntimeTestServiceDisabledRejects(t *testing.T) {
route := testServiceRoute("route-test-service-disabled", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
})
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-disabled", "hello"))
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func TestSyntheticRelaySchedulerAcceptsTestServiceMessage(t *testing.T) {
scheduler := testRelayScheduler()
envelope := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageTestService, 42)
envelope.Payload = mustMarshalTestServiceRequest(testServiceRequest("request-relay-scheduler", "hello"))
if _, err := scheduler.Enqueue(envelope); err != nil {
t.Fatalf("enqueue test service: %v", err)
}
dequeued, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue test service: %v", err)
}
if dequeued.MessageType != SyntheticMessageTestService {
t.Fatalf("MessageType = %q, want %q", dequeued.MessageType, SyntheticMessageTestService)
}
}
func testServiceRoute(routeID string, hops []string) SyntheticRoute {
route := testRoute(routeID, hops)
route.AllowedChannels = []string{SyntheticChannelRouteControl}
return route
}
func testServiceRequest(requestID string, payload string) SyntheticTestServiceRequest {
return SyntheticTestServiceRequest{
RequestID: requestID,
OrganizationID: SyntheticDefaultTestOrganizationID,
ServiceType: SyntheticTestServiceType,
Payload: payload,
}
}
func mustMarshalTestServiceRequest(request SyntheticTestServiceRequest) []byte {
payload, err := json.Marshal(request)
if err != nil {
panic(err)
}
return payload
}
func TestSyntheticRuntimeTestServiceRejectsMissingRequestID(t *testing.T) {
route := testServiceRoute("route-test-service-missing-request", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
request := testServiceRequest("", "hello")
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
if !errors.Is(err, ErrSyntheticRequestInvalid) {
t.Fatalf("err = %v, want ErrSyntheticRequestInvalid", err)
}
}
func TestSyntheticRuntimeTestServiceAllowsMaxPayloadBoundary(t *testing.T) {
route := testServiceRoute("route-test-service-max", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
Transport: transport,
MaxTestPayloadBytes: 8,
})
nodeB := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
Routes: []SyntheticRoute{route},
Transport: transport,
MaxTestPayloadBytes: 8,
})
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-max", strings.Repeat("a", 8)))
if err != nil {
t.Fatalf("send test service: %v", err)
}
if result.Response.EchoPayload != strings.Repeat("a", 8) {
t.Fatalf("EchoPayload = %q", result.Response.EchoPayload)
}
}
@@ -0,0 +1,18 @@
package relay
import "errors"
var ErrProductionForwardingDisabled = errors.New("relay skeleton does not forward production payloads before an approved production mesh stage")
type Skeleton struct {
ClusterID string
NodeID string
}
func (s Skeleton) AcceptControlConnection() bool {
return s.ClusterID != "" && s.NodeID != ""
}
func (s Skeleton) ForwardProductionPayload([]byte) error {
return ErrProductionForwardingDisabled
}
@@ -0,0 +1,16 @@
package relay
import (
"errors"
"testing"
)
func TestRelaySkeletonAcceptsControlButNotPayloadForwarding(t *testing.T) {
relay := Skeleton{ClusterID: "cluster-1", NodeID: "node-relay"}
if !relay.AcceptControlConnection() {
t.Fatal("relay skeleton should accept control connection metadata")
}
if err := relay.ForwardProductionPayload([]byte("rdp")); !errors.Is(err, ErrProductionForwardingDisabled) {
t.Fatalf("err = %v, want ErrProductionForwardingDisabled", err)
}
}
@@ -0,0 +1,129 @@
package state
import (
"crypto/rand"
"encoding/base64"
"encoding/json"
"errors"
"os"
"path/filepath"
"time"
)
const FileName = "identity.json"
type Identity struct {
NodeID string `json:"node_id"`
ClusterID string `json:"cluster_id"`
NodeName string `json:"node_name"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
IdentityStatus string `json:"identity_status"`
PendingJoinRequestID string `json:"pending_join_request_id,omitempty"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key,omitempty"`
ClusterAuthorityFingerprint string `json:"cluster_authority_fingerprint,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
func LoadOrCreate(dir, clusterID, nodeName string) (Identity, error) {
path := filepath.Join(dir, FileName)
existing, err := Load(path)
if err == nil {
return existing, nil
}
if !errors.Is(err, os.ErrNotExist) {
return Identity{}, err
}
now := time.Now().UTC()
fingerprint, err := randomToken("rap-node-fp")
if err != nil {
return Identity{}, err
}
publicKey, err := randomToken("rap-node-pub")
if err != nil {
return Identity{}, err
}
identity := Identity{
ClusterID: clusterID,
NodeName: nodeName,
NodeFingerprint: fingerprint,
PublicKey: publicKey,
IdentityStatus: "new",
CreatedAt: now,
UpdatedAt: now,
}
if err := Save(path, identity); err != nil {
return Identity{}, err
}
return identity, nil
}
func Load(path string) (Identity, error) {
payload, err := os.ReadFile(path)
if err != nil {
return Identity{}, err
}
var identity Identity
if err := json.Unmarshal(payload, &identity); err != nil {
return Identity{}, err
}
return identity, nil
}
func Save(path string, identity Identity) error {
if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil {
return err
}
identity.UpdatedAt = time.Now().UTC()
payload, err := json.MarshalIndent(identity, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, payload, 0o600)
}
func MarkEnrollmentSubmitted(dir, clusterID, joinRequestID string) (Identity, error) {
path := filepath.Join(dir, FileName)
identity, err := Load(path)
if err != nil {
return Identity{}, err
}
identity.ClusterID = clusterID
identity.PendingJoinRequestID = joinRequestID
identity.IdentityStatus = "pending_approval"
if err := Save(path, identity); err != nil {
return Identity{}, err
}
return identity, nil
}
func MarkApproved(dir string, nodeID, clusterID, status string) (Identity, error) {
return MarkApprovedWithAuthority(dir, nodeID, clusterID, status, "", "")
}
func MarkApprovedWithAuthority(dir string, nodeID, clusterID, status, authorityPublicKey, authorityFingerprint string) (Identity, error) {
path := filepath.Join(dir, FileName)
identity, err := Load(path)
if err != nil {
return Identity{}, err
}
identity.NodeID = nodeID
identity.ClusterID = clusterID
identity.IdentityStatus = status
identity.PendingJoinRequestID = ""
identity.ClusterAuthorityPublicKey = authorityPublicKey
identity.ClusterAuthorityFingerprint = authorityFingerprint
if err := Save(path, identity); err != nil {
return Identity{}, err
}
return identity, nil
}
func randomToken(prefix string) (string, error) {
var random [32]byte
if _, err := rand.Read(random[:]); err != nil {
return "", err
}
return prefix + "_" + base64.RawURLEncoding.EncodeToString(random[:]), nil
}
@@ -0,0 +1,55 @@
package state
import (
"path/filepath"
"testing"
)
func TestLoadOrCreatePersistsIdentity(t *testing.T) {
dir := t.TempDir()
identity, err := LoadOrCreate(dir, "cluster-1", "node-a")
if err != nil {
t.Fatalf("load or create: %v", err)
}
if identity.NodeFingerprint == "" || identity.PublicKey == "" {
t.Fatalf("identity missing generated fields: %+v", identity)
}
loaded, err := Load(filepath.Join(dir, FileName))
if err != nil {
t.Fatalf("load identity: %v", err)
}
if loaded.NodeFingerprint != identity.NodeFingerprint {
t.Fatal("identity fingerprint was not persisted")
}
}
func TestMarkApprovedUpdatesIdentity(t *testing.T) {
dir := t.TempDir()
if _, err := LoadOrCreate(dir, "cluster-1", "node-a"); err != nil {
t.Fatalf("load or create: %v", err)
}
if _, err := MarkEnrollmentSubmitted(dir, "cluster-1", "join-request-1"); err != nil {
t.Fatalf("mark enrollment submitted: %v", err)
}
approved, err := MarkApproved(dir, "node-1", "cluster-1", "active")
if err != nil {
t.Fatalf("mark approved: %v", err)
}
if approved.NodeID != "node-1" || approved.IdentityStatus != "active" || approved.PendingJoinRequestID != "" {
t.Fatalf("unexpected approved identity: %+v", approved)
}
}
func TestMarkApprovedWithAuthorityPinsClusterAuthority(t *testing.T) {
dir := t.TempDir()
if _, err := LoadOrCreate(dir, "cluster-1", "node-a"); err != nil {
t.Fatalf("load or create: %v", err)
}
approved, err := MarkApprovedWithAuthority(dir, "node-1", "cluster-1", "active", "public-key-b64", "rap-ca-ed25519-test")
if err != nil {
t.Fatalf("mark approved with authority: %v", err)
}
if approved.ClusterAuthorityPublicKey != "public-key-b64" || approved.ClusterAuthorityFingerprint != "rap-ca-ed25519-test" {
t.Fatalf("authority pin was not persisted: %+v", approved)
}
}
@@ -0,0 +1,40 @@
package supervisor
import (
"context"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
)
type Supervisor interface {
Apply(ctx context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error)
}
type StubSupervisor struct {
Version string
}
func (s StubSupervisor) Apply(_ context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error) {
statuses := make([]client.WorkloadStatusRequest, 0, len(desired))
for _, workload := range desired {
state := "degraded"
if workload.DesiredState == "disabled" {
state = "stopped"
}
version := workload.Version
if version == "" {
version = s.Version
}
statuses = append(statuses, client.WorkloadStatusRequest{
ReportedState: state,
RuntimeMode: workload.RuntimeMode,
Version: version,
StatusPayload: map[string]any{
"supervisor": "stub",
"desired_state": workload.DesiredState,
"service_type": workload.ServiceType,
},
})
}
return statuses, nil
}
@@ -0,0 +1,35 @@
package supervisor
import (
"context"
"testing"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
)
func TestStubSupervisorReportsDegradedForEnabledWorkload(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "rdp-worker", DesiredState: "enabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if len(statuses) != 1 {
t.Fatalf("statuses length = %d", len(statuses))
}
if statuses[0].ReportedState != "degraded" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
}
func TestStubSupervisorReportsStoppedForDisabledWorkload(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "relay-node", DesiredState: "disabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "stopped" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
}
+19
View File
@@ -0,0 +1,19 @@
FROM golang:1.23-bookworm AS build
WORKDIR /src
COPY go.mod go.sum ./
RUN go mod download
COPY . .
RUN CGO_ENABLED=0 GOOS=linux go build -o /out/rap-api ./cmd/api
FROM debian:bookworm-slim
RUN apt-get update \
&& apt-get install -y --no-install-recommends ca-certificates \
&& rm -rf /var/lib/apt/lists/*
COPY --from=build /out/rap-api /usr/local/bin/rap-api
ENTRYPOINT ["/usr/local/bin/rap-api"]
+270
View File
@@ -0,0 +1,270 @@
# Backend Foundation
Production-oriented Go backend skeleton for the remote access platform.
## Scope included
- configuration loading from environment
- HTTP server bootstrap with graceful shutdown
- PostgreSQL and Redis connectivity wiring
- migrations scaffold
- auth foundation with access/refresh tokens, hashed refresh rotation, trusted devices, and persisted auth sessions
- persistent session storage foundation for remote sessions, attachments, resource policies, and audit events
- session broker orchestration for start, attach, detach, takeover, terminate, failure, and detached-session recovery
- Redis-backed live session state, controller binding, attach tokens, heartbeat keys, worker routing, and reconnect support
- Redis-backed worker registration, lease lifecycle, heartbeat tracking, stale lease recovery, and routing queues
- worker assignment queueing and worker event ingestion for the minimal real RDP worker runtime
- websocket live plane with attach handshake, ping/pong heartbeat, state messages, takeover detection, and transport reconnect flow
- module boundaries for auth, resources, session broker, and websocket gateway
- worker registry scaffold to prepare later RDP worker integration
- per-resource certificate verification policy for RDP connections with `strict` default and explicit `ignore` override
- platform-core v2 foundations for organizations, memberships, identity sources, nodes, and node-agent control plane
- Data Plane v1 contract scaffolding for optional session response candidates/tokens, with current backend gateway behavior preserved as fallback
- production resource secret-readiness guard for rejecting plaintext credential-like metadata and requiring `secret_ref` for RDP/VNC/SSH resources in production mode
- encrypted resource secret storage/resolver MVP for production `secret_ref` usage
## Entry point
Run the API from `cmd/api`.
## Local dev
- backend: `pwsh -File scripts/smoke/run-backend.ps1`
- infra: `pwsh -File scripts/smoke/start-infra.ps1`
- migrations: `pwsh -File scripts/smoke/apply-migrations.ps1`
- worker image build: `docker build --tag rap-rdp-worker:dev --file workers/rdp-worker/Dockerfile workers/rdp-worker`
- end-to-end smoke path: [scripts/smoke/README.md](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\scripts\smoke\README.md)
## Configuration
Use `configs/api.example.env` as the starting point for local environment variables.
Resource secret-readiness is controlled by `APP_ENV`:
- in `APP_ENV=production` or `APP_ENV=prod`, RDP/VNC/SSH resources must carry
`secret_ref` and must not include plaintext credential-like fields in
`metadata`
- in development and smoke environments, plaintext metadata remains allowed
until the encrypted secret resolver is implemented
- the production guard is enforced both on resource create/update and on
session start, so legacy plaintext resources cannot be started in production
accidentally
- `SECRET_ENCRYPTION_KEY_B64` or `SECRET_ENCRYPTION_KEY_FILE` supplies the
AES-256-GCM master key for the MVP encrypted store; production mode refuses
to start without one
- `SECRET_ENCRYPTION_KEY_ID` labels the active key version in stored records
- `PUT /api/v1/resources/{resourceID}/secret` creates or rotates a resource
secret and updates `resources.secret_ref`; plaintext is never returned by the
API
- session assignment keeps PostgreSQL metadata safe: `remote_sessions.metadata`
stores `secret_ref`, while resolved credentials are merged only into the
transient worker assignment after session/worker/lease checks
See `docs/architecture/SECURITY_SECRETS_READINESS.md` for the target
secret-reference model and remaining resolver/PKI gaps.
Data Plane v1 contract scaffolding is controlled by:
- `DATA_PLANE_TOKEN_TTL`, default `1m`
- `DATA_PLANE_TOKEN_PRIVATE_KEY_FILE`, optional path to an RSA private key PEM used to sign RS256 data-plane tokens
- `DATA_PLANE_TOKEN_PRIVATE_KEY_PEM`, optional inline RSA private key PEM; used when file path is not configured
- `DATA_PLANE_BACKEND_GATEWAY_URL`, default `/api/v1/gateway/ws`
- `DATA_PLANE_DIRECT_WORKER_WSS_URL_TEMPLATE`, optional; supports `{worker_id}` replacement
- `DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME`, default `false`; advertises
`runtime_transport=json_v1` only after the worker direct JSON bridge is
deployed and verified
- `DATA_PLANE_DIRECT_WORKER_BINARY_RENDER`, default `false`; when the direct
JSON runtime is enabled, advertises `render_transport=binary_v1` so DP-2
clients can request binary render frames over direct worker WSS. Binary
render candidates also advertise `supported_color_modes=["full_color","grayscale"]`
and `default_color_mode="full_color"` for the DP-3A grayscale foundation.
- `DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE`, default `smoke_insecure`; allowed
values are `smoke_insecure`, `public_ca`, and `platform_ca`.
- `DATA_PLANE_DIRECT_WORKER_TLS_CA_REF`, optional label for the platform CA or
trust bundle version advertised to clients.
Data-plane tokens are RS256-signed. The backend must hold only the private key;
workers receive only the matching public key for validation. If no private key
is configured, the backend omits the optional `data_plane` offer and the
backend gateway fallback remains unchanged.
If no direct worker WSS URL template is configured, session responses still include the backend gateway fallback candidate only.
If the URL template is configured but `DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME`
is `false`, the direct candidate is still present for contract visibility but is
not marked data-capable; DP-1D Windows clients will skip it and use the backend
gateway fallback.
If `DATA_PLANE_DIRECT_WORKER_BINARY_RENDER` is `false`, direct worker WSS
remains JSON/base64 for render. If it is `true`, only direct worker WSS render
is binary; backend gateway fallback remains JSON/base64.
In production, the backend does not advertise direct worker WSS when
`DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE=smoke_insecure`; it keeps the backend
gateway fallback instead. Trusted direct candidates include `tls_trust_mode`,
`production_trusted`, `smoke_only`, and optional `tls_ca_ref` metadata. See
`docs/architecture/DIRECT_WORKER_TLS_PKI.md`.
## Module layout
- `internal/platform` shared runtime, config, infra, and bootstrap concerns
- `internal/modules/auth` auth and trusted-device boundary
- `internal/modules/organization` organization model, org roles, and memberships
- `internal/modules/identitysource` local/LDAP/OIDC identity source model and future mapping foundations
- `internal/modules/resource` remote resource inventory boundary
- `internal/modules/sessionbroker` persistent session lifecycle, orchestration, audit, and Redis live-state boundary
- `internal/modules/sessiongateway` websocket attach/reconnect/takeover transport boundary
- `internal/modules/worker` worker registration, lease coordination, and control-plane routing boundary for future C++ RDP workers
- `internal/modules/node` node inventory, capabilities, enabled services, update policy, and partition state
- `internal/modules/nodeagent` node-agent registration, health, service status, and update/rollback control interface
- `pkg/contracts` cross-module contracts for sessions and worker control
## Backend responsibilities
- PostgreSQL remains the source of truth for auth sessions, devices, remote sessions, attachments, resource policies, and audit events
- Redis is used only for live routing and coordination: attach tokens, controller bindings, live session cache, worker registration, worker leases, heartbeats, and routing queues
- `worker:control:<worker_id>` carries worker assignments, `worker:queue:<session_id>` carries live control/input envelopes, and `worker:events` carries worker-reported lifecycle events back into broker processing
- Session broker owns state transitions and orchestration rules; websocket handlers call broker services instead of talking to postgres repositories directly
- Worker runtime stays behind interfaces and Redis coordination so the backend remains isolated from FreeRDP implementation details while the minimal real RDP worker plugs into the control plane
- RDP certificate verification is configured per resource through `certificate_verification_mode`
- resources are now org-scoped in PostgreSQL and remote sessions persist their owning organization without changing the proven worker/session runtime contracts
- session start/attach/takeover responses may include optional `data_plane` candidates and a short-lived signed data-plane token for DP-1 direct worker WSS migration; existing clients continue to use the current gateway path, and direct realtime use remains gated by explicit candidate metadata
## Authorization model
- `platform_admin` and `platform_recovery_admin` have global access across organizations, resources, and sessions
- in `INSTALLATION_AUTHORITY_MODE=strict`, platform-admin power is effective only
when the user also has a valid signed row in `platform_role_grants`; changing
`users.platform_role` in PostgreSQL alone no longer grants owner access
- first-owner bootstrap is available at
`POST /api/v1/installation/bootstrap-owner` and requires a Product Root
Ed25519 signature over an activation manifest in strict mode
- production (`APP_ENV=production` or `prod`) requires strict installation
authority plus `INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_B64` or
`INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_FILE`
- legacy/dev installs can keep database-role behavior, and insecure first-owner
bootstrap is available only when
`INSTALLATION_INSECURE_BOOTSTRAP_ENABLED=true`
- `org_owner` and `org_admin` can create and update resources inside their organization and can manage any remote session inside that organization
- active non-admin memberships such as `org_operator`, `org_member`, and `org_viewer` are deny-by-default for admin actions; they can only access org-scoped reads and operate on their own session flows where the session broker explicitly allows it
- session start always authorizes the actor against the resource organization before worker reservation
- attach, detach, takeover, and terminate authorize against the owning remote session organization before any state transition is written
- worker-facing events do not bypass this model for user-originated commands; internal worker failure and heartbeat paths remain broker-internal control-plane operations
## Migration safety
- `000005_platform_core_v2` bootstraps a single `default` organization and backfills existing `resources.organization_id` and `remote_sessions.organization_id` into that organization before setting `NOT NULL`
- `000006_default_org_memberships_backfill` safely restores access continuity by inserting missing active memberships for existing users into the `default` organization
- the backfill is idempotent because it only inserts rows missing under the `(organization_id, user_id)` uniqueness constraint
- platform administrators are backfilled as `org_owner` in the default organization, while other existing users are backfilled as `org_member`
- if `000005` fails before the `NOT NULL` step, PostgreSQL rolls back the transaction and leaves pre-v2 rows untouched; if `000006` is rerun, it skips already-created memberships rather than duplicating them
## Platform-Core V2 Notes
- `organizations`, `organization_memberships`, and `organization_roles` establish multi-tenant ownership and basic org-scoped authorization boundaries
- `identity_sources` and `identity_mappings` are foundation-only in this phase; full LDAP/OIDC sync and claim/group ingestion are intentionally deferred
- `nodes`, `node_capabilities`, `node_services`, `node_update_policies`, `node_partition_states`, and `node_agent_update_runs` provide the first control-plane model for node and node-agent lifecycle
- current proven RDP session lifecycle remains preserved: the session broker still orchestrates the same worker/session behavior, but it now records organization ownership via org-scoped resources
- PostgreSQL remains the source of truth for organizations, memberships, org-scoped resources, identity sources, nodes, node-agent state, and session lifecycle state
## Resource Certificate Verification
- `strict` is the default and keeps normal certificate validation enabled in the worker runtime
- `ignore` must be explicitly stored on the resource and allows that one RDP connection to skip certificate validation
- the backend passes this policy through session assignment data; it is not a global backend toggle
## Messaging Model
- HTTP errors now use a structured envelope:
- `error.code`
- `error.message_key`
- `error.fallback_message`
- `error.details`
- `error.trace_id`
- `internal/platform/httpx` owns error normalization and trace-id generation so handlers can keep calling `WriteError(...)` without changing business logic.
- For `5xx` responses, user-facing payloads are normalized to an English generic fallback message while logs and diagnostics can still keep raw internal details elsewhere.
- For `4xx` responses, stable `code` and `message_key` are derived from the current fallback message, so clients can localize without depending on raw English text as the primary contract.
## WebSocket Messaging
- Session gateway envelopes keep the existing `type` and `payload` contract.
- User-facing websocket events now also include `event` with:
- `code`
- `message_key`
- `fallback_message`
- `details`
- `trace_id`
- `session.taken_over`, terminal `session.state`, `transport.closed`, and protocol-level errors now carry this structured event object.
- Existing payload semantics remain intact for compatibility with the already proven session lifecycle.
## Message Rules
- Keep English as the only development language for `fallback_message`, logs, and diagnostics.
- New HTTP handlers should prefer `httpx.WriteError(...)` for user-facing failures instead of hand-building `"error": "..."` JSON.
- New websocket user-facing notifications should populate `TransportEnvelope.Event` with a stable `code` and `message_key`.
- Do not use raw human-readable English text as the primary client contract; it should only remain as fallback text.
- This messaging layer is now runtime-proven against the live Windows smoke flow for invalid-login errors, websocket takeover delivery, websocket state fallback rendering, and worker-death failure handling.
## Clipboard Policy
RDP text clipboard is controlled per resource through `resource_policies.clipboard_mode`.
Allowed values are `disabled`, `client_to_server`, `server_to_client`, and
`bidirectional`; the default is `disabled`. The legacy `clipboard_enabled`
column is retained only for compatibility and migration/backfill, while new
runtime decisions use `clipboard_mode`.
Clipboard enforcement happens in the real data path:
- `sessionbroker.ResourcePolicy.ClipboardMode` is loaded from PostgreSQL and
embedded into the session assignment metadata sent to the worker.
- `sessiongateway.Module.handleEnvelope` blocks client-to-server clipboard
envelopes unless the session is `active` and the policy allows that direction.
- `worker.EventProcessor` sends worker-originated clipboard text through
`sessionbroker.Service.UpdateWorkerClipboardText`, which applies the same
active-state and server-to-client policy checks before updating live state.
- Clipboard messages carry `sequence_id`, `origin`, and `content_hash` so
clients and workers can avoid feedback loops across reattach/takeover paths.
- Redis stores clipboard text only as transient live state for routing to the
active controller; PostgreSQL remains authoritative for policy/session state.
## File Upload Policy
Stage 5.1 introduces client-to-server file upload as a policy-gated RDP
feature. The authoritative policy field is
`resource_policies.file_transfer_mode`; allowed values are `disabled`,
`client_to_server`, `server_to_client`, and `bidirectional`, but only
`client_to_server` behavior is implemented in this stage. The default is
`disabled`. The legacy `file_transfer_enabled` column is retained only as a
derived compatibility flag and must not be treated as the primary policy.
Enforcement is deliberately duplicated in the real data path:
- `resource.Module` exposes `file_transfer_mode` in resource create, update,
list, and read payloads.
- `sessionbroker.Service.StartRemoteSession` embeds `file_transfer_mode` into
assignment metadata and requests the worker `file-transfer` capability only
when client-to-server upload is allowed.
- `sessiongateway.Module.handleFileUploadStart` and
`handleFileUploadChunk` require an active session, current controller,
allowed policy mode, valid UUID `transfer_id`, safe file name, 25 MiB max
file size, and 256 KiB max chunk size before routing chunks to the worker.
- Redis is used only to route bounded upload envelopes to the worker. The file
itself is written by the worker to controlled worker storage; PostgreSQL
remains authoritative for policy and session state.
## File Download Policy
Stage 5.2 adds a runtime-proven server-to-client download path for RDP. The
policy field remains `resource_policies.file_transfer_mode`; `server_to_client`
and `bidirectional` allow download, while `disabled` and `client_to_server`
block it. The default remains `disabled`.
The v1 download model uses only the restricted `RAP_Transfers\ToClient`
drop-zone inside the existing per-session visible transfer directory. Backend
gateway accepts only `file_download.start`, `file_download.ack`, and
`file_download.cancel` from the current controller of an active session and
routes them to the worker after policy validation. Worker-origin
`file_download.*` events are stored only as transient live state for
backend-gateway fallback delivery; PostgreSQL remains authoritative for
session/resource/policy state and must not store file contents.
The direct worker WSS path is also lifecycle-gated: detach returns
`file_download.blocked`, old-controller takeover returns `session.taken_over`,
and worker failure closes the direct transport after PostgreSQL transitions the
session to `failed`.
+24
View File
@@ -0,0 +1,24 @@
package main
import (
"context"
"log"
"os/signal"
"syscall"
"github.com/example/remote-access-platform/backend/internal/platform/runtime"
)
func main() {
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stop()
app, err := runtime.NewApp(ctx)
if err != nil {
log.Fatalf("bootstrap app: %v", err)
}
if err := app.Run(ctx); err != nil {
log.Fatalf("run app: %v", err)
}
}
+94
View File
@@ -0,0 +1,94 @@
package main
import (
"context"
"flag"
"fmt"
"log"
"net/url"
"os"
"os/signal"
"strings"
"time"
"github.com/gorilla/websocket"
)
func main() {
var (
gatewayURL = flag.String("gateway-url", "ws://127.0.0.1:8080/api/v1/gateway/ws", "websocket gateway url")
attachToken = flag.String("attach-token", "", "short-lived attach token")
duration = flag.Duration("duration", 90*time.Second, "maximum time to keep the client attached")
heartbeatInterval = flag.Duration("heartbeat-interval", 10*time.Second, "client heartbeat interval")
exitOnTakenOver = flag.Bool("exit-on-taken-over", true, "exit successfully after receiving session.taken_over")
)
flag.Parse()
if *attachToken == "" {
log.Fatal("attach-token is required")
}
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()
if *duration > 0 {
var timeoutCancel context.CancelFunc
ctx, timeoutCancel = context.WithTimeout(ctx, *duration)
defer timeoutCancel()
}
targetURL, err := url.Parse(*gatewayURL)
if err != nil {
log.Fatalf("parse gateway url: %v", err)
}
query := targetURL.Query()
query.Set("attach_token", *attachToken)
targetURL.RawQuery = query.Encode()
conn, _, err := websocket.DefaultDialer.DialContext(ctx, targetURL.String(), nil)
if err != nil {
log.Fatalf("dial websocket: %v", err)
}
defer conn.Close()
done := make(chan error, 1)
go func() {
for {
_, payload, err := conn.ReadMessage()
if err != nil {
done <- err
return
}
fmt.Println(string(payload))
if *exitOnTakenOver && containsTakenOver(payload) {
done <- nil
return
}
}
}()
heartbeatTicker := time.NewTicker(*heartbeatInterval)
defer heartbeatTicker.Stop()
for {
select {
case err := <-done:
if err != nil {
log.Fatalf("websocket read: %v", err)
}
return
case <-heartbeatTicker.C:
if err := conn.WriteJSON(map[string]any{"type": "heartbeat"}); err != nil {
log.Fatalf("heartbeat write: %v", err)
}
case <-ctx.Done():
if err := ctx.Err(); err != nil && err != context.Canceled && err != context.DeadlineExceeded {
log.Fatalf("context ended: %v", err)
}
return
}
}
}
func containsTakenOver(payload []byte) bool {
return strings.Contains(string(payload), `"type":"session.taken_over"`)
}
+44
View File
@@ -0,0 +1,44 @@
APP_NAME=rap-api
APP_ENV=development
HTTP_HOST=0.0.0.0
HTTP_PORT=8080
HTTP_READ_TIMEOUT=15s
HTTP_WRITE_TIMEOUT=15s
HTTP_IDLE_TIMEOUT=60s
HTTP_SHUTDOWN_TIMEOUT=10s
POSTGRES_DSN=postgres://rap_user:rap_password@localhost:5432/remote_access_platform?sslmode=disable
POSTGRES_MAX_CONNS=20
POSTGRES_MIN_CONNS=2
POSTGRES_CONNECT_TIMEOUT=5s
REDIS_ADDR=localhost:6379
REDIS_PASSWORD=
REDIS_DB=0
REDIS_DIAL_TIMEOUT=5s
AUTH_ACCESS_TOKEN_TTL=15m
AUTH_REFRESH_TOKEN_TTL=720h
AUTH_ISSUER=rap-api
AUTH_ACCESS_TOKEN_SECRET=change-me-access-secret
AUTH_REFRESH_HASH_SECRET=change-me-refresh-hash-secret
DATA_PLANE_TOKEN_TTL=1m
DATA_PLANE_TOKEN_PRIVATE_KEY_FILE=
DATA_PLANE_TOKEN_PRIVATE_KEY_PEM=
DATA_PLANE_BACKEND_GATEWAY_URL=/api/v1/gateway/ws
DATA_PLANE_DIRECT_WORKER_WSS_URL_TEMPLATE=
DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME=false
DATA_PLANE_DIRECT_WORKER_BINARY_RENDER=false
DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE=smoke_insecure
DATA_PLANE_DIRECT_WORKER_TLS_CA_REF=
SECRET_ENCRYPTION_KEY_B64=
SECRET_ENCRYPTION_KEY_FILE=
SECRET_ENCRYPTION_KEY_ID=local-v1
SESSION_HEARTBEAT_TTL=90s
SESSION_DETACH_GRACE_PERIOD=30m
SESSION_ATTACH_TOKEN_TTL=2m
SESSION_LIVE_STATE_TTL=2m
SESSION_RECOVERY_BATCH_SIZE=100
WORKER_LEASE_TTL=45s
WORKER_HEARTBEAT_TTL=15s
WORKER_STALE_LEASE_GRACE_PERIOD=30s
WEBSOCKET_WRITE_TIMEOUT=10s
WEBSOCKET_PING_INTERVAL=20s
WEBSOCKET_PONG_WAIT=40s
+23
View File
@@ -0,0 +1,23 @@
module github.com/example/remote-access-platform/backend
go 1.23.2
require (
github.com/go-chi/chi/v5 v5.2.1
github.com/golang-jwt/jwt/v5 v5.2.2
github.com/google/uuid v1.6.0
github.com/gorilla/websocket v1.5.3
github.com/jackc/pgx/v5 v5.7.4
github.com/redis/go-redis/v9 v9.8.0
golang.org/x/crypto v0.37.0
)
require (
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/puddle/v2 v2.2.2 // indirect
golang.org/x/sync v0.13.0 // indirect
golang.org/x/text v0.24.0 // indirect
)
+46
View File
@@ -0,0 +1,46 @@
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/go-chi/chi/v5 v5.2.1 h1:KOIHODQj58PmL80G2Eak4WdvUzjSJSm0vG72crDCqb8=
github.com/go-chi/chi/v5 v5.2.1/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops=
github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8=
github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
github.com/jackc/pgx/v5 v5.7.4 h1:9wKznZrhWa2QiHL+NjTSPP6yjl3451BX3imWDnokYlg=
github.com/jackc/pgx/v5 v5.7.4/go.mod h1:ncY89UGWxg82EykZUwSpUKEfccBGGYq1xjrOpsbsfGQ=
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/redis/go-redis/v9 v9.8.0 h1:q3nRvjrlge/6UD7eTu/DSg2uYiU2mCL0G/uzBWqhicI=
github.com/redis/go-redis/v9 v9.8.0/go.mod h1:huWgSWd8mW6+m0VPhJjSSQ+d6Nh1VICQ6Q5lHuCH/Iw=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE=
golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc=
golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+19
View File
@@ -0,0 +1,19 @@
package auth
import "errors"
var (
ErrInvalidCredentials = errors.New("invalid credentials")
ErrInvalidRefreshToken = errors.New("invalid refresh token")
ErrAuthSessionRevoked = errors.New("auth session revoked")
ErrDeviceRevoked = errors.New("device revoked")
ErrDeviceNotTrusted = errors.New("device not trusted")
ErrAuthSessionNotFound = errors.New("auth session not found")
ErrTrustedDeviceMissing = errors.New("trusted device not found")
ErrInstallationAlreadyBootstrapped = errors.New("installation is already bootstrapped")
ErrInstallationActivationRequired = errors.New("signed installation activation is required")
ErrInvalidInstallationActivation = errors.New("invalid installation activation")
ErrInsecureBootstrapDisabled = errors.New("insecure installation bootstrap is disabled")
ErrInvalidBootstrapOwner = errors.New("invalid bootstrap owner")
)
+114
View File
@@ -0,0 +1,114 @@
package auth
import (
"encoding/json"
"time"
)
type DeviceTrustStatus string
const (
DeviceTrustStatusPending DeviceTrustStatus = "pending"
DeviceTrustStatusTrusted DeviceTrustStatus = "trusted"
DeviceTrustStatusRevoked DeviceTrustStatus = "revoked"
)
type User struct {
ID string
Email string
PasswordHash string
MFAEnabled bool
CreatedAt time.Time
UpdatedAt time.Time
}
type Device struct {
ID string
UserID string
Fingerprint string
Label string
TrustStatus DeviceTrustStatus
TrustedAt *time.Time
LastSeenAt *time.Time
RevokedAt *time.Time
RevokedReason *string
CreatedAt time.Time
UpdatedAt time.Time
}
type AuthSession struct {
ID string
UserID string
DeviceID string
RefreshTokenHash string
RefreshExpiresAt time.Time
LastSeenAt *time.Time
LastRotatedAt *time.Time
RevokedAt *time.Time
RevokedReason *string
CreatedAt time.Time
UpdatedAt time.Time
}
type LoginCommand struct {
Email string `json:"email"`
Password string `json:"password"`
DeviceFingerprint string `json:"device_fingerprint"`
DeviceLabel string `json:"device_label"`
TrustDevice bool `json:"trust_device"`
}
type RefreshCommand struct {
RefreshToken string `json:"refresh_token"`
}
type BootstrapOwnerCommand struct {
Email string `json:"email"`
Password string `json:"password"`
ActivationPayload json.RawMessage `json:"activation_payload"`
ActivationSignature string `json:"activation_signature"`
}
type RevokeAuthSessionCommand struct {
UserID string `json:"user_id"`
AuthSessionID string `json:"auth_session_id"`
Reason string `json:"reason"`
}
type RevokeDeviceCommand struct {
UserID string `json:"user_id"`
DeviceID string `json:"device_id"`
Reason string `json:"reason"`
}
type TokenPair struct {
AccessToken string `json:"access_token"`
AccessTokenExpiresAt time.Time `json:"access_token_expires_at"`
RefreshToken string `json:"refresh_token"`
RefreshTokenExpiresAt time.Time `json:"refresh_token_expires_at"`
}
type AuthResult struct {
User User `json:"user"`
Device Device `json:"device"`
AuthSession AuthSession `json:"auth_session"`
Tokens TokenPair `json:"tokens"`
}
type InstallationStatus struct {
Bootstrapped bool `json:"bootstrapped"`
AuthorityState string `json:"authority_state"`
InstallID string `json:"install_id,omitempty"`
BootstrappedOwnerEmail string `json:"bootstrapped_owner_email,omitempty"`
BootstrappedAt *time.Time `json:"bootstrapped_at,omitempty"`
AuthorityMode string `json:"authority_mode"`
StrictAuthority bool `json:"strict_authority"`
RootFingerprint string `json:"root_fingerprint,omitempty"`
InsecureBootstrapAllowed bool `json:"insecure_bootstrap_allowed"`
}
type BootstrapOwnerResult struct {
Installation InstallationStatus `json:"installation"`
User User `json:"user"`
PlatformRole string `json:"platform_role"`
}
+173
View File
@@ -0,0 +1,173 @@
package auth
import (
"encoding/json"
"net/http"
"github.com/go-chi/chi/v5"
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
"github.com/example/remote-access-platform/backend/internal/platform/module"
)
type Module struct {
service *Service
}
func NewModule(deps module.Dependencies, service *Service) *Module {
return &Module{service: service}
}
func (m *Module) Name() string {
return "auth"
}
func (m *Module) RegisterRoutes(router chi.Router) {
router.Route("/installation", func(r chi.Router) {
r.Get("/status", m.handleInstallationStatus)
r.Post("/bootstrap-owner", m.handleBootstrapOwner)
})
router.Route("/auth", func(r chi.Router) {
r.Post("/login", m.handleLogin)
r.Post("/refresh", m.handleRefresh)
r.Post("/sessions/revoke", m.handleRevokeAuthSession)
r.Get("/devices", m.handleTrustedDevices)
r.Post("/devices/{deviceID}/revoke", m.handleRevokeTrustedDevice)
})
}
func (m *Module) handleInstallationStatus(w http.ResponseWriter, r *http.Request) {
status, err := m.service.InstallationStatus(r.Context())
if err != nil {
statusCode, message := m.service.MapError(err)
httpx.WriteError(w, statusCode, message)
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"installation": status})
}
func (m *Module) handleBootstrapOwner(w http.ResponseWriter, r *http.Request) {
var cmd BootstrapOwnerCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid installation bootstrap payload")
return
}
result, err := m.service.BootstrapOwner(r.Context(), cmd)
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusCreated, result)
}
func (m *Module) handleLogin(w http.ResponseWriter, r *http.Request) {
var cmd LoginCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid login payload")
return
}
result, err := m.service.Login(r.Context(), cmd)
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusOK, result)
}
func (m *Module) handleRefresh(w http.ResponseWriter, r *http.Request) {
var cmd RefreshCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid refresh payload")
return
}
result, err := m.service.Refresh(r.Context(), cmd)
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusOK, result)
}
func (m *Module) handleRevokeAuthSession(w http.ResponseWriter, r *http.Request) {
var cmd RevokeAuthSessionCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid auth session revoke payload")
return
}
if err := m.service.RevokeAuthSession(r.Context(), cmd); err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
"status": "revoked",
"message": httpx.NewMessage(
"auth.session.revoked",
"status.auth.session.revoked",
"Auth session revoked.",
nil,
"",
),
})
}
func (m *Module) handleTrustedDevices(w http.ResponseWriter, r *http.Request) {
userID := r.URL.Query().Get("user_id")
if userID == "" {
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
return
}
devices, err := m.service.ListTrustedDevices(r.Context(), userID)
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{
"devices": devices,
})
}
func (m *Module) handleRevokeTrustedDevice(w http.ResponseWriter, r *http.Request) {
var payload struct {
UserID string `json:"user_id"`
Reason string `json:"reason"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid device revoke payload")
return
}
err := m.service.RevokeTrustedDevice(r.Context(), RevokeDeviceCommand{
UserID: payload.UserID,
DeviceID: chi.URLParam(r, "deviceID"),
Reason: payload.Reason,
})
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
"status": "revoked",
"message": httpx.NewMessage(
"auth.device.revoked",
"status.auth.device.revoked",
"Trusted device revoked.",
nil,
"",
),
})
}
@@ -0,0 +1,525 @@
package auth
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
postgresplatform "github.com/example/remote-access-platform/backend/internal/platform/postgres"
)
type postgresStore struct {
db postgresplatform.DBTX
}
type PostgresTransactor struct {
pool *pgxpool.Pool
}
func NewPostgresStore(pool *pgxpool.Pool) Store {
return &postgresStore{db: pool}
}
func NewPostgresTransactor(pool *pgxpool.Pool) *PostgresTransactor {
return &PostgresTransactor{pool: pool}
}
func (t *PostgresTransactor) WithinTransaction(ctx context.Context, fn func(store Store) error) error {
return postgresplatform.WithTransaction(ctx, t.pool, func(tx pgx.Tx) error {
return fn(&postgresStore{db: tx})
})
}
func (s *postgresStore) Users() UserRepository {
return &postgresUserRepository{db: s.db}
}
func (s *postgresStore) Devices() DeviceRepository {
return &postgresDeviceRepository{db: s.db}
}
func (s *postgresStore) AuthSessions() AuthSessionRepository {
return &postgresAuthSessionRepository{db: s.db}
}
func (s *postgresStore) Installation() InstallationRepository {
return &postgresInstallationRepository{db: s.db}
}
type postgresUserRepository struct {
db postgresplatform.DBTX
}
type postgresDeviceRepository struct {
db postgresplatform.DBTX
}
type postgresAuthSessionRepository struct {
db postgresplatform.DBTX
}
type postgresInstallationRepository struct {
db postgresplatform.DBTX
}
func (r *postgresUserRepository) GetByEmail(ctx context.Context, email string) (*User, error) {
const query = `
SELECT id::text, email, password_hash, mfa_enabled, created_at, updated_at
FROM users
WHERE email = $1
`
return scanOptionalUser(r.db.QueryRow(ctx, query, email))
}
func (r *postgresUserRepository) GetByID(ctx context.Context, userID string) (*User, error) {
const query = `
SELECT id::text, email, password_hash, mfa_enabled, created_at, updated_at
FROM users
WHERE id = $1::uuid
`
return scanOptionalUser(r.db.QueryRow(ctx, query, userID))
}
func (r *postgresDeviceRepository) Upsert(ctx context.Context, params UpsertDeviceParams) (*Device, error) {
const query = `
INSERT INTO devices (
user_id,
device_fingerprint,
device_label,
trust_status,
trusted_at,
last_seen_at,
created_at,
updated_at
) VALUES (
$1::uuid,
$2,
$3,
CASE WHEN $4 THEN 'trusted' ELSE 'pending' END,
CASE WHEN $4 THEN $5::timestamptz ELSE NULL::timestamptz END,
$5::timestamptz,
$5::timestamptz,
$5::timestamptz
)
ON CONFLICT (user_id, device_fingerprint) DO UPDATE SET
device_label = EXCLUDED.device_label,
last_seen_at = EXCLUDED.last_seen_at,
updated_at = EXCLUDED.updated_at,
trust_status = CASE
WHEN devices.trust_status = 'revoked' THEN devices.trust_status
WHEN devices.trust_status = 'trusted' THEN devices.trust_status
WHEN EXCLUDED.trust_status = 'trusted' THEN 'trusted'
ELSE devices.trust_status
END,
trusted_at = CASE
WHEN devices.trust_status = 'trusted' THEN devices.trusted_at
WHEN EXCLUDED.trust_status = 'trusted' THEN EXCLUDED.trusted_at
ELSE devices.trusted_at
END
RETURNING
id::text, user_id::text, device_fingerprint, COALESCE(device_label, ''),
trust_status, trusted_at, last_seen_at, revoked_at, revoked_reason, created_at, updated_at
`
return scanDevice(r.db.QueryRow(ctx, query,
params.UserID,
params.Fingerprint,
params.Label,
params.TrustRequested,
params.SeenAt,
))
}
func (r *postgresDeviceRepository) GetByIDForUser(ctx context.Context, userID, deviceID string) (*Device, error) {
const query = `
SELECT id::text, user_id::text, device_fingerprint, COALESCE(device_label, ''),
trust_status, trusted_at, last_seen_at, revoked_at, revoked_reason, created_at, updated_at
FROM devices
WHERE id = $1::uuid AND user_id = $2::uuid
`
device, err := scanDevice(r.db.QueryRow(ctx, query, deviceID, userID))
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}
return device, err
}
func (r *postgresDeviceRepository) ListTrustedByUser(ctx context.Context, userID string) ([]Device, error) {
const query = `
SELECT id::text, user_id::text, device_fingerprint, COALESCE(device_label, ''),
trust_status, trusted_at, last_seen_at, revoked_at, revoked_reason, created_at, updated_at
FROM devices
WHERE user_id = $1::uuid AND trust_status = 'trusted' AND revoked_at IS NULL
ORDER BY created_at DESC
`
rows, err := r.db.Query(ctx, query, userID)
if err != nil {
return nil, fmt.Errorf("query trusted devices: %w", err)
}
defer rows.Close()
var devices []Device
for rows.Next() {
device, err := scanDevice(rows)
if err != nil {
return nil, err
}
devices = append(devices, *device)
}
return devices, rows.Err()
}
func (r *postgresDeviceRepository) Revoke(ctx context.Context, params RevokeDeviceParams) error {
const query = `
UPDATE devices
SET trust_status = 'revoked',
revoked_at = $3,
revoked_reason = $4,
updated_at = $3
WHERE id = $1::uuid AND user_id = $2::uuid
`
if _, err := r.db.Exec(ctx, query, params.DeviceID, params.UserID, params.RevokedAt, params.Reason); err != nil {
return fmt.Errorf("revoke device: %w", err)
}
return nil
}
func (r *postgresAuthSessionRepository) Create(ctx context.Context, session AuthSession) error {
const query = `
INSERT INTO auth_sessions (
id,
user_id,
device_id,
refresh_token_hash,
refresh_expires_at,
last_seen_at,
created_at,
updated_at
) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8)
`
if _, err := r.db.Exec(ctx, query,
session.ID,
session.UserID,
session.DeviceID,
session.RefreshTokenHash,
session.RefreshExpiresAt,
session.LastSeenAt,
session.CreatedAt,
session.UpdatedAt,
); err != nil {
return fmt.Errorf("create auth session: %w", err)
}
return nil
}
func (r *postgresAuthSessionRepository) GetByID(ctx context.Context, authSessionID string) (*AuthSession, error) {
return r.getByID(ctx, authSessionID, "")
}
func (r *postgresAuthSessionRepository) GetByIDForUpdate(ctx context.Context, authSessionID string) (*AuthSession, error) {
return r.getByID(ctx, authSessionID, " FOR UPDATE")
}
func (r *postgresAuthSessionRepository) getByID(ctx context.Context, authSessionID string, suffix string) (*AuthSession, error) {
query := `
SELECT id::text, user_id::text, device_id::text, refresh_token_hash, refresh_expires_at,
last_seen_at, last_rotated_at, revoked_at, revoked_reason, created_at, updated_at
FROM auth_sessions
WHERE id = $1::uuid` + suffix
session, err := scanAuthSession(r.db.QueryRow(ctx, query, authSessionID))
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}
return session, err
}
func (r *postgresAuthSessionRepository) Rotate(ctx context.Context, params RotateAuthSessionParams) error {
const query = `
UPDATE auth_sessions
SET refresh_token_hash = $2,
refresh_expires_at = $3,
last_seen_at = $4,
last_rotated_at = $5,
updated_at = $5
WHERE id = $1::uuid AND revoked_at IS NULL
`
if _, err := r.db.Exec(ctx, query,
params.AuthSessionID,
params.RefreshTokenHash,
params.RefreshExpiresAt,
params.LastSeenAt,
params.LastRotatedAt,
); err != nil {
return fmt.Errorf("rotate auth session: %w", err)
}
return nil
}
func (r *postgresAuthSessionRepository) Touch(ctx context.Context, authSessionID string, seenAt time.Time) error {
const query = `
UPDATE auth_sessions
SET last_seen_at = $2, updated_at = $2
WHERE id = $1::uuid AND revoked_at IS NULL
`
if _, err := r.db.Exec(ctx, query, authSessionID, seenAt); err != nil {
return fmt.Errorf("touch auth session: %w", err)
}
return nil
}
func (r *postgresAuthSessionRepository) Revoke(ctx context.Context, params RevokeAuthSessionParams) error {
const query = `
UPDATE auth_sessions
SET revoked_at = $3,
revoked_reason = $4,
updated_at = $3
WHERE id = $1::uuid AND user_id = $2::uuid AND revoked_at IS NULL
`
if _, err := r.db.Exec(ctx, query, params.AuthSessionID, params.UserID, params.RevokedAt, params.Reason); err != nil {
return fmt.Errorf("revoke auth session: %w", err)
}
return nil
}
func (r *postgresAuthSessionRepository) RevokeByDevice(ctx context.Context, userID, deviceID, reason string, revokedAt time.Time) error {
const query = `
UPDATE auth_sessions
SET revoked_at = $3,
revoked_reason = $4,
updated_at = $3
WHERE user_id = $1::uuid AND device_id = $2::uuid AND revoked_at IS NULL
`
if _, err := r.db.Exec(ctx, query, userID, deviceID, revokedAt, reason); err != nil {
return fmt.Errorf("revoke auth sessions by device: %w", err)
}
return nil
}
func (r *postgresInstallationRepository) GetStatus(ctx context.Context) (*InstallationAuthorityState, error) {
const query = `
SELECT install_id, authority_state, product_root_key_fingerprint, bootstrapped_owner_email, bootstrapped_at
FROM installation_authority
WHERE id = 1
`
status := &InstallationAuthorityState{}
if err := r.db.QueryRow(ctx, query).Scan(
&status.InstallID,
&status.AuthorityState,
&status.ProductRootFingerprint,
&status.BootstrappedOwnerEmail,
&status.BootstrappedAt,
); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return &InstallationAuthorityState{
Bootstrapped: false,
AuthorityState: "unbootstrapped",
}, nil
}
return nil, fmt.Errorf("get installation status: %w", err)
}
status.Bootstrapped = true
return status, nil
}
func (r *postgresInstallationRepository) BootstrapOwner(ctx context.Context, params BootstrapOwnerParams) (*User, error) {
var existingInstallID string
if err := r.db.QueryRow(ctx, `
SELECT install_id
FROM installation_authority
WHERE id = 1
FOR UPDATE
`).Scan(&existingInstallID); err != nil && !errors.Is(err, pgx.ErrNoRows) {
return nil, fmt.Errorf("lock installation authority: %w", err)
} else if err == nil {
return nil, ErrInstallationAlreadyBootstrapped
}
email := strings.ToLower(strings.TrimSpace(params.Email))
now := params.Now.UTC()
user, err := scanOptionalUser(r.db.QueryRow(ctx, `
INSERT INTO users (email, password_hash, mfa_enabled, platform_role, created_at, updated_at)
VALUES ($1, $2, FALSE, $3, $4, $4)
ON CONFLICT (email) DO UPDATE SET
password_hash = EXCLUDED.password_hash,
platform_role = EXCLUDED.platform_role,
updated_at = EXCLUDED.updated_at
RETURNING id::text, email, password_hash, mfa_enabled, created_at, updated_at
`, email, params.PasswordHash, params.Role, now))
if err != nil {
return nil, fmt.Errorf("upsert bootstrap owner: %w", err)
}
if user == nil {
return nil, fmt.Errorf("upsert bootstrap owner returned no user")
}
payload := json.RawMessage(`{}`)
if len(params.ActivationPayload) > 0 {
payload = params.ActivationPayload
}
if _, err := r.db.Exec(ctx, `
INSERT INTO installation_authority (
id,
install_id,
authority_state,
product_root_key_fingerprint,
activation_payload,
activation_signature,
bootstrapped_owner_email,
bootstrapped_at,
created_at,
updated_at
) VALUES (
1,
$1,
'active',
$2,
$3::jsonb,
$4,
$5,
$6,
$6,
$6
)
`, params.InstallID, params.ProductRootKeyFingerprint, []byte(payload), params.ActivationSignature, email, now); err != nil {
return nil, fmt.Errorf("insert installation authority: %w", err)
}
if _, err := r.db.Exec(ctx, `
UPDATE platform_role_grants
SET revoked_at = $4
WHERE user_id = $1::uuid
AND role = $2
AND install_id = $3
AND revoked_at IS NULL
`, user.ID, params.Role, params.InstallID, now); err != nil {
return nil, fmt.Errorf("revoke superseded platform role grants: %w", err)
}
if _, err := r.db.Exec(ctx, `
INSERT INTO platform_role_grants (
user_id,
role,
install_id,
grant_payload,
grant_signature,
grant_source,
granted_at,
expires_at,
metadata
) VALUES (
$1::uuid,
$2,
$3,
$4::jsonb,
$5,
$6,
$7,
$8,
'{"bootstrap_owner":true}'::jsonb
)
`, user.ID, params.Role, params.InstallID, []byte(payload), params.ActivationSignature, params.GrantSource, now, params.ExpiresAt); err != nil {
return nil, fmt.Errorf("insert platform role grant: %w", err)
}
if _, err := r.db.Exec(ctx, `
INSERT INTO organization_memberships (
organization_id,
user_id,
role_id,
status,
invited_by_user_id,
created_at,
updated_at
)
SELECT id, $1::uuid, 'org_owner', 'active', $1::uuid, $2, $2
FROM organizations
WHERE slug = 'default'
ON CONFLICT (organization_id, user_id) DO UPDATE SET
role_id = 'org_owner',
status = 'active',
invited_by_user_id = EXCLUDED.invited_by_user_id,
updated_at = EXCLUDED.updated_at
`, user.ID, now); err != nil {
return nil, fmt.Errorf("upsert default organization owner membership: %w", err)
}
return user, nil
}
type scanner interface {
Scan(dest ...any) error
}
func scanOptionalUser(row scanner) (*User, error) {
user := &User{}
if err := row.Scan(
&user.ID,
&user.Email,
&user.PasswordHash,
&user.MFAEnabled,
&user.CreatedAt,
&user.UpdatedAt,
); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}
return nil, fmt.Errorf("scan user: %w", err)
}
return user, nil
}
func scanDevice(row scanner) (*Device, error) {
device := &Device{}
var trustedAt, lastSeenAt, revokedAt *time.Time
var revokedReason *string
if err := row.Scan(
&device.ID,
&device.UserID,
&device.Fingerprint,
&device.Label,
&device.TrustStatus,
&trustedAt,
&lastSeenAt,
&revokedAt,
&revokedReason,
&device.CreatedAt,
&device.UpdatedAt,
); err != nil {
return nil, fmt.Errorf("scan device: %w", err)
}
device.TrustedAt = trustedAt
device.LastSeenAt = lastSeenAt
device.RevokedAt = revokedAt
device.RevokedReason = revokedReason
return device, nil
}
func scanAuthSession(row scanner) (*AuthSession, error) {
session := &AuthSession{}
var lastSeenAt, lastRotatedAt, revokedAt *time.Time
var revokedReason *string
if err := row.Scan(
&session.ID,
&session.UserID,
&session.DeviceID,
&session.RefreshTokenHash,
&session.RefreshExpiresAt,
&lastSeenAt,
&lastRotatedAt,
&revokedAt,
&revokedReason,
&session.CreatedAt,
&session.UpdatedAt,
); err != nil {
return nil, fmt.Errorf("scan auth session: %w", err)
}
session.LastSeenAt = lastSeenAt
session.LastRotatedAt = lastRotatedAt
session.RevokedAt = revokedAt
session.RevokedReason = revokedReason
return session, nil
}
@@ -0,0 +1,97 @@
package auth
import (
"context"
"encoding/json"
"time"
)
type UserRepository interface {
GetByEmail(ctx context.Context, email string) (*User, error)
GetByID(ctx context.Context, userID string) (*User, error)
}
type DeviceRepository interface {
Upsert(ctx context.Context, params UpsertDeviceParams) (*Device, error)
GetByIDForUser(ctx context.Context, userID, deviceID string) (*Device, error)
ListTrustedByUser(ctx context.Context, userID string) ([]Device, error)
Revoke(ctx context.Context, params RevokeDeviceParams) error
}
type AuthSessionRepository interface {
Create(ctx context.Context, session AuthSession) error
GetByID(ctx context.Context, authSessionID string) (*AuthSession, error)
GetByIDForUpdate(ctx context.Context, authSessionID string) (*AuthSession, error)
Rotate(ctx context.Context, params RotateAuthSessionParams) error
Touch(ctx context.Context, authSessionID string, seenAt time.Time) error
Revoke(ctx context.Context, params RevokeAuthSessionParams) error
RevokeByDevice(ctx context.Context, userID, deviceID, reason string, revokedAt time.Time) error
}
type InstallationRepository interface {
GetStatus(ctx context.Context) (*InstallationAuthorityState, error)
BootstrapOwner(ctx context.Context, params BootstrapOwnerParams) (*User, error)
}
type Store interface {
Users() UserRepository
Devices() DeviceRepository
AuthSessions() AuthSessionRepository
Installation() InstallationRepository
}
type Transactor interface {
WithinTransaction(ctx context.Context, fn func(store Store) error) error
}
type UpsertDeviceParams struct {
UserID string
Fingerprint string
Label string
TrustRequested bool
SeenAt time.Time
}
type RotateAuthSessionParams struct {
AuthSessionID string
RefreshTokenHash string
RefreshExpiresAt time.Time
LastSeenAt time.Time
LastRotatedAt time.Time
}
type RevokeAuthSessionParams struct {
AuthSessionID string
UserID string
Reason string
RevokedAt time.Time
}
type RevokeDeviceParams struct {
UserID string
DeviceID string
Reason string
RevokedAt time.Time
}
type InstallationAuthorityState struct {
Bootstrapped bool
AuthorityState string
InstallID string
ProductRootFingerprint string
BootstrappedOwnerEmail string
BootstrappedAt *time.Time
}
type BootstrapOwnerParams struct {
Email string
PasswordHash string
Role string
InstallID string
ProductRootKeyFingerprint string
ActivationPayload json.RawMessage
ActivationSignature string
GrantSource string
ExpiresAt *time.Time
Now time.Time
}
+440
View File
@@ -0,0 +1,440 @@
package auth
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
"github.com/google/uuid"
"golang.org/x/crypto/bcrypt"
"github.com/example/remote-access-platform/backend/internal/platform/authority"
"github.com/example/remote-access-platform/backend/internal/platform/module"
)
type Service struct {
cfg module.Config
store Store
transactor Transactor
tokenManager *TokenManager
authority *authority.Verifier
now func() time.Time
}
func NewService(deps module.Dependencies, store Store, transactor Transactor, verifiers ...*authority.Verifier) *Service {
var authorityVerifier *authority.Verifier
if len(verifiers) > 0 {
authorityVerifier = verifiers[0]
} else if verifier, err := authority.NewVerifier(deps.Config.Installation); err == nil {
authorityVerifier = verifier
}
return &Service{
cfg: deps.Config,
store: store,
transactor: transactor,
tokenManager: NewTokenManager(TokenConfig{
Issuer: deps.Config.Auth.Issuer,
AccessTokenSecret: deps.Config.Auth.AccessTokenSecret,
RefreshHashSecret: deps.Config.Auth.RefreshHashSecret,
AccessTokenTTL: deps.Config.Auth.AccessTokenTTL,
RefreshTokenTTL: deps.Config.Auth.RefreshTokenTTL,
}),
authority: authorityVerifier,
now: time.Now,
}
}
func (s *Service) Login(ctx context.Context, cmd LoginCommand) (*AuthResult, error) {
user, err := s.store.Users().GetByEmail(ctx, cmd.Email)
if err != nil {
return nil, err
}
if user == nil {
return nil, ErrInvalidCredentials
}
if err := bcrypt.CompareHashAndPassword([]byte(user.PasswordHash), []byte(cmd.Password)); err != nil {
return nil, ErrInvalidCredentials
}
var result AuthResult
now := s.now().UTC()
if err := s.transactor.WithinTransaction(ctx, func(store Store) error {
device, err := store.Devices().Upsert(ctx, UpsertDeviceParams{
UserID: user.ID,
Fingerprint: cmd.DeviceFingerprint,
Label: cmd.DeviceLabel,
TrustRequested: cmd.TrustDevice,
SeenAt: now,
})
if err != nil {
return err
}
if device.TrustStatus == DeviceTrustStatusRevoked {
return ErrDeviceRevoked
}
authSessionID := uuid.NewString()
refreshToken, refreshHash, refreshExpiresAt, err := s.tokenManager.IssueRefreshToken(authSessionID, now)
if err != nil {
return err
}
accessToken, accessExpiresAt, err := s.tokenManager.IssueAccessToken(user.ID, authSessionID, device.ID, now)
if err != nil {
return err
}
session := AuthSession{
ID: authSessionID,
UserID: user.ID,
DeviceID: device.ID,
RefreshTokenHash: refreshHash,
RefreshExpiresAt: refreshExpiresAt,
CreatedAt: now,
UpdatedAt: now,
LastSeenAt: &now,
}
if err := store.AuthSessions().Create(ctx, session); err != nil {
return err
}
result = AuthResult{
User: *user,
Device: *device,
AuthSession: session,
Tokens: TokenPair{
AccessToken: accessToken,
AccessTokenExpiresAt: accessExpiresAt,
RefreshToken: refreshToken,
RefreshTokenExpiresAt: refreshExpiresAt,
},
}
return nil
}); err != nil {
return nil, err
}
return &result, nil
}
func (s *Service) Refresh(ctx context.Context, cmd RefreshCommand) (*AuthResult, error) {
authSessionID, err := s.tokenManager.ParseRefreshToken(cmd.RefreshToken)
if err != nil {
return nil, err
}
var result AuthResult
now := s.now().UTC()
if err := s.transactor.WithinTransaction(ctx, func(store Store) error {
session, err := store.AuthSessions().GetByIDForUpdate(ctx, authSessionID)
if err != nil {
return err
}
if session == nil {
return ErrInvalidRefreshToken
}
if session.RevokedAt != nil {
return ErrAuthSessionRevoked
}
if now.After(session.RefreshExpiresAt) {
if revokeErr := store.AuthSessions().Revoke(ctx, RevokeAuthSessionParams{
AuthSessionID: session.ID,
UserID: session.UserID,
Reason: "refresh_token_expired",
RevokedAt: now,
}); revokeErr != nil {
return revokeErr
}
return ErrInvalidRefreshToken
}
expectedHash := s.tokenManager.HashRefreshToken(cmd.RefreshToken)
if expectedHash != session.RefreshTokenHash {
if revokeErr := store.AuthSessions().Revoke(ctx, RevokeAuthSessionParams{
AuthSessionID: session.ID,
UserID: session.UserID,
Reason: "refresh_rotation_reuse_detected",
RevokedAt: now,
}); revokeErr != nil {
return revokeErr
}
return ErrInvalidRefreshToken
}
user, err := store.Users().GetByID(ctx, session.UserID)
if err != nil {
return err
}
if user == nil {
return ErrInvalidCredentials
}
device, err := store.Devices().GetByIDForUser(ctx, session.UserID, session.DeviceID)
if err != nil {
return err
}
if device == nil {
return ErrTrustedDeviceMissing
}
if device.TrustStatus == DeviceTrustStatusRevoked {
return ErrDeviceRevoked
}
refreshToken, refreshHash, refreshExpiresAt, err := s.tokenManager.IssueRefreshToken(session.ID, now)
if err != nil {
return err
}
accessToken, accessExpiresAt, err := s.tokenManager.IssueAccessToken(user.ID, session.ID, device.ID, now)
if err != nil {
return err
}
if err := store.AuthSessions().Rotate(ctx, RotateAuthSessionParams{
AuthSessionID: session.ID,
RefreshTokenHash: refreshHash,
RefreshExpiresAt: refreshExpiresAt,
LastSeenAt: now,
LastRotatedAt: now,
}); err != nil {
return err
}
result = AuthResult{
User: *user,
Device: *device,
AuthSession: AuthSession{
ID: session.ID,
UserID: session.UserID,
DeviceID: session.DeviceID,
RefreshTokenHash: refreshHash,
RefreshExpiresAt: refreshExpiresAt,
LastSeenAt: &now,
LastRotatedAt: &now,
},
Tokens: TokenPair{
AccessToken: accessToken,
AccessTokenExpiresAt: accessExpiresAt,
RefreshToken: refreshToken,
RefreshTokenExpiresAt: refreshExpiresAt,
},
}
return nil
}); err != nil {
return nil, err
}
return &result, nil
}
func (s *Service) InstallationStatus(ctx context.Context) (*InstallationStatus, error) {
record, err := s.store.Installation().GetStatus(ctx)
if err != nil {
return nil, err
}
return s.installationStatusFromRecord(record), nil
}
func (s *Service) BootstrapOwner(ctx context.Context, cmd BootstrapOwnerCommand) (*BootstrapOwnerResult, error) {
email := strings.ToLower(strings.TrimSpace(cmd.Email))
password := strings.TrimSpace(cmd.Password)
if email == "" || !strings.Contains(email, "@") || len(password) < 12 {
return nil, ErrInvalidBootstrapOwner
}
now := s.now().UTC()
role := authority.PlatformRoleAdmin
installID := ""
grantSource := "installation_activation"
rootFingerprint := ""
activationPayload := cmd.ActivationPayload
activationSignature := strings.TrimSpace(cmd.ActivationSignature)
var expiresAt *time.Time
if s.strictAuthority() {
if len(activationPayload) == 0 || activationSignature == "" {
return nil, ErrInstallationActivationRequired
}
activation, err := s.authority.VerifyActivation(activationPayload, activationSignature)
if err != nil {
return nil, fmt.Errorf("%w: %v", ErrInvalidInstallationActivation, err)
}
if !strings.EqualFold(activation.OwnerEmail, email) {
return nil, ErrInvalidInstallationActivation
}
role = activation.PlatformRole
installID = activation.InstallID
expiresAt = activation.ExpiresAt
rootFingerprint = s.authority.RootFingerprint()
} else {
if s.authority == nil || !s.authority.AllowInsecureBootstrap() {
return nil, ErrInsecureBootstrapDisabled
}
installID = uuid.NewString()
grantSource = "dev_insecure"
rootFingerprint = "dev-insecure"
devPayload, err := json.Marshal(authority.ActivationPayload{
SchemaVersion: authority.ActivationSchemaVersion,
InstallID: installID,
OwnerEmail: email,
PlatformRole: role,
IssuedAt: now,
Environment: s.cfg.App.Env,
})
if err != nil {
return nil, err
}
activationPayload = json.RawMessage(devPayload)
activationSignature = "dev-insecure"
}
passwordHash, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
if err != nil {
return nil, fmt.Errorf("hash bootstrap owner password: %w", err)
}
var user *User
if err := s.transactor.WithinTransaction(ctx, func(store Store) error {
created, err := store.Installation().BootstrapOwner(ctx, BootstrapOwnerParams{
Email: email,
PasswordHash: string(passwordHash),
Role: role,
InstallID: installID,
ProductRootKeyFingerprint: rootFingerprint,
ActivationPayload: activationPayload,
ActivationSignature: activationSignature,
GrantSource: grantSource,
ExpiresAt: expiresAt,
Now: now,
})
if err != nil {
return err
}
user = created
return nil
}); err != nil {
return nil, err
}
status, err := s.InstallationStatus(ctx)
if err != nil {
return nil, err
}
return &BootstrapOwnerResult{
Installation: *status,
User: *user,
PlatformRole: role,
}, nil
}
func (s *Service) RevokeAuthSession(ctx context.Context, cmd RevokeAuthSessionCommand) error {
return s.transactor.WithinTransaction(ctx, func(store Store) error {
session, err := store.AuthSessions().GetByIDForUpdate(ctx, cmd.AuthSessionID)
if err != nil {
return err
}
if session == nil || session.UserID != cmd.UserID {
return ErrAuthSessionNotFound
}
return store.AuthSessions().Revoke(ctx, RevokeAuthSessionParams{
AuthSessionID: cmd.AuthSessionID,
UserID: cmd.UserID,
Reason: cmd.Reason,
RevokedAt: s.now().UTC(),
})
})
}
func (s *Service) RevokeTrustedDevice(ctx context.Context, cmd RevokeDeviceCommand) error {
return s.transactor.WithinTransaction(ctx, func(store Store) error {
device, err := store.Devices().GetByIDForUser(ctx, cmd.UserID, cmd.DeviceID)
if err != nil {
return err
}
if device == nil {
return ErrTrustedDeviceMissing
}
if device.TrustStatus != DeviceTrustStatusTrusted {
return ErrDeviceNotTrusted
}
now := s.now().UTC()
if err := store.Devices().Revoke(ctx, RevokeDeviceParams{
UserID: cmd.UserID,
DeviceID: cmd.DeviceID,
Reason: cmd.Reason,
RevokedAt: now,
}); err != nil {
return err
}
return store.AuthSessions().RevokeByDevice(ctx, cmd.UserID, cmd.DeviceID, "device_revoked:"+cmd.Reason, now)
})
}
func (s *Service) ListTrustedDevices(ctx context.Context, userID string) ([]Device, error) {
return s.store.Devices().ListTrustedByUser(ctx, userID)
}
func (s *Service) MapError(err error) (int, string) {
switch {
case err == nil:
return 0, ""
case errors.Is(err, ErrInvalidCredentials):
return 401, "invalid credentials"
case errors.Is(err, ErrInvalidRefreshToken):
return 401, "invalid refresh token"
case errors.Is(err, ErrAuthSessionRevoked):
return 401, "auth session revoked"
case errors.Is(err, ErrDeviceRevoked):
return 403, "device revoked"
case errors.Is(err, ErrDeviceNotTrusted):
return 409, "device is not trusted"
case errors.Is(err, ErrAuthSessionNotFound), errors.Is(err, ErrTrustedDeviceMissing):
return 404, err.Error()
case errors.Is(err, ErrInstallationActivationRequired), errors.Is(err, ErrInvalidInstallationActivation), errors.Is(err, ErrInvalidBootstrapOwner):
return 400, err.Error()
case errors.Is(err, ErrInsecureBootstrapDisabled):
return 403, err.Error()
case errors.Is(err, ErrInstallationAlreadyBootstrapped):
return 409, err.Error()
default:
return 500, fmt.Sprintf("internal error: %v", err)
}
}
func (s *Service) installationStatusFromRecord(record *InstallationAuthorityState) *InstallationStatus {
if record == nil {
record = &InstallationAuthorityState{AuthorityState: "unbootstrapped"}
}
mode := authority.ModeLegacy
strict := false
rootFingerprint := ""
insecureAllowed := false
if s.authority != nil {
mode = s.authority.Mode()
strict = s.authority.Strict()
rootFingerprint = s.authority.RootFingerprint()
insecureAllowed = s.authority.AllowInsecureBootstrap()
}
if record.ProductRootFingerprint != "" {
rootFingerprint = record.ProductRootFingerprint
}
return &InstallationStatus{
Bootstrapped: record.Bootstrapped,
AuthorityState: record.AuthorityState,
InstallID: record.InstallID,
BootstrappedOwnerEmail: record.BootstrappedOwnerEmail,
BootstrappedAt: record.BootstrappedAt,
AuthorityMode: mode,
StrictAuthority: strict,
RootFingerprint: rootFingerprint,
InsecureBootstrapAllowed: insecureAllowed,
}
}
func (s *Service) strictAuthority() bool {
return s.authority != nil && s.authority.Strict()
}
+95
View File
@@ -0,0 +1,95 @@
package auth
import (
"crypto/hmac"
"crypto/rand"
"crypto/sha256"
"encoding/base64"
"fmt"
"strings"
"time"
"github.com/golang-jwt/jwt/v5"
)
type TokenManager struct {
issuer string
accessSecret []byte
refreshHashSecret []byte
accessTTL time.Duration
refreshTTL time.Duration
}
type AccessClaims struct {
AuthSessionID string `json:"sid"`
DeviceID string `json:"did"`
jwt.RegisteredClaims
}
func NewTokenManager(cfg TokenConfig) *TokenManager {
return &TokenManager{
issuer: cfg.Issuer,
accessSecret: []byte(cfg.AccessTokenSecret),
refreshHashSecret: []byte(cfg.RefreshHashSecret),
accessTTL: cfg.AccessTokenTTL,
refreshTTL: cfg.RefreshTokenTTL,
}
}
type TokenConfig struct {
Issuer string
AccessTokenSecret string
RefreshHashSecret string
AccessTokenTTL time.Duration
RefreshTokenTTL time.Duration
}
func (m *TokenManager) IssueAccessToken(userID, authSessionID, deviceID string, now time.Time) (string, time.Time, error) {
expiresAt := now.Add(m.accessTTL)
claims := AccessClaims{
AuthSessionID: authSessionID,
DeviceID: deviceID,
RegisteredClaims: jwt.RegisteredClaims{
Issuer: m.issuer,
Subject: userID,
ExpiresAt: jwt.NewNumericDate(expiresAt),
IssuedAt: jwt.NewNumericDate(now),
NotBefore: jwt.NewNumericDate(now),
},
}
token := jwt.NewWithClaims(jwt.SigningMethodHS256, claims)
signed, err := token.SignedString(m.accessSecret)
if err != nil {
return "", time.Time{}, fmt.Errorf("sign access token: %w", err)
}
return signed, expiresAt, nil
}
func (m *TokenManager) IssueRefreshToken(authSessionID string, now time.Time) (raw string, hash string, expiresAt time.Time, err error) {
secret := make([]byte, 32)
if _, err = rand.Read(secret); err != nil {
return "", "", time.Time{}, fmt.Errorf("read random refresh secret: %w", err)
}
encodedSecret := base64.RawURLEncoding.EncodeToString(secret)
raw = authSessionID + "." + encodedSecret
hash = m.HashRefreshToken(raw)
expiresAt = now.Add(m.refreshTTL)
return raw, hash, expiresAt, nil
}
func (m *TokenManager) HashRefreshToken(token string) string {
mac := hmac.New(sha256.New, m.refreshHashSecret)
_, _ = mac.Write([]byte(token))
return base64.RawURLEncoding.EncodeToString(mac.Sum(nil))
}
func (m *TokenManager) ParseRefreshToken(token string) (string, error) {
sessionID, _, ok := strings.Cut(token, ".")
if !ok || sessionID == "" {
return "", ErrInvalidRefreshToken
}
return sessionID, nil
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,34 @@
package cluster
import (
"encoding/json"
"testing"
)
func TestMeshLatestObservationKeySeparatesRouteHealthByRoute(t *testing.T) {
key := meshLatestObservationKey(json.RawMessage(`{
"observation_type":"synthetic_route_health",
"route_id":"route-1"
}`))
if key != "synthetic_route_health:route-1" {
t.Fatalf("key = %q", key)
}
}
func TestMeshLatestObservationKeySeparatesConnectionManagerMode(t *testing.T) {
key := meshLatestObservationKey(json.RawMessage(`{
"observation_type":"peer_connection_manager",
"transport_mode":"relay_control",
"relay_node_id":"node-r"
}`))
if key != "peer_connection_manager:relay_control:node-r" {
t.Fatalf("key = %q", key)
}
}
func TestMeshLatestObservationKeyDefaults(t *testing.T) {
key := meshLatestObservationKey(json.RawMessage(`{}`))
if key != "default" {
t.Fatalf("key = %q", key)
}
}
@@ -0,0 +1,91 @@
package cluster
import (
"context"
"encoding/json"
"time"
)
type Repository interface {
GetPlatformRole(ctx context.Context, userID string) (string, error)
ListClusters(ctx context.Context) ([]Cluster, error)
GetCluster(ctx context.Context, clusterID string) (Cluster, error)
CreateCluster(ctx context.Context, input CreateClusterInput) (Cluster, error)
UpdateCluster(ctx context.Context, input UpdateClusterInput) (Cluster, error)
GetClusterAuthority(ctx context.Context, clusterID string) (ClusterAuthorityKey, error)
EnsureClusterAuthority(ctx context.Context, clusterID string, actorUserID *string) (ClusterAuthorityKey, error)
ListClusterNodes(ctx context.Context, clusterID string) ([]ClusterNode, error)
ListNodeGroups(ctx context.Context, clusterID string) ([]ClusterNodeGroup, error)
CreateNodeGroup(ctx context.Context, input CreateNodeGroupInput) (ClusterNodeGroup, error)
AssignNodeToGroup(ctx context.Context, input AssignNodeGroupInput) (ClusterNode, error)
CreateJoinToken(ctx context.Context, input CreateJoinTokenInput, tokenHash string) (NodeJoinToken, error)
SetJoinTokenAuthority(ctx context.Context, clusterID, tokenID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinToken, error)
GetValidJoinTokenByHash(ctx context.Context, clusterID, tokenHash string) (NodeJoinToken, error)
RevokeJoinToken(ctx context.Context, input RevokeJoinTokenInput) (NodeJoinToken, error)
ExpireJoinTokens(ctx context.Context, clusterID string) error
CreateJoinRequest(ctx context.Context, input CreateJoinRequestInput, joinTokenID string) (NodeJoinRequest, error)
GetJoinRequestForBootstrap(ctx context.Context, input GetJoinRequestBootstrapInput) (NodeJoinRequest, error)
ListJoinRequests(ctx context.Context, clusterID string) ([]NodeJoinRequest, error)
ApproveJoinRequest(ctx context.Context, input ApproveJoinRequestInput) (ApprovedJoinRequest, error)
SetJoinRequestApprovalAuthority(ctx context.Context, clusterID, joinRequestID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinRequest, error)
RejectJoinRequest(ctx context.Context, input RejectJoinRequestInput) (NodeJoinRequest, error)
AssignNodeRole(ctx context.Context, input AssignNodeRoleInput) (NodeRoleAssignment, error)
ListNodeRoleAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeRoleAssignment, error)
AttachExistingNodeToCluster(ctx context.Context, input AttachExistingNodeInput) (ClusterNode, error)
RecordHeartbeat(ctx context.Context, input RecordHeartbeatInput) (NodeHeartbeat, error)
ListNodeHeartbeats(ctx context.Context, clusterID, nodeID string, limit int) ([]NodeHeartbeat, error)
RevokeNodeIdentity(ctx context.Context, input RevokeNodeIdentityInput) error
DisableClusterMembership(ctx context.Context, input DisableMembershipInput) error
UpsertFabricTestingFlag(ctx context.Context, input UpsertFabricTestingFlagInput) (FabricTestingFlag, error)
ListFabricTestingFlags(ctx context.Context) ([]FabricTestingFlag, error)
GetEffectiveNodeTestingFlags(ctx context.Context, clusterID, nodeID string) (EffectiveNodeTestingFlags, error)
RecordNodeTelemetry(ctx context.Context, input RecordNodeTelemetryInput) (NodeTelemetryObservation, error)
ListNodeTelemetry(ctx context.Context, clusterID, nodeID string, limit int) ([]NodeTelemetryObservation, error)
SetDesiredWorkload(ctx context.Context, input SetDesiredWorkloadInput) (NodeWorkloadDesiredState, error)
ListDesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]NodeWorkloadDesiredState, error)
ReportWorkloadStatus(ctx context.Context, input ReportWorkloadStatusInput) (NodeWorkloadStatus, error)
ListLatestWorkloadStatuses(ctx context.Context, clusterID, nodeID string) ([]NodeWorkloadStatus, error)
ReportMeshLink(ctx context.Context, input ReportMeshLinkInput) (MeshLinkObservation, error)
ListMeshLinks(ctx context.Context, clusterID string) ([]MeshLinkObservation, error)
CreateRouteIntent(ctx context.Context, input CreateRouteIntentInput) (MeshRouteIntent, error)
ListRouteIntents(ctx context.Context, clusterID string) ([]MeshRouteIntent, error)
ListQoSPolicies(ctx context.Context, clusterID string) ([]MeshQoSPolicy, error)
ListFabricEntryPoints(ctx context.Context, clusterID string) ([]FabricEntryPoint, error)
CreateFabricEntryPoint(ctx context.Context, input CreateFabricEntryPointInput) (FabricEntryPoint, error)
SetFabricEntryPointNode(ctx context.Context, input SetFabricEntryPointNodeInput) (FabricEntryPointNode, error)
ListFabricEntryPointNodes(ctx context.Context, clusterID, entryPointID string) ([]FabricEntryPointNode, error)
ListFabricEgressPools(ctx context.Context, clusterID string) ([]FabricEgressPool, error)
CreateFabricEgressPool(ctx context.Context, input CreateFabricEgressPoolInput) (FabricEgressPool, error)
SetFabricEgressPoolNode(ctx context.Context, input SetFabricEgressPoolNodeInput) (FabricEgressPoolNode, error)
ListFabricEgressPoolNodes(ctx context.Context, clusterID, egressPoolID string) ([]FabricEgressPoolNode, error)
GetClusterAuthorityState(ctx context.Context, clusterID string) (ClusterAuthorityState, error)
UpdateClusterAuthorityState(ctx context.Context, input UpdateClusterAuthorityInput) (ClusterAuthorityState, error)
ListClusterAdminSummaries(ctx context.Context) ([]ClusterAdminSummary, error)
CreateVPNConnection(ctx context.Context, input CreateVPNConnectionInput) (VPNConnection, error)
ListVPNConnections(ctx context.Context, clusterID string) ([]VPNConnection, error)
GetVPNConnection(ctx context.Context, clusterID, vpnConnectionID string) (VPNConnection, error)
UpdateVPNConnectionDesiredState(ctx context.Context, input UpdateVPNConnectionDesiredStateInput) (VPNConnection, error)
UpsertVPNConnectionRoutePolicy(ctx context.Context, input UpsertVPNConnectionRoutePolicyInput) (VPNConnectionRoutePolicy, error)
ListVPNConnectionRoutePolicies(ctx context.Context, clusterID, vpnConnectionID string) ([]VPNConnectionRoutePolicy, error)
SetVPNConnectionAllowedNodes(ctx context.Context, input SetVPNConnectionAllowedNodesInput) ([]VPNConnectionAllowedNode, error)
ListVPNConnectionAllowedNodes(ctx context.Context, clusterID, vpnConnectionID string) ([]VPNConnectionAllowedNode, error)
AcquireVPNConnectionLease(ctx context.Context, input AcquireVPNConnectionLeaseInput, expiresAt time.Time, fencingToken string) (VPNConnectionLease, error)
RenewVPNConnectionLease(ctx context.Context, input RenewVPNConnectionLeaseInput, expiresAt time.Time) (VPNConnectionLease, error)
ReleaseVPNConnectionLease(ctx context.Context, input ReleaseVPNConnectionLeaseInput) (VPNConnectionLease, error)
FenceVPNConnectionLease(ctx context.Context, input FenceVPNConnectionLeaseInput) (VPNConnectionLease, error)
GetActiveVPNConnectionLease(ctx context.Context, clusterID, vpnConnectionID string) (VPNConnectionLease, error)
CheckVPNLeaseOwnerEligibility(ctx context.Context, clusterID, vpnConnectionID, ownerNodeID string) (VPNLeaseOwnerEligibility, error)
ExpireStaleVPNConnectionLeases(ctx context.Context, clusterID string, now time.Time) ([]VPNConnectionLease, error)
ListNodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error)
ReportNodeVPNAssignmentStatus(ctx context.Context, input ReportNodeVPNAssignmentStatusInput) (NodeVPNAssignmentStatus, error)
RecordAudit(ctx context.Context, event ClusterAuditEvent) error
ListAuditEvents(ctx context.Context, clusterID string, limit int) ([]ClusterAuditEvent, error)
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+43
View File
@@ -0,0 +1,43 @@
package cluster
import (
"crypto/rand"
"crypto/sha256"
"encoding/base64"
"encoding/hex"
"errors"
"strings"
"time"
)
const joinTokenHashPrefix = "sha256:"
func generateJoinToken() (string, error) {
var random [32]byte
if _, err := rand.Read(random[:]); err != nil {
return "", err
}
return "rap_join_" + base64.RawURLEncoding.EncodeToString(random[:]), nil
}
func hashJoinToken(token string) (string, error) {
trimmed := strings.TrimSpace(token)
if trimmed == "" {
return "", errors.New("join token is required")
}
sum := sha256.Sum256([]byte(trimmed))
return joinTokenHashPrefix + hex.EncodeToString(sum[:]), nil
}
func isPlatformAdminRole(role string) bool {
return role == PlatformRoleAdmin || role == PlatformRoleRecoveryAdmin
}
func isAllowedNodeRole(role string) bool {
_, ok := allowedNodeRoles[role]
return ok
}
func defaultJoinTokenExpiry(now time.Time) time.Time {
return now.Add(30 * time.Minute)
}
@@ -0,0 +1,344 @@
package identitysource
import (
"context"
"encoding/json"
"errors"
"net/http"
"time"
"github.com/go-chi/chi/v5"
"github.com/google/uuid"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
"github.com/example/remote-access-platform/backend/internal/platform/module"
)
type Module struct {
db *pgxpool.Pool
}
type IdentitySource struct {
ID string `json:"id"`
OrganizationID string `json:"organization_id"`
Kind string `json:"kind"`
Name string `json:"name"`
Status string `json:"status"`
Config json.RawMessage `json:"config"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type IdentityMapping struct {
ID string `json:"id"`
IdentitySourceID string `json:"identity_source_id"`
MappingType string `json:"mapping_type"`
ExternalSelector json.RawMessage `json:"external_selector"`
InternalTarget json.RawMessage `json:"internal_target"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type upsertIdentitySourceRequest struct {
ActorUserID string `json:"actor_user_id"`
OrganizationID string `json:"organization_id"`
Kind string `json:"kind"`
Name string `json:"name"`
Status string `json:"status"`
Config json.RawMessage `json:"config"`
IdentityMappings []struct {
MappingType string `json:"mapping_type"`
ExternalSelector json.RawMessage `json:"external_selector"`
InternalTarget json.RawMessage `json:"internal_target"`
} `json:"identity_mappings"`
}
func NewModule(deps module.Dependencies) *Module {
return &Module{db: deps.Infra.DB}
}
func (m *Module) Name() string {
return "identitysource"
}
func (m *Module) RegisterRoutes(router chi.Router) {
router.Route("/identity-sources", func(r chi.Router) {
r.Get("/", m.listIdentitySources)
r.Post("/", m.createIdentitySource)
r.Get("/{identitySourceID}", m.getIdentitySource)
r.Put("/{identitySourceID}", m.updateIdentitySource)
})
}
func (m *Module) listIdentitySources(w http.ResponseWriter, r *http.Request) {
orgID := r.URL.Query().Get("organization_id")
if orgID == "" {
httpx.WriteError(w, http.StatusBadRequest, "organization_id is required")
return
}
rows, err := m.db.Query(r.Context(), `
SELECT id, organization_id, kind, name, status, config, created_at, updated_at
FROM identity_sources
WHERE organization_id = $1
ORDER BY created_at DESC
`, orgID)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer rows.Close()
var items []IdentitySource
for rows.Next() {
item, err := scanIdentitySource(rows)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
items = append(items, item)
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"identity_sources": items})
}
func (m *Module) getIdentitySource(w http.ResponseWriter, r *http.Request) {
id := chi.URLParam(r, "identitySourceID")
item, err := m.getByID(r.Context(), id)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
httpx.WriteError(w, http.StatusNotFound, "identity source not found")
return
}
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
mappings, err := m.listMappings(r.Context(), id)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{
"identity_source": item,
"identity_mappings": mappings,
})
}
func (m *Module) createIdentitySource(w http.ResponseWriter, r *http.Request) {
req, err := decodeRequest(r)
if err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
now := time.Now().UTC()
item := IdentitySource{
ID: uuid.NewString(),
OrganizationID: req.OrganizationID,
Kind: req.Kind,
Name: req.Name,
Status: req.Status,
Config: req.Config,
CreatedAt: now,
UpdatedAt: now,
}
tx, err := m.db.Begin(r.Context())
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer tx.Rollback(r.Context())
if _, err := tx.Exec(r.Context(), `
INSERT INTO identity_sources (id, organization_id, kind, name, status, config, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6::jsonb, $7, $8)
`, item.ID, item.OrganizationID, item.Kind, item.Name, item.Status, []byte(item.Config), item.CreatedAt, item.UpdatedAt); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
mappings, err := upsertMappings(r.Context(), tx, item.ID, req.IdentityMappings)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := tx.Commit(r.Context()); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusCreated, map[string]any{
"identity_source": item,
"identity_mappings": mappings,
})
}
func (m *Module) updateIdentitySource(w http.ResponseWriter, r *http.Request) {
req, err := decodeRequest(r)
if err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
id := chi.URLParam(r, "identitySourceID")
tx, err := m.db.Begin(r.Context())
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer tx.Rollback(r.Context())
tag, err := tx.Exec(r.Context(), `
UPDATE identity_sources
SET organization_id = $2, kind = $3, name = $4, status = $5, config = $6::jsonb, updated_at = $7
WHERE id = $1
`, id, req.OrganizationID, req.Kind, req.Name, req.Status, []byte(req.Config), time.Now().UTC())
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if tag.RowsAffected() == 0 {
httpx.WriteError(w, http.StatusNotFound, "identity source not found")
return
}
if _, err := tx.Exec(r.Context(), `DELETE FROM identity_mappings WHERE identity_source_id = $1`, id); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
mappings, err := upsertMappings(r.Context(), tx, id, req.IdentityMappings)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := tx.Commit(r.Context()); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
item, err := m.getByID(r.Context(), id)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{
"identity_source": item,
"identity_mappings": mappings,
})
}
func decodeRequest(r *http.Request) (*upsertIdentitySourceRequest, error) {
var req upsertIdentitySourceRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
return nil, errors.New("invalid identity source payload")
}
if req.ActorUserID == "" || req.OrganizationID == "" || req.Kind == "" || req.Name == "" {
return nil, errors.New("actor_user_id, organization_id, kind, and name are required")
}
if req.Status == "" {
req.Status = "active"
}
if len(req.Config) == 0 {
req.Config = json.RawMessage(`{}`)
}
if !json.Valid(req.Config) {
return nil, errors.New("config must be valid json")
}
for _, mapping := range req.IdentityMappings {
if len(mapping.ExternalSelector) == 0 {
mapping.ExternalSelector = json.RawMessage(`{}`)
}
if len(mapping.InternalTarget) == 0 {
mapping.InternalTarget = json.RawMessage(`{}`)
}
}
return &req, nil
}
func (m *Module) getByID(ctx context.Context, id string) (IdentitySource, error) {
row := m.db.QueryRow(ctx, `
SELECT id, organization_id, kind, name, status, config, created_at, updated_at
FROM identity_sources
WHERE id = $1
`, id)
return scanIdentitySource(row)
}
func (m *Module) listMappings(ctx context.Context, sourceID string) ([]IdentityMapping, error) {
rows, err := m.db.Query(ctx, `
SELECT id, identity_source_id, mapping_type, external_selector, internal_target, created_at, updated_at
FROM identity_mappings
WHERE identity_source_id = $1
ORDER BY created_at ASC
`, sourceID)
if err != nil {
return nil, err
}
defer rows.Close()
var mappings []IdentityMapping
for rows.Next() {
item, err := scanIdentityMapping(rows)
if err != nil {
return nil, err
}
mappings = append(mappings, item)
}
return mappings, rows.Err()
}
func upsertMappings(ctx context.Context, tx pgx.Tx, sourceID string, requested []struct {
MappingType string `json:"mapping_type"`
ExternalSelector json.RawMessage `json:"external_selector"`
InternalTarget json.RawMessage `json:"internal_target"`
}) ([]IdentityMapping, error) {
now := time.Now().UTC()
items := make([]IdentityMapping, 0, len(requested))
for _, mapping := range requested {
external := mapping.ExternalSelector
if len(external) == 0 {
external = json.RawMessage(`{}`)
}
internal := mapping.InternalTarget
if len(internal) == 0 {
internal = json.RawMessage(`{}`)
}
item := IdentityMapping{
ID: uuid.NewString(),
IdentitySourceID: sourceID,
MappingType: mapping.MappingType,
ExternalSelector: external,
InternalTarget: internal,
CreatedAt: now,
UpdatedAt: now,
}
if _, err := tx.Exec(ctx, `
INSERT INTO identity_mappings (
id, identity_source_id, mapping_type, external_selector, internal_target, created_at, updated_at
) VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6, $7)
`, item.ID, item.IdentitySourceID, item.MappingType, []byte(item.ExternalSelector), []byte(item.InternalTarget), item.CreatedAt, item.UpdatedAt); err != nil {
return nil, err
}
items = append(items, item)
}
return items, nil
}
type rowScanner interface {
Scan(dest ...any) error
}
func scanIdentitySource(row rowScanner) (IdentitySource, error) {
var item IdentitySource
if err := row.Scan(&item.ID, &item.OrganizationID, &item.Kind, &item.Name, &item.Status, &item.Config, &item.CreatedAt, &item.UpdatedAt); err != nil {
return IdentitySource{}, err
}
if len(item.Config) == 0 {
item.Config = json.RawMessage(`{}`)
}
return item, nil
}
func scanIdentityMapping(row rowScanner) (IdentityMapping, error) {
var item IdentityMapping
if err := row.Scan(&item.ID, &item.IdentitySourceID, &item.MappingType, &item.ExternalSelector, &item.InternalTarget, &item.CreatedAt, &item.UpdatedAt); err != nil {
return IdentityMapping{}, err
}
if len(item.ExternalSelector) == 0 {
item.ExternalSelector = json.RawMessage(`{}`)
}
if len(item.InternalTarget) == 0 {
item.InternalTarget = json.RawMessage(`{}`)
}
return item, nil
}
+458
View File
@@ -0,0 +1,458 @@
package node
import (
"context"
"encoding/json"
"errors"
"net/http"
"time"
"github.com/go-chi/chi/v5"
"github.com/google/uuid"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
"github.com/example/remote-access-platform/backend/internal/platform/module"
)
type Module struct {
db *pgxpool.Pool
}
type Node struct {
ID string `json:"id"`
OwnerOrganizationID *string `json:"owner_organization_id,omitempty"`
NodeKey string `json:"node_key"`
Name string `json:"name"`
OwnershipType string `json:"ownership_type"`
RegistrationStatus string `json:"registration_status"`
HealthStatus string `json:"health_status"`
VersionState string `json:"version_state"`
PartitionState string `json:"partition_state"`
DesiredVersion *string `json:"desired_version,omitempty"`
ReportedVersion *string `json:"reported_version,omitempty"`
LastSeenAt *time.Time `json:"last_seen_at,omitempty"`
Metadata json.RawMessage `json:"metadata"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type NodeCapability struct {
NodeID string `json:"node_id"`
Capability string `json:"capability"`
Value json.RawMessage `json:"value"`
UpdatedAt time.Time `json:"updated_at"`
}
type NodeService struct {
NodeID string `json:"node_id"`
ServiceType string `json:"service_type"`
Enabled bool `json:"enabled"`
DesiredState string `json:"desired_state"`
ReportedState string `json:"reported_state"`
LastReportedAt *time.Time `json:"last_reported_at,omitempty"`
Metadata json.RawMessage `json:"metadata"`
UpdatedAt time.Time `json:"updated_at"`
}
type NodeUpdatePolicy struct {
NodeID string `json:"node_id"`
Mode string `json:"mode"`
Channel string `json:"channel"`
MaintenanceWindow json.RawMessage `json:"maintenance_window"`
Canary bool `json:"canary"`
AutomaticRollout bool `json:"automatic_rollout"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type NodePartitionState struct {
NodeID string `json:"node_id"`
ClusterState string `json:"cluster_state"`
RecoveryMode string `json:"recovery_mode"`
Notes *string `json:"notes,omitempty"`
UpdatedAt time.Time `json:"updated_at"`
}
type upsertNodeRequest struct {
ActorUserID string `json:"actor_user_id"`
OwnerOrganizationID *string `json:"owner_organization_id"`
NodeKey string `json:"node_key"`
Name string `json:"name"`
OwnershipType string `json:"ownership_type"`
DesiredVersion *string `json:"desired_version"`
Metadata json.RawMessage `json:"metadata"`
Capabilities []struct {
Capability string `json:"capability"`
Value json.RawMessage `json:"value"`
} `json:"capabilities"`
Services []struct {
ServiceType string `json:"service_type"`
Enabled bool `json:"enabled"`
DesiredState string `json:"desired_state"`
Metadata json.RawMessage `json:"metadata"`
} `json:"services"`
UpdatePolicy struct {
Mode string `json:"mode"`
Channel string `json:"channel"`
MaintenanceWindow json.RawMessage `json:"maintenance_window"`
Canary bool `json:"canary"`
AutomaticRollout bool `json:"automatic_rollout"`
} `json:"update_policy"`
PartitionState struct {
ClusterState string `json:"cluster_state"`
RecoveryMode string `json:"recovery_mode"`
Notes *string `json:"notes"`
} `json:"partition_state"`
}
func NewModule(deps module.Dependencies) *Module {
return &Module{db: deps.Infra.DB}
}
func (m *Module) Name() string {
return "node"
}
func (m *Module) RegisterRoutes(router chi.Router) {
router.Route("/nodes", func(r chi.Router) {
r.Get("/", m.listNodes)
r.Post("/", m.createNode)
r.Get("/{nodeID}", m.getNode)
r.Put("/{nodeID}", m.updateNode)
})
}
func (m *Module) listNodes(w http.ResponseWriter, r *http.Request) {
rows, err := m.db.Query(r.Context(), `
SELECT id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status,
version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at
FROM nodes
ORDER BY created_at DESC
`)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer rows.Close()
var items []Node
for rows.Next() {
item, err := scanNode(rows)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
items = append(items, item)
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"nodes": items})
}
func (m *Module) getNode(w http.ResponseWriter, r *http.Request) {
nodeID := chi.URLParam(r, "nodeID")
item, err := m.getNodeByID(r.Context(), nodeID)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
httpx.WriteError(w, http.StatusNotFound, "node not found")
return
}
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
caps, _ := m.listCapabilities(r.Context(), nodeID)
services, _ := m.listServices(r.Context(), nodeID)
updatePolicy, _ := m.getUpdatePolicy(r.Context(), nodeID)
partitionState, _ := m.getPartitionState(r.Context(), nodeID)
httpx.WriteJSON(w, http.StatusOK, map[string]any{
"node": item,
"capabilities": caps,
"services": services,
"update_policy": updatePolicy,
"partition_state": partitionState,
})
}
func (m *Module) createNode(w http.ResponseWriter, r *http.Request) {
req, err := decodeNodeRequest(r)
if err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
item := Node{
ID: uuid.NewString(),
OwnerOrganizationID: req.OwnerOrganizationID,
NodeKey: req.NodeKey,
Name: req.Name,
OwnershipType: req.OwnershipType,
RegistrationStatus: "pending",
HealthStatus: "unknown",
VersionState: "unknown",
PartitionState: "healthy",
DesiredVersion: req.DesiredVersion,
Metadata: req.Metadata,
CreatedAt: time.Now().UTC(),
UpdatedAt: time.Now().UTC(),
}
if err := m.persistNode(r.Context(), item, req, true); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusCreated, map[string]any{"node": item})
}
func (m *Module) updateNode(w http.ResponseWriter, r *http.Request) {
req, err := decodeNodeRequest(r)
if err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
nodeID := chi.URLParam(r, "nodeID")
item, err := m.getNodeByID(r.Context(), nodeID)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
httpx.WriteError(w, http.StatusNotFound, "node not found")
return
}
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
item.OwnerOrganizationID = req.OwnerOrganizationID
item.NodeKey = req.NodeKey
item.Name = req.Name
item.OwnershipType = req.OwnershipType
item.DesiredVersion = req.DesiredVersion
item.Metadata = req.Metadata
item.UpdatedAt = time.Now().UTC()
if err := m.persistNode(r.Context(), item, req, false); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"node": item})
}
func decodeNodeRequest(r *http.Request) (*upsertNodeRequest, error) {
var req upsertNodeRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
return nil, errors.New("invalid node payload")
}
if req.ActorUserID == "" || req.NodeKey == "" || req.Name == "" || req.OwnershipType == "" {
return nil, errors.New("actor_user_id, node_key, name, and ownership_type are required")
}
if len(req.Metadata) == 0 {
req.Metadata = json.RawMessage(`{}`)
}
if !json.Valid(req.Metadata) {
return nil, errors.New("metadata must be valid json")
}
if req.UpdatePolicy.Mode == "" {
req.UpdatePolicy.Mode = "manual"
}
if req.UpdatePolicy.Channel == "" {
req.UpdatePolicy.Channel = "stable"
}
if len(req.UpdatePolicy.MaintenanceWindow) == 0 {
req.UpdatePolicy.MaintenanceWindow = json.RawMessage(`{}`)
}
if req.PartitionState.ClusterState == "" {
req.PartitionState.ClusterState = "healthy"
}
if req.PartitionState.RecoveryMode == "" {
req.PartitionState.RecoveryMode = "normal"
}
for i := range req.Capabilities {
if len(req.Capabilities[i].Value) == 0 {
req.Capabilities[i].Value = json.RawMessage(`{}`)
}
}
for i := range req.Services {
if req.Services[i].DesiredState == "" {
req.Services[i].DesiredState = "disabled"
}
if len(req.Services[i].Metadata) == 0 {
req.Services[i].Metadata = json.RawMessage(`{}`)
}
}
return &req, nil
}
func (m *Module) persistNode(ctx context.Context, item Node, req *upsertNodeRequest, create bool) error {
tx, err := m.db.Begin(ctx)
if err != nil {
return err
}
defer tx.Rollback(ctx)
if create {
_, err = tx.Exec(ctx, `
INSERT INTO nodes (
id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status,
version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13::jsonb,$14,$15)
`, item.ID, item.OwnerOrganizationID, item.NodeKey, item.Name, item.OwnershipType, item.RegistrationStatus, item.HealthStatus, item.VersionState, item.PartitionState, item.DesiredVersion, item.ReportedVersion, item.LastSeenAt, []byte(item.Metadata), item.CreatedAt, item.UpdatedAt)
} else {
_, err = tx.Exec(ctx, `
UPDATE nodes
SET owner_organization_id=$2, node_key=$3, name=$4, ownership_type=$5, desired_version=$6, metadata=$7::jsonb, updated_at=$8
WHERE id=$1
`, item.ID, item.OwnerOrganizationID, item.NodeKey, item.Name, item.OwnershipType, item.DesiredVersion, []byte(item.Metadata), item.UpdatedAt)
}
if err != nil {
return err
}
if _, err := tx.Exec(ctx, `DELETE FROM node_capabilities WHERE node_id = $1`, item.ID); err != nil {
return err
}
for _, capability := range req.Capabilities {
if _, err := tx.Exec(ctx, `
INSERT INTO node_capabilities (node_id, capability, value, updated_at)
VALUES ($1, $2, $3::jsonb, $4)
`, item.ID, capability.Capability, []byte(capability.Value), time.Now().UTC()); err != nil {
return err
}
}
if _, err := tx.Exec(ctx, `DELETE FROM node_services WHERE node_id = $1`, item.ID); err != nil {
return err
}
for _, service := range req.Services {
if _, err := tx.Exec(ctx, `
INSERT INTO node_services (
node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at
) VALUES ($1, $2, $3, $4, 'unknown', NULL, $5::jsonb, $6)
`, item.ID, service.ServiceType, service.Enabled, service.DesiredState, []byte(service.Metadata), time.Now().UTC()); err != nil {
return err
}
}
if _, err := tx.Exec(ctx, `
INSERT INTO node_update_policies (
node_id, mode, channel, maintenance_window, canary, automatic_rollout, created_at, updated_at
) VALUES ($1,$2,$3,$4::jsonb,$5,$6,$7,$8)
ON CONFLICT (node_id) DO UPDATE SET
mode = EXCLUDED.mode,
channel = EXCLUDED.channel,
maintenance_window = EXCLUDED.maintenance_window,
canary = EXCLUDED.canary,
automatic_rollout = EXCLUDED.automatic_rollout,
updated_at = EXCLUDED.updated_at
`, item.ID, req.UpdatePolicy.Mode, req.UpdatePolicy.Channel, []byte(req.UpdatePolicy.MaintenanceWindow), req.UpdatePolicy.Canary, req.UpdatePolicy.AutomaticRollout, time.Now().UTC(), time.Now().UTC()); err != nil {
return err
}
if _, err := tx.Exec(ctx, `
INSERT INTO node_partition_states (node_id, cluster_state, recovery_mode, notes, updated_at)
VALUES ($1,$2,$3,$4,$5)
ON CONFLICT (node_id) DO UPDATE SET
cluster_state = EXCLUDED.cluster_state,
recovery_mode = EXCLUDED.recovery_mode,
notes = EXCLUDED.notes,
updated_at = EXCLUDED.updated_at
`, item.ID, req.PartitionState.ClusterState, req.PartitionState.RecoveryMode, req.PartitionState.Notes, time.Now().UTC()); err != nil {
return err
}
return tx.Commit(ctx)
}
func (m *Module) getNodeByID(ctx context.Context, nodeID string) (Node, error) {
row := m.db.QueryRow(ctx, `
SELECT id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status,
version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at
FROM nodes
WHERE id = $1
`, nodeID)
return scanNode(row)
}
func (m *Module) listCapabilities(ctx context.Context, nodeID string) ([]NodeCapability, error) {
rows, err := m.db.Query(ctx, `SELECT node_id, capability, value, updated_at FROM node_capabilities WHERE node_id = $1 ORDER BY capability`, nodeID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []NodeCapability
for rows.Next() {
var item NodeCapability
if err := rows.Scan(&item.NodeID, &item.Capability, &item.Value, &item.UpdatedAt); err != nil {
return nil, err
}
out = append(out, item)
}
return out, rows.Err()
}
func (m *Module) listServices(ctx context.Context, nodeID string) ([]NodeService, error) {
rows, err := m.db.Query(ctx, `
SELECT node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at
FROM node_services WHERE node_id = $1 ORDER BY service_type
`, nodeID)
if err != nil {
return nil, err
}
defer rows.Close()
var out []NodeService
for rows.Next() {
var item NodeService
if err := rows.Scan(&item.NodeID, &item.ServiceType, &item.Enabled, &item.DesiredState, &item.ReportedState, &item.LastReportedAt, &item.Metadata, &item.UpdatedAt); err != nil {
return nil, err
}
out = append(out, item)
}
return out, rows.Err()
}
func (m *Module) getUpdatePolicy(ctx context.Context, nodeID string) (*NodeUpdatePolicy, error) {
row := m.db.QueryRow(ctx, `
SELECT node_id, mode, channel, maintenance_window, canary, automatic_rollout, created_at, updated_at
FROM node_update_policies WHERE node_id = $1
`, nodeID)
var item NodeUpdatePolicy
if err := row.Scan(&item.NodeID, &item.Mode, &item.Channel, &item.MaintenanceWindow, &item.Canary, &item.AutomaticRollout, &item.CreatedAt, &item.UpdatedAt); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}
return nil, err
}
return &item, nil
}
func (m *Module) getPartitionState(ctx context.Context, nodeID string) (*NodePartitionState, error) {
row := m.db.QueryRow(ctx, `
SELECT node_id, cluster_state, recovery_mode, notes, updated_at
FROM node_partition_states WHERE node_id = $1
`, nodeID)
var item NodePartitionState
if err := row.Scan(&item.NodeID, &item.ClusterState, &item.RecoveryMode, &item.Notes, &item.UpdatedAt); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}
return nil, err
}
return &item, nil
}
type rowScanner interface {
Scan(dest ...any) error
}
func scanNode(row rowScanner) (Node, error) {
var item Node
if err := row.Scan(
&item.ID,
&item.OwnerOrganizationID,
&item.NodeKey,
&item.Name,
&item.OwnershipType,
&item.RegistrationStatus,
&item.HealthStatus,
&item.VersionState,
&item.PartitionState,
&item.DesiredVersion,
&item.ReportedVersion,
&item.LastSeenAt,
&item.Metadata,
&item.CreatedAt,
&item.UpdatedAt,
); err != nil {
return Node{}, err
}
if len(item.Metadata) == 0 {
item.Metadata = json.RawMessage(`{}`)
}
return item, nil
}
@@ -0,0 +1,356 @@
package nodeagent
import (
"encoding/json"
"errors"
"net/http"
"time"
"github.com/go-chi/chi/v5"
"github.com/google/uuid"
"github.com/jackc/pgx/v5/pgxpool"
clustermodule "github.com/example/remote-access-platform/backend/internal/modules/cluster"
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
"github.com/example/remote-access-platform/backend/internal/platform/module"
"github.com/example/remote-access-platform/backend/internal/platform/secrets"
)
type Module struct {
db *pgxpool.Pool
cluster *clustermodule.Service
}
func NewModule(deps module.Dependencies) *Module {
clusterStore := clustermodule.NewPostgresStore(deps.Infra.DB)
if deps.Config.Secret.EncryptionKeyBase64 != "" {
if encryptor, err := secrets.NewEncryptor(deps.Config.Secret.EncryptionKeyBase64, deps.Config.Secret.EncryptionKeyID); err == nil {
clusterStore.WithClusterKeyEncryptor(encryptor)
}
}
return &Module{
db: deps.Infra.DB,
cluster: clustermodule.NewService(clusterStore),
}
}
func (m *Module) Name() string {
return "nodeagent"
}
func (m *Module) RegisterRoutes(router chi.Router) {
router.Route("/node-agents", func(r chi.Router) {
r.Post("/enroll", m.enrollAgent)
r.Post("/enrollments/{requestID}/bootstrap", m.bootstrapEnrollment)
r.Post("/register", m.registerAgent)
r.Post("/{nodeID}/health", m.reportHealth)
r.Post("/{nodeID}/services/status", m.reportServiceStatus)
r.Post("/{nodeID}/update-manifest/request", m.requestUpdateManifest)
r.Post("/{nodeID}/update-result", m.acknowledgeUpdateResult)
r.Post("/{nodeID}/rollback-result", m.reportRollbackResult)
r.Get("/{nodeID}/clusters/{clusterID}/vpn-assignments/desired", m.listVPNAssignments)
r.Post("/{nodeID}/clusters/{clusterID}/vpn-assignments/{vpnConnectionID}/status", m.reportVPNAssignmentStatus)
})
}
func (m *Module) enrollAgent(w http.ResponseWriter, r *http.Request) {
var payload struct {
ClusterID string `json:"cluster_id"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
ReportedCapabilities json.RawMessage `json:"reported_capabilities"`
ReportedFacts json.RawMessage `json:"reported_facts"`
RequestedRoles json.RawMessage `json:"requested_roles"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid agent enrollment payload")
return
}
joinRequest, err := m.cluster.CreateJoinRequest(r.Context(), clustermodule.CreateJoinRequestInput{
ClusterID: payload.ClusterID,
JoinToken: payload.JoinToken,
NodeName: payload.NodeName,
NodeFingerprint: payload.NodeFingerprint,
PublicKey: payload.PublicKey,
ReportedCapabilities: payload.ReportedCapabilities,
ReportedFacts: payload.ReportedFacts,
RequestedRoles: payload.RequestedRoles,
})
if err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
"status": "pending_approval",
"join_request": joinRequest,
})
}
func (m *Module) bootstrapEnrollment(w http.ResponseWriter, r *http.Request) {
var payload struct {
ClusterID string `json:"cluster_id"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid enrollment bootstrap payload")
return
}
result, err := m.cluster.GetJoinRequestBootstrap(r.Context(), clustermodule.GetJoinRequestBootstrapInput{
ClusterID: payload.ClusterID,
JoinRequestID: chi.URLParam(r, "requestID"),
NodeFingerprint: payload.NodeFingerprint,
PublicKey: payload.PublicKey,
})
if err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, result)
}
func (m *Module) registerAgent(w http.ResponseWriter, r *http.Request) {
var payload struct {
NodeKey string `json:"node_key"`
Name string `json:"name"`
OwnershipType string `json:"ownership_type"`
OwnerOrganizationID *string `json:"owner_organization_id"`
ReportedVersion *string `json:"reported_version"`
Metadata json.RawMessage `json:"metadata"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid agent registration payload")
return
}
if payload.NodeKey == "" || payload.Name == "" || payload.OwnershipType == "" {
httpx.WriteError(w, http.StatusBadRequest, "node_key, name, and ownership_type are required")
return
}
if len(payload.Metadata) == 0 {
payload.Metadata = json.RawMessage(`{}`)
}
now := time.Now().UTC()
nodeID := uuid.NewString()
if err := m.db.QueryRow(r.Context(), `
INSERT INTO nodes (
id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status,
version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, 'active', 'unknown', 'unknown', 'healthy', NULL, $6, $7, $8::jsonb, $9, $10)
ON CONFLICT (node_key) DO UPDATE SET
name = EXCLUDED.name,
ownership_type = EXCLUDED.ownership_type,
owner_organization_id = EXCLUDED.owner_organization_id,
registration_status = 'active',
reported_version = EXCLUDED.reported_version,
last_seen_at = EXCLUDED.last_seen_at,
metadata = EXCLUDED.metadata,
updated_at = EXCLUDED.updated_at
RETURNING id
`, nodeID, payload.OwnerOrganizationID, payload.NodeKey, payload.Name, payload.OwnershipType, payload.ReportedVersion, now, []byte(payload.Metadata), now, now).Scan(&nodeID); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{
"node_id": nodeID,
"status": "registered",
"legacy": true,
"warning": "direct node-agent registration is retained for compatibility; production enrollment must use /node-agents/enroll",
})
}
func (m *Module) reportHealth(w http.ResponseWriter, r *http.Request) {
var payload struct {
HealthStatus string `json:"health_status"`
ReportedVersion *string `json:"reported_version"`
Metadata json.RawMessage `json:"metadata"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid node health payload")
return
}
if payload.HealthStatus == "" {
payload.HealthStatus = "unknown"
}
if len(payload.Metadata) == 0 {
payload.Metadata = json.RawMessage(`{}`)
}
if _, err := m.db.Exec(r.Context(), `
UPDATE nodes
SET health_status = $2, reported_version = COALESCE($3, reported_version), last_seen_at = $4, metadata = $5::jsonb, updated_at = $4
WHERE id = $1
`, chi.URLParam(r, "nodeID"), payload.HealthStatus, payload.ReportedVersion, time.Now().UTC(), []byte(payload.Metadata)); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted"})
}
func (m *Module) reportServiceStatus(w http.ResponseWriter, r *http.Request) {
var payload struct {
Services []struct {
ServiceType string `json:"service_type"`
ReportedState string `json:"reported_state"`
Metadata json.RawMessage `json:"metadata"`
} `json:"services"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid node service status payload")
return
}
now := time.Now().UTC()
for _, service := range payload.Services {
if len(service.Metadata) == 0 {
service.Metadata = json.RawMessage(`{}`)
}
if _, err := m.db.Exec(r.Context(), `
INSERT INTO node_services (
node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at
) VALUES ($1, $2, FALSE, 'disabled', $3, $4, $5::jsonb, $4)
ON CONFLICT (node_id, service_type) DO UPDATE SET
reported_state = EXCLUDED.reported_state,
last_reported_at = EXCLUDED.last_reported_at,
metadata = EXCLUDED.metadata,
updated_at = EXCLUDED.updated_at
`, chi.URLParam(r, "nodeID"), service.ServiceType, service.ReportedState, now, []byte(service.Metadata)); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
}
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted"})
}
func (m *Module) listVPNAssignments(w http.ResponseWriter, r *http.Request) {
items, err := m.cluster.ListNodeVPNAssignments(r.Context(), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID"))
if writeClusterServiceError(w, err) {
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{
"vpn_assignments": items,
"runtime_execution_enabled": false,
})
}
func (m *Module) reportVPNAssignmentStatus(w http.ResponseWriter, r *http.Request) {
var payload struct {
ObservedStatus string `json:"observed_status"`
StatusPayload json.RawMessage `json:"status_payload"`
ObservedAt *time.Time `json:"observed_at"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid vpn assignment status payload")
return
}
observedAt := time.Time{}
if payload.ObservedAt != nil {
observedAt = *payload.ObservedAt
}
item, err := m.cluster.ReportNodeVPNAssignmentStatus(r.Context(), clustermodule.ReportNodeVPNAssignmentStatusInput{
ClusterID: chi.URLParam(r, "clusterID"),
NodeID: chi.URLParam(r, "nodeID"),
VPNConnectionID: chi.URLParam(r, "vpnConnectionID"),
ObservedStatus: payload.ObservedStatus,
StatusPayload: payload.StatusPayload,
ObservedAt: observedAt,
})
if writeClusterServiceError(w, err) {
return
}
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
"vpn_assignment_status": item,
"runtime_execution_enabled": false,
})
}
func (m *Module) requestUpdateManifest(w http.ResponseWriter, r *http.Request) {
nodeID := chi.URLParam(r, "nodeID")
var mode, channel string
var canary, automatic bool
var desiredVersion *string
if err := m.db.QueryRow(r.Context(), `
SELECT n.desired_version, p.mode, p.channel, p.canary, p.automatic_rollout
FROM nodes n
LEFT JOIN node_update_policies p ON p.node_id = n.id
WHERE n.id = $1
`, nodeID).Scan(&desiredVersion, &mode, &channel, &canary, &automatic); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{
"manifest": map[string]any{
"node_id": nodeID,
"desired_version": desiredVersion,
"mode": mode,
"channel": channel,
"canary": canary,
"automatic_rollout": automatic,
},
})
}
func (m *Module) acknowledgeUpdateResult(w http.ResponseWriter, r *http.Request) {
m.recordUpdateRun(w, r, "update")
}
func (m *Module) reportRollbackResult(w http.ResponseWriter, r *http.Request) {
m.recordUpdateRun(w, r, "rollback")
}
func (m *Module) recordUpdateRun(w http.ResponseWriter, r *http.Request, action string) {
var payload struct {
TargetVersion string `json:"target_version"`
Status string `json:"status"`
Payload json.RawMessage `json:"payload"`
}
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid update result payload")
return
}
if payload.Status == "" {
payload.Status = "acknowledged"
}
if len(payload.Payload) == 0 {
payload.Payload = json.RawMessage(`{}`)
}
now := time.Now().UTC()
runID := uuid.NewString()
if _, err := m.db.Exec(r.Context(), `
INSERT INTO node_agent_update_runs (
id, node_id, action, target_version, status, requested_at, acknowledged_at, completed_at, payload
) VALUES ($1, $2, $3, $4, $5, $6, $6, CASE WHEN $5 IN ('succeeded', 'failed') THEN $6 ELSE NULL END, $7::jsonb)
`, runID, chi.URLParam(r, "nodeID"), action, payload.TargetVersion, payload.Status, now, []byte(payload.Payload)); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if action == "update" && payload.Status == "succeeded" {
_, _ = m.db.Exec(r.Context(), `
UPDATE nodes
SET reported_version = $2, version_state = 'current', updated_at = $3
WHERE id = $1
`, chi.URLParam(r, "nodeID"), payload.TargetVersion, now)
}
if action == "rollback" && payload.Status == "succeeded" {
_, _ = m.db.Exec(r.Context(), `
UPDATE nodes
SET reported_version = $2, version_state = 'rollback', updated_at = $3
WHERE id = $1
`, chi.URLParam(r, "nodeID"), payload.TargetVersion, now)
}
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted", "run_id": runID})
}
func writeClusterServiceError(w http.ResponseWriter, err error) bool {
if err == nil {
return false
}
switch {
case errors.Is(err, clustermodule.ErrVPNLeaseOwnerNotAllowed), errors.Is(err, clustermodule.ErrVPNLeaseOwnerRoleRequired):
httpx.WriteError(w, http.StatusForbidden, err.Error())
case errors.Is(err, clustermodule.ErrInvalidPayload):
httpx.WriteError(w, http.StatusBadRequest, err.Error())
default:
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
}
return true
}
@@ -0,0 +1,21 @@
package organization
import "testing"
func TestTenantSafeTopologyExposureDoesNotExposeCoreMesh(t *testing.T) {
value := tenantSafeTopologyExposure()
forbidden := []string{
"core_node_id",
"mesh_route",
"cluster_private_topology",
"certificate_serial",
}
for _, token := range forbidden {
if value == token {
t.Fatalf("topology exposure leaked forbidden token %q", token)
}
}
if value != "tenant_safe_no_core_mesh_topology" {
t.Fatalf("unexpected topology exposure marker: %q", value)
}
}
@@ -0,0 +1,518 @@
package organization
import (
"context"
"encoding/json"
"errors"
"net/http"
"strings"
"time"
"github.com/go-chi/chi/v5"
"github.com/google/uuid"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/example/remote-access-platform/backend/internal/platform/authority"
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
"github.com/example/remote-access-platform/backend/internal/platform/module"
)
const (
RoleOrgOwner = "org_owner"
RoleOrgAdmin = "org_admin"
RoleOrgOperator = "org_operator"
RoleOrgMember = "org_member"
RoleOrgViewer = "org_viewer"
)
type Module struct {
db *pgxpool.Pool
authority *authority.Verifier
}
type Organization struct {
ID string `json:"id"`
Slug string `json:"slug"`
Name string `json:"name"`
Status string `json:"status"`
Metadata json.RawMessage `json:"metadata"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type Membership struct {
ID string `json:"id"`
OrganizationID string `json:"organization_id"`
UserID string `json:"user_id"`
RoleID string `json:"role_id"`
Status string `json:"status"`
InvitedByUser *string `json:"invited_by_user_id,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type AdminSummary struct {
OrganizationID string `json:"organization_id"`
ResourceCount int64 `json:"resource_count"`
ActiveSessionCount int64 `json:"active_session_count"`
ServiceEndpoints []ServiceSummary `json:"service_endpoints"`
ConnectorStatus map[string]any `json:"connector_status"`
RecentAudit []OrgAuditEvent `json:"recent_audit"`
TopologyExposure string `json:"topology_exposure"`
}
type ServiceSummary struct {
Protocol string `json:"protocol"`
Count int64 `json:"count"`
}
type OrgAuditEvent struct {
ID string `json:"id"`
EventType string `json:"event_type"`
TargetType string `json:"target_type"`
TargetID string `json:"target_id"`
Payload json.RawMessage `json:"payload"`
CreatedAt time.Time `json:"created_at"`
}
type createOrganizationRequest struct {
ActorUserID string `json:"actor_user_id"`
Slug string `json:"slug"`
Name string `json:"name"`
Metadata json.RawMessage `json:"metadata"`
}
type addMembershipRequest struct {
ActorUserID string `json:"actor_user_id"`
UserID string `json:"user_id"`
RoleID string `json:"role_id"`
}
func NewModule(deps module.Dependencies) *Module {
authorityVerifier, _ := authority.NewVerifier(deps.Config.Installation)
return &Module{db: deps.Infra.DB, authority: authorityVerifier}
}
func (m *Module) Name() string {
return "organization"
}
func (m *Module) RegisterRoutes(router chi.Router) {
router.Route("/organizations", func(r chi.Router) {
r.Get("/", m.listOrganizations)
r.Post("/", m.createOrganization)
r.Get("/{organizationID}", m.getOrganization)
r.Get("/{organizationID}/admin-summary", m.getAdminSummary)
r.Get("/{organizationID}/memberships", m.listMemberships)
r.Post("/{organizationID}/memberships", m.addMembership)
})
}
func (m *Module) listOrganizations(w http.ResponseWriter, r *http.Request) {
userID := r.URL.Query().Get("user_id")
if userID == "" {
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
return
}
platformRole, err := m.getPlatformRole(r.Context(), userID)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
var rows pgx.Rows
if isPlatformAdmin(platformRole) {
rows, err = m.db.Query(r.Context(), `
SELECT id, slug, name, status, metadata, created_at, updated_at
FROM organizations
ORDER BY created_at DESC
`)
} else {
rows, err = m.db.Query(r.Context(), `
SELECT o.id, o.slug, o.name, o.status, o.metadata, o.created_at, o.updated_at
FROM organizations o
INNER JOIN organization_memberships om ON om.organization_id = o.id
WHERE om.user_id = $1 AND om.status = 'active'
ORDER BY o.created_at DESC
`, userID)
}
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer rows.Close()
var organizations []Organization
for rows.Next() {
org, err := scanOrganization(rows)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
organizations = append(organizations, org)
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"organizations": organizations})
}
func (m *Module) getOrganization(w http.ResponseWriter, r *http.Request) {
orgID := chi.URLParam(r, "organizationID")
userID := r.URL.Query().Get("user_id")
if userID == "" {
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
return
}
if err := m.ensureOrgAccess(r.Context(), orgID, userID, false); err != nil {
status := http.StatusInternalServerError
if errors.Is(err, pgx.ErrNoRows) || errors.Is(err, errForbidden) {
status = http.StatusForbidden
}
httpx.WriteError(w, status, err.Error())
return
}
org, err := m.getOrganizationByID(r.Context(), orgID)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
httpx.WriteError(w, http.StatusNotFound, "organization not found")
return
}
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"organization": org})
}
func (m *Module) getAdminSummary(w http.ResponseWriter, r *http.Request) {
orgID := chi.URLParam(r, "organizationID")
actorUserID := r.URL.Query().Get("actor_user_id")
if actorUserID == "" {
httpx.WriteError(w, http.StatusBadRequest, "actor_user_id is required")
return
}
if err := m.ensureOrgAccess(r.Context(), orgID, actorUserID, true); err != nil {
httpx.WriteError(w, http.StatusForbidden, err.Error())
return
}
summary, err := m.loadAdminSummary(r.Context(), orgID)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"admin_summary": summary})
}
func (m *Module) loadAdminSummary(ctx context.Context, orgID string) (AdminSummary, error) {
var resourceCount int64
if err := m.db.QueryRow(ctx, `
SELECT COUNT(*)
FROM resources
WHERE organization_id = $1::uuid
`, orgID).Scan(&resourceCount); err != nil {
return AdminSummary{}, err
}
var activeSessionCount int64
if err := m.db.QueryRow(ctx, `
SELECT COUNT(*)
FROM remote_sessions
WHERE organization_id = $1::uuid
AND state = 'active'
`, orgID).Scan(&activeSessionCount); err != nil {
return AdminSummary{}, err
}
rows, err := m.db.Query(ctx, `
SELECT protocol, COUNT(*)
FROM resources
WHERE organization_id = $1::uuid
GROUP BY protocol
ORDER BY protocol
`, orgID)
if err != nil {
return AdminSummary{}, err
}
defer rows.Close()
var services []ServiceSummary
for rows.Next() {
var item ServiceSummary
if err := rows.Scan(&item.Protocol, &item.Count); err != nil {
return AdminSummary{}, err
}
services = append(services, item)
}
if err := rows.Err(); err != nil {
return AdminSummary{}, err
}
auditRows, err := m.db.Query(ctx, `
SELECT ae.id::text, ae.event_type, ae.target_type, ae.target_id, ae.payload, ae.created_at
FROM audit_events ae
LEFT JOIN remote_sessions rs ON rs.id = ae.remote_session_id
WHERE rs.organization_id = $1::uuid
ORDER BY ae.created_at DESC
LIMIT 20
`, orgID)
if err != nil {
return AdminSummary{}, err
}
defer auditRows.Close()
var audit []OrgAuditEvent
for auditRows.Next() {
var item OrgAuditEvent
if err := auditRows.Scan(&item.ID, &item.EventType, &item.TargetType, &item.TargetID, &item.Payload, &item.CreatedAt); err != nil {
return AdminSummary{}, err
}
audit = append(audit, item)
}
if err := auditRows.Err(); err != nil {
return AdminSummary{}, err
}
return AdminSummary{
OrganizationID: orgID,
ResourceCount: resourceCount,
ActiveSessionCount: activeSessionCount,
ServiceEndpoints: services,
ConnectorStatus: map[string]any{
"vpn": "not_implemented",
"connector": "not_implemented",
},
RecentAudit: audit,
TopologyExposure: tenantSafeTopologyExposure(),
}, nil
}
func tenantSafeTopologyExposure() string {
return "tenant_safe_no_core_mesh_topology"
}
func (m *Module) createOrganization(w http.ResponseWriter, r *http.Request) {
var req createOrganizationRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid organization payload")
return
}
if req.ActorUserID == "" || req.Name == "" || req.Slug == "" {
httpx.WriteError(w, http.StatusBadRequest, "actor_user_id, slug, and name are required")
return
}
role, err := m.getPlatformRole(r.Context(), req.ActorUserID)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if !isPlatformAdmin(role) {
httpx.WriteError(w, http.StatusForbidden, "platform admin role is required")
return
}
if len(req.Metadata) == 0 {
req.Metadata = json.RawMessage(`{}`)
}
if !json.Valid(req.Metadata) {
httpx.WriteError(w, http.StatusBadRequest, "metadata must be valid json")
return
}
now := time.Now().UTC()
org := Organization{
ID: uuid.NewString(),
Slug: normalizeSlug(req.Slug),
Name: req.Name,
Status: "active",
Metadata: req.Metadata,
CreatedAt: now,
UpdatedAt: now,
}
membership := Membership{
ID: uuid.NewString(),
OrganizationID: org.ID,
UserID: req.ActorUserID,
RoleID: RoleOrgOwner,
Status: "active",
CreatedAt: now,
UpdatedAt: now,
}
tx, err := m.db.Begin(r.Context())
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer tx.Rollback(r.Context())
if _, err := tx.Exec(r.Context(), `
INSERT INTO organizations (id, slug, name, status, metadata, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5::jsonb, $6, $7)
`, org.ID, org.Slug, org.Name, org.Status, []byte(org.Metadata), org.CreatedAt, org.UpdatedAt); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if _, err := tx.Exec(r.Context(), `
INSERT INTO organization_memberships (
id, organization_id, user_id, role_id, status, invited_by_user_id, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
`, membership.ID, membership.OrganizationID, membership.UserID, membership.RoleID, membership.Status, req.ActorUserID, membership.CreatedAt, membership.UpdatedAt); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := tx.Commit(r.Context()); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusCreated, map[string]any{
"organization": org,
"membership": membership,
})
}
func (m *Module) listMemberships(w http.ResponseWriter, r *http.Request) {
orgID := chi.URLParam(r, "organizationID")
userID := r.URL.Query().Get("user_id")
if userID == "" {
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
return
}
if err := m.ensureOrgAccess(r.Context(), orgID, userID, true); err != nil {
httpx.WriteError(w, http.StatusForbidden, err.Error())
return
}
rows, err := m.db.Query(r.Context(), `
SELECT id, organization_id, user_id, role_id, status, invited_by_user_id, created_at, updated_at
FROM organization_memberships
WHERE organization_id = $1
ORDER BY created_at DESC
`, orgID)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer rows.Close()
var memberships []Membership
for rows.Next() {
membership, err := scanMembership(rows)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
memberships = append(memberships, membership)
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"memberships": memberships})
}
func (m *Module) addMembership(w http.ResponseWriter, r *http.Request) {
orgID := chi.URLParam(r, "organizationID")
var req addMembershipRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid membership payload")
return
}
if req.ActorUserID == "" || req.UserID == "" || req.RoleID == "" {
httpx.WriteError(w, http.StatusBadRequest, "actor_user_id, user_id, and role_id are required")
return
}
if err := m.ensureOrgAccess(r.Context(), orgID, req.ActorUserID, true); err != nil {
httpx.WriteError(w, http.StatusForbidden, err.Error())
return
}
now := time.Now().UTC()
membership := Membership{
ID: uuid.NewString(),
OrganizationID: orgID,
UserID: req.UserID,
RoleID: req.RoleID,
Status: "active",
InvitedByUser: &req.ActorUserID,
CreatedAt: now,
UpdatedAt: now,
}
if _, err := m.db.Exec(r.Context(), `
INSERT INTO organization_memberships (
id, organization_id, user_id, role_id, status, invited_by_user_id, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
ON CONFLICT (organization_id, user_id) DO UPDATE SET
role_id = EXCLUDED.role_id,
status = 'active',
invited_by_user_id = EXCLUDED.invited_by_user_id,
updated_at = EXCLUDED.updated_at
`, membership.ID, membership.OrganizationID, membership.UserID, membership.RoleID, membership.Status, membership.InvitedByUser, membership.CreatedAt, membership.UpdatedAt); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusCreated, map[string]any{"membership": membership})
}
var errForbidden = errors.New("forbidden")
func (m *Module) ensureOrgAccess(ctx context.Context, orgID, userID string, adminRequired bool) error {
role, err := m.getPlatformRole(ctx, userID)
if err != nil {
return err
}
if isPlatformAdmin(role) {
return nil
}
query := `
SELECT role_id
FROM organization_memberships
WHERE organization_id = $1 AND user_id = $2 AND status = 'active'
`
var roleID string
if err := m.db.QueryRow(ctx, query, orgID, userID).Scan(&roleID); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return errForbidden
}
return err
}
if adminRequired && roleID != RoleOrgOwner && roleID != RoleOrgAdmin {
return errForbidden
}
return nil
}
func (m *Module) getPlatformRole(ctx context.Context, userID string) (string, error) {
return authority.EffectivePlatformRole(ctx, m.db, m.authority, userID)
}
func isPlatformAdmin(role string) bool {
return role == "platform_admin" || role == "platform_recovery_admin"
}
func (m *Module) getOrganizationByID(ctx context.Context, orgID string) (Organization, error) {
row := m.db.QueryRow(ctx, `
SELECT id, slug, name, status, metadata, created_at, updated_at
FROM organizations
WHERE id = $1
`, orgID)
return scanOrganization(row)
}
type rowScanner interface {
Scan(dest ...any) error
}
func scanOrganization(row rowScanner) (Organization, error) {
var org Organization
if err := row.Scan(&org.ID, &org.Slug, &org.Name, &org.Status, &org.Metadata, &org.CreatedAt, &org.UpdatedAt); err != nil {
return Organization{}, err
}
if len(org.Metadata) == 0 {
org.Metadata = json.RawMessage(`{}`)
}
return org, nil
}
func scanMembership(row rowScanner) (Membership, error) {
var membership Membership
if err := row.Scan(
&membership.ID,
&membership.OrganizationID,
&membership.UserID,
&membership.RoleID,
&membership.Status,
&membership.InvitedByUser,
&membership.CreatedAt,
&membership.UpdatedAt,
); err != nil {
return Membership{}, err
}
return membership, nil
}
func normalizeSlug(in string) string {
return strings.ToLower(strings.TrimSpace(in))
}
+639
View File
@@ -0,0 +1,639 @@
package resource
import (
"context"
"encoding/json"
"errors"
"net/http"
"time"
"github.com/go-chi/chi/v5"
"github.com/google/uuid"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/example/remote-access-platform/backend/internal/platform/authority"
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
"github.com/example/remote-access-platform/backend/internal/platform/module"
"github.com/example/remote-access-platform/backend/internal/platform/secrets"
)
const (
CertificateVerificationModeStrict = "strict"
CertificateVerificationModeIgnore = "ignore"
RenderQualityProfileLowBandwidth = "low_bandwidth"
RenderQualityProfileBalanced = "balanced"
RenderQualityProfileHighQuality = "high_quality"
RenderQualityProfileTextPriority = "text_priority"
ClipboardModeDisabled = "disabled"
ClipboardModeClientToServer = "client_to_server"
ClipboardModeServerToClient = "server_to_client"
ClipboardModeBidirectional = "bidirectional"
FileTransferModeDisabled = "disabled"
FileTransferModeClientToServer = "client_to_server"
FileTransferModeServerToClient = "server_to_client"
FileTransferModeBidirectional = "bidirectional"
)
type Module struct {
db *pgxpool.Pool
appEnv string
secretStore *secrets.ResourceSecretStore
authority *authority.Verifier
}
type Resource struct {
ID string `json:"id"`
OrganizationID string `json:"organization_id"`
Name string `json:"name"`
Address string `json:"address"`
Protocol string `json:"protocol"`
SecretRef *string `json:"secret_ref,omitempty"`
CertificateVerificationMode string `json:"certificate_verification_mode"`
RenderQualityProfile string `json:"render_quality_profile"`
ClipboardMode string `json:"clipboard_mode"`
FileTransferMode string `json:"file_transfer_mode"`
Metadata json.RawMessage `json:"metadata"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type upsertResourceRequest struct {
ActorUserID string `json:"actor_user_id"`
OrganizationID string `json:"organization_id"`
Name string `json:"name"`
Address string `json:"address"`
Protocol string `json:"protocol"`
SecretRef *string `json:"secret_ref"`
CertificateVerificationMode string `json:"certificate_verification_mode"`
RenderQualityProfile string `json:"render_quality_profile"`
ClipboardMode string `json:"clipboard_mode"`
FileTransferMode string `json:"file_transfer_mode"`
Metadata json.RawMessage `json:"metadata"`
}
type upsertResourceSecretRequest struct {
ActorUserID string `json:"actor_user_id"`
Payload json.RawMessage `json:"payload"`
Metadata json.RawMessage `json:"metadata"`
}
func NewModule(deps module.Dependencies, secretStores ...*secrets.ResourceSecretStore) *Module {
var secretStore *secrets.ResourceSecretStore
if len(secretStores) > 0 {
secretStore = secretStores[0]
}
authorityVerifier, _ := authority.NewVerifier(deps.Config.Installation)
return &Module{db: deps.Infra.DB, appEnv: deps.Config.App.Env, secretStore: secretStore, authority: authorityVerifier}
}
func (m *Module) Name() string {
return "resource"
}
func (m *Module) RegisterRoutes(router chi.Router) {
router.Route("/resources", func(r chi.Router) {
r.Get("/", m.listResources)
r.Post("/", m.createResource)
r.Get("/{resourceID}", m.getResource)
r.Put("/{resourceID}", m.updateResource)
r.Put("/{resourceID}/secret", m.upsertResourceSecret)
})
}
func (m *Module) listResources(w http.ResponseWriter, r *http.Request) {
userID := r.URL.Query().Get("user_id")
orgID := r.URL.Query().Get("organization_id")
if userID == "" {
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
return
}
platformRole, err := m.getPlatformRole(r.Context(), userID)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
query := `
SELECT r.id, r.organization_id, r.name, r.address, r.protocol, r.secret_ref,
r.certificate_verification_mode, r.metadata, r.created_at, r.updated_at,
COALESCE(rp.clipboard_mode, 'disabled') AS clipboard_mode,
COALESCE(rp.file_transfer_mode, 'disabled') AS file_transfer_mode
FROM resources r
LEFT JOIN resource_policies rp ON rp.resource_id = r.id
`
args := make([]any, 0, 2)
if platformRole == "platform_admin" || platformRole == "platform_recovery_admin" {
if orgID != "" {
query += ` WHERE r.organization_id = $1`
args = append(args, orgID)
}
query += ` ORDER BY r.created_at DESC`
} else {
query += `
INNER JOIN organization_memberships om ON om.organization_id = r.organization_id
WHERE om.user_id = $1 AND om.status = 'active'
`
args = append(args, userID)
if orgID != "" {
query += ` AND r.organization_id = $2`
args = append(args, orgID)
}
query += ` ORDER BY r.created_at DESC`
}
rows, err := m.db.Query(r.Context(), query, args...)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer rows.Close()
resources := make([]Resource, 0)
for rows.Next() {
resource, err := scanResource(rows)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
resources = append(resources, resource)
}
if err := rows.Err(); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"resources": resources})
}
func (m *Module) getResource(w http.ResponseWriter, r *http.Request) {
userID := r.URL.Query().Get("user_id")
if userID == "" {
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
return
}
resource, err := m.getByID(r.Context(), chi.URLParam(r, "resourceID"))
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
httpx.WriteError(w, http.StatusNotFound, "resource not found")
return
}
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := m.ensureResourceAccess(r.Context(), resource.OrganizationID, userID, false); err != nil {
httpx.WriteError(w, http.StatusForbidden, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"resource": resource})
}
func (m *Module) createResource(w http.ResponseWriter, r *http.Request) {
req, err := decodeUpsertRequest(r)
if err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
if err := secrets.ValidateResourceSecretReadiness(req.Protocol, req.SecretRef, req.Metadata, m.appEnv); err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
now := time.Now().UTC()
resource := Resource{
ID: uuid.NewString(),
OrganizationID: req.OrganizationID,
Name: req.Name,
Address: req.Address,
Protocol: req.Protocol,
SecretRef: req.SecretRef,
CertificateVerificationMode: req.CertificateVerificationMode,
RenderQualityProfile: req.RenderQualityProfile,
ClipboardMode: req.ClipboardMode,
FileTransferMode: req.FileTransferMode,
Metadata: req.Metadata,
CreatedAt: now,
UpdatedAt: now,
}
if err := m.ensureResourceAccess(r.Context(), req.OrganizationID, req.ActorUserID, true); err != nil {
httpx.WriteError(w, http.StatusForbidden, err.Error())
return
}
tx, err := m.db.Begin(r.Context())
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer tx.Rollback(r.Context())
if _, err := tx.Exec(r.Context(), `
INSERT INTO resources (
id, organization_id, name, address, protocol, secret_ref, certificate_verification_mode, metadata, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb, $9, $10)
`, resource.ID, resource.OrganizationID, resource.Name, resource.Address, resource.Protocol, resource.SecretRef, resource.CertificateVerificationMode, []byte(resource.Metadata), resource.CreatedAt, resource.UpdatedAt); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := upsertResourcePolicy(r.Context(), tx, resource.ID, resource.ClipboardMode, resource.FileTransferMode, now); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := tx.Commit(r.Context()); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusCreated, map[string]any{"resource": resource})
}
func (m *Module) updateResource(w http.ResponseWriter, r *http.Request) {
req, err := decodeUpsertRequest(r)
if err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
if err := secrets.ValidateResourceSecretReadiness(req.Protocol, req.SecretRef, req.Metadata, m.appEnv); err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
resourceID := chi.URLParam(r, "resourceID")
existing, err := m.getByID(r.Context(), resourceID)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
httpx.WriteError(w, http.StatusNotFound, "resource not found")
return
}
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := m.ensureResourceAccess(r.Context(), existing.OrganizationID, req.ActorUserID, true); err != nil {
httpx.WriteError(w, http.StatusForbidden, err.Error())
return
}
if req.OrganizationID != existing.OrganizationID {
if err := m.ensureResourceAccess(r.Context(), req.OrganizationID, req.ActorUserID, true); err != nil {
httpx.WriteError(w, http.StatusForbidden, err.Error())
return
}
}
now := time.Now().UTC()
tx, err := m.db.Begin(r.Context())
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer tx.Rollback(r.Context())
tag, err := tx.Exec(r.Context(), `
UPDATE resources
SET
organization_id = $2,
name = $3,
address = $4,
protocol = $5,
secret_ref = $6,
certificate_verification_mode = $7,
metadata = $8::jsonb,
updated_at = $9
WHERE id = $1
`, resourceID, req.OrganizationID, req.Name, req.Address, req.Protocol, req.SecretRef, req.CertificateVerificationMode, []byte(req.Metadata), now)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if tag.RowsAffected() == 0 {
httpx.WriteError(w, http.StatusNotFound, "resource not found")
return
}
if err := upsertResourcePolicy(r.Context(), tx, resourceID, req.ClipboardMode, req.FileTransferMode, now); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := tx.Commit(r.Context()); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
resource, err := m.getByID(r.Context(), resourceID)
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"resource": resource})
}
func (m *Module) upsertResourceSecret(w http.ResponseWriter, r *http.Request) {
if m.secretStore == nil {
httpx.WriteError(w, http.StatusServiceUnavailable, "resource secret encryption is not configured")
return
}
var req upsertResourceSecretRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid resource secret payload")
return
}
if req.ActorUserID == "" {
httpx.WriteError(w, http.StatusBadRequest, "actor_user_id is required")
return
}
resourceID := chi.URLParam(r, "resourceID")
resource, err := m.getByID(r.Context(), resourceID)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
httpx.WriteError(w, http.StatusNotFound, "resource not found")
return
}
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := m.ensureResourceAccess(r.Context(), resource.OrganizationID, req.ActorUserID, true); err != nil {
httpx.WriteError(w, http.StatusForbidden, err.Error())
return
}
tx, err := m.db.Begin(r.Context())
if err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
defer tx.Rollback(r.Context())
secretStore := m.secretStore.WithDB(tx)
secretRef := secrets.DefaultResourceSecretRef(resource.OrganizationID, resource.ID)
descriptor, err := secretStore.Upsert(r.Context(), secrets.UpsertResourceSecretCommand{
OrganizationID: resource.OrganizationID,
ResourceID: resource.ID,
Protocol: resource.Protocol,
SecretRef: secretRef,
Payload: req.Payload,
Metadata: req.Metadata,
ActorUserID: req.ActorUserID,
})
if err != nil {
httpx.WriteError(w, http.StatusBadRequest, err.Error())
return
}
if _, err := tx.Exec(r.Context(), `
UPDATE resources
SET secret_ref = $2, updated_at = $3
WHERE id = $1::uuid
`, resource.ID, descriptor.SecretRef, time.Now().UTC()); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := writeAuditEvent(r.Context(), tx, "resource_secret_rotated", req.ActorUserID, "resource_secret", descriptor.SecretRef, map[string]any{
"resource_id": resource.ID,
"organization_id": resource.OrganizationID,
"protocol": resource.Protocol,
"version": descriptor.Version,
"secret_ref": descriptor.SecretRef,
}); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
if err := tx.Commit(r.Context()); err != nil {
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"secret": descriptor})
}
func decodeUpsertRequest(r *http.Request) (*upsertResourceRequest, error) {
var req upsertResourceRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
return nil, errors.New("invalid resource payload")
}
if req.Name == "" {
return nil, errors.New("name is required")
}
if req.ActorUserID == "" {
return nil, errors.New("actor_user_id is required")
}
if req.OrganizationID == "" {
return nil, errors.New("organization_id is required")
}
if req.Address == "" {
return nil, errors.New("address is required")
}
if req.Protocol == "" {
req.Protocol = "rdp"
}
mode, err := normalizeCertificateVerificationMode(req.CertificateVerificationMode)
if err != nil {
return nil, err
}
req.CertificateVerificationMode = mode
renderQualityProfile, err := normalizeRenderQualityProfile(req.RenderQualityProfile)
if err != nil {
return nil, err
}
req.RenderQualityProfile = renderQualityProfile
clipboardMode, err := normalizeClipboardMode(req.ClipboardMode)
if err != nil {
return nil, err
}
req.ClipboardMode = clipboardMode
fileTransferMode, err := normalizeFileTransferMode(req.FileTransferMode)
if err != nil {
return nil, err
}
req.FileTransferMode = fileTransferMode
metadata, err := normalizeMetadata(req.Metadata, req.CertificateVerificationMode, req.RenderQualityProfile)
if err != nil {
return nil, err
}
req.Metadata = metadata
return &req, nil
}
func normalizeCertificateVerificationMode(mode string) (string, error) {
switch mode {
case "", CertificateVerificationModeStrict:
return CertificateVerificationModeStrict, nil
case CertificateVerificationModeIgnore:
return CertificateVerificationModeIgnore, nil
default:
return "", errors.New("certificate_verification_mode must be one of: strict, ignore")
}
}
func normalizeClipboardMode(mode string) (string, error) {
switch mode {
case "", ClipboardModeDisabled:
return ClipboardModeDisabled, nil
case ClipboardModeClientToServer, ClipboardModeServerToClient, ClipboardModeBidirectional:
return mode, nil
default:
return "", errors.New("clipboard_mode must be one of: disabled, client_to_server, server_to_client, bidirectional")
}
}
func normalizeFileTransferMode(mode string) (string, error) {
switch mode {
case "", FileTransferModeDisabled:
return FileTransferModeDisabled, nil
case FileTransferModeClientToServer, FileTransferModeServerToClient, FileTransferModeBidirectional:
return mode, nil
default:
return "", errors.New("file_transfer_mode must be one of: disabled, client_to_server, server_to_client, bidirectional")
}
}
func normalizeMetadata(raw json.RawMessage, certificateVerificationMode, renderQualityProfile string) (json.RawMessage, error) {
if len(raw) == 0 {
raw = json.RawMessage(`{}`)
}
if !json.Valid(raw) {
return nil, errors.New("metadata must be valid json")
}
var metadata map[string]any
if err := json.Unmarshal(raw, &metadata); err != nil {
return nil, errors.New("metadata must be a json object")
}
metadata["certificate_verification_mode"] = certificateVerificationMode
metadata["render_quality_profile"] = renderQualityProfile
encoded, err := json.Marshal(metadata)
if err != nil {
return nil, err
}
return json.RawMessage(encoded), nil
}
func (m *Module) getByID(ctx context.Context, resourceID string) (Resource, error) {
row := m.db.QueryRow(ctx, `
SELECT r.id, r.organization_id, r.name, r.address, r.protocol, r.secret_ref,
r.certificate_verification_mode, r.metadata, r.created_at, r.updated_at,
COALESCE(rp.clipboard_mode, 'disabled') AS clipboard_mode,
COALESCE(rp.file_transfer_mode, 'disabled') AS file_transfer_mode
FROM resources r
LEFT JOIN resource_policies rp ON rp.resource_id = r.id
WHERE r.id = $1
`, resourceID)
return scanResource(row)
}
func (m *Module) ensureResourceAccess(ctx context.Context, orgID, userID string, adminRequired bool) error {
role, err := m.getPlatformRole(ctx, userID)
if err != nil {
return err
}
if role == "platform_admin" || role == "platform_recovery_admin" {
return nil
}
var membershipRole string
if err := m.db.QueryRow(ctx, `
SELECT role_id
FROM organization_memberships
WHERE organization_id = $1 AND user_id = $2 AND status = 'active'
`, orgID, userID).Scan(&membershipRole); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return errors.New("forbidden")
}
return err
}
if adminRequired && membershipRole != "org_owner" && membershipRole != "org_admin" {
return errors.New("forbidden")
}
return nil
}
func (m *Module) getPlatformRole(ctx context.Context, userID string) (string, error) {
return authority.EffectivePlatformRole(ctx, m.db, m.authority, userID)
}
type rowScanner interface {
Scan(dest ...any) error
}
func scanResource(row rowScanner) (Resource, error) {
var resource Resource
if err := row.Scan(
&resource.ID,
&resource.OrganizationID,
&resource.Name,
&resource.Address,
&resource.Protocol,
&resource.SecretRef,
&resource.CertificateVerificationMode,
&resource.Metadata,
&resource.CreatedAt,
&resource.UpdatedAt,
&resource.ClipboardMode,
&resource.FileTransferMode,
); err != nil {
return Resource{}, err
}
if len(resource.Metadata) == 0 {
resource.Metadata = json.RawMessage(`{}`)
}
if resource.CertificateVerificationMode == "" {
resource.CertificateVerificationMode = CertificateVerificationModeStrict
}
if resource.RenderQualityProfile == "" {
resource.RenderQualityProfile = renderQualityProfileFromMetadata(resource.Metadata)
}
if resource.ClipboardMode == "" {
resource.ClipboardMode = ClipboardModeDisabled
}
if resource.FileTransferMode == "" {
resource.FileTransferMode = FileTransferModeDisabled
}
return resource, nil
}
func upsertResourcePolicy(ctx context.Context, tx pgx.Tx, resourceID, clipboardMode, fileTransferMode string, now time.Time) error {
clipboardEnabled := clipboardMode != ClipboardModeDisabled
fileTransferEnabled := fileTransferMode == FileTransferModeClientToServer || fileTransferMode == FileTransferModeBidirectional
_, err := tx.Exec(ctx, `
INSERT INTO resource_policies (
resource_id, clipboard_enabled, clipboard_mode, file_transfer_enabled, file_transfer_mode, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $6)
ON CONFLICT (resource_id) DO UPDATE SET
clipboard_enabled = EXCLUDED.clipboard_enabled,
clipboard_mode = EXCLUDED.clipboard_mode,
file_transfer_enabled = EXCLUDED.file_transfer_enabled,
file_transfer_mode = EXCLUDED.file_transfer_mode,
updated_at = EXCLUDED.updated_at
`, resourceID, clipboardEnabled, clipboardMode, fileTransferEnabled, fileTransferMode, now)
return err
}
func writeAuditEvent(ctx context.Context, tx pgx.Tx, eventType, actorUserID, targetType, targetID string, payload map[string]any) error {
encoded, err := json.Marshal(payload)
if err != nil {
return err
}
_, err = tx.Exec(ctx, `
INSERT INTO audit_events (
id, actor_user_id, event_type, target_type, target_id, payload, created_at
) VALUES (
$1::uuid, NULLIF($2, '')::uuid, $3, $4, $5, $6::jsonb, $7
)
`, uuid.NewString(), actorUserID, eventType, targetType, targetID, encoded, time.Now().UTC())
return err
}
func normalizeRenderQualityProfile(profile string) (string, error) {
switch profile {
case "", RenderQualityProfileBalanced:
return RenderQualityProfileBalanced, nil
case RenderQualityProfileLowBandwidth, RenderQualityProfileHighQuality, RenderQualityProfileTextPriority:
return profile, nil
default:
return "", errors.New("render_quality_profile must be one of: low_bandwidth, balanced, high_quality, text_priority")
}
}
func renderQualityProfileFromMetadata(raw json.RawMessage) string {
if len(raw) == 0 {
return RenderQualityProfileBalanced
}
var metadata map[string]any
if err := json.Unmarshal(raw, &metadata); err != nil {
return RenderQualityProfileBalanced
}
if profile, ok := metadata["render_quality_profile"].(string); ok {
switch profile {
case RenderQualityProfileLowBandwidth, RenderQualityProfileBalanced, RenderQualityProfileHighQuality, RenderQualityProfileTextPriority:
return profile
}
}
return RenderQualityProfileBalanced
}
@@ -0,0 +1,219 @@
package sessionbroker
import (
"crypto/rsa"
"fmt"
"strings"
"time"
"github.com/golang-jwt/jwt/v5"
"github.com/google/uuid"
"github.com/example/remote-access-platform/backend/internal/platform/secrets"
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
)
const (
directWorkerTLSTrustModeSmokeInsecure = "smoke_insecure"
directWorkerTLSTrustModePublicCA = "public_ca"
directWorkerTLSTrustModePlatformCA = "platform_ca"
)
type DataPlaneTokenClaims struct {
SessionID string `json:"session_id"`
AttachmentID string `json:"attachment_id"`
UserID string `json:"user_id"`
OrganizationID string `json:"organization_id"`
ClusterID string `json:"cluster_id,omitempty"`
WorkerID string `json:"worker_id"`
ResourceID string `json:"resource_id"`
AllowedChannels []string `json:"allowed_channels"`
ExpiresAtValue time.Time `json:"expires_at"`
jwt.RegisteredClaims
}
func (s *Service) buildDataPlaneOffer(session RemoteSession, attachment SessionAttachment) (*sessioncontracts.DataPlaneOffer, error) {
if s.cfg.DataPlane.TokenTTL <= 0 || s.cfg.DataPlane.TokenPrivateKeyPEM == "" {
return nil, nil
}
now := s.now().UTC()
expiresAt := now.Add(s.cfg.DataPlane.TokenTTL)
allowedChannels := dataPlaneAllowedChannelsFromSession(session)
jti := uuid.NewString()
claims := DataPlaneTokenClaims{
SessionID: session.ID,
AttachmentID: attachment.ID,
UserID: attachment.UserID,
OrganizationID: session.OrganizationID,
WorkerID: session.WorkerID,
ResourceID: session.ResourceID,
AllowedChannels: allowedChannels,
ExpiresAtValue: expiresAt,
RegisteredClaims: jwt.RegisteredClaims{
ID: jti,
Issuer: s.cfg.Auth.Issuer,
Subject: attachment.UserID,
Audience: jwt.ClaimStrings{"rap-data-plane", "worker:" + session.WorkerID},
IssuedAt: jwt.NewNumericDate(now),
NotBefore: jwt.NewNumericDate(now),
ExpiresAt: jwt.NewNumericDate(expiresAt),
},
}
token, err := signDataPlaneToken(claims, s.cfg.DataPlane.TokenPrivateKeyPEM)
if err != nil {
return nil, err
}
candidates := s.buildDataPlaneCandidates(session)
preferred := sessioncontracts.DataPlaneCandidateBackendGateway
if len(candidates) > 0 {
preferred = candidates[0].Type
}
return &sessioncontracts.DataPlaneOffer{
Preferred: preferred,
Token: token,
ExpiresAt: expiresAt,
Candidates: candidates,
}, nil
}
func (s *Service) buildDataPlaneCandidates(session RemoteSession) []sessioncontracts.DataPlaneCandidate {
var candidates []sessioncontracts.DataPlaneCandidate
if directURL := s.directWorkerWSSURL(session.WorkerID); directURL != "" && s.canAdvertiseDirectWorkerWSS() {
metadata := map[string]any(nil)
if s.cfg.DataPlane.DirectWorkerJSONRuntime {
metadata = map[string]any{
"runtime_transport": "json_v1",
"traffic_ready": true,
}
s.addDirectWorkerTLSTrustMetadata(metadata)
if s.cfg.DataPlane.DirectWorkerBinaryRender {
metadata["render_transport"] = "binary_v1"
metadata["binary_render"] = true
metadata["supported_color_modes"] = []string{"full_color", "grayscale"}
metadata["default_color_mode"] = "full_color"
}
}
candidates = append(candidates, sessioncontracts.DataPlaneCandidate{
Type: sessioncontracts.DataPlaneCandidateDirectWorkerWSS,
URL: directURL,
WorkerID: session.WorkerID,
Priority: 10,
Metadata: metadata,
})
}
if s.cfg.DataPlane.BackendGatewayURL != "" {
candidates = append(candidates, sessioncontracts.DataPlaneCandidate{
Type: sessioncontracts.DataPlaneCandidateBackendGateway,
URL: s.cfg.DataPlane.BackendGatewayURL,
Priority: 100,
})
}
return candidates
}
func (s *Service) canAdvertiseDirectWorkerWSS() bool {
trustMode := normalizeDirectWorkerTLSTrustMode(s.cfg.DataPlane.DirectWorkerTLSTrustMode)
return !secrets.IsProductionEnv(s.cfg.App.Env) || directWorkerTLSTrustModeIsProductionTrusted(trustMode)
}
func (s *Service) addDirectWorkerTLSTrustMetadata(metadata map[string]any) {
trustMode := normalizeDirectWorkerTLSTrustMode(s.cfg.DataPlane.DirectWorkerTLSTrustMode)
metadata["tls_trust_mode"] = trustMode
metadata["production_trusted"] = directWorkerTLSTrustModeIsProductionTrusted(trustMode)
metadata["smoke_only"] = trustMode == directWorkerTLSTrustModeSmokeInsecure
if s.cfg.DataPlane.DirectWorkerTLSCARef != "" {
metadata["tls_ca_ref"] = s.cfg.DataPlane.DirectWorkerTLSCARef
}
}
func normalizeDirectWorkerTLSTrustMode(mode string) string {
switch strings.ToLower(strings.TrimSpace(mode)) {
case directWorkerTLSTrustModePublicCA:
return directWorkerTLSTrustModePublicCA
case directWorkerTLSTrustModePlatformCA:
return directWorkerTLSTrustModePlatformCA
default:
return directWorkerTLSTrustModeSmokeInsecure
}
}
func directWorkerTLSTrustModeIsProductionTrusted(mode string) bool {
return mode == directWorkerTLSTrustModePublicCA || mode == directWorkerTLSTrustModePlatformCA
}
func (s *Service) directWorkerWSSURL(workerID string) string {
template := strings.TrimSpace(s.cfg.DataPlane.DirectWorkerWSSURLTemplate)
if template == "" || workerID == "" {
return ""
}
return strings.ReplaceAll(template, "{worker_id}", workerID)
}
func signDataPlaneToken(claims DataPlaneTokenClaims, privateKeyPEM string) (string, error) {
privateKey, err := jwt.ParseRSAPrivateKeyFromPEM([]byte(privateKeyPEM))
if err != nil {
return "", fmt.Errorf("parse data-plane private key: %w", err)
}
token := jwt.NewWithClaims(jwt.SigningMethodRS256, claims)
signed, err := token.SignedString(privateKey)
if err != nil {
return "", fmt.Errorf("sign data-plane token: %w", err)
}
return signed, nil
}
func parseDataPlaneToken(tokenValue string, publicKey *rsa.PublicKey) (*DataPlaneTokenClaims, error) {
claims := &DataPlaneTokenClaims{}
token, err := jwt.ParseWithClaims(tokenValue, claims, func(token *jwt.Token) (any, error) {
if token.Method != jwt.SigningMethodRS256 {
return nil, fmt.Errorf("unexpected data-plane signing method: %s", token.Header["alg"])
}
return publicKey, nil
})
if err != nil {
return nil, err
}
if !token.Valid {
return nil, fmt.Errorf("data-plane token invalid")
}
return claims, nil
}
func dataPlaneAllowedChannelsFromSession(session RemoteSession) []string {
channels := []string{
sessioncontracts.DataPlaneChannelControl,
sessioncontracts.DataPlaneChannelInput,
sessioncontracts.DataPlaneChannelRender,
sessioncontracts.DataPlaneChannelTelemetry,
}
metadata := decodeJSONMap(session.Metadata)
policy, _ := metadata["policy"].(map[string]any)
if policy != nil {
if mode, _ := policy["clipboard_mode"].(string); mode != "" && mode != string(ResourceClipboardModeDisabled) {
channels = append(channels, sessioncontracts.DataPlaneChannelClipboard)
}
if mode, _ := policy["file_transfer_mode"].(string); fileTransferAllowsClientToServer(ResourceFileTransferMode(mode)) {
channels = append(channels, sessioncontracts.DataPlaneChannelFileUpload)
}
if mode, _ := policy["file_transfer_mode"].(string); fileTransferAllowsServerToClient(ResourceFileTransferMode(mode)) {
channels = append(channels, sessioncontracts.DataPlaneChannelFileDownload)
}
}
return channels
}
func (s *Service) attachDataPlaneOffer(result *SessionControlResult) error {
if result == nil || result.Attachment == nil {
return nil
}
result.GatewayURL = s.cfg.DataPlane.BackendGatewayURL
offer, err := s.buildDataPlaneOffer(result.Session, *result.Attachment)
if err != nil {
return err
}
result.DataPlane = offer
return nil
}
@@ -0,0 +1,357 @@
package sessionbroker
import (
"crypto/rand"
"crypto/rsa"
"crypto/x509"
"encoding/json"
"encoding/pem"
"slices"
"strings"
"testing"
"time"
"github.com/example/remote-access-platform/backend/internal/platform/config"
"github.com/example/remote-access-platform/backend/internal/platform/module"
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
)
func TestDataPlaneTokenScopeValidation(t *testing.T) {
now := time.Now().UTC().Truncate(time.Second)
privateKeyPEM, publicKey := testRS256Key(t)
service := &Service{
cfg: module.Config{
Auth: config.AuthConfig{
Issuer: "rap-api-test",
},
DataPlane: config.DataPlaneConfig{
TokenTTL: time.Minute,
TokenPrivateKeyPEM: privateKeyPEM,
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
},
},
now: func() time.Time { return now },
}
session := RemoteSession{
ID: "session-1",
OrganizationID: "org-1",
ResourceID: "resource-1",
WorkerID: "worker-1",
Metadata: mustJSON(t, map[string]any{"policy": map[string]any{"clipboard_mode": "bidirectional", "file_transfer_mode": "client_to_server"}}),
}
attachment := SessionAttachment{
ID: "attachment-1",
UserID: "user-1",
}
offer, err := service.buildDataPlaneOffer(session, attachment)
if err != nil {
t.Fatalf("buildDataPlaneOffer returned error: %v", err)
}
if offer == nil {
t.Fatal("expected data-plane offer")
}
claims, err := parseDataPlaneToken(offer.Token, publicKey)
if err != nil {
t.Fatalf("parseDataPlaneToken returned error: %v", err)
}
assertEqual(t, claims.SessionID, session.ID, "session_id")
assertEqual(t, claims.AttachmentID, attachment.ID, "attachment_id")
assertEqual(t, claims.UserID, attachment.UserID, "user_id")
assertEqual(t, claims.OrganizationID, session.OrganizationID, "organization_id")
assertEqual(t, claims.WorkerID, session.WorkerID, "worker_id")
assertEqual(t, claims.ResourceID, session.ResourceID, "resource_id")
if claims.ID == "" {
t.Fatal("expected jti")
}
if claims.ExpiresAt == nil || !claims.ExpiresAt.Time.Equal(now.Add(time.Minute)) {
t.Fatalf("unexpected expires_at: %v", claims.ExpiresAt)
}
if !claims.ExpiresAtValue.Equal(now.Add(time.Minute)) {
t.Fatalf("unexpected expires_at claim value: %v", claims.ExpiresAtValue)
}
for _, channel := range []string{
sessioncontracts.DataPlaneChannelControl,
sessioncontracts.DataPlaneChannelInput,
sessioncontracts.DataPlaneChannelRender,
sessioncontracts.DataPlaneChannelTelemetry,
sessioncontracts.DataPlaneChannelClipboard,
sessioncontracts.DataPlaneChannelFileUpload,
} {
if !slices.Contains(claims.AllowedChannels, channel) {
t.Fatalf("expected allowed channel %q in %v", channel, claims.AllowedChannels)
}
}
}
func TestDataPlaneOfferResponseShapeCompatibility(t *testing.T) {
now := time.Now().UTC().Truncate(time.Second)
privateKeyPEM, _ := testRS256Key(t)
service := &Service{
cfg: module.Config{
Auth: config.AuthConfig{Issuer: "rap-api-test"},
DataPlane: config.DataPlaneConfig{
TokenTTL: time.Minute,
TokenPrivateKeyPEM: privateKeyPEM,
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
DirectWorkerJSONRuntime: true,
DirectWorkerTLSTrustMode: "smoke_insecure",
},
},
now: func() time.Time { return now },
}
result := &SessionControlResult{
Session: RemoteSession{
ID: "session-1",
OrganizationID: "org-1",
ResourceID: "resource-1",
WorkerID: "worker-1",
Metadata: mustJSON(t, map[string]any{"policy": map[string]any{"clipboard_mode": "disabled", "file_transfer_mode": "disabled"}}),
},
Attachment: &SessionAttachment{ID: "attachment-1", UserID: "user-1"},
AttachToken: &sessioncontracts.AttachTokenClaims{
Token: "existing-attach-token",
SessionID: "session-1",
AttachmentID: "attachment-1",
UserID: "user-1",
WorkerID: "worker-1",
ExpiresAt: now.Add(2 * time.Minute),
},
}
if err := service.attachDataPlaneOffer(result); err != nil {
t.Fatalf("attachDataPlaneOffer returned error: %v", err)
}
payload, err := json.Marshal(result)
if err != nil {
t.Fatalf("marshal response: %v", err)
}
var decoded map[string]any
if err := json.Unmarshal(payload, &decoded); err != nil {
t.Fatalf("decode response: %v", err)
}
if decoded["session"] == nil || decoded["attachment"] == nil || decoded["attach_token"] == nil {
t.Fatalf("response lost existing fields: %s", payload)
}
if decoded["data_plane"] == nil || decoded["gateway_url"] == nil {
t.Fatalf("response missing data-plane fields: %s", payload)
}
if result.DataPlane == nil {
t.Fatal("expected data-plane offer")
}
if result.DataPlane.Preferred != sessioncontracts.DataPlaneCandidateDirectWorkerWSS {
t.Fatalf("unexpected preferred candidate: %s", result.DataPlane.Preferred)
}
if len(result.DataPlane.Candidates) != 2 {
t.Fatalf("expected direct and fallback candidates, got %d", len(result.DataPlane.Candidates))
}
if result.DataPlane.Candidates[0].URL != "wss://worker-1.worker.example.test/rap/v1/data-plane" {
t.Fatalf("unexpected direct candidate URL: %s", result.DataPlane.Candidates[0].URL)
}
if result.DataPlane.Candidates[0].Metadata["runtime_transport"] != "json_v1" {
t.Fatalf("direct candidate is missing json_v1 runtime metadata: %#v", result.DataPlane.Candidates[0].Metadata)
}
if result.DataPlane.Candidates[0].Metadata["traffic_ready"] != true {
t.Fatalf("direct candidate is missing traffic_ready metadata: %#v", result.DataPlane.Candidates[0].Metadata)
}
if result.DataPlane.Candidates[0].Metadata["smoke_only"] != true {
t.Fatalf("direct candidate should be marked smoke-only by default: %#v", result.DataPlane.Candidates[0].Metadata)
}
if result.DataPlane.Candidates[0].Metadata["production_trusted"] != false {
t.Fatalf("smoke direct candidate must not be production-trusted: %#v", result.DataPlane.Candidates[0].Metadata)
}
if !strings.Contains(result.DataPlane.Candidates[1].URL, "/api/v1/gateway/ws") {
t.Fatalf("unexpected backend candidate URL: %s", result.DataPlane.Candidates[1].URL)
}
}
func TestDataPlaneDirectCandidateMetadataRequiresRuntimeFlag(t *testing.T) {
service := &Service{
cfg: module.Config{
DataPlane: config.DataPlaneConfig{
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
DirectWorkerTLSTrustMode: "smoke_insecure",
},
},
}
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
if len(candidates) != 2 {
t.Fatalf("expected direct and fallback candidates, got %d", len(candidates))
}
if candidates[0].Metadata != nil {
t.Fatalf("direct candidate must not advertise json_v1 before runtime flag is enabled: %#v", candidates[0].Metadata)
}
}
func TestDataPlaneDirectCandidateAdvertisesBinaryRenderOnlyWhenEnabled(t *testing.T) {
service := &Service{
cfg: module.Config{
DataPlane: config.DataPlaneConfig{
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
DirectWorkerJSONRuntime: true,
DirectWorkerBinaryRender: true,
DirectWorkerTLSTrustMode: "platform_ca",
DirectWorkerTLSCARef: "rap-platform-ca:v1",
},
},
}
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
if len(candidates) != 2 {
t.Fatalf("expected direct and fallback candidates, got %d", len(candidates))
}
if candidates[0].Metadata["render_transport"] != "binary_v1" {
t.Fatalf("direct candidate is missing binary render metadata: %#v", candidates[0].Metadata)
}
if candidates[0].Metadata["binary_render"] != true {
t.Fatalf("direct candidate is missing binary_render metadata: %#v", candidates[0].Metadata)
}
if candidates[0].Metadata["default_color_mode"] != "full_color" {
t.Fatalf("direct candidate is missing default_color_mode metadata: %#v", candidates[0].Metadata)
}
if candidates[0].Metadata["production_trusted"] != true || candidates[0].Metadata["tls_trust_mode"] != "platform_ca" {
t.Fatalf("direct candidate is missing production trust metadata: %#v", candidates[0].Metadata)
}
if candidates[0].Metadata["tls_ca_ref"] != "rap-platform-ca:v1" {
t.Fatalf("direct candidate is missing tls_ca_ref metadata: %#v", candidates[0].Metadata)
}
modes, ok := candidates[0].Metadata["supported_color_modes"].([]string)
if !ok || !slices.Contains(modes, "full_color") || !slices.Contains(modes, "grayscale") {
t.Fatalf("direct candidate is missing supported_color_modes metadata: %#v", candidates[0].Metadata)
}
}
func TestDataPlaneDirectCandidateOmittedInProductionWhenSmokeOnly(t *testing.T) {
service := &Service{
cfg: module.Config{
App: config.AppConfig{Env: "production"},
DataPlane: config.DataPlaneConfig{
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
DirectWorkerJSONRuntime: true,
DirectWorkerTLSTrustMode: "smoke_insecure",
},
},
}
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
if len(candidates) != 1 {
t.Fatalf("expected fallback-only candidates in production with smoke TLS, got %d", len(candidates))
}
if candidates[0].Type != sessioncontracts.DataPlaneCandidateBackendGateway {
t.Fatalf("production must not advertise smoke-only direct candidate: %#v", candidates)
}
}
func TestDataPlaneDirectCandidateAdvertisedInProductionWhenTrusted(t *testing.T) {
service := &Service{
cfg: module.Config{
App: config.AppConfig{Env: "production"},
DataPlane: config.DataPlaneConfig{
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
DirectWorkerJSONRuntime: true,
DirectWorkerTLSTrustMode: "public_ca",
},
},
}
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
if len(candidates) != 2 {
t.Fatalf("expected trusted direct and fallback candidates, got %d", len(candidates))
}
if candidates[0].Metadata["production_trusted"] != true || candidates[0].Metadata["tls_trust_mode"] != "public_ca" {
t.Fatalf("trusted production direct candidate metadata mismatch: %#v", candidates[0].Metadata)
}
}
func TestDataPlaneCandidatesFallbackOnlyWhenDirectTemplateMissing(t *testing.T) {
service := &Service{
cfg: module.Config{
DataPlane: config.DataPlaneConfig{
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
},
},
}
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
if len(candidates) != 1 {
t.Fatalf("expected fallback-only candidate list, got %d", len(candidates))
}
if candidates[0].Type != sessioncontracts.DataPlaneCandidateBackendGateway {
t.Fatalf("unexpected candidate type: %s", candidates[0].Type)
}
}
func TestDataPlaneAllowedChannelsRespectRuntimePolicy(t *testing.T) {
cases := []struct {
name string
policy map[string]any
expected []string
blocked []string
}{
{
name: "disabled policies expose only control input render telemetry",
policy: map[string]any{"clipboard_mode": "disabled", "file_transfer_mode": "disabled"},
expected: []string{sessioncontracts.DataPlaneChannelControl, sessioncontracts.DataPlaneChannelInput, sessioncontracts.DataPlaneChannelRender, sessioncontracts.DataPlaneChannelTelemetry},
blocked: []string{sessioncontracts.DataPlaneChannelClipboard, sessioncontracts.DataPlaneChannelFileUpload},
},
{
name: "clipboard policy adds clipboard channel",
policy: map[string]any{"clipboard_mode": "server_to_client", "file_transfer_mode": "disabled"},
expected: []string{sessioncontracts.DataPlaneChannelClipboard},
blocked: []string{sessioncontracts.DataPlaneChannelFileUpload},
},
{
name: "client upload policy adds file upload channel",
policy: map[string]any{"clipboard_mode": "disabled", "file_transfer_mode": "client_to_server"},
expected: []string{sessioncontracts.DataPlaneChannelFileUpload},
blocked: []string{sessioncontracts.DataPlaneChannelClipboard},
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
session := RemoteSession{Metadata: mustJSON(t, map[string]any{"policy": tc.policy})}
channels := dataPlaneAllowedChannelsFromSession(session)
for _, channel := range tc.expected {
if !slices.Contains(channels, channel) {
t.Fatalf("expected channel %q in %v", channel, channels)
}
}
for _, channel := range tc.blocked {
if slices.Contains(channels, channel) {
t.Fatalf("did not expect channel %q in %v", channel, channels)
}
}
})
}
}
func mustJSON(t *testing.T, value any) []byte {
t.Helper()
payload, err := json.Marshal(value)
if err != nil {
t.Fatalf("marshal test metadata: %v", err)
}
return payload
}
func testRS256Key(t *testing.T) (string, *rsa.PublicKey) {
t.Helper()
privateKey, err := rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
t.Fatalf("generate RSA key: %v", err)
}
encoded := pem.EncodeToMemory(&pem.Block{
Type: "RSA PRIVATE KEY",
Bytes: x509.MarshalPKCS1PrivateKey(privateKey),
})
return string(encoded), &privateKey.PublicKey
}
func assertEqual(t *testing.T, got, want, name string) {
t.Helper()
if got != want {
t.Fatalf("unexpected %s: got %q want %q", name, got, want)
}
}
@@ -0,0 +1,15 @@
package sessionbroker
import "errors"
var (
ErrSessionNotFound = errors.New("remote session not found")
ErrAttachmentNotFound = errors.New("session attachment not found")
ErrActiveControllerPresent = errors.New("active controller already present")
ErrTakeoverNotAllowed = errors.New("takeover not allowed")
ErrTrustedDeviceRequired = errors.New("trusted device required")
ErrAccessDenied = errors.New("access denied")
ErrSessionNotAttachable = errors.New("session is not attachable")
ErrSessionNotTerminable = errors.New("session is not terminable")
ErrAttachTokenInvalid = errors.New("attach token invalid or expired")
)
@@ -0,0 +1,65 @@
package sessionbroker
import (
"context"
"time"
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
)
type LiveStateStore interface {
UpsertSession(ctx context.Context, state LiveSessionState) error
GetSession(ctx context.Context, sessionID string) (*LiveSessionState, error)
DeleteSession(ctx context.Context, sessionID string) error
BindController(ctx context.Context, binding sessioncontracts.ControllerBinding, ttl time.Duration) error
GetControllerBinding(ctx context.Context, sessionID string) (*sessioncontracts.ControllerBinding, error)
ClearControllerBinding(ctx context.Context, sessionID string) error
StoreAttachToken(ctx context.Context, claims sessioncontracts.AttachTokenClaims, ttl time.Duration) error
ConsumeAttachToken(ctx context.Context, token string) (*sessioncontracts.AttachTokenClaims, error)
TouchAttachmentHeartbeat(ctx context.Context, sessionID, attachmentID string, ttl time.Duration) error
UpdateWorkerRoute(ctx context.Context, route WorkerRoute, ttl time.Duration) error
GetWorkerRoute(ctx context.Context, sessionID string) (*WorkerRoute, error)
DeleteWorkerRoute(ctx context.Context, sessionID string) error
}
type LiveSessionState struct {
SessionID string `json:"session_id"`
ResourceID string `json:"resource_id"`
WorkerID string `json:"worker_id"`
State sessioncontracts.State `json:"state"`
ControllerID string `json:"controller_id"`
AttachmentID string `json:"attachment_id"`
TakeoverVersion int `json:"takeover_version"`
RenderQualityProfile string `json:"render_quality_profile,omitempty"`
RenderState string `json:"render_state,omitempty"`
RenderWidth int `json:"render_width,omitempty"`
RenderHeight int `json:"render_height,omitempty"`
RenderFrameSequence int64 `json:"render_frame_sequence,omitempty"`
RenderFrameFormat string `json:"render_frame_format,omitempty"`
RenderFrameData string `json:"render_frame_data,omitempty"`
LastInputCorrelationID string `json:"last_input_correlation_id,omitempty"`
WorkerFrameCapturedAt string `json:"worker_frame_captured_at,omitempty"`
CursorX int `json:"cursor_x,omitempty"`
CursorY int `json:"cursor_y,omitempty"`
CursorVisible bool `json:"cursor_visible,omitempty"`
DirtyRectangles int `json:"dirty_rectangles,omitempty"`
LastRenderAt *time.Time `json:"last_render_at,omitempty"`
ClipboardSequence int64 `json:"clipboard_sequence,omitempty"`
ClipboardText string `json:"clipboard_text,omitempty"`
ClipboardOrigin string `json:"clipboard_origin,omitempty"`
ClipboardContentHash string `json:"clipboard_content_hash,omitempty"`
ClipboardUpdatedAt *time.Time `json:"clipboard_updated_at,omitempty"`
FileDownloadSequence int64 `json:"file_download_sequence,omitempty"`
FileDownloadType string `json:"file_download_type,omitempty"`
FileDownloadPayload map[string]any `json:"file_download_payload,omitempty"`
FileDownloadUpdatedAt *time.Time `json:"file_download_updated_at,omitempty"`
UpdatedAt time.Time `json:"updated_at"`
}
type WorkerRoute struct {
SessionID string `json:"session_id"`
WorkerID string `json:"worker_id"`
LeaseID string `json:"lease_id"`
ControlStream string `json:"control_stream"`
UpdatedAt time.Time `json:"updated_at"`
}
@@ -0,0 +1,132 @@
package sessionbroker
import (
"time"
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
)
type AttachmentRole string
const (
AttachmentRoleController AttachmentRole = "controller"
)
type AttachmentState string
const (
AttachmentStateAttaching AttachmentState = "attaching"
AttachmentStateActive AttachmentState = "active"
AttachmentStateDetached AttachmentState = "detached"
AttachmentStateSuperseded AttachmentState = "superseded"
AttachmentStateRevoked AttachmentState = "revoked"
AttachmentStateClosed AttachmentState = "closed"
)
type ResourceTakeoverPolicy string
const (
ResourceTakeoverPolicyTrustedDevice ResourceTakeoverPolicy = "trusted_device"
ResourceTakeoverPolicySameUser ResourceTakeoverPolicy = "same_user"
ResourceTakeoverPolicyAdminOnly ResourceTakeoverPolicy = "admin_only"
)
type ResourceClipboardMode string
const (
ResourceClipboardModeDisabled ResourceClipboardMode = "disabled"
ResourceClipboardModeClientToServer ResourceClipboardMode = "client_to_server"
ResourceClipboardModeServerToClient ResourceClipboardMode = "server_to_client"
ResourceClipboardModeBidirectional ResourceClipboardMode = "bidirectional"
)
type ResourceFileTransferMode string
const (
ResourceFileTransferModeDisabled ResourceFileTransferMode = "disabled"
ResourceFileTransferModeClientToServer ResourceFileTransferMode = "client_to_server"
ResourceFileTransferModeServerToClient ResourceFileTransferMode = "server_to_client"
ResourceFileTransferModeBidirectional ResourceFileTransferMode = "bidirectional"
)
type RemoteSession struct {
ID string
OrganizationID string
ResourceID string
Protocol string
State sessioncontracts.State
WorkerID string
ControllerUserID string
DetachDeadlineAt *time.Time
LastHeartbeatAt *time.Time
TakeoverVersion int
RenderQualityProfile string
Metadata []byte
CreatedAt time.Time
UpdatedAt time.Time
}
type SessionAttachment struct {
ID string
RemoteSessionID string
UserID string
DeviceID string
Role AttachmentRole
State AttachmentState
SupersededBy *string
TakeoverOf *string
AttachedAt *time.Time
DetachedAt *time.Time
LastInputAt *time.Time
Metadata []byte
CreatedAt time.Time
UpdatedAt time.Time
}
type ResourcePolicy struct {
ResourceID string
MaxConcurrentSessions int
TakeoverPolicy ResourceTakeoverPolicy
RequireTrustedDevice bool
DetachGracePeriod time.Duration
ClipboardEnabled bool
ClipboardMode ResourceClipboardMode
FileTransferEnabled bool
FileTransferMode ResourceFileTransferMode
CreatedAt time.Time
UpdatedAt time.Time
}
type ResourceRuntimeSpec struct {
ID string
OrganizationID string
Name string
Address string
Protocol string
SecretRef *string
CertificateVerificationMode string
Metadata []byte
}
type AuditEvent struct {
ID string
ActorUserID *string
ActorDeviceID *string
EventType string
TargetType string
TargetID string
RemoteSessionID *string
Payload []byte
CreatedAt time.Time
}
const (
AuditEventSessionStarted = "session_started"
AuditEventSessionAttached = "session_attached"
AuditEventSessionDetached = "session_detached"
AuditEventSessionTakenOver = "session_taken_over"
AuditEventSessionTerminated = "session_terminated"
AuditEventSessionFailed = "session_failed"
AuditEventSecretAccessed = "resource_secret_accessed"
AuditEventSecretAccessDenied = "resource_secret_access_denied"
)
@@ -0,0 +1,164 @@
package sessionbroker
import (
"encoding/json"
"net/http"
"github.com/go-chi/chi/v5"
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
)
type Module struct {
service *Service
}
func NewModule(service *Service) *Module {
return &Module{service: service}
}
func (m *Module) Name() string {
return "session-broker"
}
func (m *Module) Service() *Service {
return m.service
}
func (m *Module) RegisterRoutes(router chi.Router) {
router.Route("/sessions", func(r chi.Router) {
r.Get("/", m.listSessions)
r.Post("/", m.startSession)
r.Post("/{sessionID}/attach", m.attachSession)
r.Post("/{sessionID}/detach", m.detachSession)
r.Post("/{sessionID}/takeover", m.takeoverSession)
r.Post("/{sessionID}/terminate", m.terminateSession)
r.Post("/{sessionID}/fail", m.markFailed)
})
}
func (m *Module) listSessions(w http.ResponseWriter, r *http.Request) {
userID := r.URL.Query().Get("user_id")
if userID == "" {
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
return
}
sessions, err := m.service.ListSessions(r.Context(), userID)
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"sessions": sessions})
}
func (m *Module) startSession(w http.ResponseWriter, r *http.Request) {
var cmd StartRemoteSessionCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid start session payload")
return
}
result, err := m.service.StartRemoteSession(r.Context(), cmd)
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusCreated, result)
}
func (m *Module) attachSession(w http.ResponseWriter, r *http.Request) {
var cmd AttachToSessionCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid attach session payload")
return
}
cmd.SessionID = chi.URLParam(r, "sessionID")
result, err := m.service.AttachToSession(r.Context(), cmd)
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusOK, result)
}
func (m *Module) detachSession(w http.ResponseWriter, r *http.Request) {
var cmd DetachFromSessionCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid detach session payload")
return
}
cmd.SessionID = chi.URLParam(r, "sessionID")
result, err := m.service.DetachFromSession(r.Context(), cmd)
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusAccepted, result)
}
func (m *Module) takeoverSession(w http.ResponseWriter, r *http.Request) {
var cmd TakeoverSessionCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid takeover session payload")
return
}
cmd.SessionID = chi.URLParam(r, "sessionID")
result, err := m.service.TakeoverSession(r.Context(), cmd)
if err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusOK, result)
}
func (m *Module) terminateSession(w http.ResponseWriter, r *http.Request) {
var cmd TerminateSessionCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid terminate session payload")
return
}
cmd.SessionID = chi.URLParam(r, "sessionID")
if err := m.service.TerminateSession(r.Context(), cmd); err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
"status": "terminated",
"message": httpx.NewMessage(
"session.terminated",
"status.session.terminated",
"Session terminated.",
nil,
"",
),
})
}
func (m *Module) markFailed(w http.ResponseWriter, r *http.Request) {
var cmd MarkSessionFailedCommand
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
httpx.WriteError(w, http.StatusBadRequest, "invalid fail session payload")
return
}
cmd.SessionID = chi.URLParam(r, "sessionID")
if err := m.service.MarkSessionFailed(r.Context(), cmd); err != nil {
status, message := m.service.MapError(err)
httpx.WriteError(w, status, message)
return
}
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
"status": "failed",
"message": httpx.NewMessage(
"session.failed",
"status.session.failed",
"Session marked as failed.",
nil,
"",
),
})
}
@@ -0,0 +1,17 @@
package sessionbroker
import (
"context"
workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker"
)
type WorkerOrchestrator interface {
Reserve(ctx context.Context, request workercontracts.AttachRequest) (*workercontracts.WorkerLease, error)
GetSessionLease(ctx context.Context, sessionID string) (*workercontracts.WorkerLease, error)
ReleaseSessionLease(ctx context.Context, sessionID string) error
PrepareAttachment(ctx context.Context, session RemoteSession, attachment SessionAttachment, runtimeMetadata map[string]any) error
NotifyDetachment(ctx context.Context, session RemoteSession, attachment SessionAttachment) error
TerminateRemoteSession(ctx context.Context, sessionID, attachmentID string) error
ValidateSessionRuntime(ctx context.Context, sessionID, workerID string) (bool, string, error)
}
@@ -0,0 +1,607 @@
package sessionbroker
import (
"context"
"encoding/json"
"errors"
"fmt"
"time"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/example/remote-access-platform/backend/internal/platform/authority"
postgresplatform "github.com/example/remote-access-platform/backend/internal/platform/postgres"
)
type postgresStore struct {
db postgresplatform.DBTX
authority *authority.Verifier
}
type PostgresTransactor struct {
pool *pgxpool.Pool
authority *authority.Verifier
}
func NewPostgresStore(pool *pgxpool.Pool, verifiers ...*authority.Verifier) Store {
var authorityVerifier *authority.Verifier
if len(verifiers) > 0 {
authorityVerifier = verifiers[0]
}
return &postgresStore{db: pool, authority: authorityVerifier}
}
func NewPostgresTransactor(pool *pgxpool.Pool, verifiers ...*authority.Verifier) *PostgresTransactor {
var authorityVerifier *authority.Verifier
if len(verifiers) > 0 {
authorityVerifier = verifiers[0]
}
return &PostgresTransactor{pool: pool, authority: authorityVerifier}
}
func (t *PostgresTransactor) WithinTransaction(ctx context.Context, fn func(store Store) error) error {
return postgresplatform.WithTransaction(ctx, t.pool, func(tx pgx.Tx) error {
return fn(&postgresStore{db: tx, authority: t.authority})
})
}
func (s *postgresStore) RemoteSessions() RemoteSessionRepository {
return &postgresRemoteSessionRepository{db: s.db}
}
func (s *postgresStore) SessionAttachments() SessionAttachmentRepository {
return &postgresSessionAttachmentRepository{db: s.db}
}
func (s *postgresStore) ResourcePolicies() ResourcePolicyRepository {
return &postgresResourcePolicyRepository{db: s.db}
}
func (s *postgresStore) ResourceRuntime() ResourceRuntimeRepository {
return &postgresResourceRuntimeRepository{db: s.db}
}
func (s *postgresStore) AuditEvents() AuditEventRepository {
return &postgresAuditEventRepository{db: s.db}
}
func (s *postgresStore) Access() AccessRepository {
return &postgresAccessRepository{db: s.db, authority: s.authority}
}
type postgresRemoteSessionRepository struct {
db postgresplatform.DBTX
}
type postgresSessionAttachmentRepository struct {
db postgresplatform.DBTX
}
type postgresResourcePolicyRepository struct {
db postgresplatform.DBTX
}
type postgresResourceRuntimeRepository struct {
db postgresplatform.DBTX
}
type postgresAuditEventRepository struct {
db postgresplatform.DBTX
}
type postgresAccessRepository struct {
db postgresplatform.DBTX
authority *authority.Verifier
}
func (r *postgresRemoteSessionRepository) Create(ctx context.Context, session RemoteSession) error {
const query = `
INSERT INTO remote_sessions (
id, organization_id, resource_id, protocol, state, worker_id, controller_user_id, detach_deadline_at,
last_heartbeat_at, takeover_version, metadata, created_at, updated_at
) VALUES (
$1::uuid, $2::uuid, $3::uuid, $4, $5, NULLIF($6, ''), $7::uuid, $8, $9, $10, $11::jsonb, $12, $13
)
`
if _, err := r.db.Exec(ctx, query,
session.ID,
session.OrganizationID,
session.ResourceID,
session.Protocol,
session.State,
session.WorkerID,
session.ControllerUserID,
session.DetachDeadlineAt,
session.LastHeartbeatAt,
session.TakeoverVersion,
jsonPayload(session.Metadata),
session.CreatedAt,
session.UpdatedAt,
); err != nil {
return fmt.Errorf("create remote session: %w", err)
}
return nil
}
func (r *postgresRemoteSessionRepository) GetByID(ctx context.Context, sessionID string) (*RemoteSession, error) {
return r.getByID(ctx, sessionID, "")
}
func (r *postgresRemoteSessionRepository) GetByIDForUpdate(ctx context.Context, sessionID string) (*RemoteSession, error) {
return r.getByID(ctx, sessionID, " FOR UPDATE")
}
func (r *postgresRemoteSessionRepository) getByID(ctx context.Context, sessionID string, suffix string) (*RemoteSession, error) {
query := `
SELECT id::text, organization_id::text, resource_id::text, protocol, state, COALESCE(worker_id, ''), controller_user_id::text,
detach_deadline_at, last_heartbeat_at, takeover_version, metadata, created_at, updated_at
FROM remote_sessions
WHERE id = $1::uuid` + suffix
remoteSession, err := scanRemoteSession(r.db.QueryRow(ctx, query, sessionID))
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}
return remoteSession, err
}
func (r *postgresRemoteSessionRepository) ListByController(ctx context.Context, userID string) ([]RemoteSession, error) {
const query = `
SELECT id::text, organization_id::text, resource_id::text, protocol, state, COALESCE(worker_id, ''), controller_user_id::text,
detach_deadline_at, last_heartbeat_at, takeover_version, metadata, created_at, updated_at
FROM remote_sessions
WHERE controller_user_id = $1::uuid
ORDER BY updated_at DESC
`
rows, err := r.db.Query(ctx, query, userID)
if err != nil {
return nil, fmt.Errorf("list remote sessions: %w", err)
}
defer rows.Close()
var sessions []RemoteSession
for rows.Next() {
item, err := scanRemoteSession(rows)
if err != nil {
return nil, err
}
sessions = append(sessions, *item)
}
return sessions, rows.Err()
}
func (r *postgresRemoteSessionRepository) CountLiveByResource(ctx context.Context, resourceID string) (int, error) {
const query = `
SELECT COUNT(*)
FROM remote_sessions
WHERE resource_id = $1::uuid AND state IN ('starting', 'active', 'detached', 'reconnecting')
`
var count int
if err := r.db.QueryRow(ctx, query, resourceID).Scan(&count); err != nil {
return 0, fmt.Errorf("count live remote sessions: %w", err)
}
return count, nil
}
func (r *postgresRemoteSessionRepository) ListDetachedExpired(ctx context.Context, before time.Time, limit int) ([]RemoteSession, error) {
const query = `
SELECT id::text, organization_id::text, resource_id::text, protocol, state, COALESCE(worker_id, ''), controller_user_id::text,
detach_deadline_at, last_heartbeat_at, takeover_version, metadata, created_at, updated_at
FROM remote_sessions
WHERE state = 'detached' AND detach_deadline_at IS NOT NULL AND detach_deadline_at <= $1
ORDER BY detach_deadline_at ASC
LIMIT $2
`
rows, err := r.db.Query(ctx, query, before, limit)
if err != nil {
return nil, fmt.Errorf("list detached expired sessions: %w", err)
}
defer rows.Close()
var sessions []RemoteSession
for rows.Next() {
item, err := scanRemoteSession(rows)
if err != nil {
return nil, err
}
sessions = append(sessions, *item)
}
return sessions, rows.Err()
}
func (r *postgresRemoteSessionRepository) UpdateState(ctx context.Context, params UpdateRemoteSessionStateParams) error {
const query = `
UPDATE remote_sessions
SET state = $2,
worker_id = NULLIF($3, ''),
detach_deadline_at = $4,
last_heartbeat_at = $5,
takeover_version = $6,
updated_at = $7
WHERE id = $1::uuid
`
if _, err := r.db.Exec(ctx, query,
params.RemoteSessionID,
params.State,
params.WorkerID,
params.DetachDeadlineAt,
params.LastHeartbeatAt,
params.TakeoverVersion,
params.UpdatedAt,
); err != nil {
return fmt.Errorf("update remote session state: %w", err)
}
return nil
}
func (r *postgresSessionAttachmentRepository) Create(ctx context.Context, attachment SessionAttachment) error {
const query = `
INSERT INTO session_attachments (
id, remote_session_id, user_id, device_id, role, state, superseded_by,
takeover_of, attached_at, detached_at, last_input_at, metadata, created_at, updated_at
) VALUES (
$1::uuid, $2::uuid, $3::uuid, $4::uuid, $5, $6, NULLIF($7, '')::uuid,
NULLIF($8, '')::uuid, $9, $10, $11, $12::jsonb, $13, $14
)
`
if _, err := r.db.Exec(ctx, query,
attachment.ID,
attachment.RemoteSessionID,
attachment.UserID,
attachment.DeviceID,
attachment.Role,
attachment.State,
stringValue(attachment.SupersededBy),
stringValue(attachment.TakeoverOf),
attachment.AttachedAt,
attachment.DetachedAt,
attachment.LastInputAt,
jsonPayload(attachment.Metadata),
attachment.CreatedAt,
attachment.UpdatedAt,
); err != nil {
return fmt.Errorf("create session attachment: %w", err)
}
return nil
}
func (r *postgresSessionAttachmentRepository) GetByID(ctx context.Context, attachmentID string) (*SessionAttachment, error) {
return r.getByID(ctx, attachmentID, "")
}
func (r *postgresSessionAttachmentRepository) GetByIDForUpdate(ctx context.Context, attachmentID string) (*SessionAttachment, error) {
return r.getByID(ctx, attachmentID, " FOR UPDATE")
}
func (r *postgresSessionAttachmentRepository) getByID(ctx context.Context, attachmentID string, suffix string) (*SessionAttachment, error) {
query := `
SELECT id::text, remote_session_id::text, user_id::text, device_id::text, role, state,
superseded_by::text, takeover_of::text, attached_at, detached_at, last_input_at, metadata, created_at, updated_at
FROM session_attachments
WHERE id = $1::uuid` + suffix
attachment, err := scanSessionAttachment(r.db.QueryRow(ctx, query, attachmentID))
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}
return attachment, err
}
func (r *postgresSessionAttachmentRepository) ListByRemoteSession(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) {
return r.listByRemoteSession(ctx, remoteSessionID, "")
}
func (r *postgresSessionAttachmentRepository) ListActiveByRemoteSessionForUpdate(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) {
return r.listByRemoteSession(ctx, remoteSessionID, " AND state IN ('attaching', 'active', 'reconnecting') FOR UPDATE")
}
func (r *postgresSessionAttachmentRepository) listByRemoteSession(ctx context.Context, remoteSessionID string, suffix string) ([]SessionAttachment, error) {
query := `
SELECT id::text, remote_session_id::text, user_id::text, device_id::text, role, state,
superseded_by::text, takeover_of::text, attached_at, detached_at, last_input_at, metadata, created_at, updated_at
FROM session_attachments
WHERE remote_session_id = $1::uuid` + suffix
rows, err := r.db.Query(ctx, query, remoteSessionID)
if err != nil {
return nil, fmt.Errorf("list session attachments: %w", err)
}
defer rows.Close()
var attachments []SessionAttachment
for rows.Next() {
item, err := scanSessionAttachment(rows)
if err != nil {
return nil, err
}
attachments = append(attachments, *item)
}
return attachments, rows.Err()
}
func (r *postgresSessionAttachmentRepository) UpdateState(ctx context.Context, params UpdateSessionAttachmentStateParams) error {
const query = `
UPDATE session_attachments
SET state = $2,
detached_at = $3,
last_input_at = $4,
updated_at = $5
WHERE id = $1::uuid
`
if _, err := r.db.Exec(ctx, query,
params.AttachmentID,
params.State,
params.DetachedAt,
params.LastInputAt,
params.UpdatedAt,
); err != nil {
return fmt.Errorf("update session attachment state: %w", err)
}
return nil
}
func (r *postgresSessionAttachmentRepository) Supersede(ctx context.Context, params SupersedeAttachmentParams) error {
const query = `
UPDATE session_attachments
SET state = 'superseded',
superseded_by = $2::uuid,
detached_at = $3,
updated_at = $4
WHERE id = $1::uuid
`
if _, err := r.db.Exec(ctx, query,
params.PreviousAttachmentID,
params.NextAttachmentID,
params.DetachedAt,
params.UpdatedAt,
); err != nil {
return fmt.Errorf("supersede attachment: %w", err)
}
return nil
}
func (r *postgresResourcePolicyRepository) GetByResourceID(ctx context.Context, resourceID string) (*ResourcePolicy, error) {
const query = `
SELECT resource_id::text, max_concurrent_sessions, takeover_policy, require_trusted_device,
detach_grace_period_seconds, clipboard_enabled, clipboard_mode, file_transfer_enabled,
COALESCE(file_transfer_mode, 'disabled'), created_at, updated_at
FROM resource_policies
WHERE resource_id = $1::uuid
`
policy, err := scanResourcePolicy(r.db.QueryRow(ctx, query, resourceID))
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}
return policy, err
}
func (r *postgresResourcePolicyRepository) Upsert(ctx context.Context, policy ResourcePolicy) error {
const query = `
INSERT INTO resource_policies (
resource_id, max_concurrent_sessions, takeover_policy, require_trusted_device,
detach_grace_period_seconds, clipboard_enabled, clipboard_mode, file_transfer_enabled, file_transfer_mode, created_at, updated_at
) VALUES ($1::uuid, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
ON CONFLICT (resource_id) DO UPDATE SET
max_concurrent_sessions = EXCLUDED.max_concurrent_sessions,
takeover_policy = EXCLUDED.takeover_policy,
require_trusted_device = EXCLUDED.require_trusted_device,
detach_grace_period_seconds = EXCLUDED.detach_grace_period_seconds,
clipboard_enabled = EXCLUDED.clipboard_enabled,
clipboard_mode = EXCLUDED.clipboard_mode,
file_transfer_enabled = EXCLUDED.file_transfer_enabled,
file_transfer_mode = EXCLUDED.file_transfer_mode,
updated_at = EXCLUDED.updated_at
`
clipboardMode := normalizeClipboardMode(policy.ClipboardMode)
fileTransferMode := normalizeFileTransferMode(policy.FileTransferMode)
if _, err := r.db.Exec(ctx, query,
policy.ResourceID,
policy.MaxConcurrentSessions,
policy.TakeoverPolicy,
policy.RequireTrustedDevice,
int(policy.DetachGracePeriod.Seconds()),
clipboardMode != ResourceClipboardModeDisabled,
clipboardMode,
fileTransferAllowsClientToServer(fileTransferMode),
fileTransferMode,
policy.CreatedAt,
policy.UpdatedAt,
); err != nil {
return fmt.Errorf("upsert resource policy: %w", err)
}
return nil
}
func (r *postgresResourceRuntimeRepository) GetByID(ctx context.Context, resourceID string) (*ResourceRuntimeSpec, error) {
const query = `
SELECT id::text, organization_id::text, name, address, protocol, secret_ref, certificate_verification_mode, metadata
FROM resources
WHERE id = $1::uuid
`
item := &ResourceRuntimeSpec{}
var secretRef *string
var metadata []byte
if err := r.db.QueryRow(ctx, query, resourceID).Scan(
&item.ID,
&item.OrganizationID,
&item.Name,
&item.Address,
&item.Protocol,
&secretRef,
&item.CertificateVerificationMode,
&metadata,
); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return nil, nil
}
return nil, fmt.Errorf("get resource runtime spec: %w", err)
}
item.SecretRef = secretRef
item.Metadata = metadata
return item, nil
}
func (r *postgresAuditEventRepository) Create(ctx context.Context, event AuditEvent) error {
const query = `
INSERT INTO audit_events (
id, actor_user_id, actor_device_id, event_type, target_type, target_id,
remote_session_id, payload, created_at
) VALUES (
$1::uuid, NULLIF($2, '')::uuid, NULLIF($3, '')::uuid, $4, $5, $6,
NULLIF($7, '')::uuid, $8::jsonb, $9
)
`
if _, err := r.db.Exec(ctx, query,
event.ID,
stringValue(event.ActorUserID),
stringValue(event.ActorDeviceID),
event.EventType,
event.TargetType,
event.TargetID,
stringValue(event.RemoteSessionID),
jsonPayload(event.Payload),
event.CreatedAt,
); err != nil {
return fmt.Errorf("create audit event: %w", err)
}
return nil
}
func (r *postgresAccessRepository) IsTrustedDevice(ctx context.Context, userID, deviceID string) (bool, error) {
const query = `
SELECT EXISTS(
SELECT 1 FROM devices
WHERE id = $1::uuid AND user_id = $2::uuid AND trust_status = 'trusted' AND revoked_at IS NULL
)
`
var trusted bool
if err := r.db.QueryRow(ctx, query, deviceID, userID).Scan(&trusted); err != nil {
return false, fmt.Errorf("check trusted device: %w", err)
}
return trusted, nil
}
func (r *postgresAccessRepository) GetPlatformRole(ctx context.Context, userID string) (string, error) {
return authority.EffectivePlatformRole(ctx, r.db, r.authority, userID)
}
func (r *postgresAccessRepository) GetOrganizationRole(ctx context.Context, organizationID, userID string) (string, bool, error) {
const query = `
SELECT role_id
FROM organization_memberships
WHERE organization_id = $1::uuid AND user_id = $2::uuid AND status = 'active'
`
var role string
if err := r.db.QueryRow(ctx, query, organizationID, userID).Scan(&role); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return "", false, nil
}
return "", false, fmt.Errorf("get organization role: %w", err)
}
return role, true, nil
}
type scanner interface {
Scan(dest ...any) error
}
func scanRemoteSession(row scanner) (*RemoteSession, error) {
item := &RemoteSession{}
var detachDeadlineAt, lastHeartbeatAt *time.Time
var metadata []byte
if err := row.Scan(
&item.ID,
&item.OrganizationID,
&item.ResourceID,
&item.Protocol,
&item.State,
&item.WorkerID,
&item.ControllerUserID,
&detachDeadlineAt,
&lastHeartbeatAt,
&item.TakeoverVersion,
&metadata,
&item.CreatedAt,
&item.UpdatedAt,
); err != nil {
return nil, fmt.Errorf("scan remote session: %w", err)
}
item.DetachDeadlineAt = detachDeadlineAt
item.LastHeartbeatAt = lastHeartbeatAt
item.Metadata = metadata
item.RenderQualityProfile = renderQualityProfileFromSessionMetadata(metadata)
return item, nil
}
func scanSessionAttachment(row scanner) (*SessionAttachment, error) {
item := &SessionAttachment{}
var supersededBy, takeoverOf *string
var attachedAt, detachedAt, lastInputAt *time.Time
var metadata []byte
if err := row.Scan(
&item.ID,
&item.RemoteSessionID,
&item.UserID,
&item.DeviceID,
&item.Role,
&item.State,
&supersededBy,
&takeoverOf,
&attachedAt,
&detachedAt,
&lastInputAt,
&metadata,
&item.CreatedAt,
&item.UpdatedAt,
); err != nil {
return nil, fmt.Errorf("scan session attachment: %w", err)
}
item.SupersededBy = supersededBy
item.TakeoverOf = takeoverOf
item.AttachedAt = attachedAt
item.DetachedAt = detachedAt
item.LastInputAt = lastInputAt
item.Metadata = metadata
return item, nil
}
func scanResourcePolicy(row scanner) (*ResourcePolicy, error) {
item := &ResourcePolicy{}
var detachGraceSeconds int
if err := row.Scan(
&item.ResourceID,
&item.MaxConcurrentSessions,
&item.TakeoverPolicy,
&item.RequireTrustedDevice,
&detachGraceSeconds,
&item.ClipboardEnabled,
&item.ClipboardMode,
&item.FileTransferEnabled,
&item.FileTransferMode,
&item.CreatedAt,
&item.UpdatedAt,
); err != nil {
return nil, fmt.Errorf("scan resource policy: %w", err)
}
item.DetachGracePeriod = time.Duration(detachGraceSeconds) * time.Second
item.ClipboardMode = normalizeClipboardMode(item.ClipboardMode)
item.ClipboardEnabled = item.ClipboardMode != ResourceClipboardModeDisabled
item.FileTransferMode = normalizeFileTransferMode(item.FileTransferMode)
item.FileTransferEnabled = fileTransferAllowsClientToServer(item.FileTransferMode)
return item, nil
}
func jsonPayload(payload []byte) []byte {
if len(payload) == 0 {
return []byte(`{}`)
}
if json.Valid(payload) {
return payload
}
return []byte(`{}`)
}
func stringValue(value *string) string {
if value == nil {
return ""
}
return *value
}
@@ -0,0 +1,140 @@
package sessionbroker
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/redis/go-redis/v9"
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
)
type RedisLiveStateStore struct {
client *redis.Client
}
func NewRedisLiveStateStore(client *redis.Client) *RedisLiveStateStore {
return &RedisLiveStateStore{client: client}
}
func (s *RedisLiveStateStore) UpsertSession(ctx context.Context, state LiveSessionState) error {
return s.setJSON(ctx, liveSessionKey(state.SessionID), state, 0)
}
func (s *RedisLiveStateStore) GetSession(ctx context.Context, sessionID string) (*LiveSessionState, error) {
var state LiveSessionState
ok, err := s.getJSON(ctx, liveSessionKey(sessionID), &state)
if err != nil || !ok {
return nil, err
}
return &state, nil
}
func (s *RedisLiveStateStore) DeleteSession(ctx context.Context, sessionID string) error {
return s.client.Del(ctx, liveSessionKey(sessionID)).Err()
}
func (s *RedisLiveStateStore) BindController(ctx context.Context, binding sessioncontracts.ControllerBinding, ttl time.Duration) error {
return s.setJSON(ctx, controllerBindingKey(binding.SessionID), binding, ttl)
}
func (s *RedisLiveStateStore) GetControllerBinding(ctx context.Context, sessionID string) (*sessioncontracts.ControllerBinding, error) {
var binding sessioncontracts.ControllerBinding
ok, err := s.getJSON(ctx, controllerBindingKey(sessionID), &binding)
if err != nil || !ok {
return nil, err
}
return &binding, nil
}
func (s *RedisLiveStateStore) ClearControllerBinding(ctx context.Context, sessionID string) error {
return s.client.Del(ctx, controllerBindingKey(sessionID)).Err()
}
func (s *RedisLiveStateStore) StoreAttachToken(ctx context.Context, claims sessioncontracts.AttachTokenClaims, ttl time.Duration) error {
return s.setJSON(ctx, attachTokenKey(claims.Token), claims, ttl)
}
func (s *RedisLiveStateStore) ConsumeAttachToken(ctx context.Context, token string) (*sessioncontracts.AttachTokenClaims, error) {
key := attachTokenKey(token)
payload, err := s.client.GetDel(ctx, key).Result()
if err == redis.Nil {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("consume attach token: %w", err)
}
var claims sessioncontracts.AttachTokenClaims
if err := json.Unmarshal([]byte(payload), &claims); err != nil {
return nil, fmt.Errorf("decode attach token: %w", err)
}
return &claims, nil
}
func (s *RedisLiveStateStore) TouchAttachmentHeartbeat(ctx context.Context, sessionID, attachmentID string, ttl time.Duration) error {
return s.client.Set(ctx, attachmentHeartbeatKey(sessionID, attachmentID), time.Now().UTC().Format(time.RFC3339Nano), ttl).Err()
}
func (s *RedisLiveStateStore) UpdateWorkerRoute(ctx context.Context, route WorkerRoute, ttl time.Duration) error {
return s.setJSON(ctx, workerRouteKey(route.SessionID), route, ttl)
}
func (s *RedisLiveStateStore) GetWorkerRoute(ctx context.Context, sessionID string) (*WorkerRoute, error) {
var route WorkerRoute
ok, err := s.getJSON(ctx, workerRouteKey(sessionID), &route)
if err != nil || !ok {
return nil, err
}
return &route, nil
}
func (s *RedisLiveStateStore) DeleteWorkerRoute(ctx context.Context, sessionID string) error {
return s.client.Del(ctx, workerRouteKey(sessionID)).Err()
}
func (s *RedisLiveStateStore) setJSON(ctx context.Context, key string, value any, ttl time.Duration) error {
payload, err := json.Marshal(value)
if err != nil {
return fmt.Errorf("encode redis payload: %w", err)
}
if err := s.client.Set(ctx, key, payload, ttl).Err(); err != nil {
return fmt.Errorf("set redis key %s: %w", key, err)
}
return nil
}
func (s *RedisLiveStateStore) getJSON(ctx context.Context, key string, dest any) (bool, error) {
payload, err := s.client.Get(ctx, key).Result()
if err == redis.Nil {
return false, nil
}
if err != nil {
return false, fmt.Errorf("get redis key %s: %w", key, err)
}
if err := json.Unmarshal([]byte(payload), dest); err != nil {
return false, fmt.Errorf("decode redis key %s: %w", key, err)
}
return true, nil
}
func liveSessionKey(sessionID string) string {
return "live:session:" + sessionID
}
func controllerBindingKey(sessionID string) string {
return "live:session:" + sessionID + ":controller"
}
func attachTokenKey(token string) string {
return "live:attach:" + token
}
func attachmentHeartbeatKey(sessionID, attachmentID string) string {
return "live:session:" + sessionID + ":attachment:" + attachmentID + ":heartbeat"
}
func workerRouteKey(sessionID string) string {
return "live:session:" + sessionID + ":worker-route"
}
@@ -0,0 +1,85 @@
package sessionbroker
import (
"context"
"time"
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
)
type RemoteSessionRepository interface {
Create(ctx context.Context, session RemoteSession) error
GetByID(ctx context.Context, sessionID string) (*RemoteSession, error)
GetByIDForUpdate(ctx context.Context, sessionID string) (*RemoteSession, error)
ListByController(ctx context.Context, userID string) ([]RemoteSession, error)
CountLiveByResource(ctx context.Context, resourceID string) (int, error)
ListDetachedExpired(ctx context.Context, before time.Time, limit int) ([]RemoteSession, error)
UpdateState(ctx context.Context, params UpdateRemoteSessionStateParams) error
}
type SessionAttachmentRepository interface {
Create(ctx context.Context, attachment SessionAttachment) error
GetByID(ctx context.Context, attachmentID string) (*SessionAttachment, error)
GetByIDForUpdate(ctx context.Context, attachmentID string) (*SessionAttachment, error)
ListByRemoteSession(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error)
ListActiveByRemoteSessionForUpdate(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error)
UpdateState(ctx context.Context, params UpdateSessionAttachmentStateParams) error
Supersede(ctx context.Context, params SupersedeAttachmentParams) error
}
type ResourcePolicyRepository interface {
GetByResourceID(ctx context.Context, resourceID string) (*ResourcePolicy, error)
Upsert(ctx context.Context, policy ResourcePolicy) error
}
type AuditEventRepository interface {
Create(ctx context.Context, event AuditEvent) error
}
type Store interface {
RemoteSessions() RemoteSessionRepository
SessionAttachments() SessionAttachmentRepository
ResourcePolicies() ResourcePolicyRepository
ResourceRuntime() ResourceRuntimeRepository
AuditEvents() AuditEventRepository
Access() AccessRepository
}
type Transactor interface {
WithinTransaction(ctx context.Context, fn func(store Store) error) error
}
type UpdateRemoteSessionStateParams struct {
RemoteSessionID string
State sessioncontracts.State
WorkerID string
DetachDeadlineAt *time.Time
LastHeartbeatAt *time.Time
TakeoverVersion int
UpdatedAt time.Time
}
type UpdateSessionAttachmentStateParams struct {
AttachmentID string
State AttachmentState
DetachedAt *time.Time
LastInputAt *time.Time
UpdatedAt time.Time
}
type SupersedeAttachmentParams struct {
PreviousAttachmentID string
NextAttachmentID string
DetachedAt time.Time
UpdatedAt time.Time
}
type AccessRepository interface {
IsTrustedDevice(ctx context.Context, userID, deviceID string) (bool, error)
GetPlatformRole(ctx context.Context, userID string) (string, error)
GetOrganizationRole(ctx context.Context, organizationID, userID string) (string, bool, error)
}
type ResourceRuntimeRepository interface {
GetByID(ctx context.Context, resourceID string) (*ResourceRuntimeSpec, error)
}
@@ -0,0 +1,138 @@
package sessionbroker
import (
"context"
"encoding/json"
"errors"
"testing"
"github.com/example/remote-access-platform/backend/internal/platform/config"
"github.com/example/remote-access-platform/backend/internal/platform/module"
"github.com/example/remote-access-platform/backend/internal/platform/secrets"
workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker"
)
type fakeSecretResolver struct {
response *secrets.ResolvedResourceSecret
err error
request secrets.ResolveResourceSecretRequest
}
func testAppConfig(env string) config.AppConfig {
return config.AppConfig{Name: "rap-api-test", Env: env}
}
func (r *fakeSecretResolver) ResolveForSession(_ context.Context, req secrets.ResolveResourceSecretRequest) (*secrets.ResolvedResourceSecret, error) {
r.request = req
if r.err != nil {
return nil, r.err
}
return r.response, nil
}
func TestRuntimeAssignmentMetadataMergesResolvedSecretWithoutMutatingSessionMetadata(t *testing.T) {
resolver := &fakeSecretResolver{
response: &secrets.ResolvedResourceSecret{
Descriptor: secrets.ResourceSecretDescriptor{Version: 3},
Payload: json.RawMessage(`{"username":"user","password":"secret","domain":"corp"}`),
},
}
service := NewService(module.Dependencies{
Config: module.Config{App: testAppConfig("production")},
}, nil, nil, nil, nil, resolver)
sessionMetadata := mustJSON(t, map[string]any{
"resource": map[string]any{
"id": "resource-1",
"organization_id": "org-1",
"secret_ref": "rap-secret://org/org-1/resources/resource-1/primary",
"metadata": map[string]any{
"rdp_host": "host",
},
},
})
session := RemoteSession{
ID: "session-1",
OrganizationID: "org-1",
ResourceID: "resource-1",
WorkerID: "worker-1",
Metadata: sessionMetadata,
}
metadata, secretRef, version, err := service.runtimeAssignmentMetadata(context.Background(), session, &workercontracts.WorkerLease{LeaseID: "lease-1"})
if err != nil {
t.Fatalf("runtimeAssignmentMetadata returned error: %v", err)
}
if secretRef == "" || version != 3 {
t.Fatalf("expected secret ref and version, got ref=%q version=%d", secretRef, version)
}
resource := metadata["resource"].(map[string]any)
resourceMetadata := resource["metadata"].(map[string]any)
if resourceMetadata["username"] != "user" || resourceMetadata["password"] != "secret" || resourceMetadata["domain"] != "corp" {
t.Fatalf("resolved secret was not merged: %#v", resourceMetadata)
}
var persisted map[string]any
if err := json.Unmarshal(session.Metadata, &persisted); err != nil {
t.Fatalf("decode persisted metadata: %v", err)
}
persistedResource := persisted["resource"].(map[string]any)
persistedMetadata := persistedResource["metadata"].(map[string]any)
if _, ok := persistedMetadata["password"]; ok {
t.Fatalf("session metadata was mutated with plaintext secret")
}
if resolver.request.LeaseID != "lease-1" || resolver.request.WorkerID != "worker-1" {
t.Fatalf("resolver request missed lease/worker proof: %#v", resolver.request)
}
}
func TestRuntimeAssignmentMetadataRequiresResolverInProduction(t *testing.T) {
service := NewService(module.Dependencies{
Config: module.Config{App: testAppConfig("production")},
}, nil, nil, nil, nil)
session := RemoteSession{
ID: "session-1",
OrganizationID: "org-1",
ResourceID: "resource-1",
WorkerID: "worker-1",
Metadata: mustJSON(t, map[string]any{
"resource": map[string]any{
"secret_ref": "rap-secret://org/org-1/resources/resource-1/primary",
},
}),
}
_, _, _, err := service.runtimeAssignmentMetadata(context.Background(), session, &workercontracts.WorkerLease{LeaseID: "lease-1"})
if !errors.Is(err, secrets.ErrSecretEncryptionKeyMissing) {
t.Fatalf("expected missing resolver error, got %v", err)
}
}
func TestRuntimeAssignmentMetadataAllowsDevelopmentMetadataWithoutResolver(t *testing.T) {
service := NewService(module.Dependencies{
Config: module.Config{App: testAppConfig("development")},
}, nil, nil, nil, nil)
session := RemoteSession{
ID: "session-1",
OrganizationID: "org-1",
ResourceID: "resource-1",
WorkerID: "worker-1",
Metadata: mustJSON(t, map[string]any{
"resource": map[string]any{
"secret_ref": "rap-secret://org/org-1/resources/resource-1/primary",
"metadata": map[string]any{
"username": "dev-user",
"password": "dev-password",
},
},
}),
}
metadata, secretRef, _, err := service.runtimeAssignmentMetadata(context.Background(), session, nil)
if err != nil {
t.Fatalf("development metadata should not require resolver: %v", err)
}
if secretRef != "" {
t.Fatalf("development fallback should not audit resolver use, got %q", secretRef)
}
resource := metadata["resource"].(map[string]any)
resourceMetadata := resource["metadata"].(map[string]any)
if resourceMetadata["password"] != "dev-password" {
t.Fatalf("development metadata was not preserved")
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,391 @@
package sessionbroker
import (
"context"
"io"
"log/slog"
"testing"
"time"
"github.com/example/remote-access-platform/backend/internal/platform/module"
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker"
)
func TestHandleWorkerConnectedIgnoresTerminalSession(t *testing.T) {
service, store, live, _ := newStaleWorkerEventTestService()
store.remote.sessions["session-1"] = RemoteSession{
ID: "session-1",
State: sessioncontracts.StateTerminated,
WorkerID: "worker-1",
}
if err := service.HandleWorkerConnected(context.Background(), "session-1"); err != nil {
t.Fatalf("HandleWorkerConnected returned error for stale terminal event: %v", err)
}
if got := store.remote.sessions["session-1"].State; got != sessioncontracts.StateTerminated {
t.Fatalf("stale connected event changed terminal state to %q", got)
}
if store.remote.updateCount != 0 {
t.Fatalf("stale connected event updated authoritative session %d times", store.remote.updateCount)
}
if live.upsertCount != 0 {
t.Fatalf("stale connected event recreated live state %d times", live.upsertCount)
}
}
func TestUpdateWorkerRenderTelemetryIgnoresTerminalSession(t *testing.T) {
service, store, live, _ := newStaleWorkerEventTestService()
store.remote.sessions["session-1"] = RemoteSession{
ID: "session-1",
State: sessioncontracts.StateTerminated,
WorkerID: "worker-1",
}
err := service.UpdateWorkerRenderTelemetry(context.Background(), "session-1", map[string]any{
"render_state": "ready",
"width": 1280,
"height": 720,
"frame_sequence": int64(99),
"frame_data": "stale-frame",
})
if err != nil {
t.Fatalf("UpdateWorkerRenderTelemetry returned error for stale terminal event: %v", err)
}
if live.upsertCount != 0 {
t.Fatalf("stale render event recreated live state %d times", live.upsertCount)
}
if live.sessions["session-1"] != nil {
t.Fatalf("stale render event left live state behind: %#v", live.sessions["session-1"])
}
}
func TestMarkSessionFailedTransitionsActiveSession(t *testing.T) {
service, store, live, orchestrator := newStaleWorkerEventTestService()
store.remote.sessions["session-1"] = RemoteSession{
ID: "session-1",
State: sessioncontracts.StateActive,
WorkerID: "worker-1",
TakeoverVersion: 3,
}
store.attachments.items["attachment-1"] = SessionAttachment{
ID: "attachment-1",
RemoteSessionID: "session-1",
State: AttachmentStateActive,
}
live.sessions["session-1"] = &LiveSessionState{SessionID: "session-1", State: sessioncontracts.StateActive}
if err := service.MarkSessionFailed(context.Background(), MarkSessionFailedCommand{SessionID: "session-1", Reason: "worker_lost"}); err != nil {
t.Fatalf("MarkSessionFailed returned error: %v", err)
}
if got := store.remote.sessions["session-1"].State; got != sessioncontracts.StateFailed {
t.Fatalf("expected failed state, got %q", got)
}
if got := store.attachments.items["attachment-1"].State; got != AttachmentStateClosed {
t.Fatalf("expected attachment closed, got %q", got)
}
if store.audit.createCount != 1 {
t.Fatalf("expected one audit event, got %d", store.audit.createCount)
}
if live.sessions["session-1"] != nil {
t.Fatal("expected failed session live state to be deleted")
}
if orchestrator.releaseCount != 1 {
t.Fatalf("expected session lease release, got %d", orchestrator.releaseCount)
}
}
func TestMarkSessionFailedAlreadyFailedIsIdempotent(t *testing.T) {
service, store, _, _ := newStaleWorkerEventTestService()
store.remote.sessions["session-1"] = RemoteSession{
ID: "session-1",
State: sessioncontracts.StateFailed,
WorkerID: "worker-1",
TakeoverVersion: 1,
}
if err := service.MarkSessionFailed(context.Background(), MarkSessionFailedCommand{SessionID: "session-1", Reason: "duplicate_worker_failure"}); err != nil {
t.Fatalf("duplicate MarkSessionFailed returned error: %v", err)
}
if got := store.remote.sessions["session-1"].State; got != sessioncontracts.StateFailed {
t.Fatalf("duplicate terminal event changed state to %q", got)
}
}
func newStaleWorkerEventTestService() (*Service, *staleWorkerEventTestStore, *staleWorkerEventLiveState, *staleWorkerEventOrchestrator) {
store := &staleWorkerEventTestStore{
remote: &staleWorkerEventRemoteSessions{sessions: map[string]RemoteSession{}},
attachments: &staleWorkerEventAttachments{items: map[string]SessionAttachment{}},
policies: &staleWorkerEventPolicies{},
audit: &staleWorkerEventAudit{},
}
live := &staleWorkerEventLiveState{sessions: map[string]*LiveSessionState{}}
orchestrator := &staleWorkerEventOrchestrator{}
service := NewService(module.Dependencies{
Infra: module.Infra{Logger: slog.New(slog.NewTextHandler(io.Discard, nil))},
}, store, staleWorkerEventTransactor{store: store}, live, orchestrator)
service.now = func() time.Time { return time.Unix(100, 0).UTC() }
return service, store, live, orchestrator
}
type staleWorkerEventTransactor struct {
store Store
}
func (t staleWorkerEventTransactor) WithinTransaction(ctx context.Context, fn func(store Store) error) error {
return fn(t.store)
}
type staleWorkerEventTestStore struct {
remote *staleWorkerEventRemoteSessions
attachments *staleWorkerEventAttachments
policies *staleWorkerEventPolicies
audit *staleWorkerEventAudit
}
func (s *staleWorkerEventTestStore) RemoteSessions() RemoteSessionRepository { return s.remote }
func (s *staleWorkerEventTestStore) SessionAttachments() SessionAttachmentRepository {
return s.attachments
}
func (s *staleWorkerEventTestStore) ResourcePolicies() ResourcePolicyRepository { return s.policies }
func (s *staleWorkerEventTestStore) ResourceRuntime() ResourceRuntimeRepository {
return staleWorkerEventResourceRuntime{}
}
func (s *staleWorkerEventTestStore) AuditEvents() AuditEventRepository { return s.audit }
func (s *staleWorkerEventTestStore) Access() AccessRepository { return staleWorkerEventAccess{} }
type staleWorkerEventRemoteSessions struct {
sessions map[string]RemoteSession
updateCount int
}
func (r *staleWorkerEventRemoteSessions) Create(_ context.Context, session RemoteSession) error {
r.sessions[session.ID] = session
return nil
}
func (r *staleWorkerEventRemoteSessions) GetByID(_ context.Context, sessionID string) (*RemoteSession, error) {
session, ok := r.sessions[sessionID]
if !ok {
return nil, nil
}
return &session, nil
}
func (r *staleWorkerEventRemoteSessions) GetByIDForUpdate(ctx context.Context, sessionID string) (*RemoteSession, error) {
return r.GetByID(ctx, sessionID)
}
func (r *staleWorkerEventRemoteSessions) ListByController(_ context.Context, _ string) ([]RemoteSession, error) {
return nil, nil
}
func (r *staleWorkerEventRemoteSessions) CountLiveByResource(_ context.Context, _ string) (int, error) {
return 0, nil
}
func (r *staleWorkerEventRemoteSessions) ListDetachedExpired(_ context.Context, _ time.Time, _ int) ([]RemoteSession, error) {
return nil, nil
}
func (r *staleWorkerEventRemoteSessions) UpdateState(_ context.Context, params UpdateRemoteSessionStateParams) error {
session := r.sessions[params.RemoteSessionID]
session.State = params.State
session.WorkerID = params.WorkerID
session.DetachDeadlineAt = params.DetachDeadlineAt
session.LastHeartbeatAt = params.LastHeartbeatAt
session.TakeoverVersion = params.TakeoverVersion
session.UpdatedAt = params.UpdatedAt
r.sessions[params.RemoteSessionID] = session
r.updateCount++
return nil
}
type staleWorkerEventAttachments struct {
items map[string]SessionAttachment
}
func (r *staleWorkerEventAttachments) Create(_ context.Context, attachment SessionAttachment) error {
r.items[attachment.ID] = attachment
return nil
}
func (r *staleWorkerEventAttachments) GetByID(_ context.Context, attachmentID string) (*SessionAttachment, error) {
attachment, ok := r.items[attachmentID]
if !ok {
return nil, nil
}
return &attachment, nil
}
func (r *staleWorkerEventAttachments) GetByIDForUpdate(ctx context.Context, attachmentID string) (*SessionAttachment, error) {
return r.GetByID(ctx, attachmentID)
}
func (r *staleWorkerEventAttachments) ListByRemoteSession(_ context.Context, remoteSessionID string) ([]SessionAttachment, error) {
attachments := make([]SessionAttachment, 0)
for _, attachment := range r.items {
if attachment.RemoteSessionID == remoteSessionID {
attachments = append(attachments, attachment)
}
}
return attachments, nil
}
func (r *staleWorkerEventAttachments) ListActiveByRemoteSessionForUpdate(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) {
return r.ListByRemoteSession(ctx, remoteSessionID)
}
func (r *staleWorkerEventAttachments) UpdateState(_ context.Context, params UpdateSessionAttachmentStateParams) error {
attachment := r.items[params.AttachmentID]
attachment.State = params.State
attachment.DetachedAt = params.DetachedAt
attachment.LastInputAt = params.LastInputAt
attachment.UpdatedAt = params.UpdatedAt
r.items[params.AttachmentID] = attachment
return nil
}
func (r *staleWorkerEventAttachments) Supersede(_ context.Context, params SupersedeAttachmentParams) error {
attachment := r.items[params.PreviousAttachmentID]
attachment.State = AttachmentStateSuperseded
attachment.SupersededBy = &params.NextAttachmentID
attachment.DetachedAt = &params.DetachedAt
attachment.UpdatedAt = params.UpdatedAt
r.items[params.PreviousAttachmentID] = attachment
return nil
}
type staleWorkerEventPolicies struct{}
func (r *staleWorkerEventPolicies) GetByResourceID(_ context.Context, _ string) (*ResourcePolicy, error) {
return nil, nil
}
func (r *staleWorkerEventPolicies) Upsert(_ context.Context, _ ResourcePolicy) error {
return nil
}
type staleWorkerEventAudit struct {
createCount int
}
func (r *staleWorkerEventAudit) Create(_ context.Context, _ AuditEvent) error {
r.createCount++
return nil
}
type staleWorkerEventResourceRuntime struct{}
func (staleWorkerEventResourceRuntime) GetByID(_ context.Context, _ string) (*ResourceRuntimeSpec, error) {
return nil, nil
}
type staleWorkerEventAccess struct{}
func (staleWorkerEventAccess) IsTrustedDevice(_ context.Context, _, _ string) (bool, error) {
return false, nil
}
func (staleWorkerEventAccess) GetPlatformRole(_ context.Context, _ string) (string, error) {
return "", nil
}
func (staleWorkerEventAccess) GetOrganizationRole(_ context.Context, _, _ string) (string, bool, error) {
return "", false, nil
}
type staleWorkerEventLiveState struct {
sessions map[string]*LiveSessionState
upsertCount int
}
func (s *staleWorkerEventLiveState) UpsertSession(_ context.Context, state LiveSessionState) error {
copied := state
s.sessions[state.SessionID] = &copied
s.upsertCount++
return nil
}
func (s *staleWorkerEventLiveState) GetSession(_ context.Context, sessionID string) (*LiveSessionState, error) {
state := s.sessions[sessionID]
if state == nil {
return nil, nil
}
copied := *state
return &copied, nil
}
func (s *staleWorkerEventLiveState) DeleteSession(_ context.Context, sessionID string) error {
delete(s.sessions, sessionID)
return nil
}
func (s *staleWorkerEventLiveState) BindController(_ context.Context, _ sessioncontracts.ControllerBinding, _ time.Duration) error {
return nil
}
func (s *staleWorkerEventLiveState) GetControllerBinding(_ context.Context, _ string) (*sessioncontracts.ControllerBinding, error) {
return nil, nil
}
func (s *staleWorkerEventLiveState) ClearControllerBinding(_ context.Context, _ string) error {
return nil
}
func (s *staleWorkerEventLiveState) StoreAttachToken(_ context.Context, _ sessioncontracts.AttachTokenClaims, _ time.Duration) error {
return nil
}
func (s *staleWorkerEventLiveState) ConsumeAttachToken(_ context.Context, _ string) (*sessioncontracts.AttachTokenClaims, error) {
return nil, nil
}
func (s *staleWorkerEventLiveState) TouchAttachmentHeartbeat(_ context.Context, _, _ string, _ time.Duration) error {
return nil
}
func (s *staleWorkerEventLiveState) UpdateWorkerRoute(_ context.Context, _ WorkerRoute, _ time.Duration) error {
return nil
}
func (s *staleWorkerEventLiveState) GetWorkerRoute(_ context.Context, _ string) (*WorkerRoute, error) {
return nil, nil
}
func (s *staleWorkerEventLiveState) DeleteWorkerRoute(_ context.Context, _ string) error {
return nil
}
type staleWorkerEventOrchestrator struct {
releaseCount int
}
func (o *staleWorkerEventOrchestrator) Reserve(_ context.Context, _ workercontracts.AttachRequest) (*workercontracts.WorkerLease, error) {
return nil, nil
}
func (o *staleWorkerEventOrchestrator) GetSessionLease(_ context.Context, _ string) (*workercontracts.WorkerLease, error) {
return nil, nil
}
func (o *staleWorkerEventOrchestrator) ReleaseSessionLease(_ context.Context, _ string) error {
o.releaseCount++
return nil
}
func (o *staleWorkerEventOrchestrator) PrepareAttachment(_ context.Context, _ RemoteSession, _ SessionAttachment, _ map[string]any) error {
return nil
}
func (o *staleWorkerEventOrchestrator) NotifyDetachment(_ context.Context, _ RemoteSession, _ SessionAttachment) error {
return nil
}
func (o *staleWorkerEventOrchestrator) TerminateRemoteSession(_ context.Context, _, _ string) error {
return nil
}
func (o *staleWorkerEventOrchestrator) ValidateSessionRuntime(_ context.Context, _, _ string) (bool, string, error) {
return true, "", nil
}
@@ -0,0 +1,44 @@
package sessionbroker
import (
"fmt"
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
)
var allowedTransitions = map[sessioncontracts.State]map[sessioncontracts.State]struct{}{
sessioncontracts.StateStarting: {
sessioncontracts.StateActive: {},
sessioncontracts.StateFailed: {},
sessioncontracts.StateTerminated: {},
},
sessioncontracts.StateActive: {
sessioncontracts.StateDetached: {},
sessioncontracts.StateReconnecting: {},
sessioncontracts.StateFailed: {},
sessioncontracts.StateTerminated: {},
},
sessioncontracts.StateDetached: {
sessioncontracts.StateReconnecting: {},
sessioncontracts.StateTerminated: {},
sessioncontracts.StateFailed: {},
},
sessioncontracts.StateReconnecting: {
sessioncontracts.StateActive: {},
sessioncontracts.StateDetached: {},
sessioncontracts.StateFailed: {},
sessioncontracts.StateTerminated: {},
},
}
func validateTransition(from, to sessioncontracts.State) error {
if from == to {
return nil
}
if allowed, ok := allowedTransitions[from]; ok {
if _, ok := allowed[to]; ok {
return nil
}
}
return fmt.Errorf("invalid session state transition: %s -> %s", from, to)
}
File diff suppressed because it is too large Load Diff
+29
View File
@@ -0,0 +1,29 @@
package worker
type SessionEvent struct {
Type string `json:"type"`
SessionID string `json:"session_id"`
WorkerID string `json:"worker_id"`
Payload map[string]any `json:"payload,omitempty"`
}
const (
SessionEventConnected = "session_connected"
SessionEventHeartbeat = "session_heartbeat"
SessionEventFailed = "session_failed"
SessionEventTerminated = "session_terminated"
SessionEventDisplayReady = "session_display_ready"
SessionEventRenderReady = "session_render_ready"
SessionEventRenderDirty = "session_render_dirty"
SessionEventRenderResized = "session_render_resized"
SessionEventCursorUpdated = "session_cursor_updated"
SessionEventFrame = "session_frame"
SessionEventClipboardText = "session_clipboard_text"
SessionEventFileUploaded = "session_file_upload_completed"
SessionEventFileDownloadAvailable = "session_file_download_available"
SessionEventFileDownloadChunk = "session_file_download_chunk"
SessionEventFileDownloadProgress = "session_file_download_progress"
SessionEventFileDownloadCompleted = "session_file_download_completed"
SessionEventFileDownloadFailed = "session_file_download_failed"
SessionEventFileDownloadBlocked = "session_file_download_blocked"
)
@@ -0,0 +1,52 @@
package worker
import (
"context"
"errors"
"time"
"github.com/example/remote-access-platform/backend/internal/modules/sessionbroker"
)
type LeaseMonitor struct {
service *Service
broker *sessionbroker.Service
interval time.Duration
}
func NewLeaseMonitor(service *Service, broker *sessionbroker.Service, interval time.Duration) *LeaseMonitor {
if interval <= 0 {
interval = 15 * time.Second
}
return &LeaseMonitor{
service: service,
broker: broker,
interval: interval,
}
}
func (m *LeaseMonitor) Run(ctx context.Context) error {
ticker := time.NewTicker(m.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return nil
case <-ticker.C:
stale, err := m.service.RecoverStaleLeases(ctx)
if err != nil {
return err
}
for _, lease := range stale {
err := m.broker.MarkSessionFailed(ctx, sessionbroker.MarkSessionFailedCommand{
SessionID: lease.SessionID,
Reason: "worker_lease_stale_or_worker_missing",
})
if err != nil && !errors.Is(err, sessionbroker.ErrSessionNotFound) && !errors.Is(err, sessionbroker.ErrSessionNotTerminable) {
return err
}
}
}
}
}

Some files were not shown because too many files have changed in this diff Show More