Fallback across VPN fabric endpoint candidates

This commit is contained in:
2026-05-16 10:54:11 +03:00
parent 4ebc6629e6
commit 850bd67b6a
3 changed files with 116 additions and 47 deletions
@@ -4628,63 +4628,82 @@ func fabricSessionGatewayTransportForAssignment(ctx context.Context, identity st
if meshState == nil || meshState.VPNFabricInbox == nil || assignment.VPNConnectionID == "" || nextHop == "" { if meshState == nil || meshState.VPNFabricInbox == nil || assignment.VPNConnectionID == "" || nextHop == "" {
return nil return nil
} }
target, ok := vpnFabricSessionTarget(meshState, nextHop) targets := vpnFabricSessionTargets(meshState, nextHop)
if !ok { if len(targets) == 0 {
log.Printf("vpn fabric session transport skipped: vpn_connection_id=%s next_hop=%s reason=peer_endpoint_missing", assignment.VPNConnectionID, nextHop) log.Printf("vpn fabric session transport skipped: vpn_connection_id=%s next_hop=%s reason=peer_endpoint_missing", assignment.VPNConnectionID, nextHop)
return nil return nil
} }
dialCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
if meshState.VPNFabricSessionPeers == nil { if meshState.VPNFabricSessionPeers == nil {
meshState.VPNFabricSessionPeers = mesh.NewFabricSessionPeerManager() meshState.VPNFabricSessionPeers = mesh.NewFabricSessionPeerManager()
} }
if meshState.VPNFabricTransport == nil { if meshState.VPNFabricTransport == nil {
meshState.VPNFabricTransport = mesh.NewWebSocketFabricTransport(meshState.VPNFabricSessionPeers) meshState.VPNFabricTransport = mesh.NewWebSocketFabricTransport(meshState.VPNFabricSessionPeers)
} }
target.PeerID = nextHop token := fabricSessionGatewayToken(identity, assignment, nextHop)
target.Token = fabricSessionGatewayToken(identity, assignment, nextHop) for index, target := range targets {
target.Timeout = 3 * time.Second dialCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
target.OutboundBuffer = 256 target.PeerID = nextHop
target.InboundBuffer = 256 target.Token = token
target.ErrorBuffer = 16 target.Timeout = 3 * time.Second
carrier, target, err := mesh.FabricTransportForTarget(target, meshState.VPNFabricTransport, nil) target.OutboundBuffer = 256
if err != nil { target.InboundBuffer = 256
log.Printf("vpn fabric session transport skipped: vpn_connection_id=%s next_hop=%s reason=transport_select_failed error=%v", assignment.VPNConnectionID, nextHop, err) target.ErrorBuffer = 16
return nil carrier, selectedTarget, err := mesh.FabricTransportForTarget(target, meshState.VPNFabricTransport, nil)
} if err != nil {
session, err := carrier.Connect(dialCtx, target) cancel()
if err != nil { log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=transport_select_failed error=%v", assignment.VPNConnectionID, nextHop, index, target.Endpoint, target.Transport, err)
log.Printf("vpn fabric session transport skipped: vpn_connection_id=%s next_hop=%s reason=session_open_failed error=%v", assignment.VPNConnectionID, nextHop, err) continue
return nil }
} session, err := carrier.Connect(dialCtx, selectedTarget)
streamID := uint64(time.Now().UnixNano()) if err != nil {
if streamID == 0 { cancel()
streamID = 1 log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=session_open_failed error=%v", assignment.VPNConnectionID, nextHop, index, selectedTarget.Endpoint, selectedTarget.Transport, err)
} continue
if err := session.Send(dialCtx, fabricproto.Frame{ }
Type: fabricproto.FrameOpenStream, streamID := uint64(time.Now().UnixNano())
StreamID: streamID, if streamID == 0 {
TrafficClass: fabricproto.TrafficClassInteractive, streamID = 1
}); err != nil { }
log.Printf("vpn fabric session transport skipped: vpn_connection_id=%s next_hop=%s reason=stream_open_failed error=%v", assignment.VPNConnectionID, nextHop, err) if err := session.Send(dialCtx, fabricproto.Frame{
return nil Type: fabricproto.FrameOpenStream,
} StreamID: streamID,
return &vpnruntime.FabricSessionPacketTransport{ TrafficClass: fabricproto.TrafficClassInteractive,
Sender: session, }); err != nil {
Receiver: session, cancel()
Inbox: meshState.VPNFabricInbox, _ = session.Close()
StreamID: streamID, log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=stream_open_failed error=%v", assignment.VPNConnectionID, nextHop, index, selectedTarget.Endpoint, selectedTarget.Transport, err)
VPNConnectionID: assignment.VPNConnectionID, continue
SendDirection: vpnruntime.FabricDirectionGatewayToClient, }
ReceiveDirection: vpnruntime.FabricDirectionClientToGateway, cancel()
TrafficClass: vpnruntime.FabricTrafficClassInteractive, return &vpnruntime.FabricSessionPacketTransport{
Sender: session,
Receiver: session,
Inbox: meshState.VPNFabricInbox,
StreamID: streamID,
VPNConnectionID: assignment.VPNConnectionID,
SendDirection: vpnruntime.FabricDirectionGatewayToClient,
ReceiveDirection: vpnruntime.FabricDirectionClientToGateway,
TrafficClass: vpnruntime.FabricTrafficClassInteractive,
}
} }
log.Printf("vpn fabric session transport skipped: vpn_connection_id=%s next_hop=%s reason=all_candidates_failed candidates=%d", assignment.VPNConnectionID, nextHop, len(targets))
return nil
} }
func vpnFabricSessionTarget(meshState *syntheticMeshState, nextHop string) (mesh.FabricTransportTarget, bool) { func vpnFabricSessionTarget(meshState *syntheticMeshState, nextHop string) (mesh.FabricTransportTarget, bool) {
if meshState == nil { targets := vpnFabricSessionTargets(meshState, nextHop)
if len(targets) == 0 {
return mesh.FabricTransportTarget{}, false return mesh.FabricTransportTarget{}, false
} }
return targets[0], true
}
func vpnFabricSessionTargets(meshState *syntheticMeshState, nextHop string) []mesh.FabricTransportTarget {
if meshState == nil {
return nil
}
out := make([]mesh.FabricTransportTarget, 0, len(meshState.PeerEndpointCandidates[nextHop])+1)
seen := map[string]struct{}{}
if candidates := meshState.PeerEndpointCandidates[nextHop]; len(candidates) > 0 { if candidates := meshState.PeerEndpointCandidates[nextHop]; len(candidates) > 0 {
ranked := mesh.RankPeerEndpointCandidates(candidates, mesh.EndpointCandidateScoreOptions{ ranked := mesh.RankPeerEndpointCandidates(candidates, mesh.EndpointCandidateScoreOptions{
ChannelClass: mesh.SyntheticChannelFabricControl, ChannelClass: mesh.SyntheticChannelFabricControl,
@@ -4696,18 +4715,26 @@ func vpnFabricSessionTarget(meshState *syntheticMeshState, nextHop string) (mesh
if endpoint == "" { if endpoint == "" {
continue continue
} }
return mesh.FabricTransportTarget{ key := item.Candidate.Transport + "\x00" + endpoint
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
out = append(out, mesh.FabricTransportTarget{
Endpoint: endpoint, Endpoint: endpoint,
Transport: item.Candidate.Transport, Transport: item.Candidate.Transport,
PeerCertSHA256: endpointCandidateTLSCertSHA256(item.Candidate), PeerCertSHA256: endpointCandidateTLSCertSHA256(item.Candidate),
}, true })
} }
} }
endpoint := strings.TrimRight(strings.TrimSpace(meshState.PeerEndpoints[nextHop]), "/") endpoint := strings.TrimRight(strings.TrimSpace(meshState.PeerEndpoints[nextHop]), "/")
if endpoint == "" { if endpoint != "" {
return mesh.FabricTransportTarget{}, false key := "\x00" + endpoint
if _, ok := seen[key]; !ok {
out = append(out, mesh.FabricTransportTarget{Endpoint: endpoint})
}
} }
return mesh.FabricTransportTarget{Endpoint: endpoint}, true return out
} }
func endpointCandidateTLSCertSHA256(candidate mesh.PeerEndpointCandidate) string { func endpointCandidateTLSCertSHA256(candidate mesh.PeerEndpointCandidate) string {
@@ -809,6 +809,45 @@ func TestVPNFabricSessionTargetFallsBackToLegacyPeerEndpoint(t *testing.T) {
} }
} }
func TestVPNFabricSessionTargetsIncludeRankedCandidatesThenLegacyFallback(t *testing.T) {
now := time.Now().UTC()
targets := vpnFabricSessionTargets(&syntheticMeshState{
PeerEndpoints: map[string]string{
"node-b": "https://node-b-legacy.example.test:443/",
},
PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-wss",
NodeID: "node-b",
Transport: "wss",
Address: "https://node-b.example.test:443",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 10,
LastVerifiedAt: &now,
},
{
EndpointID: "node-b-quic",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://node-b.example.test:19443",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 10,
LastVerifiedAt: &now,
},
},
},
}, "node-b")
if len(targets) != 3 {
t.Fatalf("target count = %d, want 3: %+v", len(targets), targets)
}
if targets[0].Transport != "direct_quic" || targets[1].Transport != "wss" || targets[2].Endpoint != "https://node-b-legacy.example.test:443" {
t.Fatalf("targets not ordered by ranked candidates then fallback: %+v", targets)
}
}
func TestHeartbeatPayloadReportsMeshListenerFailureWithoutKillingHeartbeat(t *testing.T) { func TestHeartbeatPayloadReportsMeshListenerFailureWithoutKillingHeartbeat(t *testing.T) {
now := time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC) now := time.Date(2026, 4, 30, 9, 0, 0, 0, time.UTC)
payload := heartbeatPayload(config.Config{ payload := heartbeatPayload(config.Config{
@@ -316,6 +316,9 @@ legacy peer endpoints when the control plane has not published candidates yet.
The temporary self-signed QUIC listener advertises its SHA-256 certificate The temporary self-signed QUIC listener advertises its SHA-256 certificate
fingerprint in endpoint metadata, and the QUIC client can pin that fingerprint fingerprint in endpoint metadata, and the QUIC client can pin that fingerprint
instead of disabling verification while the cluster CA path is being finished. instead of disabling verification while the cluster CA path is being finished.
VPN fabric-session dialing now walks all ranked endpoint candidates before
falling back to the legacy peer endpoint, so a failed QUIC candidate does not
block WebSocket/HTTPS compatibility transport.
Deliverables: Deliverables: