Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions api/v1alpha1/seinode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,24 +290,28 @@ const (
// ConditionSeiNodePaused mirrors spec.paused: True when paused.
ConditionSeiNodePaused = "Paused"

// ConditionStateSyncReady gates the state-sync-bearing plan. Always-present
// once reconciled. True means canonical syncers are configured and the plan
// may proceed; False fails closed (no state-sync plan built, and peers are
// never used as witnesses). It is a configured-count gate: witness
// reliability comes from curating the canonical-syncer set, and the sidecar
// establishes the trust point from them as it does today.
// ConditionStateSyncReady gates the ConfigureStateSync-bearing plan, which is
// built for any snapshot bootstrap (stateSync or s3 — both apply via CometBFT
// state-sync and need rpc-server witnesses). Always-present once reconciled.
// True means canonical syncers are configured and the plan may proceed; False
// fails closed (no such plan built, and peers are never used as witnesses). It
// is a configured-count gate: witness reliability comes from curating the
// canonical-syncer set, and the sidecar establishes the trust point from them
// as it does today.
ConditionStateSyncReady = "StateSyncReady"
)

// Reasons for the StateSyncReady condition.
const (
// ReasonStateSyncReady: state-sync enabled and >=2 canonical syncers are
// configured for the chain; the state-sync-bearing plan may proceed.
// ReasonStateSyncReady: the node bootstraps from a snapshot (stateSync or s3)
// and >=2 canonical syncers are configured for the chain; the
// ConfigureStateSync-bearing plan may proceed.
ReasonStateSyncReady = "Ready"
// ReasonStateSyncNoSyncersConfigured: state-sync enabled but the canonical-
// syncer source yields <2 entries for the chain (fail closed).
// ReasonStateSyncNoSyncersConfigured: the node bootstraps from a snapshot but
// the canonical-syncer source yields <2 entries for the chain (fail closed).
ReasonStateSyncNoSyncersConfigured = "NoSyncersConfigured"
// ReasonStateSyncNotApplicable: the node does not enable state-sync.
// ReasonStateSyncNotApplicable: the node does not bootstrap from a snapshot
// (e.g. a genesis node), so it carries no ConfigureStateSync task to gate.
ReasonStateSyncNotApplicable = "NotApplicable"
// ReasonStateSyncSyncerSourceError: reading or parsing the canonical-syncer
// source file failed for a reason other than absence (transient). Fails
Expand Down
3 changes: 3 additions & 0 deletions internal/controller/node/plan_execution_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ func newProgressionReconciler(t *testing.T, mock *mockSidecarClient, objs ...cli
},
},
}
// Snapshot-bootstrap nodes pass through the fail-closed StateSyncReady gate;
// wire syncers for every fixture chain so progression tests proceed.
withSyncers(t, r, fixtureSyncers())
return r, c
}

Expand Down
22 changes: 22 additions & 0 deletions internal/controller/node/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,26 @@ func newNodeReconcilerWithSidecar(t *testing.T, mock *mockSidecarClient, objs ..
},
},
}
// Wire canonical syncers for every fixture chain so snapshot-bootstrap nodes
// (s3 and stateSync both carry ConfigureStateSync and pass through the
// fail-closed StateSyncReady gate). Tests asserting fail-closed override this
// via withSyncers/writeSyncerFile.
withSyncers(t, r, fixtureSyncers())
return r, c
}

// fixtureSyncers returns a >=2-syncer set for every chain used by node test
// fixtures, so a snapshot-bootstrap node clears the StateSyncReady gate
// regardless of which fixture chain it carries.
func fixtureSyncers() map[string][]string {
chains := []string{defaultTestChainID, testChainID, atlantic2ChainID, pacific1ChainID, "test"}
out := make(map[string][]string, len(chains))
for _, ch := range chains {
out[ch] = []string{syncerA, syncerB}
}
return out
}

func nodeReqFor(name, namespace string) ctrl.Request { //nolint:unparam // test helper designed for reuse
return ctrl.Request{NamespacedName: types.NamespacedName{Name: name, Namespace: namespace}}
}
Expand All @@ -89,6 +106,11 @@ func getSeiNode(t *testing.T, ctx context.Context, c client.Client, name, namesp
const (
testImageV2 = "ghcr.io/sei-protocol/seid:v2.0.0"
testRevision = "rev-2"
// defaultTestChainID must match the chainID the snapshot/genesis fixtures
// hardcode in testhelpers_test.go (not enforced by the compiler).
defaultTestChainID = "sei-test"
atlantic2ChainID = "atlantic-2"
pacific1ChainID = "pacific-1"
)

func TestNodeReconcile_NotFound(t *testing.T) {
Expand Down
10 changes: 7 additions & 3 deletions internal/controller/node/statesync.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ const minCanonicalSyncers = 2
// the caller's single status patch flushes it. Run before the Failed/Paused
// early-returns so StateSyncReady is seeded on every path.
//
// It resolves for any snapshot-bootstrap node (see planner.needsStateSyncWitnesses
// for why witnesses are required); a genesis node (snap == nil) is NotApplicable.
//
// Fail-closed is enforced downstream: the planner declines to build a state-sync
// plan whenever StateSyncReady is not True (see ResolvePlan), so this method only
// resolves the condition. It returns blocked=true whenever state-sync is enabled
Expand All @@ -34,11 +37,12 @@ const minCanonicalSyncers = 2
// and unblocks once GitOps provisions or fixes the syncers.
func (r *SeiNodeReconciler) reconcileStateSyncGate(node *seiv1alpha1.SeiNode) (blocked bool) {
snap := node.Spec.SnapshotSource()
if snap == nil || snap.StateSync == nil {
// State-sync disabled: no state-sync task in the plan to gate.
if snap == nil {
// No snapshot source (e.g. genesis validator): no ConfigureStateSync task
// in the plan to gate.
node.Status.ResolvedStateSyncers = nil
setStateSyncReady(node, metav1.ConditionFalse, seiv1alpha1.ReasonStateSyncNotApplicable,
"node does not enable state sync")
"node does not bootstrap from a snapshot")
return false
}

Expand Down
56 changes: 41 additions & 15 deletions internal/controller/node/statesync_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,32 +196,56 @@ func TestStateSyncGate_UnconfiguredSource_FailsClosed(t *testing.T) {
g := NewWithT(t)

node := stateSyncNode("n", testChainID)
r, _ := newNodeReconciler(t, node) // Platform leaves ControllerConfigFile empty.
r, _ := newNodeReconciler(t, node)
r.Platform.ControllerConfigFile = "" // unconfigured source

blocked := r.reconcileStateSyncGate(node)
g.Expect(blocked).To(BeTrue())
g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured))
}

func TestStateSyncGate_Disabled_NotApplicable(t *testing.T) {
// An s3-restore node applies its snapshot via CometBFT state-sync
// (use-local-snapshot), so it ALSO needs canonical rpc-server witnesses to
// verify the trust point. The gate must resolve syncers for it (Ready when >=2
// are configured) rather than declaring NotApplicable — the latter left
// ResolvedStateSyncers nil and the sidecar fell back to unreachable peers.
func TestStateSyncGate_S3Restore_ResolvesSyncers(t *testing.T) {
g := NewWithT(t)

// S3 snapshot node: state sync not enabled.
node := newSnapshotNode("n", testNamespace)
node := newSnapshotNode("n", testNamespace) // s3 source, ChainID "sei-test"
r, _ := newNodeReconciler(t, node)
withSyncers(t, r, map[string][]string{testChainID: {syncerA, syncerB}})
withSyncers(t, r, map[string][]string{node.Spec.ChainID: {syncerA, syncerB}})

blocked := r.reconcileStateSyncGate(node)
g.Expect(blocked).To(BeFalse())

cond := stateSyncCondition(node)
g.Expect(cond.Status).To(Equal(metav1.ConditionTrue))
g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncReady))
g.Expect(node.Status.ResolvedStateSyncers).To(Equal([]string{syncerA, syncerB}))
}

// An s3-restore node with <2 configured syncers must fail closed — same as a
// stateSync node. This is the bug being fixed: previously it was NotApplicable
// and proceeded witness-less.
func TestStateSyncGate_S3Restore_OneSyncer_FailsClosed(t *testing.T) {
g := NewWithT(t)

node := newSnapshotNode("n", testNamespace)
r, _ := newNodeReconciler(t, node)
withSyncers(t, r, map[string][]string{node.Spec.ChainID: {syncerSingle}})

blocked := r.reconcileStateSyncGate(node)
g.Expect(blocked).To(BeTrue())

cond := stateSyncCondition(node)
g.Expect(cond.Status).To(Equal(metav1.ConditionFalse))
g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncNotApplicable))
g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured))
g.Expect(node.Status.ResolvedStateSyncers).To(BeNil())
}

// A node with no snapshot source at all (e.g. genesis validator) is treated as
// state-sync-disabled: NotApplicable, never blocks the plan.
// A node with no snapshot source at all (e.g. genesis validator) carries no
// ConfigureStateSync task: NotApplicable, never blocks the plan.
func TestStateSyncGate_NoSnapshotSource_NotApplicable(t *testing.T) {
g := NewWithT(t)

Expand All @@ -248,19 +272,21 @@ func TestStateSyncGate_FailClosedClearsStaleSyncers(t *testing.T) {

node := stateSyncNode("n", testChainID)
node.Status.ResolvedStateSyncers = []string{"old-a:26657", "old-b:26657"}
r, _ := newNodeReconciler(t, node) // unconfigured source → fail closed
r, _ := newNodeReconciler(t, node)
r.Platform.ControllerConfigFile = "" // unconfigured source → fail closed

blocked := r.reconcileStateSyncGate(node)
g.Expect(blocked).To(BeTrue())
g.Expect(node.Status.ResolvedStateSyncers).To(BeNil())
}

// The gate must not block (or even set ResolvedStateSyncers on) a non-state-sync
// node — regression guard for the full reconcile path.
func TestStateSyncGate_NonStateSyncNodeUnaffected(t *testing.T) {
// The gate must not block (or even set ResolvedStateSyncers on) a node that
// carries no ConfigureStateSync task — a genesis node with no snapshot source.
// Regression guard for the full reconcile path.
func TestStateSyncGate_GenesisNodeUnaffected(t *testing.T) {
g := NewWithT(t)

node := newSnapshotNode("n", testNamespace)
node := newGenesisNode("n", testNamespace)
r, _ := newNodeReconciler(t, node)

blocked := r.reconcileStateSyncGate(node)
Expand Down Expand Up @@ -365,8 +391,8 @@ func TestReconcile_PausedNode_StateSyncReadyStillSeeded(t *testing.T) {
wantReason: seiv1alpha1.ReasonStateSyncNoSyncersConfigured,
},
{
name: "state-sync disabled",
node: newSnapshotNode("paused-s3", testNamespace),
name: "no snapshot source",
node: newGenesisNode("paused-gen", testNamespace),
withSyncers: false,
wantReason: seiv1alpha1.ReasonStateSyncNotApplicable,
},
Expand Down
7 changes: 7 additions & 0 deletions internal/planner/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@
// - Conditions are always-present: condition state is expressed as
// True/False/Unknown with a stable Reason, never by removal (see
// CLAUDE.md "Conditions"). isConditionTrue asserts on Status, not presence.
// - State-sync witnesses gate every snapshot bootstrap: needsStateSyncWitnesses
// (snap != nil) drives both the ConfigureStateSync task insertion
// (buildSidecarProgression) and the fail-closed plan blocker
// (stateSyncBlocksPlan), so a node bootstrapping from a snapshot — s3 or
// stateSync — can never plan ConfigureStateSync without >=2 resolved
// canonical-syncer witnesses. Genesis (snap == nil) carries no such task.
// Guarded by TestStateSyncGate_S3Restore_OneSyncer_FailsClosed.
//
// # Zero-Value & Sentinel Semantics
//
Expand Down
31 changes: 24 additions & 7 deletions internal/planner/planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,17 +186,19 @@ func (p *NodeResolver) ResolvePlan(ctx context.Context, node *seiv1alpha1.SeiNod

// stateSyncBlocksPlan reports whether the fail-closed state-sync gate must
// suppress plan construction this reconcile. It fires only on the init path
// (pre-Running), which is the only path that builds a state-sync-bearing plan
// via buildSidecarProgression — a Running node's update plans carry no
// (pre-Running), which is the only path that builds a ConfigureStateSync-bearing
// plan via buildSidecarProgression — a Running node's update plans carry no
// state-sync task, so an image roll must not be blocked by a syncer-source
// blip. The gate trips when state-sync is enabled and the controller-resolved
// StateSyncReady condition is not True (NoSyncersConfigured or the transient
// SyncerSourceError). A missing condition (not yet resolved) does not gate.
// blip. The gate trips when the node carries a ConfigureStateSync task (any
// snapshot source — both stateSync and s3-restore apply via CometBFT state-sync
// and need rpc-server witnesses) and the controller-resolved StateSyncReady
// condition is not True (NoSyncersConfigured or the transient SyncerSourceError).
// A missing condition (not yet resolved) does not gate.
func stateSyncBlocksPlan(node *seiv1alpha1.SeiNode) bool {
if node.Status.Phase == seiv1alpha1.PhaseRunning {
return false
}
if !hasStateSync(node.Spec.SnapshotSource()) {
if !needsStateSyncWitnesses(node.Spec.SnapshotSource()) {
return false
}
cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionStateSyncReady)
Expand Down Expand Up @@ -342,7 +344,7 @@ func buildSidecarProgression(snap *seiv1alpha1.SnapshotSource) ([]string, error)
if prog, err = insertBefore(prog, TaskConfigApply, TaskConfigureGenesis); err != nil {
return nil, err
}
if snap != nil {
if needsStateSyncWitnesses(snap) {
if prog, err = insertBefore(prog, TaskConfigValidate, TaskConfigureStateSync); err != nil {
return nil, err
}
Expand Down Expand Up @@ -477,6 +479,21 @@ func hasStateSync(snap *seiv1alpha1.SnapshotSource) bool {
return snap != nil && snap.StateSync != nil
}

// needsStateSyncWitnesses reports whether the node's plan carries a
// ConfigureStateSync task and therefore requires canonical rpc-server witnesses.
// Both bootstrap-from-snapshot modes apply the snapshot through CometBFT
// state-sync — stateSync fetches chunks from peers; s3 stages a snapshot under
// data/snapshots and applies it with use-local-snapshot=true — and both verify
// the trust point against >=2 reachable rpc-servers. A genesis node
// (snap == nil) carries no such task and needs no witnesses.
//
// This is the single predicate behind both the ConfigureStateSync insertion
// (buildSidecarProgression) and the fail-closed gate (stateSyncBlocksPlan),
// keeping plan-insertion and witness-resolution in lockstep.
func needsStateSyncWitnesses(snap *seiv1alpha1.SnapshotSource) bool {
return snap != nil
}

func bootstrapMode(snap *seiv1alpha1.SnapshotSource) string {
if hasS3Snapshot(snap) {
return "snapshot"
Expand Down
Loading