diff --git a/sei-tendermint/internal/autobahn/data/store.go b/sei-tendermint/internal/autobahn/data/store.go new file mode 100644 index 0000000000..1e2174ad86 --- /dev/null +++ b/sei-tendermint/internal/autobahn/data/store.go @@ -0,0 +1,254 @@ +package data + +import ( + "context" + + "github.com/sei-protocol/sei-chain/sei-tendermint/internal/autobahn/types" + "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" +) + +// Store is the durable backing store for data.State. It persists the two +// kinds of finalized records the consensus state machine produces — +// finalized blocks (indexed by GlobalBlockNumber and by header hash) and +// FullCommitQCs (each covering a contiguous range of GlobalBlockNumbers) — +// and provides the read API needed for crash recovery and runtime lookups. +// +// Replaces the WAL-based DataWAL used today; the contract here is a +// superset of what DataWAL.Blocks + DataWAL.CommitQCs provide today, plus +// a by-hash block index the WAL does not have. +// +// # What this does NOT store +// +// Deliberately scoped to blocks + QCs. In particular: +// +// - AppHash (per-height execution result). On restart the app reports +// its last committed (height, AppHash) via app.Info().LastBlockAppHash; +// runExecute injects that back via PushAppHash, and AppHashes for +// heights above the app's last committed height are regenerated by +// re-executing the replayed blocks. data.State.inner.appProposals is +// in-memory only. +// - Per-tx execution results, logs, events. Those live in the receipt +// store (canonical txHash → execution result lookup, unified for +// EVM and Cosmos txs per the Giga Transaction Query proposal). +// +// GlobalBlock.FinalAppState is extracted from qc.Proposal().App() on +// read — derivable from the persisted QC, no separate AppHash record +// needed. +// +// # Concurrency +// +// All methods are safe for concurrent use. Implementations should expect +// concurrent writes (WriteBlock + WriteQC interleaved from a single +// background persistence loop) and concurrent reads from RPC handlers +// and peer-sync streams. +// +// # Durability and crash safety +// +// Writes are two-phase: WriteBlock and WriteQC return without +// guaranteeing the record is on disk. Flush blocks until all +// previously-returned Writes are durable. A Write+Flush pair is +// individually atomic on disk — after a crash a reader either sees +// the full record (if Flush returned before the crash) or none of it +// (otherwise); partial writes are not possible. +// +// The two-phase shape is intentional: a disk-backed implementation +// needs an fsync per durability boundary, and fsync-per-Write at +// dozens of blocks/sec is real disk bandwidth. The typical pattern +// is "write a batch of records, then Flush once" — e.g. +// runPersist drains every block + QC currently queued for +// persistence, writes them all, calls Flush, and only then advances +// nextBlockToPersist. The implementation is free to begin writing as +// soon as the first record arrives (so this batches better than +// blocking on a closed input), and the consumer never pays an fsync +// per record. +// +// Writes are not atomic with respect to one another even within a +// single Flush — a crash between two Writes (or mid-Flush) leaves +// some records on disk and others not. Reconciliation of +// cross-record inconsistencies (e.g. a block written without its QC, +// or vice versa) is the caller's responsibility on startup (see +// DataWAL.reconcile for the rules the current WAL uses). +// +// Read-your-writes is provided within a single session regardless of +// Flush — a Write followed by a Read in the same process always +// observes the Write. Flush is about disk durability, not in-process +// visibility. +// +// Implementations are required to make sure that durability happens +// reasonably eagerly even without an explicit Flush — a node that +// stops calling Flush should still see its writes eventually land on +// disk. Flush is "wait until durable now," not "tell the +// implementation to start writing." +// +// # Ordering and the GlobalRange convention +// +// A FullCommitQC's GlobalRange is a half-open interval +// [GlobalRange.First(), GlobalRange.Next()) — First inclusive, Next +// exclusive. The QC therefore covers GlobalBlockNumbers First, First+1, +// ..., Next-1, and Next is also the First of the next contiguous QC. +// +// QCs must be written contiguously — each WriteQC's +// qc.QC().GlobalRange(committee).First must equal the previous WriteQC's +// GlobalRange().Next (the caller is data.State.runPersist, which +// guarantees this). Implementations may validate and reject out-of-order +// writes but need not. +// +// Blocks may be written in any GlobalBlockNumber order; the consumer +// (data.State) writes them in ascending order today but the contract +// does not require it. +type Store interface { + // WriteBlock persists a finalized block at GlobalBlockNumber n. + // + // n is required because *types.Block does NOT carry its + // GlobalBlockNumber — block.Header().BlockNumber() returns the + // per-lane BlockNumber, a different typedef. The lane→global + // mapping lives in the QC's GlobalRange. Implementations must + // record n alongside the block so ReadBlockByNumber can recover it + // and ReadAll can reconstruct (n, *Block) pairs. + // + // The block's hash (block.Header().Hash()) is indexed automatically + // so ReadBlockByHash works after this returns — the caller does not + // supply it separately. + // + // Idempotent on duplicate: a second WriteBlock with the same + // (n, block.Header().Hash()) pair is a no-op. Writing a different + // block at an already-occupied n, or the same block under a + // different n, is a contract violation — implementations are free + // to error or to corrupt state in that case. + // + // May return before the block is on disk. Callers that need crash + // durability before some external observable action (e.g. + // runPersist advancing nextBlockToPersist, which gates the + // AppVote runExecute issues) must call Flush. See the Store type + // doc for the two-phase write/flush contract. + WriteBlock(ctx context.Context, n types.GlobalBlockNumber, block *types.Block) error + + // WriteQC persists a FullCommitQC. + // + // The QC carries its GlobalRange internally + // (qc.QC().GlobalRange(committee)) — no range argument needed. The + // caller guarantees that successive WriteQC calls form a contiguous + // sequence: each call's First equals the previous call's Next (or + // committee.FirstBlock() for the very first call). Implementations + // may reject out-of-sequence writes but need not. + // + // Idempotent on duplicate: a second WriteQC for a QC with the same + // GlobalRange().First is a no-op. + // + // May return before the QC is on disk. See the Store type doc for + // the two-phase write/flush contract and WriteBlock for the + // rationale. + WriteQC(ctx context.Context, qc *types.FullCommitQC) error + + // PruneBefore removes: + // - every block with GlobalBlockNumber < n + // - every QC whose GlobalRange().Next ≤ n (the QC's entire + // covered range is strictly below the retention watermark; a + // QC straddling n stays) + // + // Idempotent: calling with n ≤ the existing retention watermark is + // a no-op. Pruning is permitted to be asynchronous — entries may + // remain readable briefly after PruneBefore returns, but will + // eventually become unreadable. + // + // Callers must ensure no in-flight reader is holding a pointer + // returned from a Read* call for a record being pruned. Pruning a + // record still being processed is undefined. + PruneBefore(ctx context.Context, n types.GlobalBlockNumber) error + + // Flush blocks until every Write that has returned before Flush is + // called is durable on disk. Writes made concurrently with Flush + // may or may not be durable when Flush returns (but are otherwise + // eventually durable — implementations write to disk on their own + // schedule even without an explicit Flush). + // + // The expected pattern is "write a batch of records, then Flush + // once" rather than "Flush after every Write." The implementation + // is free to begin writing as records arrive, so this still + // batches well even when the caller doesn't pre-buffer. + // + // data.State.runPersist will use this: drain every block + QC + // queued for persistence, write them all, call Flush, then + // advance nextBlockToPersist (the watermark gating AppVote + // issuance). + Flush(ctx context.Context) error + + // ReadAll returns a snapshot of all blocks and QCs not yet pruned, + // for startup replay. Intended to be called once at construction by + // data.State.NewState; afterwards the in-memory cursors track + // everything. + // + // Blocks are returned in ascending GlobalBlockNumber order, QCs in + // ascending GlobalRange().First order. The two slices are + // independent — there is no required alignment between them + // (DataWAL.reconcile handles cross-WAL drift; the same logic will + // run over Loaded). + // + // May allocate proportional to retention. For typical Sei retention + // windows this is fine; if a future implementation expects + // orders-of-magnitude larger retention, consider switching to an + // iterator API before adopting it. + ReadAll(ctx context.Context) (*Loaded, error) + + // ReadBlockByNumber returns the block at GlobalBlockNumber n. + // + // Returns utils.None if no block has been written at n, or the + // block at n has been pruned. Implementations must not block + // waiting for a future write — "not yet written" is reported as + // utils.None identical to "never written". Blocking semantics + // (wait for a write at n) live above this interface, in + // data.State. + ReadBlockByNumber(ctx context.Context, n types.GlobalBlockNumber) (utils.Option[*types.Block], error) + + // ReadBlockByHash returns the block whose header hashes to the + // given value. The hash is the same value as block.Header().Hash() + // for the block that was passed to WriteBlock. + // + // Returns utils.None if no such block has been written, or it has + // been pruned. Like ReadBlockByNumber, this is non-blocking. + ReadBlockByHash(ctx context.Context, hash types.BlockHeaderHash) (utils.Option[*types.Block], error) + + // ReadQCByBlockNumber returns the FullCommitQC whose + // GlobalRange().First ≤ n < GlobalRange().Next — i.e. the QC that + // finalizes the block at n. Because a single QC covers multiple + // blocks, the same *FullCommitQC is returned for every n in its + // range. + // + // Returns utils.None if no QC has been written that covers n yet, + // or n is below the retention watermark. Non-blocking. + ReadQCByBlockNumber(ctx context.Context, n types.GlobalBlockNumber) (utils.Option[*types.FullCommitQC], error) + + // Close releases resources held by the store. After Close returns, + // no other method may be called on the Store; doing so is + // undefined. + Close(ctx context.Context) error +} + +// Loaded is the result of Store.ReadAll — a point-in-time view of every +// block and QC not yet pruned, used by data.State.NewState to rebuild +// in-memory state at startup. +type Loaded struct { + // Blocks is the set of persisted blocks in ascending + // GlobalBlockNumber order. Each entry pairs the block with the + // GlobalBlockNumber it was written at; the Block type does not + // carry a global number on its own — its header carries a per-lane + // BlockNumber, which is distinct. + Blocks []BlockEntry + + // QCs is the set of persisted FullCommitQCs in ascending + // GlobalRange().First order. Each QC covers a contiguous range + // [First, Next); successive entries' ranges are contiguous (no + // gaps), but the first QC's First is not required to equal + // committee.FirstBlock() — entries with GlobalRange().Next at or + // below the retention watermark have been pruned. + QCs []*types.FullCommitQC +} + +// BlockEntry pairs a finalized block with its GlobalBlockNumber. The +// GlobalBlockNumber is the index data.State uses to address the block; +// it is not stored inside *types.Block itself, so the Store records it +// alongside on WriteBlock and returns it on ReadAll. +type BlockEntry struct { + Number types.GlobalBlockNumber + Block *types.Block +}